From bbcc64772580c8a979288791afa02d30bc476d2e Mon Sep 17 00:00:00 2001
From: trav90 <travawine@palemoon.org>
Date: Fri, 19 Oct 2018 21:52:15 -0500
Subject: Update aom to v1.0.0

Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
---
 third_party/aom/av1/av1.cmake                      |   953 +-
 third_party/aom/av1/av1_common.mk                  |   205 -
 third_party/aom/av1/av1_cx.mk                      |   176 -
 third_party/aom/av1/av1_cx_iface.c                 |  1069 +-
 third_party/aom/av1/av1_dx.mk                      |    67 -
 third_party/aom/av1/av1_dx_iface.c                 |   908 +-
 third_party/aom/av1/av1_iface_common.h             |    48 +-
 third_party/aom/av1/common/alloccommon.c           |   386 +-
 third_party/aom/av1/common/alloccommon.h           |     8 +-
 third_party/aom/av1/common/arm/av1_txfm_neon.c     |    28 +
 .../aom/av1/common/arm/blend_a64_hmask_neon.c      |   134 +
 .../aom/av1/common/arm/blend_a64_vmask_neon.c      |   141 +
 third_party/aom/av1/common/arm/cfl_neon.c          |   584 +
 third_party/aom/av1/common/arm/convolve_neon.c     |  1134 ++
 third_party/aom/av1/common/arm/convolve_neon.h     |   228 +
 third_party/aom/av1/common/arm/intrapred_neon.c    |    79 +
 third_party/aom/av1/common/arm/jnt_convolve_neon.c |  1326 ++
 third_party/aom/av1/common/arm/mem_neon.h          |   401 +
 .../aom/av1/common/arm/neon/iht4x4_add_neon.c      |   228 -
 .../aom/av1/common/arm/neon/iht8x8_add_neon.c      |   594 -
 third_party/aom/av1/common/arm/reconinter_neon.c   |    86 +
 third_party/aom/av1/common/arm/transpose_neon.h    |   422 +
 .../aom/av1/common/arm/wiener_convolve_neon.c      |   401 +
 third_party/aom/av1/common/av1_fwd_txfm1d.c        |  2355 ---
 third_party/aom/av1/common/av1_fwd_txfm1d.h        |    61 -
 third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h    |   363 -
 third_party/aom/av1/common/av1_fwd_txfm2d.c        |   413 -
 third_party/aom/av1/common/av1_inv_txfm1d.c        |  2726 ++--
 third_party/aom/av1/common/av1_inv_txfm1d.h        |    66 +-
 third_party/aom/av1/common/av1_inv_txfm1d_cfg.h    |   382 +-
 third_party/aom/av1/common/av1_inv_txfm2d.c        |   616 +-
 third_party/aom/av1/common/av1_loopfilter.c        |  4744 ++----
 third_party/aom/av1/common/av1_loopfilter.h        |   233 +-
 third_party/aom/av1/common/av1_rtcd.c              |     6 +-
 third_party/aom/av1/common/av1_rtcd_defs.pl        |   764 +-
 third_party/aom/av1/common/av1_txfm.c              |   110 +
 third_party/aom/av1/common/av1_txfm.h              |   319 +-
 third_party/aom/av1/common/blockd.c                |   255 +-
 third_party/aom/av1/common/blockd.h                |  1538 +-
 third_party/aom/av1/common/cdef.c                  |   231 +-
 third_party/aom/av1/common/cdef.h                  |     9 +-
 third_party/aom/av1/common/cdef_block.c            |   391 +-
 third_party/aom/av1/common/cdef_block.h            |    49 +-
 third_party/aom/av1/common/cdef_block_avx2.c       |     2 +-
 third_party/aom/av1/common/cdef_block_neon.c       |     2 +-
 third_party/aom/av1/common/cdef_block_simd.h       |   813 +-
 third_party/aom/av1/common/cdef_block_sse2.c       |     2 +-
 third_party/aom/av1/common/cdef_block_sse4.c       |     2 +-
 third_party/aom/av1/common/cdef_block_ssse3.c      |     2 +-
 third_party/aom/av1/common/cfl.c                   |   642 +-
 third_party/aom/av1/common/cfl.h                   |   274 +-
 third_party/aom/av1/common/clpf.c                  |   115 -
 third_party/aom/av1/common/clpf_neon.c             |    14 -
 third_party/aom/av1/common/clpf_simd.h             |   456 -
 third_party/aom/av1/common/clpf_sse2.c             |    14 -
 third_party/aom/av1/common/clpf_sse4.c             |    14 -
 third_party/aom/av1/common/clpf_ssse3.c            |    14 -
 third_party/aom/av1/common/common.h                |     3 +
 third_party/aom/av1/common/common_data.h           |  2023 +--
 third_party/aom/av1/common/convolve.c              |  2126 ++-
 third_party/aom/av1/common/convolve.h              |   173 +-
 third_party/aom/av1/common/daala_tx.c              |  4331 ------
 third_party/aom/av1/common/daala_tx.h              |    53 -
 third_party/aom/av1/common/debugmodes.c            |    29 +-
 third_party/aom/av1/common/entropy.c               |  2564 +--
 third_party/aom/av1/common/entropy.h               |   361 +-
 third_party/aom/av1/common/entropymode.c           |  6996 ++-------
 third_party/aom/av1/common/entropymode.h           |   661 +-
 third_party/aom/av1/common/entropymv.c             |   227 +-
 third_party/aom/av1/common/entropymv.h             |    52 +-
 third_party/aom/av1/common/enums.h                 |   624 +-
 third_party/aom/av1/common/filter.c                |   274 +-
 third_party/aom/av1/common/filter.h                |    85 +-
 third_party/aom/av1/common/frame_buffers.c         |     1 +
 third_party/aom/av1/common/generic_code.c          |   112 -
 third_party/aom/av1/common/generic_code.h          |    81 -
 third_party/aom/av1/common/idct.c                  |  3109 +---
 third_party/aom/av1/common/idct.h                  |    57 +-
 third_party/aom/av1/common/laplace_tables.c        |   657 -
 .../aom/av1/common/mips/msa/av1_idct16x16_msa.c    |    81 -
 .../aom/av1/common/mips/msa/av1_idct4x4_msa.c      |    62 -
 .../aom/av1/common/mips/msa/av1_idct8x8_msa.c      |    80 -
 third_party/aom/av1/common/mv.h                    |   110 +-
 third_party/aom/av1/common/mvref_common.c          |  2726 ++--
 third_party/aom/av1/common/mvref_common.h          |   445 +-
 third_party/aom/av1/common/ncobmc_kernels.c        |  1181 --
 third_party/aom/av1/common/ncobmc_kernels.h        |    22 -
 third_party/aom/av1/common/obmc.h                  |    39 +-
 third_party/aom/av1/common/odintrin.c              |    10 -
 third_party/aom/av1/common/odintrin.h              |   180 -
 third_party/aom/av1/common/onyxc_int.h             |  1161 +-
 third_party/aom/av1/common/partition.c             |   256 -
 third_party/aom/av1/common/partition.h             |    40 -
 third_party/aom/av1/common/ppc/cfl_ppc.c           |   153 +
 third_party/aom/av1/common/pred_common.c           |  1329 +-
 third_party/aom/av1/common/pred_common.h           |   385 +-
 third_party/aom/av1/common/pvq.c                   |  1007 --
 third_party/aom/av1/common/pvq.h                   |   179 -
 third_party/aom/av1/common/pvq_state.c             |    50 -
 third_party/aom/av1/common/pvq_state.h             |    52 -
 third_party/aom/av1/common/quant_common.c          |   585 +-
 third_party/aom/av1/common/quant_common.h          |    71 +-
 third_party/aom/av1/common/reconinter.c            |  3999 +----
 third_party/aom/av1/common/reconinter.h            |   563 +-
 third_party/aom/av1/common/reconintra.c            |  3190 +---
 third_party/aom/av1/common/reconintra.h            |    98 +-
 third_party/aom/av1/common/resize.c                |   769 +-
 third_party/aom/av1/common/resize.h                |    43 +-
 third_party/aom/av1/common/restoration.c           |  2684 ++--
 third_party/aom/av1/common/restoration.h           |   335 +-
 third_party/aom/av1/common/scale.c                 |   152 +-
 third_party/aom/av1/common/scale.h                 |    14 +-
 third_party/aom/av1/common/scan.c                  |  6307 +-------
 third_party/aom/av1/common/scan.h                  |   101 +-
 third_party/aom/av1/common/seg_common.c            |    39 +-
 third_party/aom/av1/common/seg_common.h            |    60 +-
 third_party/aom/av1/common/thread_common.c         |   977 +-
 third_party/aom/av1/common/thread_common.h         |    90 +-
 third_party/aom/av1/common/tile_common.c           |   236 +-
 third_party/aom/av1/common/tile_common.h           |    27 +-
 third_party/aom/av1/common/timing.c                |    79 +
 third_party/aom/av1/common/timing.h                |    59 +
 third_party/aom/av1/common/token_cdfs.h            |  8769 +++++------
 third_party/aom/av1/common/txb_common.c            |   559 +-
 third_party/aom/av1/common/txb_common.h            |   911 +-
 third_party/aom/av1/common/warped_motion.c         |  1048 +-
 third_party/aom/av1/common/warped_motion.h         |    99 +-
 .../av1/common/x86/av1_convolve_horiz_rs_sse4.c    |   228 +
 .../aom/av1/common/x86/av1_convolve_scale_sse4.c   |   665 +-
 .../aom/av1/common/x86/av1_convolve_ssse3.c        |  1034 --
 .../aom/av1/common/x86/av1_fwd_txfm1d_sse4.c       |   839 -
 .../aom/av1/common/x86/av1_fwd_txfm2d_sse4.c       |    82 -
 .../aom/av1/common/x86/av1_highbd_convolve_sse4.c  |   334 +-
 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c |  1957 +++
 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h |   210 +
 .../aom/av1/common/x86/av1_inv_txfm_ssse3.c        |  2917 ++++
 .../aom/av1/common/x86/av1_inv_txfm_ssse3.h        |   236 +
 third_party/aom/av1/common/x86/av1_txfm1d_sse4.h   |   144 -
 third_party/aom/av1/common/x86/av1_txfm_sse2.h     |   317 +
 third_party/aom/av1/common/x86/av1_txfm_sse4.c     |    10 +
 third_party/aom/av1/common/x86/av1_txfm_sse4.h     |    60 +
 third_party/aom/av1/common/x86/cfl_avx2.c          |   491 +
 third_party/aom/av1/common/x86/cfl_simd.h          |   238 +
 third_party/aom/av1/common/x86/cfl_sse2.c          |    89 +
 third_party/aom/av1/common/x86/cfl_ssse3.c         |   393 +
 third_party/aom/av1/common/x86/convolve_2d_avx2.c  |   285 +
 third_party/aom/av1/common/x86/convolve_2d_sse2.c  |   496 +-
 third_party/aom/av1/common/x86/convolve_avx2.c     |   581 +-
 third_party/aom/av1/common/x86/convolve_sse2.c     |   339 +
 third_party/aom/av1/common/x86/filterintra_sse4.c  |   931 +-
 .../aom/av1/common/x86/highbd_convolve_2d_avx2.c   |   327 +
 .../aom/av1/common/x86/highbd_convolve_2d_sse2.c   |   191 +
 .../aom/av1/common/x86/highbd_convolve_2d_sse4.c   |   421 +
 .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c  |   528 +-
 .../aom/av1/common/x86/highbd_inv_txfm_avx2.c      |   339 +-
 .../aom/av1/common/x86/highbd_inv_txfm_sse4.c      |  2179 ++-
 .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c  |   853 +
 .../aom/av1/common/x86/highbd_jnt_convolve_sse4.c  |   383 +
 .../aom/av1/common/x86/highbd_txfm_utility_sse4.h  |    10 +
 .../aom/av1/common/x86/highbd_warp_plane_sse4.c    |   454 +
 .../aom/av1/common/x86/highbd_warp_plane_ssse3.c   |   365 -
 .../av1/common/x86/highbd_wiener_convolve_avx2.c   |   245 +
 .../av1/common/x86/highbd_wiener_convolve_ssse3.c  |   202 +
 .../aom/av1/common/x86/hybrid_inv_txfm_avx2.c      |   450 -
 third_party/aom/av1/common/x86/idct_intrin_sse2.c  |  1411 --
 third_party/aom/av1/common/x86/intra_edge_sse4.c   |    12 +-
 third_party/aom/av1/common/x86/jnt_convolve_avx2.c |   704 +
 third_party/aom/av1/common/x86/jnt_convolve_sse2.c |   385 +
 .../aom/av1/common/x86/jnt_convolve_ssse3.c        |   232 +
 third_party/aom/av1/common/x86/pvq_sse4.c          |   252 -
 third_party/aom/av1/common/x86/pvq_sse4.h          |    13 -
 third_party/aom/av1/common/x86/reconinter_avx2.c   |   124 +
 third_party/aom/av1/common/x86/reconinter_sse4.c   |   153 +
 third_party/aom/av1/common/x86/reconinter_ssse3.c  |   116 +
 third_party/aom/av1/common/x86/selfguided_avx2.c   |   719 +
 third_party/aom/av1/common/x86/selfguided_sse4.c   |  2254 +--
 third_party/aom/av1/common/x86/warp_plane_sse2.c   |   359 -
 third_party/aom/av1/common/x86/warp_plane_sse4.c   |   621 +
 third_party/aom/av1/common/x86/warp_plane_ssse3.c  |   535 -
 .../aom/av1/common/x86/wiener_convolve_avx2.c      |   260 +
 .../aom/av1/common/x86/wiener_convolve_sse2.c      |   198 +
 third_party/aom/av1/common/zigzag.h                |    33 -
 third_party/aom/av1/common/zigzag16.c              |   157 -
 third_party/aom/av1/common/zigzag32.c              |   199 -
 third_party/aom/av1/common/zigzag4.c               |    22 -
 third_party/aom/av1/common/zigzag8.c               |    50 -
 third_party/aom/av1/decoder/accounting.c           |     4 +-
 third_party/aom/av1/decoder/accounting.h           |     3 +-
 third_party/aom/av1/decoder/decint.h               |    35 -
 third_party/aom/av1/decoder/decodeframe.c          |  7460 ++++-----
 third_party/aom/av1/decoder/decodeframe.h          |    64 +-
 third_party/aom/av1/decoder/decodemv.c             |  2907 +---
 third_party/aom/av1/decoder/decodemv.h             |    14 +-
 third_party/aom/av1/decoder/decoder.c              |   474 +-
 third_party/aom/av1/decoder/decoder.h              |   198 +-
 third_party/aom/av1/decoder/decodetxb.c            |   775 +-
 third_party/aom/av1/decoder/decodetxb.h            |    25 +-
 third_party/aom/av1/decoder/detokenize.c           |   347 +-
 third_party/aom/av1/decoder/detokenize.h           |    11 +-
 third_party/aom/av1/decoder/dsubexp.c              |    82 -
 third_party/aom/av1/decoder/dsubexp.h              |    32 -
 third_party/aom/av1/decoder/dthread.c              |    17 +-
 third_party/aom/av1/decoder/dthread.h              |    10 +-
 third_party/aom/av1/decoder/generic_decoder.c      |   110 -
 third_party/aom/av1/decoder/inspection.c           |    41 +-
 third_party/aom/av1/decoder/inspection.h           |    47 +-
 third_party/aom/av1/decoder/laplace_decoder.c      |   121 -
 third_party/aom/av1/decoder/obu.c                  |   907 ++
 third_party/aom/av1/decoder/obu.h                  |    54 +
 third_party/aom/av1/decoder/pvq_decoder.c          |   378 -
 third_party/aom/av1/decoder/pvq_decoder.h          |    40 -
 third_party/aom/av1/decoder/symbolrate.h           |    88 -
 .../aom/av1/encoder/ab_partition_model_weights.h   |  1318 ++
 third_party/aom/av1/encoder/aq_complexity.c        |    28 +-
 third_party/aom/av1/encoder/aq_cyclicrefresh.c     |    42 +-
 third_party/aom/av1/encoder/aq_variance.c          |    82 +-
 third_party/aom/av1/encoder/aq_variance.h          |     4 +
 third_party/aom/av1/encoder/arm/neon/error_neon.c  |    42 -
 third_party/aom/av1/encoder/av1_fwd_txfm1d.c       |  1902 +++
 third_party/aom/av1/encoder/av1_fwd_txfm1d.h       |    49 +
 third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h   |    19 +
 third_party/aom/av1/encoder/av1_fwd_txfm2d.c       |   431 +
 third_party/aom/av1/encoder/av1_quantize.c         |  1657 +-
 third_party/aom/av1/encoder/av1_quantize.h         |   121 +-
 third_party/aom/av1/encoder/bgsprite.c             |  1257 --
 third_party/aom/av1/encoder/bgsprite.h             |    30 -
 third_party/aom/av1/encoder/bitstream.c            |  6592 +++-----
 third_party/aom/av1/encoder/bitstream.h            |    26 +-
 third_party/aom/av1/encoder/block.h                |   384 +-
 third_party/aom/av1/encoder/blockiness.c           |     7 +-
 third_party/aom/av1/encoder/context_tree.c         |   278 +-
 third_party/aom/av1/encoder/context_tree.h         |    67 +-
 third_party/aom/av1/encoder/corner_match.c         |     3 +-
 third_party/aom/av1/encoder/cost.c                 |    65 +-
 third_party/aom/av1/encoder/cost.h                 |    34 +-
 third_party/aom/av1/encoder/daala_compat_enc.c     |    30 -
 third_party/aom/av1/encoder/dct.c                  |  2797 ----
 third_party/aom/av1/encoder/dwt.c                  |   144 +
 third_party/aom/av1/encoder/dwt.h                  |     9 +
 third_party/aom/av1/encoder/encint.h               |    51 -
 third_party/aom/av1/encoder/encodeframe.c          |  7940 ++++------
 third_party/aom/av1/encoder/encodeframe.h          |    10 +-
 third_party/aom/av1/encoder/encodemb.c             |  1209 +-
 third_party/aom/av1/encoder/encodemb.h             |    55 +-
 third_party/aom/av1/encoder/encodemv.c             |   336 +-
 third_party/aom/av1/encoder/encodemv.h             |    19 +-
 third_party/aom/av1/encoder/encoder.c              |  4002 ++---
 third_party/aom/av1/encoder/encoder.h              |   440 +-
 third_party/aom/av1/encoder/encodetxb.c            |  3908 ++---
 third_party/aom/av1/encoder/encodetxb.h            |    66 +-
 third_party/aom/av1/encoder/ethread.c              |    58 +-
 third_party/aom/av1/encoder/ethread.h              |     3 +
 third_party/aom/av1/encoder/extend.c               |     4 -
 third_party/aom/av1/encoder/firstpass.c            |   524 +-
 third_party/aom/av1/encoder/firstpass.h            |    14 +-
 third_party/aom/av1/encoder/generic_encoder.c      |   157 -
 third_party/aom/av1/encoder/global_motion.c        |   109 +-
 third_party/aom/av1/encoder/global_motion.h        |    20 +-
 third_party/aom/av1/encoder/grain_test_vectors.h   |   781 +
 third_party/aom/av1/encoder/hash.c                 |    62 +-
 third_party/aom/av1/encoder/hash.h                 |    14 +-
 third_party/aom/av1/encoder/hash_motion.c          |   207 +-
 third_party/aom/av1/encoder/hash_motion.h          |    10 +-
 third_party/aom/av1/encoder/hybrid_fwd_txfm.c      |   609 +-
 third_party/aom/av1/encoder/hybrid_fwd_txfm.h      |     2 +-
 third_party/aom/av1/encoder/k_means_template.h     |    70 +-
 third_party/aom/av1/encoder/laplace_encoder.c      |   107 -
 third_party/aom/av1/encoder/lookahead.c            |    29 +-
 third_party/aom/av1/encoder/lookahead.h            |    16 +-
 third_party/aom/av1/encoder/mbgraph.c              |    58 +-
 third_party/aom/av1/encoder/mbgraph.h              |     6 +-
 third_party/aom/av1/encoder/mcomp.c                |  1235 +-
 third_party/aom/av1/encoder/mcomp.h                |    56 +-
 third_party/aom/av1/encoder/mips/msa/error_msa.c   |     3 +-
 .../aom/av1/encoder/mips/msa/fdct16x16_msa.c       |   436 -
 third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c |    52 -
 third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c |    65 -
 third_party/aom/av1/encoder/mips/msa/fdct_msa.h    |   117 -
 .../aom/av1/encoder/mips/msa/temporal_filter_msa.c |     3 +-
 third_party/aom/av1/encoder/ml.c                   |    57 +
 third_party/aom/av1/encoder/ml.h                   |    44 +
 third_party/aom/av1/encoder/palette.c              |    28 +-
 third_party/aom/av1/encoder/palette.h              |    27 +-
 third_party/aom/av1/encoder/pickcdef.c             |    97 +-
 third_party/aom/av1/encoder/picklpf.c              |   388 +-
 third_party/aom/av1/encoder/pickrst.c              |  1499 +-
 third_party/aom/av1/encoder/pickrst.h              |     3 +-
 third_party/aom/av1/encoder/pustats.h              |   229 +
 third_party/aom/av1/encoder/pvq_encoder.c          |   988 --
 third_party/aom/av1/encoder/pvq_encoder.h          |    53 -
 third_party/aom/av1/encoder/ransac.c               |   313 -
 third_party/aom/av1/encoder/ransac.h               |     9 -
 third_party/aom/av1/encoder/ratectrl.c             |   161 +-
 third_party/aom/av1/encoder/ratectrl.h             |    25 +-
 third_party/aom/av1/encoder/ratectrl_xiph.c        |  1244 --
 third_party/aom/av1/encoder/ratectrl_xiph.h        |   200 -
 third_party/aom/av1/encoder/rd.c                   |  1215 +-
 third_party/aom/av1/encoder/rd.h                   |   235 +-
 third_party/aom/av1/encoder/rdopt.c                | 15517 ++++++++-----------
 third_party/aom/av1/encoder/rdopt.h                |   126 +-
 third_party/aom/av1/encoder/segmentation.c         |   255 +-
 third_party/aom/av1/encoder/segmentation.h         |    13 -
 third_party/aom/av1/encoder/speed_features.c       |   354 +-
 third_party/aom/av1/encoder/speed_features.h       |   253 +-
 third_party/aom/av1/encoder/subexp.c               |   164 -
 third_party/aom/av1/encoder/subexp.h               |    42 -
 third_party/aom/av1/encoder/temporal_filter.c      |   332 +-
 third_party/aom/av1/encoder/temporal_filter.h      |     6 +-
 third_party/aom/av1/encoder/tokenize.c             |   820 +-
 third_party/aom/av1/encoder/tokenize.h             |   102 +-
 third_party/aom/av1/encoder/treewriter.c           |    59 -
 third_party/aom/av1/encoder/treewriter.h           |    42 -
 .../aom/av1/encoder/tx_prune_model_weights.h       |  2086 +++
 .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c      |  1205 ++
 .../aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c      |   306 +
 .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.c        |  2889 ++++
 .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.h        |   117 +
 .../aom/av1/encoder/x86/av1_highbd_quantize_avx2.c |    84 +-
 .../aom/av1/encoder/x86/av1_highbd_quantize_sse4.c |   103 +-
 .../aom/av1/encoder/x86/av1_quantize_avx2.c        |   234 +-
 .../aom/av1/encoder/x86/av1_quantize_sse2.c        |   273 +-
 .../aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm    |     3 +
 third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h  |   141 +
 .../aom/av1/encoder/x86/corner_match_sse4.c        |     3 +-
 third_party/aom/av1/encoder/x86/dct_intrin_sse2.c  |  3483 -----
 third_party/aom/av1/encoder/x86/dct_sse2.asm       |     5 -
 third_party/aom/av1/encoder/x86/encodetxb_sse2.c   |   505 +
 third_party/aom/av1/encoder/x86/encodetxb_sse4.c   |    80 +
 .../aom/av1/encoder/x86/error_intrin_avx2.c        |     3 +-
 third_party/aom/av1/encoder/x86/error_sse2.asm     |    46 -
 third_party/aom/av1/encoder/x86/hash_sse42.c       |    51 +
 .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c     |  1276 +-
 .../aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c     |  1627 --
 .../av1/encoder/x86/temporal_filter_apply_sse2.asm |     2 +
 third_party/aom/av1/encoder/x86/wedge_utils_sse2.c |     2 +-
 335 files changed, 82479 insertions(+), 133716 deletions(-)
 delete mode 100644 third_party/aom/av1/av1_common.mk
 delete mode 100644 third_party/aom/av1/av1_cx.mk
 delete mode 100644 third_party/aom/av1/av1_dx.mk
 create mode 100644 third_party/aom/av1/common/arm/av1_txfm_neon.c
 create mode 100644 third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
 create mode 100644 third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
 create mode 100644 third_party/aom/av1/common/arm/cfl_neon.c
 create mode 100644 third_party/aom/av1/common/arm/convolve_neon.c
 create mode 100644 third_party/aom/av1/common/arm/convolve_neon.h
 create mode 100644 third_party/aom/av1/common/arm/intrapred_neon.c
 create mode 100644 third_party/aom/av1/common/arm/jnt_convolve_neon.c
 create mode 100644 third_party/aom/av1/common/arm/mem_neon.h
 delete mode 100644 third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
 delete mode 100644 third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
 create mode 100644 third_party/aom/av1/common/arm/reconinter_neon.c
 create mode 100644 third_party/aom/av1/common/arm/transpose_neon.h
 create mode 100644 third_party/aom/av1/common/arm/wiener_convolve_neon.c
 delete mode 100644 third_party/aom/av1/common/av1_fwd_txfm1d.c
 delete mode 100644 third_party/aom/av1/common/av1_fwd_txfm1d.h
 delete mode 100644 third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h
 delete mode 100644 third_party/aom/av1/common/av1_fwd_txfm2d.c
 create mode 100644 third_party/aom/av1/common/av1_txfm.c
 delete mode 100644 third_party/aom/av1/common/clpf.c
 delete mode 100644 third_party/aom/av1/common/clpf_neon.c
 delete mode 100644 third_party/aom/av1/common/clpf_simd.h
 delete mode 100644 third_party/aom/av1/common/clpf_sse2.c
 delete mode 100644 third_party/aom/av1/common/clpf_sse4.c
 delete mode 100644 third_party/aom/av1/common/clpf_ssse3.c
 delete mode 100644 third_party/aom/av1/common/daala_tx.c
 delete mode 100644 third_party/aom/av1/common/daala_tx.h
 delete mode 100644 third_party/aom/av1/common/generic_code.c
 delete mode 100644 third_party/aom/av1/common/generic_code.h
 delete mode 100644 third_party/aom/av1/common/laplace_tables.c
 delete mode 100644 third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
 delete mode 100644 third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
 delete mode 100644 third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
 delete mode 100644 third_party/aom/av1/common/ncobmc_kernels.c
 delete mode 100644 third_party/aom/av1/common/ncobmc_kernels.h
 delete mode 100644 third_party/aom/av1/common/partition.c
 delete mode 100644 third_party/aom/av1/common/partition.h
 create mode 100644 third_party/aom/av1/common/ppc/cfl_ppc.c
 delete mode 100644 third_party/aom/av1/common/pvq.c
 delete mode 100644 third_party/aom/av1/common/pvq.h
 delete mode 100644 third_party/aom/av1/common/pvq_state.c
 delete mode 100644 third_party/aom/av1/common/pvq_state.h
 create mode 100644 third_party/aom/av1/common/timing.c
 create mode 100644 third_party/aom/av1/common/timing.h
 create mode 100644 third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
 delete mode 100644 third_party/aom/av1/common/x86/av1_convolve_ssse3.c
 delete mode 100644 third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
 delete mode 100644 third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
 create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
 delete mode 100644 third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
 create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse2.h
 create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse4.h
 create mode 100644 third_party/aom/av1/common/x86/cfl_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/cfl_simd.h
 create mode 100644 third_party/aom/av1/common/x86/cfl_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/cfl_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/convolve_2d_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/convolve_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
 delete mode 100644 third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
 delete mode 100644 third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
 delete mode 100644 third_party/aom/av1/common/x86/idct_intrin_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
 delete mode 100644 third_party/aom/av1/common/x86/pvq_sse4.c
 delete mode 100644 third_party/aom/av1/common/x86/pvq_sse4.h
 create mode 100644 third_party/aom/av1/common/x86/reconinter_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/reconinter_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/reconinter_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/selfguided_avx2.c
 delete mode 100644 third_party/aom/av1/common/x86/warp_plane_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/warp_plane_sse4.c
 delete mode 100644 third_party/aom/av1/common/x86/warp_plane_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/wiener_convolve_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/wiener_convolve_sse2.c
 delete mode 100644 third_party/aom/av1/common/zigzag.h
 delete mode 100644 third_party/aom/av1/common/zigzag16.c
 delete mode 100644 third_party/aom/av1/common/zigzag32.c
 delete mode 100644 third_party/aom/av1/common/zigzag4.c
 delete mode 100644 third_party/aom/av1/common/zigzag8.c
 delete mode 100644 third_party/aom/av1/decoder/decint.h
 delete mode 100644 third_party/aom/av1/decoder/dsubexp.c
 delete mode 100644 third_party/aom/av1/decoder/dsubexp.h
 delete mode 100644 third_party/aom/av1/decoder/generic_decoder.c
 delete mode 100644 third_party/aom/av1/decoder/laplace_decoder.c
 create mode 100644 third_party/aom/av1/decoder/obu.c
 create mode 100644 third_party/aom/av1/decoder/obu.h
 delete mode 100644 third_party/aom/av1/decoder/pvq_decoder.c
 delete mode 100644 third_party/aom/av1/decoder/pvq_decoder.h
 delete mode 100644 third_party/aom/av1/decoder/symbolrate.h
 create mode 100644 third_party/aom/av1/encoder/ab_partition_model_weights.h
 delete mode 100644 third_party/aom/av1/encoder/arm/neon/error_neon.c
 create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d.c
 create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d.h
 create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
 create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm2d.c
 delete mode 100644 third_party/aom/av1/encoder/bgsprite.c
 delete mode 100644 third_party/aom/av1/encoder/bgsprite.h
 delete mode 100644 third_party/aom/av1/encoder/daala_compat_enc.c
 delete mode 100644 third_party/aom/av1/encoder/dct.c
 create mode 100644 third_party/aom/av1/encoder/dwt.c
 create mode 100644 third_party/aom/av1/encoder/dwt.h
 delete mode 100644 third_party/aom/av1/encoder/encint.h
 delete mode 100644 third_party/aom/av1/encoder/generic_encoder.c
 create mode 100644 third_party/aom/av1/encoder/grain_test_vectors.h
 delete mode 100644 third_party/aom/av1/encoder/laplace_encoder.c
 delete mode 100644 third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
 delete mode 100644 third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
 delete mode 100644 third_party/aom/av1/encoder/mips/msa/fdct_msa.h
 create mode 100644 third_party/aom/av1/encoder/ml.c
 create mode 100644 third_party/aom/av1/encoder/ml.h
 create mode 100644 third_party/aom/av1/encoder/pustats.h
 delete mode 100644 third_party/aom/av1/encoder/pvq_encoder.c
 delete mode 100644 third_party/aom/av1/encoder/pvq_encoder.h
 delete mode 100644 third_party/aom/av1/encoder/subexp.c
 delete mode 100644 third_party/aom/av1/encoder/subexp.h
 delete mode 100644 third_party/aom/av1/encoder/treewriter.c
 delete mode 100644 third_party/aom/av1/encoder/treewriter.h
 create mode 100644 third_party/aom/av1/encoder/tx_prune_model_weights.h
 create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
 create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
 create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
 create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
 create mode 100644 third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
 delete mode 100644 third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
 create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse2.c
 create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse4.c
 create mode 100644 third_party/aom/av1/encoder/x86/hash_sse42.c
 delete mode 100644 third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c

(limited to 'third_party/aom/av1')

diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 140eec815..1c7f937e1 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -1,530 +1,326 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AV1_AV1_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AV1_AV1_CMAKE_)
+  return()
+endif() # AOM_AV1_AV1_CMAKE_
 set(AOM_AV1_AV1_CMAKE_ 1)
 
-set(AOM_AV1_COMMON_SOURCES
-    "${AOM_ROOT}/av1/av1_iface_common.h"
-    "${AOM_ROOT}/av1/common/alloccommon.c"
-    "${AOM_ROOT}/av1/common/alloccommon.h"
-    # TODO(tomfinegan): Foward transform belongs in encoder.
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d.c"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d.h"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm2d.c"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d_cfg.h"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
-    "${AOM_ROOT}/av1/common/av1_loopfilter.c"
-    "${AOM_ROOT}/av1/common/av1_loopfilter.h"
-    "${AOM_ROOT}/av1/common/av1_txfm.h"
-    "${AOM_ROOT}/av1/common/blockd.c"
-    "${AOM_ROOT}/av1/common/blockd.h"
-    "${AOM_ROOT}/av1/common/common.h"
-    "${AOM_ROOT}/av1/common/common_data.h"
-    "${AOM_ROOT}/av1/common/convolve.c"
-    "${AOM_ROOT}/av1/common/convolve.h"
-    "${AOM_ROOT}/av1/common/daala_tx.c"
-    "${AOM_ROOT}/av1/common/daala_tx.h"
-    "${AOM_ROOT}/av1/common/debugmodes.c"
-    "${AOM_ROOT}/av1/common/entropy.c"
-    "${AOM_ROOT}/av1/common/entropy.h"
-    "${AOM_ROOT}/av1/common/entropymode.c"
-    "${AOM_ROOT}/av1/common/entropymode.h"
-    "${AOM_ROOT}/av1/common/entropymv.c"
-    "${AOM_ROOT}/av1/common/entropymv.h"
-    "${AOM_ROOT}/av1/common/enums.h"
-    "${AOM_ROOT}/av1/common/filter.c"
-    "${AOM_ROOT}/av1/common/filter.h"
-    "${AOM_ROOT}/av1/common/frame_buffers.c"
-    "${AOM_ROOT}/av1/common/frame_buffers.h"
-    "${AOM_ROOT}/av1/common/idct.c"
-    "${AOM_ROOT}/av1/common/idct.h"
-    "${AOM_ROOT}/av1/common/mv.h"
-    "${AOM_ROOT}/av1/common/mvref_common.c"
-    "${AOM_ROOT}/av1/common/mvref_common.h"
-    "${AOM_ROOT}/av1/common/odintrin.c"
-    "${AOM_ROOT}/av1/common/odintrin.h"
-    "${AOM_ROOT}/av1/common/onyxc_int.h"
-    "${AOM_ROOT}/av1/common/pred_common.c"
-    "${AOM_ROOT}/av1/common/pred_common.h"
-    "${AOM_ROOT}/av1/common/quant_common.c"
-    "${AOM_ROOT}/av1/common/quant_common.h"
-    "${AOM_ROOT}/av1/common/reconinter.c"
-    "${AOM_ROOT}/av1/common/reconinter.h"
-    "${AOM_ROOT}/av1/common/reconintra.c"
-    "${AOM_ROOT}/av1/common/reconintra.h"
-    "${AOM_ROOT}/av1/common/resize.c"
-    "${AOM_ROOT}/av1/common/resize.h"
-    "${AOM_ROOT}/av1/common/scale.c"
-    "${AOM_ROOT}/av1/common/scale.h"
-    "${AOM_ROOT}/av1/common/scan.c"
-    "${AOM_ROOT}/av1/common/scan.h"
-    "${AOM_ROOT}/av1/common/seg_common.c"
-    "${AOM_ROOT}/av1/common/seg_common.h"
-    "${AOM_ROOT}/av1/common/thread_common.c"
-    "${AOM_ROOT}/av1/common/thread_common.h"
-    "${AOM_ROOT}/av1/common/tile_common.c"
-    "${AOM_ROOT}/av1/common/tile_common.h")
-
-set(AOM_AV1_DECODER_SOURCES
-    "${AOM_ROOT}/av1/av1_dx_iface.c"
-    "${AOM_ROOT}/av1/decoder/decodeframe.c"
-    "${AOM_ROOT}/av1/decoder/decodeframe.h"
-    "${AOM_ROOT}/av1/decoder/decodemv.c"
-    "${AOM_ROOT}/av1/decoder/decodemv.h"
-    "${AOM_ROOT}/av1/decoder/decoder.c"
-    "${AOM_ROOT}/av1/decoder/decoder.h"
-    "${AOM_ROOT}/av1/decoder/detokenize.c"
-    "${AOM_ROOT}/av1/decoder/detokenize.h"
-    "${AOM_ROOT}/av1/decoder/dsubexp.c"
-    "${AOM_ROOT}/av1/decoder/dsubexp.h"
-    "${AOM_ROOT}/av1/decoder/dthread.c"
-    "${AOM_ROOT}/av1/decoder/dthread.h"
-    "${AOM_ROOT}/av1/decoder/symbolrate.h")
-
-set(AOM_AV1_ENCODER_SOURCES
-    "${AOM_ROOT}/av1/av1_cx_iface.c"
-    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
-    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
-    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
-    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
-    "${AOM_ROOT}/av1/encoder/aq_variance.c"
-    "${AOM_ROOT}/av1/encoder/aq_variance.h"
-    "${AOM_ROOT}/av1/encoder/av1_quantize.c"
-    "${AOM_ROOT}/av1/encoder/av1_quantize.h"
-    "${AOM_ROOT}/av1/encoder/bitstream.c"
-    "${AOM_ROOT}/av1/encoder/bitstream.h"
-    "${AOM_ROOT}/av1/encoder/block.h"
-    "${AOM_ROOT}/av1/encoder/context_tree.c"
-    "${AOM_ROOT}/av1/encoder/context_tree.h"
-    "${AOM_ROOT}/av1/encoder/cost.c"
-    "${AOM_ROOT}/av1/encoder/cost.h"
-    "${AOM_ROOT}/av1/encoder/dct.c"
-    "${AOM_ROOT}/av1/encoder/encodeframe.c"
-    "${AOM_ROOT}/av1/encoder/encodeframe.h"
-    "${AOM_ROOT}/av1/encoder/encodemb.c"
-    "${AOM_ROOT}/av1/encoder/encodemb.h"
-    "${AOM_ROOT}/av1/encoder/encodemv.c"
-    "${AOM_ROOT}/av1/encoder/encodemv.h"
-    "${AOM_ROOT}/av1/encoder/encoder.c"
-    "${AOM_ROOT}/av1/encoder/encoder.h"
-    "${AOM_ROOT}/av1/encoder/ethread.c"
-    "${AOM_ROOT}/av1/encoder/ethread.h"
-    "${AOM_ROOT}/av1/encoder/extend.c"
-    "${AOM_ROOT}/av1/encoder/extend.h"
-    "${AOM_ROOT}/av1/encoder/firstpass.c"
-    "${AOM_ROOT}/av1/encoder/firstpass.h"
-    "${AOM_ROOT}/av1/encoder/hash.c"
-    "${AOM_ROOT}/av1/encoder/hash.h"
-    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
-    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
-    "${AOM_ROOT}/av1/encoder/lookahead.c"
-    "${AOM_ROOT}/av1/encoder/lookahead.h"
-    "${AOM_ROOT}/av1/encoder/mbgraph.c"
-    "${AOM_ROOT}/av1/encoder/mbgraph.h"
-    "${AOM_ROOT}/av1/encoder/mcomp.c"
-    "${AOM_ROOT}/av1/encoder/mcomp.h"
-    "${AOM_ROOT}/av1/encoder/palette.c"
-    "${AOM_ROOT}/av1/encoder/palette.h"
-    "${AOM_ROOT}/av1/encoder/picklpf.c"
-    "${AOM_ROOT}/av1/encoder/picklpf.h"
-    "${AOM_ROOT}/av1/encoder/ratectrl.c"
-    "${AOM_ROOT}/av1/encoder/ratectrl.h"
-    "${AOM_ROOT}/av1/encoder/rd.c"
-    "${AOM_ROOT}/av1/encoder/rd.h"
-    "${AOM_ROOT}/av1/encoder/rdopt.c"
-    "${AOM_ROOT}/av1/encoder/rdopt.h"
-    "${AOM_ROOT}/av1/encoder/segmentation.c"
-    "${AOM_ROOT}/av1/encoder/segmentation.h"
-    "${AOM_ROOT}/av1/encoder/speed_features.c"
-    "${AOM_ROOT}/av1/encoder/speed_features.h"
-    "${AOM_ROOT}/av1/encoder/subexp.c"
-    "${AOM_ROOT}/av1/encoder/subexp.h"
-    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
-    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
-    "${AOM_ROOT}/av1/encoder/tokenize.c"
-    "${AOM_ROOT}/av1/encoder/tokenize.h"
-    "${AOM_ROOT}/av1/encoder/treewriter.c"
-    "${AOM_ROOT}/av1/encoder/treewriter.h")
-
-set(AOM_AV1_COMMON_INTRIN_SSE2
-    "${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
-
-set(AOM_AV1_COMMON_INTRIN_SSSE3
-    "${AOM_ROOT}/av1/common/x86/av1_convolve_ssse3.c")
-
-set(AOM_AV1_COMMON_INTRIN_SSE4_1
-    "${AOM_ROOT}/av1/common/x86/av1_fwd_txfm1d_sse4.c"
-    "${AOM_ROOT}/av1/common/x86/av1_fwd_txfm2d_sse4.c"
-    "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
-
-set(AOM_AV1_COMMON_INTRIN_AVX2
-    "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
-    "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
-
-set(AOM_AV1_COMMON_INTRIN_MSA
-    "${AOM_ROOT}/av1/common/mips/msa/av1_idct16x16_msa.c"
-    "${AOM_ROOT}/av1/common/mips/msa/av1_idct4x4_msa.c"
-    "${AOM_ROOT}/av1/common/mips/msa/av1_idct8x8_msa.c")
-
-set(AOM_AV1_ENCODER_ASM_SSE2
-    "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
-    "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
-    "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
-
-set(AOM_AV1_ENCODER_INTRIN_SSE2
-    "${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c"
-    "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
-    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c")
-
-set(AOM_AV1_ENCODER_ASM_SSSE3_X86_64
-    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
-
-set(AOM_AV1_ENCODER_INTRIN_SSE4_1
-    ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
-    "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
-    "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
-
-set(AOM_AV1_ENCODER_INTRIN_AVX2
-    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
-    "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
-    "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
-    "${AOM_ROOT}/av1/encoder/x86/hybrid_fwd_txfm_avx2.c")
-
-set(AOM_AV1_ENCODER_INTRIN_NEON
-    "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
-
-set(AOM_AV1_ENCODER_INTRIN_MSA
-    "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct16x16_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct8x8_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct_msa.h"
-    "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
-
-if (CONFIG_HIGHBITDEPTH)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c")
-else ()
-  set(AOM_AV1_COMMON_INTRIN_NEON
-      ${AOM_AV1_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/av1/common/arm/neon/iht4x4_add_neon.c"
-      "${AOM_ROOT}/av1/common/arm/neon/iht8x8_add_neon.c")
-
-  set(AOM_AV1_ENCODER_INTRIN_NEON
-      ${AOM_AV1_ENCODER_INTRIN_NEON}
-      "${AOM_ROOT}/av1/encoder/arm/neon/error_neon.c")
-endif ()
-
-if (CONFIG_CDEF)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/cdef.c"
-      "${AOM_ROOT}/av1/common/cdef.h"
-      "${AOM_ROOT}/av1/common/cdef_block.c"
-      "${AOM_ROOT}/av1/common/cdef_block.h")
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/pickcdef.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE2
-      ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/cdef_block_sse2.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSSE3
-      ${AOM_AV1_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/av1/common/cdef_block_ssse3.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/cdef_block_sse4.c")
-
-  set(AOM_AV1_COMMON_INTRIN_AVX2
-      ${AOM_AV1_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/av1/common/cdef_block_avx2.c")
-
-  set(AOM_AV1_COMMON_INTRIN_NEON
-      ${AOM_AV1_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/av1/common/cdef_block_neon.c")
-
-  if (NOT CONFIG_CDEF_SINGLEPASS)
-    set(AOM_AV1_COMMON_SOURCES
-        ${AOM_AV1_COMMON_SOURCES}
-        "${AOM_ROOT}/av1/common/clpf.c"
-        "${AOM_ROOT}/av1/common/clpf_simd.h"
-        "${AOM_ROOT}/av1/common/cdef_block_simd.h")
-
-    set(AOM_AV1_COMMON_INTRIN_SSE2
-        ${AOM_AV1_COMMON_INTRIN_SSE2}
-        "${AOM_ROOT}/av1/common/clpf_sse2.c")
-
-    set(AOM_AV1_COMMON_INTRIN_SSSE3
-        ${AOM_AV1_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/av1/common/clpf_ssse3.c")
-
-    set(AOM_AV1_COMMON_INTRIN_SSE4_1
-        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/av1/common/clpf_sse4.c")
-
-    set(AOM_AV1_COMMON_INTRIN_NEON
-        ${AOM_AV1_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/av1/common/clpf_neon.c")
-  endif ()
-endif ()
-
-if (CONFIG_CONVOLVE_ROUND)
-  set(AOM_AV1_COMMON_INTRIN_SSE2
-      ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_AV1_COMMON_INTRIN_SSSE3
-        ${AOM_AV1_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
-  endif ()
-
-  if(NOT CONFIG_COMPOUND_ROUND)
-    set(AOM_AV1_COMMON_INTRIN_SSE4_1
-        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c")
-  endif()
-
-  set(AOM_AV1_COMMON_INTRIN_AVX2
-      ${AOM_AV1_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/av1/common/x86/convolve_avx2.c")
-endif ()
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/wedge_utils.c")
-
-  set(AOM_AV1_ENCODER_INTRIN_SSE2
-      ${AOM_AV1_ENCODER_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
-
-if (CONFIG_FILTER_INTRA)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c")
-endif ()
-
-if (CONFIG_ACCOUNTING)
-  set(AOM_AV1_DECODER_SOURCES
-      ${AOM_AV1_DECODER_SOURCES}
-      "${AOM_ROOT}/av1/decoder/accounting.c"
-      "${AOM_ROOT}/av1/decoder/accounting.h")
-endif ()
-
-if (CONFIG_BGSPRITE)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/bgsprite.c"
-      "${AOM_ROOT}/av1/encoder/bgsprite.h")
-endif ()
-
-if (CONFIG_GLOBAL_MOTION)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/corner_detect.c"
-      "${AOM_ROOT}/av1/encoder/corner_detect.h"
-      "${AOM_ROOT}/av1/encoder/corner_match.c"
-      "${AOM_ROOT}/av1/encoder/corner_match.h"
-      "${AOM_ROOT}/av1/encoder/global_motion.c"
-      "${AOM_ROOT}/av1/encoder/global_motion.h"
-      "${AOM_ROOT}/av1/encoder/ransac.c"
-      "${AOM_ROOT}/av1/encoder/ransac.h"
-      "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
-      "${AOM_ROOT}/third_party/fastfeat/fast.c"
-      "${AOM_ROOT}/third_party/fastfeat/fast.h"
-      "${AOM_ROOT}/third_party/fastfeat/nonmax.c")
-
-  set(AOM_AV1_ENCODER_INTRIN_SSE4_1
-      ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c")
-endif ()
-
-if (CONFIG_INSPECTION)
-  set(AOM_AV1_DECODER_SOURCES
-      ${AOM_AV1_DECODER_SOURCES}
-      "${AOM_ROOT}/av1/decoder/inspection.c"
-      "${AOM_ROOT}/av1/decoder/inspection.h")
-endif ()
-
-if (CONFIG_INTERNAL_STATS)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/blockiness.c")
-endif ()
-
-if (CONFIG_LV_MAP)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/txb_common.c"
-      "${AOM_ROOT}/av1/common/txb_common.h")
-
-  set(AOM_AV1_DECODER_SOURCES
-      ${AOM_AV1_DECODER_SOURCES}
-      "${AOM_ROOT}/av1/decoder/decodetxb.c"
-      "${AOM_ROOT}/av1/decoder/decodetxb.h")
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/encodetxb.c"
-      "${AOM_ROOT}/av1/encoder/encodetxb.h")
-endif ()
-
-if (CONFIG_CFL)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-    "${AOM_ROOT}/av1/common/cfl.c"
-    "${AOM_ROOT}/av1/common/cfl.h")
-endif ()
-
-if (CONFIG_LOOP_RESTORATION)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/restoration.c"
-      "${AOM_ROOT}/av1/common/restoration.h")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c")
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/pickrst.c"
-      "${AOM_ROOT}/av1/encoder/pickrst.h")
-endif ()
-
-if (CONFIG_INTRA_EDGE)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c")
-endif ()
-
-if (CONFIG_NCOBMC_ADAPT_WEIGHT)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/ncobmc_kernels.c"
-      "${AOM_ROOT}/av1/common/ncobmc_kernels.h")
-endif ()
-
-if (CONFIG_PVQ)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/laplace_tables.c"
-      "${AOM_ROOT}/av1/common/pvq.c"
-      "${AOM_ROOT}/av1/common/pvq.h"
-      "${AOM_ROOT}/av1/common/pvq_state.c"
-      "${AOM_ROOT}/av1/common/pvq_state.h"
-      "${AOM_ROOT}/av1/common/partition.c"
-      "${AOM_ROOT}/av1/common/partition.h"
-      "${AOM_ROOT}/av1/common/generic_code.c"
-      "${AOM_ROOT}/av1/common/generic_code.h"
-      "${AOM_ROOT}/av1/common/zigzag4.c"
-      "${AOM_ROOT}/av1/common/zigzag8.c"
-      "${AOM_ROOT}/av1/common/zigzag16.c"
-      "${AOM_ROOT}/av1/common/zigzag32.c")
-
-    set(AOM_AV1_DECODER_SOURCES
-        ${AOM_AV1_DECODER_SOURCES}
-        "${AOM_ROOT}/av1/decoder/decint.h"
-        "${AOM_ROOT}/av1/decoder/pvq_decoder.c"
-        "${AOM_ROOT}/av1/decoder/pvq_decoder.h"
-        "${AOM_ROOT}/av1/decoder/generic_decoder.c"
-        "${AOM_ROOT}/av1/decoder/laplace_decoder.c")
-
-    set(AOM_AV1_ENCODER_SOURCES
-        ${AOM_AV1_ENCODER_SOURCES}
-        "${AOM_ROOT}/av1/encoder/daala_compat_enc.c"
-        "${AOM_ROOT}/av1/encoder/encint.h"
-        "${AOM_ROOT}/av1/encoder/pvq_encoder.c"
-        "${AOM_ROOT}/av1/encoder/pvq_encoder.h"
-        "${AOM_ROOT}/av1/encoder/generic_encoder.c"
-        "${AOM_ROOT}/av1/encoder/laplace_encoder.c")
-
-    set(AOM_AV1_COMMON_INTRIN_SSE4_1
-        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/av1/common/x86/pvq_sse4.c"
-        "${AOM_ROOT}/av1/common/x86/pvq_sse4.h")
-
-    if (NOT CONFIG_AV1_ENCODER)
-      # TODO(tomfinegan): These should probably be in av1/common, and in a
-      # common source list. For now this mirrors the original build system.
-      set(AOM_AV1_DECODER_SOURCES
-          ${AOM_AV1_DECODER_SOURCES}
-          "${AOM_ROOT}/av1/encoder/dct.c"
-          "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
-          "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h")
-
-      set(AOM_AV1_DECODER_ASM_SSE2
-          ${AOM_AV1_DECODER_ASM_SSE2}
-          "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm")
-
-      set(AOM_AV1_DECODER_INTRIN_SSE2
-          ${AOM_AV1_DECODER_INTRIN_SSE2}
-          "${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c")
-
-    endif ()
-endif ()
-
-if (CONFIG_WARPED_MOTION OR CONFIG_GLOBAL_MOTION)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/warped_motion.c"
-      "${AOM_ROOT}/av1/common/warped_motion.h")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE2
-      ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSSE3
-      ${AOM_AV1_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/av1/common/x86/warp_plane_ssse3.c")
-
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_AV1_COMMON_INTRIN_SSSE3
-        ${AOM_AV1_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_ssse3.c")
-  endif ()
-endif ()
-
-if (CONFIG_HASH_ME)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/hash_motion.h"
-      "${AOM_ROOT}/av1/encoder/hash_motion.c"
-      "${AOM_ROOT}/third_party/vector/vector.h"
-      "${AOM_ROOT}/third_party/vector/vector.c")
-endif ()
-
-if (CONFIG_Q_ADAPT_PROBS)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/token_cdfs.h")
-endif ()
-
-if (CONFIG_XIPHRC)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/ratectrl_xiph.c"
-      "${AOM_ROOT}/av1/encoder/ratectrl_xiph.h")
-endif ()
+list(APPEND AOM_AV1_COMMON_SOURCES
+            "${AOM_ROOT}/av1/av1_iface_common.h"
+            "${AOM_ROOT}/av1/common/alloccommon.c"
+            "${AOM_ROOT}/av1/common/alloccommon.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+            "${AOM_ROOT}/av1/common/av1_txfm.c"
+            "${AOM_ROOT}/av1/common/av1_txfm.h"
+            "${AOM_ROOT}/av1/common/blockd.c"
+            "${AOM_ROOT}/av1/common/blockd.h"
+            "${AOM_ROOT}/av1/common/cdef.c"
+            "${AOM_ROOT}/av1/common/cdef.h"
+            "${AOM_ROOT}/av1/common/cdef_block.c"
+            "${AOM_ROOT}/av1/common/cdef_block.h"
+            "${AOM_ROOT}/av1/common/cfl.c"
+            "${AOM_ROOT}/av1/common/cfl.h"
+            "${AOM_ROOT}/av1/common/common.h"
+            "${AOM_ROOT}/av1/common/common_data.h"
+            "${AOM_ROOT}/av1/common/convolve.c"
+            "${AOM_ROOT}/av1/common/convolve.h"
+            "${AOM_ROOT}/av1/common/debugmodes.c"
+            "${AOM_ROOT}/av1/common/entropy.c"
+            "${AOM_ROOT}/av1/common/entropy.h"
+            "${AOM_ROOT}/av1/common/entropymode.c"
+            "${AOM_ROOT}/av1/common/entropymode.h"
+            "${AOM_ROOT}/av1/common/entropymv.c"
+            "${AOM_ROOT}/av1/common/entropymv.h"
+            "${AOM_ROOT}/av1/common/enums.h"
+            "${AOM_ROOT}/av1/common/filter.c"
+            "${AOM_ROOT}/av1/common/filter.h"
+            "${AOM_ROOT}/av1/common/frame_buffers.c"
+            "${AOM_ROOT}/av1/common/frame_buffers.h"
+            "${AOM_ROOT}/av1/common/idct.c"
+            "${AOM_ROOT}/av1/common/idct.h"
+            "${AOM_ROOT}/av1/common/mv.h"
+            "${AOM_ROOT}/av1/common/mvref_common.c"
+            "${AOM_ROOT}/av1/common/mvref_common.h"
+            "${AOM_ROOT}/av1/common/odintrin.c"
+            "${AOM_ROOT}/av1/common/odintrin.h"
+            "${AOM_ROOT}/av1/common/onyxc_int.h"
+            "${AOM_ROOT}/av1/common/pred_common.c"
+            "${AOM_ROOT}/av1/common/pred_common.h"
+            "${AOM_ROOT}/av1/common/quant_common.c"
+            "${AOM_ROOT}/av1/common/quant_common.h"
+            "${AOM_ROOT}/av1/common/reconinter.c"
+            "${AOM_ROOT}/av1/common/reconinter.h"
+            "${AOM_ROOT}/av1/common/reconintra.c"
+            "${AOM_ROOT}/av1/common/reconintra.h"
+            "${AOM_ROOT}/av1/common/resize.c"
+            "${AOM_ROOT}/av1/common/resize.h"
+            "${AOM_ROOT}/av1/common/restoration.c"
+            "${AOM_ROOT}/av1/common/restoration.h"
+            "${AOM_ROOT}/av1/common/scale.c"
+            "${AOM_ROOT}/av1/common/scale.h"
+            "${AOM_ROOT}/av1/common/scan.c"
+            "${AOM_ROOT}/av1/common/scan.h"
+            "${AOM_ROOT}/av1/common/seg_common.c"
+            "${AOM_ROOT}/av1/common/seg_common.h"
+            "${AOM_ROOT}/av1/common/thread_common.c"
+            "${AOM_ROOT}/av1/common/thread_common.h"
+            "${AOM_ROOT}/av1/common/tile_common.c"
+            "${AOM_ROOT}/av1/common/tile_common.h"
+            "${AOM_ROOT}/av1/common/timing.h"
+            "${AOM_ROOT}/av1/common/timing.c"
+            "${AOM_ROOT}/av1/common/token_cdfs.h"
+            "${AOM_ROOT}/av1/common/txb_common.c"
+            "${AOM_ROOT}/av1/common/txb_common.h"
+            "${AOM_ROOT}/av1/common/warped_motion.c"
+            "${AOM_ROOT}/av1/common/warped_motion.h")
+
+list(APPEND AOM_AV1_DECODER_SOURCES
+            "${AOM_ROOT}/av1/av1_dx_iface.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.h"
+            "${AOM_ROOT}/av1/decoder/decodemv.c"
+            "${AOM_ROOT}/av1/decoder/decodemv.h"
+            "${AOM_ROOT}/av1/decoder/decoder.c"
+            "${AOM_ROOT}/av1/decoder/decoder.h"
+            "${AOM_ROOT}/av1/decoder/decodetxb.c"
+            "${AOM_ROOT}/av1/decoder/decodetxb.h"
+            "${AOM_ROOT}/av1/decoder/detokenize.c"
+            "${AOM_ROOT}/av1/decoder/detokenize.h"
+            "${AOM_ROOT}/av1/decoder/dthread.c"
+            "${AOM_ROOT}/av1/decoder/dthread.h"
+            "${AOM_ROOT}/av1/decoder/obu.h"
+            "${AOM_ROOT}/av1/decoder/obu.c")
+
+list(APPEND AOM_AV1_ENCODER_SOURCES
+            "${AOM_ROOT}/av1/av1_cx_iface.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+            "${AOM_ROOT}/av1/encoder/aq_variance.c"
+            "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+            "${AOM_ROOT}/av1/encoder/bitstream.c"
+            "${AOM_ROOT}/av1/encoder/bitstream.h"
+            "${AOM_ROOT}/av1/encoder/block.h"
+            "${AOM_ROOT}/av1/encoder/context_tree.c"
+            "${AOM_ROOT}/av1/encoder/context_tree.h"
+            "${AOM_ROOT}/av1/encoder/corner_detect.c"
+            "${AOM_ROOT}/av1/encoder/corner_detect.h"
+            "${AOM_ROOT}/av1/encoder/corner_match.c"
+            "${AOM_ROOT}/av1/encoder/corner_match.h"
+            "${AOM_ROOT}/av1/encoder/cost.c"
+            "${AOM_ROOT}/av1/encoder/cost.h"
+            "${AOM_ROOT}/av1/encoder/encodeframe.c"
+            "${AOM_ROOT}/av1/encoder/encodeframe.h"
+            "${AOM_ROOT}/av1/encoder/encodemb.c"
+            "${AOM_ROOT}/av1/encoder/encodemb.h"
+            "${AOM_ROOT}/av1/encoder/encodemv.c"
+            "${AOM_ROOT}/av1/encoder/encodemv.h"
+            "${AOM_ROOT}/av1/encoder/encoder.c"
+            "${AOM_ROOT}/av1/encoder/encoder.h"
+            "${AOM_ROOT}/av1/encoder/encodetxb.c"
+            "${AOM_ROOT}/av1/encoder/encodetxb.h"
+            "${AOM_ROOT}/av1/encoder/ethread.c"
+            "${AOM_ROOT}/av1/encoder/ethread.h"
+            "${AOM_ROOT}/av1/encoder/extend.c"
+            "${AOM_ROOT}/av1/encoder/extend.h"
+            "${AOM_ROOT}/av1/encoder/firstpass.c"
+            "${AOM_ROOT}/av1/encoder/firstpass.h"
+            "${AOM_ROOT}/av1/encoder/global_motion.c"
+            "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
+            "${AOM_ROOT}/av1/encoder/hash.c"
+            "${AOM_ROOT}/av1/encoder/hash.h"
+            "${AOM_ROOT}/av1/encoder/hash_motion.c"
+            "${AOM_ROOT}/av1/encoder/hash_motion.h"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+            "${AOM_ROOT}/av1/encoder/lookahead.c"
+            "${AOM_ROOT}/av1/encoder/lookahead.h"
+            "${AOM_ROOT}/av1/encoder/mbgraph.c"
+            "${AOM_ROOT}/av1/encoder/mbgraph.h"
+            "${AOM_ROOT}/av1/encoder/mcomp.c"
+            "${AOM_ROOT}/av1/encoder/mcomp.h"
+            "${AOM_ROOT}/av1/encoder/ml.c"
+            "${AOM_ROOT}/av1/encoder/ml.h"
+            "${AOM_ROOT}/av1/encoder/palette.c"
+            "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/pickcdef.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.h"
+            "${AOM_ROOT}/av1/encoder/pickrst.c"
+            "${AOM_ROOT}/av1/encoder/pickrst.h"
+            "${AOM_ROOT}/av1/encoder/ransac.c"
+            "${AOM_ROOT}/av1/encoder/ransac.h"
+            "${AOM_ROOT}/av1/encoder/ratectrl.c"
+            "${AOM_ROOT}/av1/encoder/ratectrl.h"
+            "${AOM_ROOT}/av1/encoder/rd.c"
+            "${AOM_ROOT}/av1/encoder/rd.h"
+            "${AOM_ROOT}/av1/encoder/rdopt.c"
+            "${AOM_ROOT}/av1/encoder/rdopt.h"
+            "${AOM_ROOT}/av1/encoder/segmentation.c"
+            "${AOM_ROOT}/av1/encoder/segmentation.h"
+            "${AOM_ROOT}/av1/encoder/speed_features.c"
+            "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+            "${AOM_ROOT}/av1/encoder/tokenize.c"
+            "${AOM_ROOT}/av1/encoder/tokenize.h"
+            "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.h"
+            "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
+            "${AOM_ROOT}/third_party/fastfeat/nonmax.c"
+            "${AOM_ROOT}/third_party/vector/vector.c"
+            "${AOM_ROOT}/third_party/vector/vector.h"
+            "${AOM_ROOT}/av1/encoder/dwt.c"
+            "${AOM_ROOT}/av1/encoder/dwt.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+            "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
+            "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+            "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/mem_neon.h"
+            "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/intrapred_neon.c"
+            "${AOM_ROOT}/av1/common/cdef_block_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
+            "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
+if(CONFIG_ACCOUNTING)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
+              "${AOM_ROOT}/av1/decoder/accounting.h")
+endif()
+
+if(CONFIG_INSPECTION)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c"
+              "${AOM_ROOT}/av1/decoder/inspection.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif()
 
 # Setup AV1 common/decoder/encoder targets. The libaom target must exist before
 # this function is called.
-function (setup_av1_targets)
+function(setup_av1_targets)
   add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
   list(APPEND AOM_LIB_TARGETS aom_av1_common)
 
@@ -537,104 +333,117 @@ function (setup_av1_targets)
   # dummy source file to the aom_av1 target.
   add_dummy_source_file_to_target("aom_av1" "c")
 
-  if (CONFIG_AV1_DECODER)
+  if(CONFIG_AV1_DECODER)
     add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
-  endif ()
+  endif()
 
-  if (CONFIG_AV1_ENCODER)
+  if(CONFIG_AV1_ENCODER)
     add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
-  endif ()
+  endif()
 
-  if (HAVE_SSE2)
+  if(HAVE_SSE2)
     require_compiler_flag_nomsvc("-msse2" NO)
     add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_SSE2" "aom")
-    if (CONFIG_AV1_DECODER)
-      if (AOM_AV1_DECODER_ASM_SSE2)
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_ASM_SSE2)
         add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2" "aom")
-      endif ()
+      endif()
 
-      if (AOM_AV1_DECODER_INTRIN_SSE2)
+      if(AOM_AV1_DECODER_INTRIN_SSE2)
         add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
                                       "AOM_AV1_DECODER_INTRIN_SSE2" "aom")
-      endif ()
-    endif ()
+      endif()
+    endif()
 
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2" "aom")
       add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_SSE2" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_SSSE3)
+  if(HAVE_SSSE3)
     require_compiler_flag_nomsvc("-mssse3" NO)
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_SSSE3" "aom")
 
-    if (CONFIG_AV1_DECODER)
-      if (AOM_AV1_DECODER_INTRIN_SSSE3)
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_INTRIN_SSSE3)
         add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
                                       "AOM_AV1_DECODER_INTRIN_SSSE3" "aom")
-      endif ()
-    endif ()
-  endif ()
+      endif()
+    endif()
+  endif()
 
-  if (HAVE_SSE4_1)
+  if(HAVE_SSE4_1)
     require_compiler_flag_nomsvc("-msse4.1" NO)
     add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_SSE4_1" "aom")
 
-    if (CONFIG_AV1_ENCODER)
-      if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         add_asm_library("aom_av1_encoder_ssse3"
                         "AOM_AV1_ENCODER_ASM_SSSE3_X86_64" "aom")
-      endif ()
+      endif()
 
-      if (AOM_AV1_ENCODER_INTRIN_SSE4_1)
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
         add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
                                       "AOM_AV1_ENCODER_INTRIN_SSE4_1" "aom")
-      endif ()
-    endif ()
-  endif ()
+      endif()
+    endif()
+  endif()
 
-  if (HAVE_AVX2)
+  if(HAVE_SSE4_2)
+    require_compiler_flag_nomsvc("-msse4.2" NO)
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
+        add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
     require_compiler_flag_nomsvc("-mavx2" NO)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_AVX2" "aom")
 
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_AVX2" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_NEON)
-    if (AOM_AV1_COMMON_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}"
-                                    "neon"
+  if(HAVE_NEON)
+    if(AOM_AV1_COMMON_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
                                     "aom_av1_common"
                                     "AOM_AV1_COMMON_INTRIN_NEON" "aom")
-    endif ()
+    endif()
 
-    if (AOM_AV1_ENCODER_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}"
-                                    "neon"
+    if(AOM_AV1_ENCODER_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
                                     "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_NEON" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_MSA)
-    add_intrinsics_object_library("" "msa" "aom_av1_common"
-                                  "AOM_AV1_COMMON_INTRIN_MSA" "aom")
+  if(HAVE_VSX)
+    if(AOM_AV1_COMMON_INTRIN_VSX)
+      add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_VSX" "aom")
+    endif()
+  endif()
+
+  if(HAVE_MSA)
     add_intrinsics_object_library("" "msa" "aom_av1_encoder"
                                   "AOM_AV1_ENCODER_INTRIN_MSA" "aom")
-  endif ()
+  endif()
 
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
@@ -642,9 +451,7 @@ function (setup_av1_targets)
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction ()
-
-function (setup_av1_test_targets)
-endfunction ()
+endfunction()
 
-endif ()  # AOM_AV1_AV1_CMAKE_
+function(setup_av1_test_targets)
+endfunction()
diff --git a/third_party/aom/av1/av1_common.mk b/third_party/aom/av1/av1_common.mk
deleted file mode 100644
index 35466ac88..000000000
--- a/third_party/aom/av1/av1_common.mk
+++ /dev/null
@@ -1,205 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-AV1_COMMON_SRCS-yes += av1_common.mk
-AV1_COMMON_SRCS-yes += av1_iface_common.h
-AV1_COMMON_SRCS-yes += common/alloccommon.c
-AV1_COMMON_SRCS-yes += common/av1_loopfilter.c
-AV1_COMMON_SRCS-yes += common/av1_loopfilter.h
-AV1_COMMON_SRCS-yes += common/blockd.c
-AV1_COMMON_SRCS-yes += common/debugmodes.c
-AV1_COMMON_SRCS-yes += common/entropy.c
-AV1_COMMON_SRCS-yes += common/entropymode.c
-AV1_COMMON_SRCS-yes += common/entropymv.c
-AV1_COMMON_SRCS-yes += common/frame_buffers.c
-AV1_COMMON_SRCS-yes += common/frame_buffers.h
-AV1_COMMON_SRCS-yes += common/alloccommon.h
-AV1_COMMON_SRCS-yes += common/blockd.h
-AV1_COMMON_SRCS-yes += common/common.h
-AV1_COMMON_SRCS-yes += common/daala_tx.c
-AV1_COMMON_SRCS-yes += common/daala_tx.h
-AV1_COMMON_SRCS-yes += common/entropy.h
-AV1_COMMON_SRCS-yes += common/entropymode.h
-AV1_COMMON_SRCS-yes += common/entropymv.h
-AV1_COMMON_SRCS-yes += common/enums.h
-AV1_COMMON_SRCS-yes += common/filter.h
-AV1_COMMON_SRCS-yes += common/filter.c
-AV1_COMMON_SRCS-yes += common/idct.h
-AV1_COMMON_SRCS-yes += common/idct.c
-AV1_COMMON_SRCS-yes += common/thread_common.h
-AV1_COMMON_SRCS-$(CONFIG_LV_MAP) += common/txb_common.h
-AV1_COMMON_SRCS-$(CONFIG_LV_MAP) += common/txb_common.c
-AV1_COMMON_SRCS-yes += common/mv.h
-AV1_COMMON_SRCS-yes += common/onyxc_int.h
-AV1_COMMON_SRCS-yes += common/pred_common.h
-AV1_COMMON_SRCS-yes += common/pred_common.c
-AV1_COMMON_SRCS-yes += common/quant_common.h
-AV1_COMMON_SRCS-yes += common/reconinter.h
-AV1_COMMON_SRCS-yes += common/reconintra.h
-AV1_COMMON_SRCS-yes += common/av1_rtcd.c
-AV1_COMMON_SRCS-yes += common/av1_rtcd_defs.pl
-AV1_COMMON_SRCS-yes += common/scale.h
-AV1_COMMON_SRCS-yes += common/scale.c
-AV1_COMMON_SRCS-yes += common/seg_common.h
-AV1_COMMON_SRCS-yes += common/seg_common.c
-AV1_COMMON_SRCS-yes += common/tile_common.h
-AV1_COMMON_SRCS-yes += common/tile_common.c
-AV1_COMMON_SRCS-yes += common/thread_common.c
-AV1_COMMON_SRCS-yes += common/mvref_common.c
-AV1_COMMON_SRCS-yes += common/mvref_common.h
-AV1_COMMON_SRCS-yes += common/quant_common.c
-AV1_COMMON_SRCS-yes += common/reconinter.c
-AV1_COMMON_SRCS-yes += common/reconintra.c
-AV1_COMMON_SRCS-yes += common/resize.c
-AV1_COMMON_SRCS-yes += common/resize.h
-AV1_COMMON_SRCS-yes += common/common_data.h
-AV1_COMMON_SRCS-yes += common/scan.c
-AV1_COMMON_SRCS-yes += common/scan.h
-# TODO(angiebird) the forward transform belongs under encoder/
-AV1_COMMON_SRCS-yes += common/av1_txfm.h
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.c
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d.c
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d_cfg.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_avx2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
-ifeq ($(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND),yesx)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_convolve_scale_sse4.c
-endif
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
-endif
-AV1_COMMON_SRCS-yes += common/convolve.c
-AV1_COMMON_SRCS-yes += common/convolve.h
-ifeq ($(CONFIG_LOOP_RESTORATION),yes)
-AV1_COMMON_SRCS-yes += common/restoration.h
-AV1_COMMON_SRCS-yes += common/restoration.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
-endif
-ifeq ($(CONFIG_INTRA_EDGE),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/intra_edge_sse4.c
-endif
-ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
-AV1_COMMON_SRCS-yes += common/warped_motion.h
-AV1_COMMON_SRCS-yes += common/warped_motion.c
-endif
-ifeq ($(CONFIG_CDEF),yes)
-ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/cdef_block_avx2.c
-else
-AV1_COMMON_SRCS-yes += common/clpf.c
-AV1_COMMON_SRCS-yes += common/clpf_simd.h
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
-endif
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/cdef_block_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/cdef_block_ssse3.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/cdef_block_sse4.c
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/cdef_block_neon.c
-AV1_COMMON_SRCS-yes += common/cdef_block.c
-AV1_COMMON_SRCS-yes += common/cdef_block.h
-AV1_COMMON_SRCS-yes += common/cdef_block_simd.h
-AV1_COMMON_SRCS-yes += common/cdef.c
-AV1_COMMON_SRCS-yes += common/cdef.h
-endif
-AV1_COMMON_SRCS-yes += common/odintrin.c
-AV1_COMMON_SRCS-yes += common/odintrin.h
-
-ifeq ($(CONFIG_CFL),yes)
-AV1_COMMON_SRCS-yes += common/cfl.h
-AV1_COMMON_SRCS-yes += common/cfl.c
-endif
-
-ifeq ($(CONFIG_MOTION_VAR),yes)
-AV1_COMMON_SRCS-yes += common/obmc.h
-endif
-
-ifeq ($(CONFIG_PVQ),yes)
-# PVQ from daala
-AV1_COMMON_SRCS-yes += common/pvq.c
-AV1_COMMON_SRCS-yes += common/partition.c
-AV1_COMMON_SRCS-yes += common/partition.h
-AV1_COMMON_SRCS-yes += common/zigzag4.c
-AV1_COMMON_SRCS-yes += common/zigzag8.c
-AV1_COMMON_SRCS-yes += common/zigzag16.c
-AV1_COMMON_SRCS-yes += common/zigzag32.c
-AV1_COMMON_SRCS-yes += common/zigzag.h
-AV1_COMMON_SRCS-yes += common/generic_code.c
-AV1_COMMON_SRCS-yes += common/pvq_state.c
-AV1_COMMON_SRCS-yes += common/laplace_tables.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.h
-endif
-ifneq ($(findstring yes,$(CONFIG_PVQ)$(CONFIG_DAALA_DIST)$(CONFIG_XIPHRC)),)
-AV1_COMMON_SRCS-yes += common/pvq.h
-AV1_COMMON_SRCS-yes += common/pvq_state.h
-AV1_COMMON_SRCS-yes += common/generic_code.h
-endif
-
-# common (msa)
-AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct4x4_msa.c
-AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct8x8_msa.c
-AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct16x16_msa.c
-
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
-endif
-
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/highbd_inv_txfm_avx2.c
-
-ifneq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
-endif
-
-ifeq ($(CONFIG_FILTER_INTRA),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
-endif
-
-ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/warp_plane_ssse3.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_warp_plane_ssse3.c
-endif
-endif
-
-ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
-endif
-endif
-
-
-ifeq ($(CONFIG_Q_ADAPT_PROBS),yes)
-AV1_COMMON_SRCS-yes += common/token_cdfs.h
-endif
-
-ifeq ($(CONFIG_NCOBMC_ADAPT_WEIGHT),yes)
-AV1_COMMON_SRCS-yes += common/ncobmc_kernels.h
-AV1_COMMON_SRCS-yes += common/ncobmc_kernels.c
-endif
-
-$(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/third_party/aom/av1/av1_cx.mk b/third_party/aom/av1/av1_cx.mk
deleted file mode 100644
index 13f297403..000000000
--- a/third_party/aom/av1/av1_cx.mk
+++ /dev/null
@@ -1,176 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-AV1_CX_EXPORTS += exports_enc
-
-AV1_CX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
-AV1_CX_SRCS-no  += $(AV1_COMMON_SRCS-no)
-AV1_CX_SRCS_REMOVE-yes += $(AV1_COMMON_SRCS_REMOVE-yes)
-AV1_CX_SRCS_REMOVE-no  += $(AV1_COMMON_SRCS_REMOVE-no)
-
-AV1_CX_SRCS-yes += av1_cx_iface.c
-
-AV1_CX_SRCS-yes += encoder/av1_quantize.c
-AV1_CX_SRCS-yes += encoder/av1_quantize.h
-AV1_CX_SRCS-yes += encoder/bitstream.c
-AV1_CX_SRCS-$(CONFIG_BGSPRITE) += encoder/bgsprite.c
-AV1_CX_SRCS-$(CONFIG_BGSPRITE) += encoder/bgsprite.h
-AV1_CX_SRCS-yes += encoder/context_tree.c
-AV1_CX_SRCS-yes += encoder/context_tree.h
-AV1_CX_SRCS-yes += encoder/cost.h
-AV1_CX_SRCS-yes += encoder/cost.c
-AV1_CX_SRCS-yes += encoder/dct.c
-AV1_CX_SRCS-yes += encoder/hybrid_fwd_txfm.c
-AV1_CX_SRCS-yes += encoder/hybrid_fwd_txfm.h
-AV1_CX_SRCS-yes += encoder/encodeframe.c
-AV1_CX_SRCS-yes += encoder/encodeframe.h
-AV1_CX_SRCS-yes += encoder/encodemb.c
-AV1_CX_SRCS-yes += encoder/encodemv.c
-AV1_CX_SRCS-yes += encoder/ethread.h
-AV1_CX_SRCS-yes += encoder/ethread.c
-AV1_CX_SRCS-yes += encoder/extend.c
-AV1_CX_SRCS-yes += encoder/firstpass.c
-AV1_CX_SRCS-yes += encoder/mathutils.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/nonmax.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast_9.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.h
-AV1_CX_SRCS-yes += encoder/block.h
-AV1_CX_SRCS-yes += encoder/bitstream.h
-AV1_CX_SRCS-yes += encoder/encodemb.h
-AV1_CX_SRCS-yes += encoder/encodemv.h
-AV1_CX_SRCS-$(CONFIG_LV_MAP) += encoder/encodetxb.c
-AV1_CX_SRCS-$(CONFIG_LV_MAP) += encoder/encodetxb.h
-AV1_CX_SRCS-yes += encoder/extend.h
-AV1_CX_SRCS-yes += encoder/firstpass.h
-AV1_CX_SRCS-yes += encoder/lookahead.c
-AV1_CX_SRCS-yes += encoder/lookahead.h
-AV1_CX_SRCS-yes += encoder/mcomp.h
-AV1_CX_SRCS-yes += encoder/encoder.h
-AV1_CX_SRCS-yes += encoder/random.h
-AV1_CX_SRCS-yes += encoder/ratectrl.h
-ifeq ($(CONFIG_XIPHRC),yes)
-AV1_CX_SRCS-yes += encoder/ratectrl_xiph.h
-endif
-AV1_CX_SRCS-yes += encoder/rd.h
-AV1_CX_SRCS-yes += encoder/rdopt.h
-AV1_CX_SRCS-yes += encoder/tokenize.h
-AV1_CX_SRCS-yes += encoder/treewriter.h
-AV1_CX_SRCS-yes += encoder/mcomp.c
-AV1_CX_SRCS-yes += encoder/encoder.c
-AV1_CX_SRCS-yes += encoder/k_means_template.h
-AV1_CX_SRCS-yes += encoder/palette.h
-AV1_CX_SRCS-yes += encoder/palette.c
-AV1_CX_SRCS-yes += encoder/picklpf.c
-AV1_CX_SRCS-yes += encoder/picklpf.h
-AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.c
-AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.h
-AV1_CX_SRCS-yes += encoder/ratectrl.c
-ifeq ($(CONFIG_XIPHRC),yes)
-AV1_CX_SRCS-yes += encoder/ratectrl_xiph.c
-endif
-AV1_CX_SRCS-yes += encoder/rd.c
-AV1_CX_SRCS-yes += encoder/rdopt.c
-AV1_CX_SRCS-yes += encoder/segmentation.c
-AV1_CX_SRCS-yes += encoder/segmentation.h
-AV1_CX_SRCS-yes += encoder/speed_features.c
-AV1_CX_SRCS-yes += encoder/speed_features.h
-AV1_CX_SRCS-yes += encoder/subexp.c
-AV1_CX_SRCS-yes += encoder/subexp.h
-AV1_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/blockiness.c
-
-AV1_CX_SRCS-yes += encoder/tokenize.c
-AV1_CX_SRCS-yes += encoder/treewriter.c
-AV1_CX_SRCS-yes += encoder/aq_variance.c
-AV1_CX_SRCS-yes += encoder/aq_variance.h
-AV1_CX_SRCS-yes += encoder/aq_cyclicrefresh.c
-AV1_CX_SRCS-yes += encoder/aq_cyclicrefresh.h
-AV1_CX_SRCS-yes += encoder/aq_complexity.c
-AV1_CX_SRCS-yes += encoder/aq_complexity.h
-AV1_CX_SRCS-yes += encoder/temporal_filter.c
-AV1_CX_SRCS-yes += encoder/temporal_filter.h
-AV1_CX_SRCS-yes += encoder/mbgraph.c
-AV1_CX_SRCS-yes += encoder/mbgraph.h
-AV1_CX_SRCS-yes += encoder/hash.c
-AV1_CX_SRCS-yes += encoder/hash.h
-ifeq ($(CONFIG_HASH_ME),yes)
-AV1_CX_SRCS-yes += ../third_party/vector/vector.h
-AV1_CX_SRCS-yes += ../third_party/vector/vector.c
-AV1_CX_SRCS-yes += encoder/hash_motion.c
-AV1_CX_SRCS-yes += encoder/hash_motion.h
-endif
-ifeq ($(CONFIG_CDEF),yes)
-AV1_CX_SRCS-yes += encoder/pickcdef.c
-endif
-ifeq ($(CONFIG_PVQ),yes)
-# PVQ from daala
-AV1_CX_SRCS-yes += encoder/daala_compat_enc.c
-AV1_CX_SRCS-yes += encoder/pvq_encoder.c
-AV1_CX_SRCS-yes += encoder/pvq_encoder.h
-AV1_CX_SRCS-yes += encoder/generic_encoder.c
-AV1_CX_SRCS-yes += encoder/laplace_encoder.c
-endif
-ifneq ($(findstring yes,$(CONFIG_XIPHRC)$(CONFIG_PVQ)),)
-AV1_CX_SRCS-yes += encoder/encint.h
-endif
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/av1_quantize_sse2.c
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_quantize_avx2.c
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_highbd_quantize_avx2.c
-
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
-
-ifeq ($(ARCH_X86_64),yes)
-AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/av1_quantize_ssse3_x86_64.asm
-endif
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/hybrid_fwd_txfm_avx2.c
-
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/av1_highbd_quantize_sse4.c
-
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
-
-AV1_CX_SRCS-yes += encoder/wedge_utils.c
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
-
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
-
-ifneq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
-endif
-AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
-
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
-
-ifeq ($(CONFIG_GLOBAL_MOTION),yes)
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/corner_match_sse4.c
-endif
-
-AV1_CX_SRCS-yes := $(filter-out $(AV1_CX_SRCS_REMOVE-yes),$(AV1_CX_SRCS-yes))
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 0f6c1c4d7..9d5414c1e 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -11,37 +11,33 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
 #include "aom/aom_encoder.h"
 #include "aom_ports/aom_once.h"
 #include "aom_ports/system_state.h"
 #include "aom/internal/aom_codec_internal.h"
-#include "./aom_version.h"
 #include "av1/encoder/encoder.h"
 #include "aom/aomcx.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/av1_iface_common.h"
+#include "av1/encoder/bitstream.h"
+#include "aom_ports/mem_ops.h"
 
 #define MAG_SIZE (4)
-#define MAX_INDEX_SIZE (256)
+#define MAX_NUM_ENHANCEMENT_LAYERS 3
 
 struct av1_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
+  int dev_sf;
   unsigned int enable_auto_alt_ref;
-#if CONFIG_EXT_REFS
   unsigned int enable_auto_bwd_ref;
-#endif  // CONFIG_EXT_REFS
   unsigned int noise_sensitivity;
   unsigned int sharpness;
   unsigned int static_thresh;
   unsigned int tile_columns;  // log2 number of tile columns
   unsigned int tile_rows;     // log2 number of tile rows
-#if CONFIG_DEPENDENT_HORZTILES
-  unsigned int dependent_horz_tiles;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  unsigned int loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
   unsigned int arnr_max_frames;
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
@@ -52,107 +48,118 @@ struct av1_extracfg {
   unsigned int rc_max_inter_bitrate_pct;
   unsigned int gf_cbr_boost_pct;
   unsigned int lossless;
-#if CONFIG_AOM_QM
+  unsigned int enable_cdef;
+  unsigned int enable_restoration;
+  unsigned int disable_trellis_quant;
   unsigned int enable_qm;
+  unsigned int qm_y;
+  unsigned int qm_u;
+  unsigned int qm_v;
   unsigned int qm_min;
   unsigned int qm_max;
-#endif
 #if CONFIG_DIST_8X8
   unsigned int enable_dist_8x8;
 #endif
   unsigned int num_tg;
   unsigned int mtu_size;
-#if CONFIG_TEMPMV_SIGNALING
-  unsigned int disable_tempmv;
-#endif
+
+  aom_timing_info_type_t timing_info_type;
   unsigned int frame_parallel_decoding_mode;
+  int use_dual_filter;
   AQ_MODE aq_mode;
-#if CONFIG_EXT_DELTA_Q
   DELTAQ_MODE deltaq_mode;
-#endif
   unsigned int frame_periodic_boost;
   aom_bit_depth_t bit_depth;
   aom_tune_content content;
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
   aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
   int render_width;
   int render_height;
   aom_superblock_size_t superblock_size;
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  int ans_window_size_log2;
-#endif
-#if CONFIG_EXT_TILE
   unsigned int single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
+  int error_resilient_mode;
+  int s_frame_mode;
 
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
   unsigned int motion_vector_unit_test;
+  unsigned int cdf_update_mode;
+  int enable_order_hint;
+  int enable_jnt_comp;
+  int enable_ref_frame_mvs;  // sequence level
+  int allow_ref_frame_mvs;   // frame level
+  int enable_warped_motion;  // sequence level
+  int allow_warped_motion;   // frame level
+  int enable_superres;
 };
 
 static struct av1_extracfg default_extra_cfg = {
-  0,  // cpu_used
-  1,  // enable_auto_alt_ref
-#if CONFIG_EXT_REFS
-  0,    // enable_auto_bwd_ref
-#endif  // CONFIG_EXT_REFS
-  0,    // noise_sensitivity
-  0,    // sharpness
-  0,    // static_thresh
-  0,    // tile_columns
-  0,    // tile_rows
-#if CONFIG_DEPENDENT_HORZTILES
-  0,  // Dependent Horizontal tiles
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  1,              // loop_filter_across_tiles_enabled
-#endif            // CONFIG_LOOPFILTERING_ACROSS_TILES
-  7,              // arnr_max_frames
-  5,              // arnr_strength
-  0,              // min_gf_interval; 0 -> default decision
-  0,              // max_gf_interval; 0 -> default decision
-  AOM_TUNE_PSNR,  // tuning
-  10,             // cq_level
-  0,              // rc_max_intra_bitrate_pct
-  0,              // rc_max_inter_bitrate_pct
-  0,              // gf_cbr_boost_pct
-  0,              // lossless
-#if CONFIG_AOM_QM
+  0,                 // cpu_used
+  0,                 // dev_sf
+  1,                 // enable_auto_alt_ref
+  0,                 // enable_auto_bwd_ref
+  0,                 // noise_sensitivity
+  0,                 // sharpness
+  0,                 // static_thresh
+  0,                 // tile_columns
+  0,                 // tile_rows
+  7,                 // arnr_max_frames
+  5,                 // arnr_strength
+  0,                 // min_gf_interval; 0 -> default decision
+  0,                 // max_gf_interval; 0 -> default decision
+  AOM_TUNE_PSNR,     // tuning
+  10,                // cq_level
+  0,                 // rc_max_intra_bitrate_pct
+  0,                 // rc_max_inter_bitrate_pct
+  0,                 // gf_cbr_boost_pct
+  0,                 // lossless
+  1,                 // enable_cdef
+  1,                 // enable_restoration
+  0,                 // disable_trellis_quant
   0,                 // enable_qm
+  DEFAULT_QM_Y,      // qm_y
+  DEFAULT_QM_U,      // qm_u
+  DEFAULT_QM_V,      // qm_v
   DEFAULT_QM_FIRST,  // qm_min
   DEFAULT_QM_LAST,   // qm_max
-#endif
 #if CONFIG_DIST_8X8
   0,
 #endif
-  1,  // max number of tile groups
-  0,  // mtu_size
-#if CONFIG_TEMPMV_SIGNALING
-  0,  // disable temporal mv prediction
-#endif
-  1,      // frame_parallel_decoding_mode
-  NO_AQ,  // aq_mode
-#if CONFIG_EXT_DELTA_Q
-  NO_DELTA_Q,  // deltaq_mode
-#endif
-  CONFIG_XIPHRC,                // frame_periodic_delta_q
+  1,                            // max number of tile groups
+  0,                            // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  1,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  NO_AQ,                        // aq_mode
+  NO_DELTA_Q,                   // deltaq_mode
+  0,                            // frame_periodic_delta_q
   AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
-  AOM_CS_UNKNOWN,               // color space
-  AOM_TF_UNKNOWN,               // transfer function
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color space
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
   AOM_CSP_UNKNOWN,              // chroma sample position
   0,                            // color range
   0,                            // render width
   0,                            // render height
   AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  23,  // ans_window_size_log2
-#endif
-#if CONFIG_EXT_TILE
-  0,    // Single tile decoding is off by default.
-#endif  // CONFIG_EXT_TILE
-
-  0,  // motion_vector_unit_test
+  0,                            // Single tile decoding is off by default.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  0,                            // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+  1,                            // CDF update mode
+  1,                            // frame order hint
+  1,                            // jnt_comp
+  1,                            // enable_ref_frame_mvs sequence level
+  1,                            // allow ref_frame_mvs frame level
+  1,                            // enable_warped_motion at sequence level
+  1,                            // allow_warped_motion at frame level
+  1,                            // superres
 };
 
 struct aom_codec_alg_priv {
@@ -204,11 +211,6 @@ static aom_codec_err_t update_error_state(
     if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
   } while (0)
 
-#define RANGE_CHECK_LO(p, memb, lo)                                     \
-  do {                                                                  \
-    if (!((p)->memb >= (lo))) ERROR(#memb " out of range [" #lo "..]"); \
-  } while (0)
-
 #define RANGE_CHECK_BOOL(p, memb)                                     \
   do {                                                                \
     if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
@@ -221,15 +223,13 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
-  RANGE_CHECK_HI(cfg, g_profile, 3);
+  RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
   RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
-#if CONFIG_EXT_DELTA_Q
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
-#endif
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
   RANGE_CHECK_HI(cfg, g_threads, 64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
@@ -255,7 +255,6 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
-#if CONFIG_FRAME_SUPERRES
   RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
   RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
@@ -263,7 +262,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63);
   RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
-#endif  // CONFIG_FRAME_SUPERRES
+  RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
 
   // AV1 does not support a lower bound on the keyframe interval in
   // automatic keyframe placement mode.
@@ -275,53 +274,25 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
   RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
-#if CONFIG_EXT_REFS
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
-#endif  // CONFIG_EXT_REFS
   RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK(extra_cfg, dev_sf, 0, UINT8_MAX);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
               AOM_SUPERBLOCK_SIZE_DYNAMIC);
-#if CONFIG_EXT_TILE
   RANGE_CHECK_HI(cfg, large_scale_tile, 1);
   RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
 
-  if (cfg->large_scale_tile) {
-// TODO(any): Waring. If CONFIG_EXT_TILE is true, tile_columns really
-// means tile_width, and tile_rows really means tile_hight. The interface
-// should be sanitized.
-#if CONFIG_EXT_PARTITION
-    if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
-      if (extra_cfg->tile_columns != 0)
-        RANGE_CHECK(extra_cfg, tile_columns, 1, 32);
-      if (extra_cfg->tile_rows != 0) RANGE_CHECK(extra_cfg, tile_rows, 1, 32);
-    } else {
-#endif  // CONFIG_EXT_PARTITION
-      if (extra_cfg->tile_columns != 0)
-        RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
-      if (extra_cfg->tile_rows != 0) RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
-#if CONFIG_EXT_PARTITION
-    }
-#endif  // CONFIG_EXT_PARTITION
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_MAX_TILE
-    RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
-    RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
-#else   // CONFIG_MAX_TILE
   RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
-  RANGE_CHECK_HI(extra_cfg, tile_rows, 2);
-#endif  // CONFIG_MAX_TILE
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+
+  RANGE_CHECK_HI(cfg, monochrome, 1);
+
+  if (cfg->large_scale_tile && extra_cfg->aq_mode)
+    ERROR(
+        "Adaptive quantization are not supported in large scale tile "
+        "coding.");
 
-#if CONFIG_DEPENDENT_HORZTILES
-  RANGE_CHECK_HI(extra_cfg, dependent_horz_tiles, 1);
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  RANGE_CHECK_HI(extra_cfg, loop_filter_across_tiles_enabled, 1);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
   RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
@@ -334,25 +305,14 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   if (extra_cfg->tuning == AOM_TUNE_SSIM)
     ERROR("Option --tune=ssim is not currently supported in AV1.");
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if CONFIG_PVQ
-  if (extra_cfg->content == AOM_CONTENT_SCREEN)
-    ERROR(
-        "Option --tune-content=screen is not currently supported when PVQ is "
-        "enabled.");
-#endif  // CONFIG_PVQ
-
   if (cfg->g_pass == AOM_RC_LAST_PASS) {
-#if !CONFIG_XIPHRC
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
     const FIRSTPASS_STATS *stats;
-#endif
 
     if (cfg->rc_twopass_stats_in.buf == NULL)
       ERROR("rc_twopass_stats_in.buf not set.");
 
-#if !CONFIG_XIPHRC
     if (cfg->rc_twopass_stats_in.sz % packet_sz)
       ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
 
@@ -364,37 +324,46 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
     if ((int)(stats->count + 0.5) != n_packets - 1)
       ERROR("rc_twopass_stats_in missing EOS stats packet");
-#endif
   }
 
-#if !CONFIG_HIGHBITDEPTH
-  if (cfg->g_profile > (unsigned int)PROFILE_1) {
-    ERROR("Profile > 1 not supported in this build configuration");
-  }
-#endif
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
-      cfg->g_bit_depth > AOM_BITS_8) {
-    ERROR("Codec high bit-depth not supported in profile < 2");
-  }
-  if (cfg->g_profile <= (unsigned int)PROFILE_1 && cfg->g_input_bit_depth > 8) {
-    ERROR("Source high bit-depth not supported in profile < 2");
+      cfg->g_bit_depth > AOM_BITS_10) {
+    ERROR("Codec bit-depth 12 not supported in profile < 2");
   }
-  if (cfg->g_profile > (unsigned int)PROFILE_1 &&
-      cfg->g_bit_depth == AOM_BITS_8) {
-    ERROR("Codec bit-depth 8 not supported in profile > 1");
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 10) {
+    ERROR("Source bit-depth 12 not supported in profile < 2");
   }
-#if CONFIG_COLORSPACE_HEADERS
-  RANGE_CHECK(extra_cfg, color_space, AOM_CS_UNKNOWN, AOM_CS_ICTCP);
-  RANGE_CHECK(extra_cfg, transfer_function, AOM_TF_UNKNOWN, AOM_TF_HLG);
-  RANGE_CHECK(extra_cfg, chroma_sample_position, AOM_CSP_UNKNOWN,
-              AOM_CSP_COLOCATED);
+
+  RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
+              AOM_CICP_CP_EBU_3213);  // Need to check range more precisely to
+                                      // check for reserved values?
+  RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709,
+              AOM_CICP_TC_HLG);
+  RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY,
+              AOM_CICP_MC_ICTCP);
+  RANGE_CHECK(extra_cfg, color_range, 0, 1);
+
+#if CONFIG_DIST_8X8
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_DAALA_DIST);
 #else
-  RANGE_CHECK(extra_cfg, color_space, AOM_CS_UNKNOWN, AOM_CS_SRGB);
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
 #endif
-  RANGE_CHECK(extra_cfg, color_range, 0, 1);
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  RANGE_CHECK(extra_cfg, ans_window_size_log2, 8, 23);
+
+  RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
+              AOM_TIMING_DEC_MODEL);
+
+  RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16);
+
+  if (extra_cfg->lossless) {
+    if (extra_cfg->aq_mode != 0)
+      ERROR("Only --aq_mode=0 can be used with --lossless=1.");
+#if CONFIG_DIST_8X8
+    if (extra_cfg->enable_dist_8x8)
+      ERROR("dist-8x8 cannot be used with lossless compression.");
 #endif
+  }
+
   return AOM_CODEC_OK;
 }
 
@@ -404,23 +373,17 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
     case AOM_IMG_FMT_YV12:
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_I42016: break;
-    case AOM_IMG_FMT_I422:
     case AOM_IMG_FMT_I444:
-    case AOM_IMG_FMT_I440:
-      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
-        ERROR(
-            "Invalid image format. I422, I444, I440 images are "
-            "not supported in profile.");
+    case AOM_IMG_FMT_I44416:
+      if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 &&
+          !ctx->cfg.monochrome) {
+        ERROR("Invalid image format. I444 images not supported in profile.");
       }
       break;
+    case AOM_IMG_FMT_I422:
     case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44416:
-    case AOM_IMG_FMT_I44016:
-      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
-          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
-        ERROR(
-            "Invalid image format. 16-bit I422, I444, I440 images are "
-            "not supported in profile.");
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) {
+        ERROR("Invalid image format. I422 images not supported in profile.");
       }
       break;
     default:
@@ -442,31 +405,74 @@ static int get_image_bps(const aom_image_t *img) {
     case AOM_IMG_FMT_I420: return 12;
     case AOM_IMG_FMT_I422: return 16;
     case AOM_IMG_FMT_I444: return 24;
-    case AOM_IMG_FMT_I440: return 16;
     case AOM_IMG_FMT_I42016: return 24;
     case AOM_IMG_FMT_I42216: return 32;
     case AOM_IMG_FMT_I44416: return 48;
-    case AOM_IMG_FMT_I44016: return 32;
     default: assert(0 && "Invalid image format"); break;
   }
   return 0;
 }
 
+// Set appropriate options to disable frame super-resolution.
+static void disable_superres(AV1EncoderConfig *const oxcf) {
+  oxcf->superres_mode = SUPERRES_NONE;
+  oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_qthresh = 255;
+  oxcf->superres_kf_qthresh = 255;
+}
+
 static aom_codec_err_t set_encoder_config(
     AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg,
     const struct av1_extracfg *extra_cfg) {
   const int is_vbr = cfg->rc_end_usage == AOM_VBR;
   oxcf->profile = cfg->g_profile;
+  oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
   oxcf->width = cfg->g_w;
   oxcf->height = cfg->g_h;
+  oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
+  oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
   oxcf->bit_depth = cfg->g_bit_depth;
   oxcf->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
   oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
-  if (oxcf->init_framerate > 180) oxcf->init_framerate = 30;
-
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
+      extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    oxcf->timing_info_present = 1;
+    oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+    oxcf->timing_info.time_scale = cfg->g_timebase.den;
+    oxcf->timing_info.num_ticks_per_picture = 1;
+  } else {
+    oxcf->timing_info_present = 0;
+  }
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
+    oxcf->timing_info.equal_picture_interval = 1;
+    oxcf->decoder_model_info_present_flag = 0;
+    oxcf->display_model_info_present_flag = 1;
+  } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    //    if( extra_cfg->arnr_strength > 0 )
+    //    {
+    //      printf("Only --arnr-strength=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    //    if( extra_cfg->enable_superres)
+    //    {
+    //      printf("Only --superres-mode=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
+    oxcf->timing_info.equal_picture_interval = 0;
+    oxcf->decoder_model_info_present_flag = 1;
+    oxcf->buffer_removal_delay_present = 1;
+    oxcf->display_model_info_present_flag = 1;
+  }
+  if (oxcf->init_framerate > 180) {
+    oxcf->init_framerate = 30;
+    oxcf->timing_info_present = 0;
+  }
   oxcf->mode = GOOD;
+  oxcf->cfg = &cfg->cfg;
 
   switch (cfg->g_pass) {
     case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
@@ -491,11 +497,15 @@ static aom_codec_err_t set_encoder_config(
   oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
   oxcf->fixed_q = -1;
 
-#if CONFIG_AOM_QM
+  oxcf->enable_cdef = extra_cfg->enable_cdef;
+  oxcf->enable_restoration = extra_cfg->enable_restoration;
+  oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
   oxcf->using_qm = extra_cfg->enable_qm;
+  oxcf->qm_y = extra_cfg->qm_y;
+  oxcf->qm_u = extra_cfg->qm_u;
+  oxcf->qm_v = extra_cfg->qm_v;
   oxcf->qm_minlevel = extra_cfg->qm_min;
   oxcf->qm_maxlevel = extra_cfg->qm_max;
-#endif
 #if CONFIG_DIST_8X8
   oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
   if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
@@ -503,15 +513,16 @@ static aom_codec_err_t set_encoder_config(
     oxcf->using_dist_8x8 = 1;
 #endif
   oxcf->num_tile_groups = extra_cfg->num_tg;
-#if CONFIG_EXT_TILE
   // In large-scale tile encoding mode, num_tile_groups is always 1.
   if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
-#endif  // CONFIG_EXT_TILE
   oxcf->mtu = extra_cfg->mtu_size;
 
-#if CONFIG_TEMPMV_SIGNALING
-  oxcf->disable_tempmv = extra_cfg->disable_tempmv;
-#endif
+  // FIXME(debargha): Should this be:
+  // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
+  //                             extra_cfg->enable_order_hint ?
+  // Disallow using temporal MVs while large_scale_tile = 1.
+  oxcf->allow_ref_frame_mvs =
+      extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
   oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
   oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
 
@@ -523,26 +534,26 @@ static aom_codec_err_t set_encoder_config(
       oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
     oxcf->resize_mode = RESIZE_NONE;
 
-#if CONFIG_FRAME_SUPERRES
-  oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
-  oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
-  oxcf->superres_kf_scale_denominator =
-      (uint8_t)cfg->rc_superres_kf_denominator;
-  oxcf->superres_qthresh =
-      extra_cfg->lossless ? 255
-                          : av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
-  oxcf->superres_kf_qthresh =
-      extra_cfg->lossless
-          ? 255
-          : av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
-  if (oxcf->superres_mode == SUPERRES_FIXED &&
-      oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
-      oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR)
-    oxcf->superres_mode = SUPERRES_NONE;
-  if (oxcf->superres_mode == SUPERRES_QTHRESH &&
-      oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255)
-    oxcf->superres_mode = SUPERRES_NONE;
-#endif  // CONFIG_FRAME_SUPERRES
+  if (extra_cfg->lossless || cfg->large_scale_tile) {
+    disable_superres(oxcf);
+  } else {
+    oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
+    oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
+    oxcf->superres_kf_scale_denominator =
+        (uint8_t)cfg->rc_superres_kf_denominator;
+    oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+    oxcf->superres_kf_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+    if (oxcf->superres_mode == SUPERRES_FIXED &&
+        oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
+        oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+      disable_superres(oxcf);
+    }
+    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+        oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
+      disable_superres(oxcf);
+    }
+  }
 
   oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
   oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
@@ -558,12 +569,13 @@ static aom_codec_err_t set_encoder_config(
       cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
 
   oxcf->key_freq = cfg->kf_max_dist;
-
+  oxcf->sframe_dist = cfg->sframe_dist;
+  oxcf->sframe_mode = cfg->sframe_mode;
+  oxcf->sframe_enabled = cfg->sframe_dist != 0;
   oxcf->speed = extra_cfg->cpu_used;
+  oxcf->dev_sf = extra_cfg->dev_sf;
   oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
-#if CONFIG_EXT_REFS
   oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
-#endif  // CONFIG_EXT_REFS
   oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
   oxcf->sharpness = extra_cfg->sharpness;
 
@@ -573,64 +585,68 @@ static aom_codec_err_t set_encoder_config(
   oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
 #endif
 
-  oxcf->color_space = extra_cfg->color_space;
-
-#if CONFIG_COLORSPACE_HEADERS
-  oxcf->transfer_function = extra_cfg->transfer_function;
+  oxcf->color_primaries = extra_cfg->color_primaries;
+  oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
+  oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
   oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
-#else
-  if (extra_cfg->transfer_function != AOM_TF_UNKNOWN)
-    return AOM_CODEC_UNSUP_FEATURE;
-  if (extra_cfg->chroma_sample_position != AOM_CSP_UNKNOWN)
-    return AOM_CODEC_UNSUP_FEATURE;
-#endif
 
   oxcf->color_range = extra_cfg->color_range;
   oxcf->render_width = extra_cfg->render_width;
   oxcf->render_height = extra_cfg->render_height;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  // Adjust g_lag_in_frames down if not needed
+  oxcf->lag_in_frames =
+      AOMMIN(MAX_GF_INTERVAL + oxcf->arnr_max_frames / 2, oxcf->lag_in_frames);
   oxcf->arnr_strength = extra_cfg->arnr_strength;
   oxcf->min_gf_interval = extra_cfg->min_gf_interval;
   oxcf->max_gf_interval = extra_cfg->max_gf_interval;
 
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
-
-#if CONFIG_EXT_PARTITION
+  oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
   oxcf->superblock_size = extra_cfg->superblock_size;
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  oxcf->ans_window_size_log2 = extra_cfg->ans_window_size_log2;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-
-#if CONFIG_EXT_TILE
+  if (cfg->large_scale_tile) {
+    oxcf->film_grain_test_vector = 0;
+    oxcf->film_grain_table_filename = NULL;
+  } else {
+    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+  }
   oxcf->large_scale_tile = cfg->large_scale_tile;
   oxcf->single_tile_decoding =
       (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
   if (oxcf->large_scale_tile) {
-#if CONFIG_EXT_PARTITION
-    const unsigned int max =
-        extra_cfg->superblock_size == AOM_SUPERBLOCK_SIZE_64X64 ? 64 : 32;
-#else
-    const unsigned int max = 64;
-#endif  // CONFIG_EXT_PARTITION
-    // If tile size is not set, set it to the default value.
-    const unsigned int tc =
-        (!extra_cfg->tile_columns) ? UINT_MAX : extra_cfg->tile_columns;
-    const unsigned int tr =
-        (!extra_cfg->tile_rows) ? UINT_MAX : extra_cfg->tile_rows;
-
-    oxcf->tile_columns = AOMMIN(tc, max);
-    oxcf->tile_rows = AOMMIN(tr, max);
-  } else {
-#endif  // CONFIG_EXT_TILE
-    oxcf->tile_columns = extra_cfg->tile_columns;
-    oxcf->tile_rows = extra_cfg->tile_rows;
-#if CONFIG_EXT_TILE
+    // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
+    // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
+    // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+    // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
+    if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
+        extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
+      oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+  }
+
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows = extra_cfg->tile_rows;
+
+  oxcf->monochrome = cfg->monochrome;
+  oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  oxcf->enable_dual_filter = extra_cfg->use_dual_filter;
+  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
+  oxcf->enable_jnt_comp =
+      extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint;
+  oxcf->enable_ref_frame_mvs =
+      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+
+  oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
+  oxcf->allow_warped_motion =
+      extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+
+  oxcf->enable_superres =
+      (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
+  if (!oxcf->enable_superres) {
+    disable_superres(oxcf);
   }
-#endif  // CONFIG_EXT_TILE
 
-#if CONFIG_MAX_TILE
   oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
   oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
   for (int i = 0; i < oxcf->tile_width_count; i++) {
@@ -639,25 +655,28 @@ static aom_codec_err_t set_encoder_config(
   for (int i = 0; i < oxcf->tile_height_count; i++) {
     oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
   }
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-  oxcf->dependent_horz_tiles =
-#if CONFIG_EXT_TILE
-      (cfg->large_scale_tile) ? 0 :
-#endif  // CONFIG_EXT_TILE
-                              extra_cfg->dependent_horz_tiles;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  oxcf->loop_filter_across_tiles_enabled =
-      extra_cfg->loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-  oxcf->error_resilient_mode = cfg->g_error_resilient;
+  oxcf->error_resilient_mode =
+      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+  oxcf->s_frame_mode = extra_cfg->s_frame_mode;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    oxcf->limit = n_packets - 1;
+  } else {
+    oxcf->limit = cfg->g_limit;
+  }
+
+  if (oxcf->limit == 1) {
+    // still picture mode, display model and timing is meaningless
+    oxcf->display_model_info_present_flag = 0;
+    oxcf->timing_info_present = 0;
+  }
 
   oxcf->aq_mode = extra_cfg->aq_mode;
-#if CONFIG_EXT_DELTA_Q
   oxcf->deltaq_mode = extra_cfg->deltaq_mode;
-#endif
+
+  oxcf->save_as_annexb = cfg->save_as_annexb;
 
   oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
   oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
@@ -734,6 +753,12 @@ static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_devsf(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.dev_sf = CAST(AOME_SET_DEVSF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -741,14 +766,12 @@ static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_EXT_REFS
 static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif  // CONFIG_EXT_REFS
 
 static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
@@ -785,24 +808,6 @@ static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_DEPENDENT_HORZTILES
-static aom_codec_err_t ctrl_set_tile_dependent_rows(aom_codec_alg_priv_t *ctx,
-                                                    va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.dependent_horz_tiles = CAST(AV1E_SET_TILE_DEPENDENT_ROWS, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-static aom_codec_err_t ctrl_set_tile_loopfilter(aom_codec_alg_priv_t *ctx,
-                                                va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.loop_filter_across_tiles_enabled =
-      CAST(AV1E_SET_TILE_LOOPFILTER, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
 static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -861,14 +866,48 @@ static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_AOM_QM
+static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-
+static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
                                        va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -882,7 +921,6 @@ static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
   extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif
 #if CONFIG_DIST_8X8
 static aom_codec_err_t ctrl_set_enable_dist_8x8(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
@@ -903,14 +941,83 @@ static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
   extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#if CONFIG_TEMPMV_SIGNALING
-static aom_codec_err_t ctrl_set_disable_tempmv(aom_codec_alg_priv_t *ctx,
-                                               va_list args) {
+static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.disable_tempmv = CAST(AV1E_SET_DISABLE_TEMPMV, args);
+  extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif
+
+static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -919,14 +1026,12 @@ static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_EXT_TILE
 static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif  // CONFIG_EXT_TILE
 
 static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
                                         va_list args) {
@@ -935,14 +1040,28 @@ static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_EXT_DELTA_Q
+static aom_codec_err_t ctrl_set_film_grain_test_vector(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_test_vector =
+      CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif
+
 static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1006,10 +1125,8 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
 
     if (res == AOM_CODEC_OK) {
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
-#if CONFIG_HIGHBITDEPTH
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
-#endif
       priv->cpi = av1_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL)
         res = AOM_CODEC_MEM_ERROR;
@@ -1032,108 +1149,6 @@ static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
   return AOM_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(aom_codec_alg_priv_t *ctx,
-                                    unsigned long deadline) {
-  MODE new_mode = GOOD;
-
-  switch (ctx->cfg.g_pass) {
-    case AOM_RC_ONE_PASS:
-      switch (deadline) {
-        default: new_mode = GOOD; break;
-      }
-      break;
-    case AOM_RC_FIRST_PASS: break;
-    case AOM_RC_LAST_PASS: new_mode = GOOD;
-  }
-
-  if (ctx->oxcf.mode != new_mode) {
-    ctx->oxcf.mode = new_mode;
-    av1_change_config(ctx->cpi, &ctx->oxcf);
-  }
-}
-
-// Turn on to test if supplemental superframe data breaks decoding
-#define TEST_SUPPLEMENTAL_SUPERFRAME_DATA 0
-
-static int write_superframe_index(aom_codec_alg_priv_t *ctx) {
-  uint8_t marker = 0xc0;
-  size_t max_frame_sz = 0;
-
-  assert(ctx->pending_frame_count);
-  assert(ctx->pending_frame_count <= 8);
-
-  // Add the number of frames to the marker byte
-  marker |= ctx->pending_frame_count - 1;
-  for (int i = 0; i < ctx->pending_frame_count - 1; i++) {
-    const size_t frame_sz = ctx->pending_frame_sizes[i] - 1;
-    max_frame_sz = AOMMAX(frame_sz, max_frame_sz);
-  }
-
-  // Choose the magnitude
-  int mag;
-  unsigned int mask;
-  for (mag = 0, mask = 0xff; mag < MAG_SIZE; mag++) {
-    if (max_frame_sz <= mask) break;
-    mask <<= 8;
-    mask |= 0xff;
-  }
-  marker |= mag << 3;
-
-  // Write the index
-  uint8_t buffer[MAX_INDEX_SIZE];
-  uint8_t *x = buffer;
-
-  if (TEST_SUPPLEMENTAL_SUPERFRAME_DATA) {
-    uint8_t marker_test = 0xc0;
-    int mag_test = 2;     // 1 - 4
-    int frames_test = 4;  // 1 - 8
-    marker_test |= frames_test - 1;
-    marker_test |= (mag_test - 1) << 3;
-    *x++ = marker_test;
-    for (int i = 0; i < mag_test * frames_test; ++i)
-      *x++ = 0;  // fill up with arbitrary data
-    *x++ = marker_test;
-    printf("Added supplemental superframe data\n");
-  }
-
-  *x++ = marker;
-  for (int i = 0; i < ctx->pending_frame_count - 1; i++) {
-    assert(ctx->pending_frame_sizes[i] > 0);
-    unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
-    for (int j = 0; j <= mag; j++) {
-      *x++ = this_sz & 0xff;
-      this_sz >>= 8;
-    }
-  }
-  *x++ = marker;
-
-  const size_t index_sz = x - buffer;
-  assert(index_sz < MAX_INDEX_SIZE);
-  assert(ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz);
-
-  // move the frame to make room for the index
-  memmove(ctx->pending_cx_data + index_sz, ctx->pending_cx_data,
-          ctx->pending_cx_data_sz);
-  memcpy(ctx->pending_cx_data, buffer, index_sz);
-  ctx->pending_cx_data_sz += index_sz;
-
-  return (int)index_sz;
-}
-
-// av1 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
-static int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
-                                       int64_t n) {
-  return n * TICKS_PER_SEC * timebase->num / timebase->den;
-}
-
-static int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
-                                       int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
-  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
-}
-
 static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
                                                    unsigned int lib_flags) {
   aom_codec_frame_flags_t flags = lib_flags << 16;
@@ -1149,8 +1164,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                       const aom_image_t *img,
                                       aom_codec_pts_t pts,
                                       unsigned long duration,
-                                      aom_enc_frame_flags_t enc_flags,
-                                      unsigned long deadline) {
+                                      aom_enc_frame_flags_t enc_flags) {
   const size_t kMinCompressedSize = 8192;
   volatile aom_codec_err_t res = AOM_CODEC_OK;
   AV1_COMP *const cpi = ctx->cpi;
@@ -1163,17 +1177,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
     if (res == AOM_CODEC_OK) {
-#if CONFIG_EXT_REFS
       size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
                        ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
-#else
-      // There's no codec control for multiple alt-refs so check the encoder
-      // instance for its status to determine the compressed data size.
-      size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
-                       ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) *
-                       get_image_bps(img) / 8 *
-                       (cpi->multi_arf_allowed ? 8 : 2);
-#endif  // CONFIG_EXT_REFS
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
@@ -1186,18 +1191,15 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     }
   }
 
-  pick_quickcompress_mode(ctx, deadline);
+  if (ctx->oxcf.mode != GOOD) {
+    ctx->oxcf.mode = GOOD;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
   aom_codec_pkt_list_init(&ctx->pkt_list);
 
   volatile aom_enc_frame_flags_t flags = enc_flags;
 
-  // Handle Flags
-  if (((flags & AOM_EFLAG_NO_UPD_GF) && (flags & AOM_EFLAG_FORCE_GF)) ||
-      ((flags & AOM_EFLAG_NO_UPD_ARF) && (flags & AOM_EFLAG_FORCE_ARF))) {
-    ctx->base.err_detail = "Conflicting flags.";
-    return AOM_CODEC_INVALID_PARAM;
-  }
-
   if (setjmp(cpi->common.error.jmp)) {
     cpi->common.error.setjmp = 0;
     res = update_error_state(ctx, &cpi->common.error);
@@ -1206,6 +1208,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   }
   cpi->common.error.setjmp = 1;
 
+  // Note(yunqing): While applying encoding flags, always start from enabling
+  // all, and then modifying according to the flags. Previous frame's flags are
+  // overwritten.
   av1_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
@@ -1267,18 +1272,66 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
            !is_frame_visible &&
            -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
                                          &dst_time_stamp, &dst_end_time_stamp,
-                                         !img)) {
-#if CONFIG_REFERENCE_BUFFER
+                                         !img, timebase)) {
       if (cpi->common.seq_params.frame_id_numbers_present_flag) {
-        if (cpi->common.invalid_delta_frame_id_minus1) {
-          ctx->base.err_detail = "Invalid delta_frame_id_minus1";
+        if (cpi->common.invalid_delta_frame_id_minus_1) {
+          ctx->base.err_detail = "Invalid delta_frame_id_minus_1";
           return AOM_CODEC_ERROR;
         }
       }
-#endif  // CONFIG_REFERENCE_BUFFER
+      cpi->seq_params_locked = 1;
       if (frame_size) {
         if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
 
+        const int write_temporal_delimiter =
+            !cpi->common.spatial_layer_id && !ctx->pending_frame_count;
+
+        if (write_temporal_delimiter) {
+          uint32_t obu_header_size = 1;
+          const uint32_t obu_payload_size = 0;
+          const size_t length_field_size =
+              aom_uleb_size_in_bytes(obu_payload_size);
+
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size + 1;
+            memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                    frame_size);
+          }
+          const uint32_t obu_header_offset = 0;
+          obu_header_size = write_obu_header(
+              OBU_TEMPORAL_DELIMITER, 0,
+              (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
+
+          // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+          if (write_uleb_obu_size(obu_header_size, obu_payload_size,
+                                  ctx->pending_cx_data) != AOM_CODEC_OK) {
+            return AOM_CODEC_ERROR;
+          }
+
+          frame_size += obu_header_size + obu_payload_size + length_field_size;
+        }
+
+        if (ctx->oxcf.save_as_annexb) {
+          size_t curr_frame_size = frame_size;
+          if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
+              AOM_CODEC_OK) {
+            return AOM_CODEC_ERROR;
+          }
+          frame_size = curr_frame_size;
+
+          // B_PRIME (add frame size)
+          const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size;
+            memmove(cx_data + move_offset, cx_data, frame_size);
+          }
+          if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
+              AOM_CODEC_OK) {
+            return AOM_CODEC_ERROR;
+          }
+          frame_size += length_field_size;
+        }
+
         ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size;
         ctx->pending_cx_data_sz += frame_size;
 
@@ -1291,23 +1344,31 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       }
     }
     if (is_frame_visible) {
-      // insert superframe index if needed
-      if (ctx->pending_frame_count > 1) {
-#if CONFIG_DEBUG
-        assert(index_size >= write_superframe_index(ctx));
-#else
-        write_superframe_index(ctx);
-#endif
-      }
-
       // Add the frame packet to the list of returned packets.
       aom_codec_cx_pkt_t pkt;
 
+      if (ctx->oxcf.save_as_annexb) {
+        //  B_PRIME (add TU size)
+        size_t tu_size = ctx->pending_cx_data_sz;
+        const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
+        if (ctx->pending_cx_data) {
+          const size_t move_offset = length_field_size;
+          memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                  tu_size);
+        }
+        if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
+            AOM_CODEC_OK) {
+          return AOM_CODEC_ERROR;
+        }
+        ctx->pending_cx_data_sz += length_field_size;
+      }
+
       pkt.kind = AOM_CODEC_CX_FRAME_PKT;
 
       pkt.data.frame.buf = ctx->pending_cx_data;
       pkt.data.frame.sz = ctx->pending_cx_data_sz;
       pkt.data.frame.partition_id = -1;
+      pkt.data.frame.vis_frame_size = frame_size;
 
       pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
       pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
@@ -1394,6 +1455,25 @@ static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
   }
 }
 
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(new_img, &sd);
+      return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
 static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   (void)ctx;
@@ -1473,6 +1553,24 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
   }
 }
 
+static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  const int spatial_layer_id = va_arg(args, int);
+  if (spatial_layer_id > MAX_NUM_ENHANCEMENT_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.spatial_layer_id = spatial_layer_id;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  const int number_spatial_layers = va_arg(args, int);
+  if (number_spatial_layers > MAX_NUM_ENHANCEMENT_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.number_spatial_layers = number_spatial_layers;
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1480,38 +1578,41 @@ static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_color_space(aom_codec_alg_priv_t *ctx,
-                                            va_list args) {
+static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.color_space = CAST(AV1E_SET_COLOR_SPACE, args);
+  extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_transfer_function(aom_codec_alg_priv_t *ctx,
-                                                  va_list args) {
-#if CONFIG_COLORSPACE_HEADERS
+static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.transfer_function = CAST(AV1E_SET_TRANSFER_FUNCTION, args);
+  extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_transfer_characteristics(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.transfer_characteristics =
+      CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args);
   return update_extra_cfg(ctx, &extra_cfg);
-#else
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_UNSUP_FEATURE;
-#endif
 }
 
 static aom_codec_err_t ctrl_set_chroma_sample_position(
     aom_codec_alg_priv_t *ctx, va_list args) {
-#if CONFIG_COLORSPACE_HEADERS
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.chroma_sample_position =
       CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args);
   return update_extra_cfg(ctx, &extra_cfg);
-#else
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_UNSUP_FEATURE;
-#endif
 }
 
 static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
@@ -1537,15 +1638,6 @@ static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-static aom_codec_err_t ctrl_set_ans_window_size_log2(aom_codec_alg_priv_t *ctx,
-                                                     va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.ans_window_size_log2 = CAST(AV1E_SET_ANS_WINDOW_SIZE_LOG2, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif
-
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
   { AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1556,51 +1648,58 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AOME_SET_ROI_MAP, ctrl_set_roi_map },
   { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
   { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+  { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
   { AOME_SET_CPUUSED, ctrl_set_cpuused },
+  { AOME_SET_DEVSF, ctrl_set_devsf },
   { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
-#if CONFIG_EXT_REFS
   { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
-#endif  // CONFIG_EXT_REFS
   { AOME_SET_SHARPNESS, ctrl_set_sharpness },
   { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
   { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
-#if CONFIG_DEPENDENT_HORZTILES
-  { AV1E_SET_TILE_DEPENDENT_ROWS, ctrl_set_tile_dependent_rows },
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  { AV1E_SET_TILE_LOOPFILTER, ctrl_set_tile_loopfilter },
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
   { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
   { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
   { AOME_SET_TUNING, ctrl_set_tuning },
   { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
   { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+  { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers },
   { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
   { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
   { AV1E_SET_LOSSLESS, ctrl_set_lossless },
-#if CONFIG_AOM_QM
+  { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
+  { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+  { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
   { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+  { AV1E_SET_QM_Y, ctrl_set_qm_y },
+  { AV1E_SET_QM_U, ctrl_set_qm_u },
+  { AV1E_SET_QM_V, ctrl_set_qm_v },
   { AV1E_SET_QM_MIN, ctrl_set_qm_min },
   { AV1E_SET_QM_MAX, ctrl_set_qm_max },
-#endif
 #if CONFIG_DIST_8X8
   { AV1E_SET_ENABLE_DIST_8X8, ctrl_set_enable_dist_8x8 },
 #endif
   { AV1E_SET_NUM_TG, ctrl_set_num_tg },
   { AV1E_SET_MTU, ctrl_set_mtu },
-#if CONFIG_TEMPMV_SIGNALING
-  { AV1E_SET_DISABLE_TEMPMV, ctrl_set_disable_tempmv },
-#endif
+  { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
   { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+  { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
+  { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
+  { AV1E_SET_ENABLE_DF, ctrl_set_enable_df },
+  { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
+  { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp },
+  { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
+  { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+  { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
+  { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+  { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
   { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
-#if CONFIG_EXT_DELTA_Q
   { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
-#endif
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
   { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
-  { AV1E_SET_COLOR_SPACE, ctrl_set_color_space },
-  { AV1E_SET_TRANSFER_FUNCTION, ctrl_set_transfer_function },
+  { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
+  { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries },
+  { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics },
+  { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients },
   { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position },
   { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
   { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
@@ -1608,12 +1707,9 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
   { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  { AV1E_SET_ANS_WINDOW_SIZE_LOG2, ctrl_set_ans_window_size_log2 },
-#endif
-#if CONFIG_EXT_TILE
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
-#endif  // CONFIG_EXT_TILE
+  { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
+  { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
   { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
 
   // Getters
@@ -1622,6 +1718,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_GET_REFERENCE, ctrl_get_reference },
   { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
 
   { -1, NULL },
 };
@@ -1636,6 +1733,9 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         320,         // g_width
         240,         // g_height
+        0,           // g_limit
+        0,           // g_forced_max_frame_width
+        0,           // g_forced_max_frame_height
         AOM_BITS_8,  // g_bit_depth
         8,           // g_input_bit_depth
 
@@ -1645,7 +1745,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         AOM_RC_ONE_PASS,  // g_pass
 
-        17,  // g_lag_in_frames
+        19,  // g_lag_in_frames
 
         0,                // rc_dropframe_thresh
         RESIZE_NONE,      // rc_resize_mode
@@ -1676,14 +1776,21 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         2000,  // rc_two_pass_vbrmax_section
 
         // keyframing settings (kf)
+        0,            // fwd_kf_enabled
         AOM_KF_AUTO,  // g_kfmode
         0,            // kf_min_dist
         9999,         // kf_max_dist
+        0,            // sframe_dist
+        1,            // sframe_mode
         0,            // large_scale_tile
+        0,            // monochrome
+        0,            // full_still_picture_hdr
+        0,            // save_as_annexb
         0,            // tile_width_count
         0,            // tile_height_count
         { 0 },        // tile_widths
         { 0 },        // tile_heights
+        { 1 },        // config file
     } },
 };
 
@@ -1693,13 +1800,11 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 CODEC_INTERFACE(aom_codec_av1_cx) = {
   "AOMedia Project AV1 Encoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
-#if CONFIG_HIGHBITDEPTH
-  AOM_CODEC_CAP_HIGHBITDEPTH |
-#endif
-      AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
-  encoder_init,                                    // aom_codec_init_fn_t
-  encoder_destroy,                                 // aom_codec_destroy_fn_t
-  encoder_ctrl_maps,                               // aom_codec_ctrl_fn_map_t
+  AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
+      AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
+  encoder_init,            // aom_codec_init_fn_t
+  encoder_destroy,         // aom_codec_destroy_fn_t
+  encoder_ctrl_maps,       // aom_codec_ctrl_fn_map_t
   {
       // NOLINT
       NULL,  // aom_codec_peek_si_fn_t
diff --git a/third_party/aom/av1/av1_dx.mk b/third_party/aom/av1/av1_dx.mk
deleted file mode 100644
index 6f113c3c6..000000000
--- a/third_party/aom/av1/av1_dx.mk
+++ /dev/null
@@ -1,67 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-AV1_DX_EXPORTS += exports_dec
-
-AV1_DX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
-AV1_DX_SRCS-no  += $(AV1_COMMON_SRCS-no)
-AV1_DX_SRCS_REMOVE-yes += $(AV1_COMMON_SRCS_REMOVE-yes)
-AV1_DX_SRCS_REMOVE-no  += $(AV1_COMMON_SRCS_REMOVE-no)
-
-AV1_DX_SRCS-yes += av1_dx_iface.c
-
-AV1_DX_SRCS-yes += decoder/decodemv.c
-AV1_DX_SRCS-yes += decoder/decodeframe.c
-AV1_DX_SRCS-yes += decoder/decodeframe.h
-AV1_DX_SRCS-yes += decoder/detokenize.c
-AV1_DX_SRCS-yes += decoder/decodemv.h
-AV1_DX_SRCS-$(CONFIG_LV_MAP) += decoder/decodetxb.c
-AV1_DX_SRCS-$(CONFIG_LV_MAP) += decoder/decodetxb.h
-AV1_DX_SRCS-yes += decoder/detokenize.h
-AV1_DX_SRCS-yes += decoder/dthread.c
-AV1_DX_SRCS-yes += decoder/dthread.h
-AV1_DX_SRCS-yes += decoder/decoder.c
-AV1_DX_SRCS-yes += decoder/decoder.h
-AV1_DX_SRCS-yes += decoder/dsubexp.c
-AV1_DX_SRCS-yes += decoder/dsubexp.h
-AV1_DX_SRCS-yes += decoder/symbolrate.h
-
-ifeq ($(CONFIG_ACCOUNTING),yes)
-AV1_DX_SRCS-yes += decoder/accounting.h
-AV1_DX_SRCS-yes += decoder/accounting.c
-endif
-
-ifeq ($(CONFIG_INSPECTION),yes)
-AV1_DX_SRCS-yes += decoder/inspection.c
-AV1_DX_SRCS-yes += decoder/inspection.h
-endif
-
-ifeq ($(CONFIG_PVQ),yes)
-# PVQ from daala
-AV1_DX_SRCS-yes += decoder/pvq_decoder.c
-AV1_DX_SRCS-yes += decoder/pvq_decoder.h
-AV1_DX_SRCS-yes += decoder/decint.h
-AV1_DX_SRCS-yes += decoder/generic_decoder.c
-AV1_DX_SRCS-yes += decoder/laplace_decoder.c
-AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.c
-AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.h
-
-AV1_DX_SRCS-yes += encoder/dct.c
-AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
-
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
-endif
-
-AV1_DX_SRCS-yes := $(filter-out $(AV1_DX_SRCS_REMOVE-yes),$(AV1_DX_SRCS-yes))
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index c2f433d38..db338f7e3 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -12,14 +12,15 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./aom_config.h"
-#include "./aom_version.h"
+#include "config/aom_config.h"
+#include "config/aom_version.h"
 
 #include "aom/internal/aom_codec_internal.h"
 #include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
 #include "aom_dsp/bitreader_buffer.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
 #include "aom_util/aom_thread.h"
 
 #include "av1/common/alloccommon.h"
@@ -28,26 +29,16 @@
 
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
 
 #include "av1/av1_iface_common.h"
 
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6  // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
-  int fb_idx;
-  aom_image_t img;
-} cache_frame;
-
 struct aom_codec_alg_priv {
   aom_codec_priv_t base;
   aom_codec_dec_cfg_t cfg;
   aom_codec_stream_info_t si;
   int postproc_cfg_set;
   aom_postproc_cfg_t postproc_cfg;
-  aom_decrypt_cb decrypt_cb;
-  void *decrypt_state;
   aom_image_t img;
   int img_avail;
   int flushed;
@@ -57,19 +48,20 @@ struct aom_codec_alg_priv {
   int skip_loop_filter;
   int decode_tile_row;
   int decode_tile_col;
+  unsigned int tile_mode;
+  unsigned int ext_tile_debug;
+  EXTERNAL_REFERENCES ext_refs;
+  unsigned int is_annexb;
+  int operating_point;
+  int output_all_layers;
 
-  // Frame parallel related.
-  int frame_parallel_decode;  // frame-based threading.
   AVxWorker *frame_workers;
   int num_frame_workers;
   int next_submit_worker_id;
   int last_submit_worker_id;
   int next_output_worker_id;
   int available_threads;
-  cache_frame frame_cache[FRAME_CACHE_SIZE];
-  int frame_cache_write;
-  int frame_cache_read;
-  int num_cache_frames;
+  aom_image_t *image_with_grain;
   int need_resync;  // wait for key/intra-only frame
   // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
   BufferPool *buffer_pool;
@@ -100,18 +92,16 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
     ctx->priv = (aom_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
     priv->flushed = 0;
-    // Only do frame parallel decode when threads > 1.
-    priv->frame_parallel_decode =
-        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
-         (ctx->init_flags & AOM_CODEC_USE_FRAME_THREADING))
-            ? 1
-            : 0;
+
     // TODO(tdaede): this should not be exposed to the API
     priv->cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
+      // default values
+      priv->cfg.cfg.ext_partition = 1;
     }
+    priv->image_with_grain = NULL;
   }
 
   return AOM_CODEC_OK;
@@ -125,10 +115,10 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       aom_get_worker_interface()->end(worker);
+      aom_free(frame_worker_data->pbi->common.tpl_mvs);
+      frame_worker_data->pbi->common.tpl_mvs = NULL;
       av1_remove_common(&frame_worker_data->pbi->common);
-#if CONFIG_LOOP_RESTORATION
       av1_free_restoration_buffers(&frame_worker_data->pbi->common);
-#endif  // CONFIG_LOOP_RESTORATION
       av1_decoder_remove(frame_worker_data->pbi);
       aom_free(frame_worker_data->scratch_buffer);
 #if CONFIG_MULTITHREAD
@@ -149,176 +139,143 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
 
   aom_free(ctx->frame_workers);
   aom_free(ctx->buffer_pool);
+  if (ctx->image_with_grain) aom_img_free(ctx->image_with_grain);
   aom_free(ctx);
   return AOM_CODEC_OK;
 }
 
-#if !CONFIG_OBU
-static int parse_bitdepth_colorspace_sampling(BITSTREAM_PROFILE profile,
-                                              struct aom_read_bit_buffer *rb) {
-  aom_color_space_t color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  int subsampling_x = 0;
-  int subsampling_y = 0;
-#endif
-
-  if (profile >= PROFILE_2) rb->bit_offset += 1;  // Bit-depth 10 or 12.
-#if CONFIG_COLORSPACE_HEADERS
-  color_space = (aom_color_space_t)aom_rb_read_literal(rb, 5);
-  rb->bit_offset += 5;  // Transfer function
-#else
-  color_space = (aom_color_space_t)aom_rb_read_literal(rb, 3);
-#endif
-  if (color_space != AOM_CS_SRGB) {
-    rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
+// Parses the operating points (including operating_point_idc, seq_level_idx,
+// and seq_tier) and then sets si->number_spatial_layers and
+// si->number_temporal_layers based on operating_point_idc[0].
+static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
+                                              int is_reduced_header,
+                                              aom_codec_stream_info_t *si) {
+  int operating_point_idc0 = 0;
 
-    if (profile == PROFILE_1 || profile == PROFILE_3) {
-#if CONFIG_COLORSPACE_HEADERS
-      subsampling_x = aom_rb_read_bit(rb);
-      subsampling_y = aom_rb_read_bit(rb);
-#else
-      rb->bit_offset += 2;  // subsampling x/y.
-#endif
-      rb->bit_offset += 1;  // unused.
-#if CONFIG_COLORSPACE_HEADERS
-    } else {
-      subsampling_x = 1;
-      subsampling_y = 1;
-    }
-    if (subsampling_x == 1 && subsampling_y == 1) {
-      rb->bit_offset += 2;
-    }
-#else
-    }
-#endif
+  if (is_reduced_header) {
+    aom_rb_read_literal(rb, LEVEL_BITS);  // level
   } else {
-    if (profile == PROFILE_1 || profile == PROFILE_3) {
-      rb->bit_offset += 1;  // unused
-    } else {
-      // RGB is only available in version 1.
-      return 0;
+    const uint8_t operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
+      int operating_point_idc;
+      operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (i == 0) operating_point_idc0 = operating_point_idc;
+      int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
+      if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
     }
   }
-  return 1;
-}
-#endif
-
-static aom_codec_err_t decoder_peek_si_internal(
-    const uint8_t *data, unsigned int data_sz, aom_codec_stream_info_t *si,
-    int *is_intra_only, aom_decrypt_cb decrypt_cb, void *decrypt_state) {
-  int intra_only_flag = 0;
-  uint8_t clear_buffer[9];
 
-  if (data + data_sz <= data) return AOM_CODEC_INVALID_PARAM;
-
-  si->is_kf = 0;
-  si->w = si->h = 0;
-
-  if (decrypt_cb) {
-    data_sz = AOMMIN(sizeof(clear_buffer), data_sz);
-    decrypt_cb(decrypt_state, data, clear_buffer, data_sz);
-    data = clear_buffer;
+  if (aom_get_num_layers_from_operating_point_idc(
+          operating_point_idc0, &si->number_spatial_layers,
+          &si->number_temporal_layers) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
   }
 
-  // skip a potential superframe index
-  {
-    uint32_t frame_sizes[8];
-    int frame_count;
-    int index_size = 0;
-    aom_codec_err_t res = av1_parse_superframe_index(
-        data, data_sz, frame_sizes, &frame_count, &index_size, NULL, NULL);
-    if (res != AOM_CODEC_OK) return res;
+  return AOM_CODEC_OK;
+}
 
-    data += index_size;
-    data_sz -= index_size;
-#if CONFIG_OBU
-    if (data + data_sz <= data) return AOM_CODEC_INVALID_PARAM;
-#endif
+static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+                                                size_t data_sz,
+                                                aom_codec_stream_info_t *si,
+                                                int *is_intra_only) {
+  int intra_only_flag = 0;
+  int got_sequence_header = 0;
+  int found_keyframe = 0;
+
+  if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
+
+  si->w = 0;
+  si->h = 0;
+  si->is_kf = 0;  // is_kf indicates whether the current packet contains a RAP
+
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t payload_size = 0;
+  size_t bytes_read = 0;
+  int reduced_still_picture_hdr = 0;
+  aom_codec_err_t status = aom_read_obu_header_and_size(
+      data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+  if (status != AOM_CODEC_OK) return status;
+
+  // If the first OBU is a temporal delimiter, skip over it and look at the next
+  // OBU in the bitstream
+  if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
+    // Skip any associated payload (there shouldn't be one, but just in case)
+    if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
+    data += bytes_read + payload_size;
+    data_sz -= bytes_read + payload_size;
+
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
   }
-
-  {
-#if CONFIG_OBU
-    // Proper fix needed
-    si->is_kf = 1;
-    intra_only_flag = 1;
-    si->h = 1;
-#else
-    int show_frame;
-    int error_resilient;
-    struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
-    const int frame_marker = aom_rb_read_literal(&rb, 2);
-    const BITSTREAM_PROFILE profile = av1_read_profile(&rb);
-#if CONFIG_EXT_TILE
-    unsigned int large_scale_tile;
-#endif  // CONFIG_EXT_TILE
-
-    if (frame_marker != AOM_FRAME_MARKER) return AOM_CODEC_UNSUP_BITSTREAM;
-
-    if (profile >= MAX_PROFILES) return AOM_CODEC_UNSUP_BITSTREAM;
-
-    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
-      return AOM_CODEC_UNSUP_BITSTREAM;
-
-#if CONFIG_EXT_TILE
-    large_scale_tile = aom_rb_read_literal(&rb, 1);
-#endif  // CONFIG_EXT_TILE
-
-    if (aom_rb_read_bit(&rb)) {     // show an existing frame
-      aom_rb_read_literal(&rb, 3);  // Frame buffer to show.
-      return AOM_CODEC_OK;
-    }
-
-    if (data_sz <= 8) return AOM_CODEC_UNSUP_BITSTREAM;
-
-    si->is_kf = !aom_rb_read_bit(&rb);
-    show_frame = aom_rb_read_bit(&rb);
-    if (!si->is_kf) {
-      if (!show_frame) intra_only_flag = show_frame ? 0 : aom_rb_read_bit(&rb);
-    }
-    error_resilient = aom_rb_read_bit(&rb);
-#if CONFIG_REFERENCE_BUFFER
-    SequenceHeader seq_params = { 0, 0, 0 };
-    if (si->is_kf) {
-      /* TODO: Move outside frame loop or inside key-frame branch */
-      read_sequence_header(&seq_params, &rb);
-#if CONFIG_EXT_TILE
-      if (large_scale_tile) seq_params.frame_id_numbers_present_flag = 0;
-#endif  // CONFIG_EXT_TILE
-    }
-#endif  // CONFIG_REFERENCE_BUFFER
-#if CONFIG_REFERENCE_BUFFER
-    if (seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len;
-      frame_id_len = seq_params.frame_id_length_minus7 + 7;
-      aom_rb_read_literal(&rb, frame_id_len);
-    }
-#endif  // CONFIG_REFERENCE_BUFFER
-    if (si->is_kf) {
-      if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+  while (1) {
+    data += bytes_read;
+    data_sz -= bytes_read;
+    const uint8_t *payload_start = data;
+    // Check that the selected OBU is a sequence header
+    if (obu_header.type == OBU_SEQUENCE_HEADER) {
+      // Sanity check on sequence header size
+      if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
+      // Read a few values from the sequence header payload
+      struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+
+      av1_read_profile(&rb);  // profile
+      const int still_picture = aom_rb_read_bit(&rb);
+      reduced_still_picture_hdr = aom_rb_read_bit(&rb);
+
+      if (!still_picture && reduced_still_picture_hdr) {
         return AOM_CODEC_UNSUP_BITSTREAM;
-      av1_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
-    } else {
-      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
+      }
 
-      if (intra_only_flag) {
-        if (profile > PROFILE_0) {
-          if (!parse_bitdepth_colorspace_sampling(profile, &rb))
-            return AOM_CODEC_UNSUP_BITSTREAM;
+      if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
+          AOM_CODEC_OK) {
+        return AOM_CODEC_ERROR;
+      }
+
+      int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
+      int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
+      int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
+      int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
+      si->w = max_frame_width;
+      si->h = max_frame_height;
+      got_sequence_header = 1;
+    } else if (obu_header.type == OBU_FRAME_HEADER ||
+               obu_header.type == OBU_FRAME) {
+      if (got_sequence_header && reduced_still_picture_hdr) {
+        found_keyframe = 1;
+        break;
+      } else {
+        // make sure we have enough bits to get the frame type out
+        if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
+        struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+        const int show_existing_frame = aom_rb_read_bit(&rb);
+        if (!show_existing_frame) {
+          const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
+          if (frame_type == KEY_FRAME) {
+            found_keyframe = 1;
+            break;  // Stop here as no further OBUs will change the outcome.
+          }
         }
-        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
-        av1_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
       }
     }
-#endif  // CONFIG_OBU
+    // skip past any unread OBU header data
+    data = payload_start + payload_size;
+    data_sz -= payload_size;
+    if (data_sz <= 0) break;  // exit if we're out of OBUs
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
   }
+  if (got_sequence_header && found_keyframe) si->is_kf = 1;
   if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
   return AOM_CODEC_OK;
 }
 
-static aom_codec_err_t decoder_peek_si(const uint8_t *data,
-                                       unsigned int data_sz,
+static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz,
                                        aom_codec_stream_info_t *si) {
-  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
+  return decoder_peek_si_internal(data, data_sz, si, NULL);
 }
 
 static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
@@ -386,25 +343,7 @@ static int frame_worker_hook(void *arg1, void *arg2) {
       frame_worker_data->pbi, frame_worker_data->data_size, &data);
   frame_worker_data->data_end = data;
 
-  if (frame_worker_data->pbi->common.frame_parallel_decode) {
-    // In frame parallel decoding, a worker thread must successfully decode all
-    // the compressed data.
-    if (frame_worker_data->result != 0 ||
-        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
-      AVxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
-      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
-      // Signal all the other threads that are waiting for this frame.
-      av1_frameworker_lock_stats(worker);
-      frame_worker_data->frame_context_ready = 1;
-      lock_buffer_pool(pool);
-      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-      unlock_buffer_pool(pool);
-      frame_worker_data->pbi->need_resync = 1;
-      av1_frameworker_signal_stats(worker);
-      av1_frameworker_unlock_stats(worker);
-      return 0;
-    }
-  } else if (frame_worker_data->result != 0) {
+  if (frame_worker_data->result != 0) {
     // Check decode result in serial decode.
     frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
     frame_worker_data->pbi->need_resync = 1;
@@ -420,12 +359,8 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
   ctx->next_submit_worker_id = 0;
   ctx->last_submit_worker_id = 0;
   ctx->next_output_worker_id = 0;
-  ctx->frame_cache_read = 0;
-  ctx->frame_cache_write = 0;
-  ctx->num_cache_frames = 0;
   ctx->need_resync = 1;
-  ctx->num_frame_workers =
-      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
+  ctx->num_frame_workers = 1;
   if (ctx->num_frame_workers > MAX_DECODE_THREADS)
     ctx->num_frame_workers = MAX_DECODE_THREADS;
   ctx->available_threads = ctx->num_frame_workers;
@@ -463,6 +398,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
       set_error_detail(ctx, "Failed to allocate frame_worker_data");
       return AOM_CODEC_MEM_ERROR;
     }
+    frame_worker_data->pbi->common.options = &ctx->cfg.cfg;
     frame_worker_data->pbi->frame_worker_owner = worker;
     frame_worker_data->worker_id = i;
     frame_worker_data->scratch_buffer = NULL;
@@ -484,12 +420,16 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
 
     // If decoding in serial mode, FrameWorker thread could create tile worker
     // thread or loopfilter thread.
-    frame_worker_data->pbi->max_threads =
-        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
-
+    frame_worker_data->pbi->max_threads = ctx->cfg.threads;
     frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    frame_worker_data->pbi->common.frame_parallel_decode =
-        ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+    frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+    frame_worker_data->pbi->operating_point = ctx->operating_point;
+    frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+    frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+
     worker->hook = (AVxWorkerHook)frame_worker_hook;
     if (!winterface->reset(worker)) {
       set_error_detail(ctx, "Frame Worker thread creation failed");
@@ -516,137 +456,82 @@ static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
 }
 
 static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
-                                  const uint8_t **data, unsigned int data_sz,
-                                  void *user_priv, int64_t deadline) {
+                                  const uint8_t **data, size_t data_sz,
+                                  void *user_priv) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  (void)deadline;
 
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
   if (!ctx->si.h) {
     int is_intra_only = 0;
+    ctx->si.is_annexb = ctx->is_annexb;
     const aom_codec_err_t res =
-        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
-                                 ctx->decrypt_cb, ctx->decrypt_state);
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
     if (res != AOM_CODEC_OK) return res;
 
     if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
   }
 
-  if (!ctx->frame_parallel_decode) {
-    AVxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->data = *data;
-    frame_worker_data->data_size = data_sz;
-    frame_worker_data->user_priv = user_priv;
-    frame_worker_data->received_frame = 1;
-
-    // Set these even if already initialized.  The caller may have changed the
-    // decrypt config between frames.
-    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
-    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+  AVxWorker *const worker = ctx->frame_workers;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  frame_worker_data->data = *data;
+  frame_worker_data->data_size = data_sz;
+  frame_worker_data->user_priv = user_priv;
+  frame_worker_data->received_frame = 1;
+
 #if CONFIG_INSPECTION
-    frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
-    frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+  frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+  frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
 #endif
 
-#if CONFIG_EXT_TILE
-    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
-    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
-#endif  // CONFIG_EXT_TILE
+  frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->ext_refs = ctx->ext_refs;
 
-    worker->had_error = 0;
-    winterface->execute(worker);
+  frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
 
-    // Update data pointer after decode.
-    *data = frame_worker_data->data_end;
+  worker->had_error = 0;
+  winterface->execute(worker);
 
-    if (worker->had_error)
-      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+  // Update data pointer after decode.
+  *data = frame_worker_data->data_end;
 
-    check_resync(ctx, frame_worker_data->pbi);
-  } else {
-    AVxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    // Copy context from last worker thread to next worker thread.
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      av1_frameworker_copy_context(
-          &ctx->frame_workers[ctx->next_submit_worker_id],
-          &ctx->frame_workers[ctx->last_submit_worker_id]);
-
-    frame_worker_data->pbi->ready_for_new_data = 0;
-    // Copy the compressed data into worker's internal buffer.
-    // TODO(hkuang): Will all the workers allocate the same size
-    // as the size of the first intra frame be better? This will
-    // avoid too many deallocate and allocate.
-    if (frame_worker_data->scratch_buffer_size < data_sz) {
-      aom_free(frame_worker_data->scratch_buffer);
-      frame_worker_data->scratch_buffer = (uint8_t *)aom_malloc(data_sz);
-      if (frame_worker_data->scratch_buffer == NULL) {
-        set_error_detail(ctx, "Failed to reallocate scratch buffer");
-        return AOM_CODEC_MEM_ERROR;
-      }
-      frame_worker_data->scratch_buffer_size = data_sz;
-    }
-    frame_worker_data->data_size = data_sz;
-    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
-
-    frame_worker_data->frame_decoded = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 1;
-    frame_worker_data->data = frame_worker_data->scratch_buffer;
-    frame_worker_data->user_priv = user_priv;
-
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      ctx->last_submit_worker_id =
-          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
-
-    ctx->next_submit_worker_id =
-        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
-    --ctx->available_threads;
-    worker->had_error = 0;
-    winterface->launch(worker);
-  }
-
-  return AOM_CODEC_OK;
-}
-
-static void wait_worker_and_cache_frame(aom_codec_alg_priv_t *ctx) {
-  YV12_BUFFER_CONFIG sd;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-  ctx->next_output_worker_id =
-      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-  // TODO(hkuang): Add worker error handling here.
-  winterface->sync(worker);
-  frame_worker_data->received_frame = 0;
-  ++ctx->available_threads;
+  if (worker->had_error)
+    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
 
   check_resync(ctx, frame_worker_data->pbi);
 
-  if (av1_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
-    AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
-    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
-                    frame_worker_data->user_priv);
-    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
-        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-    ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
-    ++ctx->num_cache_frames;
-  }
+  return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
-                                      const uint8_t *data, unsigned int data_sz,
-                                      void *user_priv, long deadline) {
+                                      const uint8_t *data, size_t data_sz,
+                                      void *user_priv) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
-  aom_codec_err_t res;
-  uint32_t frame_sizes[8];
-  int frame_count;
+  const uint8_t *data_end = data + data_sz;
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  // Release any pending output frames from the previous decoder call.
+  // We need to do this even if the decoder is being flushed
+  if (ctx->frame_workers) {
+    BufferPool *const pool = ctx->buffer_pool;
+    RefCntBuffer *const frame_bufs = pool->frame_bufs;
+    lock_buffer_pool(pool);
+    for (int i = 0; i < ctx->num_frame_workers; ++i) {
+      AVxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      struct AV1Decoder *pbi = frame_worker_data->pbi;
+      for (size_t j = 0; j < pbi->num_output_frames; j++) {
+        decrease_ref_count((int)pbi->output_frame_index[j], frame_bufs, pool);
+      }
+      pbi->num_output_frames = 0;
+    }
+    unlock_buffer_pool(ctx->buffer_pool);
+  }
 
   if (data == NULL && data_sz == 0) {
     ctx->flushed = 1;
@@ -662,142 +547,91 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
     if (res != AOM_CODEC_OK) return res;
   }
 
-  int index_size = 0;
-  res = av1_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                                   &index_size, ctx->decrypt_cb,
-                                   ctx->decrypt_state);
-  if (res != AOM_CODEC_OK) return res;
-
-  data_start += index_size;
-
-  if (ctx->frame_parallel_decode) {
-    // Decode in frame parallel mode. When decoding in this mode, the frame
-    // passed to the decoder must be either a normal frame or a superframe with
-    // superframe index so the decoder could get each frame's start position
-    // in the superframe.
-    if (frame_count > 0) {
-      int i;
-
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return AOM_CODEC_CORRUPT_FRAME;
-        }
-
-        if (ctx->available_threads == 0) {
-          // No more threads for decoding. Wait until the next output worker
-          // finishes decoding. Then copy the decoded frame into cache.
-          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-            wait_worker_and_cache_frame(ctx);
-          } else {
-            // TODO(hkuang): Add unit test to test this path.
-            set_error_detail(ctx, "Frame output cache is full.");
-            return AOM_CODEC_ERROR;
-          }
-        }
+  if (ctx->is_annexb) {
+    // read the size of this temporal unit
+    size_t length_of_size;
+    uint64_t temporal_unit_size;
+    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+                        &length_of_size) != 0) {
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+    data_start += length_of_size;
+    if (temporal_unit_size > (size_t)(data_end - data_start))
+      return AOM_CODEC_CORRUPT_FRAME;
+    data_end = data_start + temporal_unit_size;
+  }
 
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != AOM_CODEC_OK) return res;
-        data_start += frame_size;
+  // Decode in serial mode.
+  while (data_start < data_end) {
+    uint64_t frame_size;
+    if (ctx->is_annexb) {
+      // read the size of this frame unit
+      size_t length_of_size;
+      if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+                          &frame_size, &length_of_size) != 0) {
+        return AOM_CODEC_CORRUPT_FRAME;
       }
+      data_start += length_of_size;
+      if (frame_size > (size_t)(data_end - data_start))
+        return AOM_CODEC_CORRUPT_FRAME;
     } else {
-      if (ctx->available_threads == 0) {
-        // No more threads for decoding. Wait until the next output worker
-        // finishes decoding. Then copy the decoded frame into cache.
-        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-          wait_worker_and_cache_frame(ctx);
-        } else {
-          // TODO(hkuang): Add unit test to test this path.
-          set_error_detail(ctx, "Frame output cache is full.");
-          return AOM_CODEC_ERROR;
-        }
-      }
-
-      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
-      if (res != AOM_CODEC_OK) return res;
+      frame_size = (uint64_t)(data_end - data_start);
     }
-  } else {
-    // Decode in serial mode.
-    if (frame_count > 0) {
-      int i;
-
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return AOM_CODEC_CORRUPT_FRAME;
-        }
 
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != AOM_CODEC_OK) return res;
+    res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
+    if (res != AOM_CODEC_OK) return res;
 
-        data_start += frame_size;
-      }
-    } else {
-      while (data_start < data_end) {
-        const uint32_t frame_size = (uint32_t)(data_end - data_start);
-        res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
-        if (res != AOM_CODEC_OK) return res;
-
-        // Account for suboptimal termination by the encoder.
-        while (data_start < data_end) {
-          const uint8_t marker =
-              read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
-          if (marker) break;
-          ++data_start;
-        }
-      }
+    // Allow extra zero bytes after the frame end
+    while (data_start < data_end) {
+      const uint8_t marker = data_start[0];
+      if (marker) break;
+      ++data_start;
     }
   }
 
   return res;
 }
 
-static void release_last_output_frame(aom_codec_alg_priv_t *ctx) {
-  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
-  // Decrease reference count of last output frame in frame parallel mode.
-  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-    BufferPool *const pool = ctx->buffer_pool;
-    lock_buffer_pool(pool);
-    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
-    unlock_buffer_pool(pool);
+aom_image_t *add_grain_if_needed(aom_image_t *img, aom_image_t *grain_img_buf,
+                                 aom_film_grain_t *grain_params) {
+  if (!grain_params->apply_grain) return img;
+
+  if (grain_img_buf &&
+      (img->d_w != grain_img_buf->d_w || img->d_h != grain_img_buf->d_h ||
+       img->fmt != grain_img_buf->fmt || !(img->d_h % 2) || !(img->d_w % 2))) {
+    aom_img_free(grain_img_buf);
+    grain_img_buf = NULL;
   }
+  if (!grain_img_buf) {
+    int w_even = img->d_w % 2 ? img->d_w + 1 : img->d_w;
+    int h_even = img->d_h % 2 ? img->d_h + 1 : img->d_h;
+    grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
+    grain_img_buf->bit_depth = img->bit_depth;
+  }
+
+  av1_add_film_grain(grain_params, img, grain_img_buf);
+
+  return grain_img_buf;
 }
 
 static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
                                       aom_codec_iter_t *iter) {
   aom_image_t *img = NULL;
 
-  // Only return frame when all the cpu are busy or
-  // application fluhsed the decoder in frame parallel decode.
-  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
-      !ctx->flushed) {
+  if (!iter) {
     return NULL;
   }
 
-  // Output the frames in the cache first.
-  if (ctx->num_cache_frames > 0) {
-    release_last_output_frame(ctx);
-    ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
-    if (ctx->need_resync) return NULL;
-    img = &ctx->frame_cache[ctx->frame_cache_read].img;
-    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
-    --ctx->num_cache_frames;
-    return img;
-  }
+  // To avoid having to allocate any extra storage, treat 'iter' as
+  // simply a pointer to an integer index
+  uintptr_t *index = (uintptr_t *)iter;
 
-  // iter acts as a flip flop, so an image is only returned on the first
-  // call to get_frame.
-  if (*iter == NULL && ctx->frame_workers != NULL) {
+  if (ctx->frame_workers != NULL) {
     do {
-      YV12_BUFFER_CONFIG sd;
+      YV12_BUFFER_CONFIG *sd;
+      // NOTE(david.barker): This code does not support multiple worker threads
+      // yet. We should probably move the iteration over threads into *iter
+      // instead of using ctx->next_output_worker_id.
       const AVxWorkerInterface *const winterface = aom_get_worker_interface();
       AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
       FrameWorkerData *const frame_worker_data =
@@ -812,50 +646,64 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           frame_worker_data->received_frame = 0;
           check_resync(ctx, frame_worker_data->pbi);
         }
-        if (av1_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
-          AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+        aom_film_grain_t *grain_params;
+        if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
+                              &grain_params) == 0) {
+          *index += 1;  // Advance the iterator to point to the next image
+
+          AV1Decoder *const pbi = frame_worker_data->pbi;
+          AV1_COMMON *const cm = &pbi->common;
           RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-          release_last_output_frame(ctx);
-          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          ctx->last_show_frame = cm->new_fb_idx;
           if (ctx->need_resync) return NULL;
-          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+          yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+
+          if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+            img = &ctx->img;
+            img->img_data = pbi->tile_list_output;
+            img->sz = pbi->tile_list_size;
+            return img;
+          }
 
-#if CONFIG_EXT_TILE
-          if (cm->single_tile_decoding &&
-              frame_worker_data->pbi->dec_tile_row >= 0) {
-            const int tile_row =
-                AOMMIN(frame_worker_data->pbi->dec_tile_row, cm->tile_rows - 1);
+          const int num_planes = av1_num_planes(cm);
+          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+              pbi->dec_tile_row >= 0) {
+            const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
             const int mi_row = tile_row * cm->tile_height;
             const int ssy = ctx->img.y_chroma_shift;
             int plane;
             ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
-            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-              ctx->img.planes[plane] +=
-                  mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+            if (num_planes > 1) {
+              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+                ctx->img.planes[plane] +=
+                    mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+              }
             }
             ctx->img.d_h =
                 AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
           }
 
-          if (cm->single_tile_decoding &&
-              frame_worker_data->pbi->dec_tile_col >= 0) {
-            const int tile_col =
-                AOMMIN(frame_worker_data->pbi->dec_tile_col, cm->tile_cols - 1);
+          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+              pbi->dec_tile_col >= 0) {
+            const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
             const int mi_col = tile_col * cm->tile_width;
             const int ssx = ctx->img.x_chroma_shift;
             int plane;
             ctx->img.planes[0] += mi_col * MI_SIZE;
-            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-              ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+            if (num_planes > 1) {
+              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+                ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+              }
             }
             ctx->img.d_w =
                 AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
           }
-#endif  // CONFIG_EXT_TILE
 
           ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
           img = &ctx->img;
-          return img;
+          img->temporal_id = cm->temporal_layer_id;
+          img->spatial_id = cm->spatial_layer_id;
+          return add_grain_if_needed(img, ctx->image_with_grain, grain_params);
         }
       } else {
         // Decoding failed. Release the worker thread.
@@ -890,12 +738,6 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (data) {
     av1_ref_frame_t *const frame = data;
     YV12_BUFFER_CONFIG sd;
@@ -903,7 +745,7 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
-                                 &sd);
+                                 frame->use_external_ref, &sd);
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -912,13 +754,6 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
-
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (frame) {
     YV12_BUFFER_CONFIG sd;
     AVxWorker *const worker = ctx->frame_workers;
@@ -933,13 +768,6 @@ static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
-
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (data) {
     YV12_BUFFER_CONFIG *fb;
     AVxWorker *const worker = ctx->frame_workers;
@@ -956,13 +784,6 @@ static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   aom_image_t *new_img = va_arg(args, aom_image_t *);
-
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (new_img) {
     YV12_BUFFER_CONFIG new_frame;
     AVxWorker *const worker = ctx->frame_workers;
@@ -979,6 +800,27 @@ static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
   }
 }
 
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *img = va_arg(args, aom_image_t *);
+  if (img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(img, &sd);
+      return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
+                                    &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
 static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
                                          va_list args) {
   (void)ctx;
@@ -997,12 +839,6 @@ static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (update_info) {
     if (ctx->frame_workers) {
       AVxWorker *const worker = ctx->frame_workers;
@@ -1036,9 +872,9 @@ static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
       AVxWorker *const worker = ctx->frame_workers;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
-      RefCntBuffer *const frame_bufs =
-          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
-      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+      AV1Decoder *const pbi = frame_worker_data->pbi;
+      RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
+      if (pbi->seen_frame_header && pbi->num_output_frames == 0)
         return AOM_CODEC_ERROR;
       if (ctx->last_show_frame >= 0)
         *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
@@ -1055,12 +891,6 @@ static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   int *const frame_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (frame_size) {
     if (ctx->frame_workers) {
       AVxWorker *const worker = ctx->frame_workers;
@@ -1078,15 +908,69 @@ static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_INVALID_PARAM;
 }
 
-static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
+
+  if (frame_header_info) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
+      frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
+      frame_header_info->extra_size = pbi->frame_header_size;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
+
+  if (tile_data) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      tile_data->coded_tile_data_size =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size;
+      tile_data->coded_tile_data =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
-  int *const render_size = va_arg(args, int *);
+  av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
+  if (data) {
+    av1_ext_ref_frame_t *const ext_frames = data;
+    ctx->ext_refs.num = ext_frames->num;
+    for (int i = 0; i < ctx->ext_refs.num; i++) {
+      image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]);
+    }
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
   }
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
 
   if (render_size) {
     if (ctx->frame_workers) {
@@ -1131,14 +1015,6 @@ static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
-static aom_codec_err_t ctrl_set_decryptor(aom_codec_alg_priv_t *ctx,
-                                          va_list args) {
-  aom_decrypt_init *init = va_arg(args, aom_decrypt_init *);
-  ctx->decrypt_cb = init ? init->decrypt_cb : NULL;
-  ctx->decrypt_state = init ? init->decrypt_state : NULL;
-  return AOM_CODEC_OK;
-}
-
 static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
                                                va_list args) {
   const int legacy_byte_alignment = 0;
@@ -1204,6 +1080,30 @@ static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->tile_mode = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->is_annexb = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->operating_point = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->output_all_layers = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
 #if !CONFIG_INSPECTION
@@ -1218,6 +1118,12 @@ static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
 #endif
 }
 
+static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  ctx->ext_tile_debug = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1229,12 +1135,17 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AOM_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
   { AOM_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
   { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
-  { AOMD_SET_DECRYPTOR, ctrl_set_decryptor },
   { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
   { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
   { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+  { AV1_SET_TILE_MODE, ctrl_set_tile_mode },
+  { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb },
+  { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point },
+  { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
   { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+  { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+  { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
 
   // Getters
   { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
@@ -1245,7 +1156,10 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
   { AV1_GET_ACCOUNTING, ctrl_get_accounting },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
   { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
+  { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
 
   { -1, NULL },
 };
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
index 6c9a2a6cb..c03892b73 100644
--- a/third_party/aom/av1/av1_iface_common.h
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -15,10 +15,11 @@
 
 static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                             void *user_priv) {
-  /** aom_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
+  /* aom_img_wrap() doesn't allow specifying independent strides for
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.
+   */
   int bps;
   if (!yv12->subsampling_y) {
     if (!yv12->subsampling_x) {
@@ -29,23 +30,18 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
       bps = 16;
     }
   } else {
-    if (!yv12->subsampling_x) {
-      img->fmt = AOM_IMG_FMT_I440;
-      bps = 16;
-    } else {
-      img->fmt = AOM_IMG_FMT_I420;
-      bps = 12;
-    }
+    img->fmt = AOM_IMG_FMT_I420;
+    bps = 12;
   }
-  img->cs = yv12->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  img->tf = yv12->transfer_function;
+  img->cp = yv12->color_primaries;
+  img->tc = yv12->transfer_characteristics;
+  img->mc = yv12->matrix_coefficients;
+  img->monochrome = yv12->monochrome;
   img->csp = yv12->chroma_sample_position;
-#endif
   img->range = yv12->color_range;
   img->bit_depth = 8;
-  img->w = yv12->y_stride;
-  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * AOM_BORDER_IN_PIXELS, 3);
+  img->w = yv12->y_width;
+  img->h = yv12->y_height;
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
   img->r_w = yv12->render_width;
@@ -60,7 +56,6 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
   img->stride[AOM_PLANE_U] = yv12->uv_stride;
   img->stride[AOM_PLANE_V] = yv12->uv_stride;
   img->stride[AOM_PLANE_ALPHA] = yv12->y_stride;
-#if CONFIG_HIGHBITDEPTH
   if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
     // aom_image_t uses byte strides and a pointer to the first byte
     // of the image.
@@ -75,7 +70,6 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
     img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
     img->stride[AOM_PLANE_ALPHA] = 2 * yv12->y_stride;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -93,8 +87,8 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
   yv12->y_crop_height = img->d_h;
   yv12->render_width = img->r_w;
   yv12->render_height = img->r_h;
-  yv12->y_width = img->d_w;
-  yv12->y_height = img->d_h;
+  yv12->y_width = img->w;
+  yv12->y_height = img->h;
 
   yv12->uv_width =
       img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
@@ -105,14 +99,13 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
 
   yv12->y_stride = img->stride[AOM_PLANE_Y];
   yv12->uv_stride = img->stride[AOM_PLANE_U];
-  yv12->color_space = img->cs;
-#if CONFIG_COLORSPACE_HEADERS
-  yv12->transfer_function = img->tf;
+  yv12->color_primaries = img->cp;
+  yv12->transfer_characteristics = img->tc;
+  yv12->matrix_coefficients = img->mc;
+  yv12->monochrome = img->monochrome;
   yv12->chroma_sample_position = img->csp;
-#endif
   yv12->color_range = img->range;
 
-#if CONFIG_HIGHBITDEPTH
   if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
     // In aom_image_t
     //     planes point to uint8 address of start of data
@@ -134,9 +127,6 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
     yv12->flags = 0;
   }
   yv12->border = (yv12->y_stride - img->w) / 2;
-#else
-  yv12->border = (img->stride[AOM_PLANE_Y] - img->w) / 2;
-#endif  // CONFIG_HIGHBITDEPTH
   yv12->subsampling_x = img->x_chroma_shift;
   yv12->subsampling_y = img->y_chroma_shift;
   return AOM_CODEC_OK;
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
index fd635686f..49902cc7d 100644
--- a/third_party/aom/av1/common/alloccommon.c
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -10,7 +10,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/alloccommon.h"
@@ -25,16 +26,43 @@ int av1_get_MBs(int width, int height) {
   const int mi_cols = aligned_width >> MI_SIZE_LOG2;
   const int mi_rows = aligned_height >> MI_SIZE_LOG2;
 
-#if CONFIG_CB4X4
   const int mb_cols = (mi_cols + 2) >> 2;
   const int mb_rows = (mi_rows + 2) >> 2;
-#else
-  const int mb_cols = (mi_cols + 1) >> 1;
-  const int mb_rows = (mi_rows + 1) >> 1;
-#endif
   return mb_rows * mb_cols;
 }
 
+#if LOOP_FILTER_BITMASK
+static int alloc_loop_filter_mask(AV1_COMMON *cm) {
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+
+  // Each lfm holds bit masks for all the 4x4 blocks in a max
+  // 64x64 (128x128 for ext_partitions) region.  The stride
+  // and rows are rounded up / truncated to a multiple of 16
+  // (32 for ext_partition).
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+  cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+                   cm->lf.lfm_stride;
+  cm->lf.lfm =
+      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) return 1;
+
+  unsigned int i;
+  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+  return 0;
+}
+
+static void free_loop_filter_mask(AV1_COMMON *cm) {
+  if (cm->lf.lfm == NULL) return;
+
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+  cm->lf.lfm_num = 0;
+  cm->lf.lfm_stride = 0;
+}
+#endif
+
 void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
   // Ensure that the decoded width and height are both multiples of
   // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
@@ -48,79 +76,13 @@ void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
   cm->mi_stride = calc_mi_size(cm->mi_cols);
 
-#if CONFIG_CB4X4
   cm->mb_cols = (cm->mi_cols + 2) >> 2;
   cm->mb_rows = (cm->mi_rows + 2) >> 2;
-#else
-  cm->mb_cols = (cm->mi_cols + 1) >> 1;
-  cm->mb_rows = (cm->mi_rows + 1) >> 1;
-#endif
   cm->MBs = cm->mb_rows * cm->mb_cols;
-}
-
-static int alloc_seg_map(AV1_COMMON *cm, int seg_map_size) {
-  int i;
-
-  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
-    cm->seg_map_array[i] = (uint8_t *)aom_calloc(seg_map_size, 1);
-    if (cm->seg_map_array[i] == NULL) return 1;
-  }
-  cm->seg_map_alloc_size = seg_map_size;
-
-  // Init the index.
-  cm->seg_map_idx = 0;
-  cm->prev_seg_map_idx = 1;
-
-  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  if (!cm->frame_parallel_decode)
-    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
-
-  return 0;
-}
-
-static void free_seg_map(AV1_COMMON *cm) {
-  int i;
 
-  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
-    aom_free(cm->seg_map_array[i]);
-    cm->seg_map_array[i] = NULL;
-  }
-
-  cm->current_frame_seg_map = NULL;
-
-  if (!cm->frame_parallel_decode) {
-    cm->last_frame_seg_map = NULL;
-  }
-  cm->seg_map_alloc_size = 0;
-}
-
-static void free_scratch_buffers(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-  for (int i = 0; i < 4; ++i) {
-    if (cm->ncobmcaw_buf[i]) {
-      aom_free(cm->ncobmcaw_buf[i]);
-      cm->ncobmcaw_buf[i] = NULL;
-    }
-  }
-#endif  // CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-}
-
-static int alloc_scratch_buffers(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-  // If not allocated already, allocate
-  if (!cm->ncobmcaw_buf[0] && !cm->ncobmcaw_buf[1] && !cm->ncobmcaw_buf[2] &&
-      !cm->ncobmcaw_buf[3]) {
-    for (int i = 0; i < 4; ++i) {
-      CHECK_MEM_ERROR(
-          cm, cm->ncobmcaw_buf[i],
-          (uint8_t *)aom_memalign(
-              16, (1 + CONFIG_HIGHBITDEPTH) * MAX_MB_PLANE * MAX_SB_SQUARE));
-    }
-  }
-#endif  // CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-  return 0;
+#if LOOP_FILTER_BITMASK
+  alloc_loop_filter_mask(cm);
+#endif
 }
 
 void av1_free_ref_frame_buffers(BufferPool *pool) {
@@ -134,97 +96,179 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
     }
     aom_free(pool->frame_bufs[i].mvs);
     pool->frame_bufs[i].mvs = NULL;
-#if CONFIG_MFMV
-    aom_free(pool->frame_bufs[i].tpl_mvs);
-    pool->frame_bufs[i].tpl_mvs = NULL;
-#endif
+    aom_free(pool->frame_bufs[i].seg_map);
+    pool->frame_bufs[i].seg_map = NULL;
     aom_free_frame_buffer(&pool->frame_bufs[i].buf);
-#if CONFIG_HASH_ME
-    av1_hash_table_destroy(&pool->frame_bufs[i].hash_table);
-#endif
   }
 }
 
-#if CONFIG_LOOP_RESTORATION
-// Assumes cm->rst_info[p].restoration_tilesize is already initialized
+// Assumes cm->rst_info[p].restoration_unit_size is already initialized
 void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
-  int p;
-#if CONFIG_FRAME_SUPERRES
-  int width = cm->superres_upscaled_width;
-  int height = cm->superres_upscaled_height;
-#else
-  int width = cm->width;
-  int height = cm->height;
-#endif  // CONFIG_FRAME_SUPERRES
-  av1_alloc_restoration_struct(cm, &cm->rst_info[0], width, height);
-  for (p = 1; p < MAX_MB_PLANE; ++p)
-    av1_alloc_restoration_struct(cm, &cm->rst_info[p],
-                                 ROUND_POWER_OF_TWO(width, cm->subsampling_x),
-                                 ROUND_POWER_OF_TWO(height, cm->subsampling_y));
-  aom_free(cm->rst_internal.tmpbuf);
-  CHECK_MEM_ERROR(cm, cm->rst_internal.tmpbuf,
-                  (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
-
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  // Allocate internal storage for the loop restoration stripe boundary lines
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    int w = p == 0 ? width : ROUND_POWER_OF_TWO(width, cm->subsampling_x);
-    int align_bits = 5;  // align for efficiency
-    int stride = ALIGN_POWER_OF_TWO(w, align_bits);
-    int num_stripes = (height + 63) / 64;
-    // for each processing stripe: 2 lines above, 2 below
-    int buf_size = num_stripes * 2 * stride;
-    uint8_t *above_buf, *below_buf;
-
-    aom_free(cm->rst_internal.stripe_boundary_above[p]);
-    aom_free(cm->rst_internal.stripe_boundary_below[p]);
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth) buf_size = buf_size * 2;
-#endif
-    CHECK_MEM_ERROR(cm, above_buf,
-                    (uint8_t *)aom_memalign(1 << align_bits, buf_size));
-    CHECK_MEM_ERROR(cm, below_buf,
-                    (uint8_t *)aom_memalign(1 << align_bits, buf_size));
-    cm->rst_internal.stripe_boundary_above[p] = above_buf;
-    cm->rst_internal.stripe_boundary_below[p] = below_buf;
-    cm->rst_internal.stripe_boundary_stride[p] = stride;
+  const int num_planes = av1_num_planes(cm);
+  for (int p = 0; p < num_planes; ++p)
+    av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+
+  if (cm->rst_tmpbuf == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
+                    (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+  }
+
+  if (cm->rlbs == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
+  }
+
+  // For striped loop restoration, we divide each row of tiles into "stripes",
+  // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
+  // luma pixels to match the output from CDEF. We will need to store 2 *
+  // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
+  // able to quickly answer the question "Where is the <n>'th stripe for tile
+  // row <m>?" To make that efficient, we generate the rst_last_stripe array.
+  int num_stripes = 0;
+  for (int i = 0; i < cm->tile_rows; ++i) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, i);
+    const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
+    const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+    const int tile_stripes = (ext_h + 63) / 64;
+    num_stripes += tile_stripes;
+    cm->rst_end_stripe[i] = num_stripes;
+  }
+
+  // Now we need to allocate enough space to store the line buffers for the
+  // stripes
+  const int frame_w = cm->superres_upscaled_width;
+  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+
+  for (int p = 0; p < num_planes; ++p) {
+    const int is_uv = p > 0;
+    const int ss_x = is_uv && cm->subsampling_x;
+    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
+    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
+    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
+                         << use_highbd;
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+
+    if (buf_size != boundaries->stripe_boundary_size ||
+        boundaries->stripe_boundary_above == NULL ||
+        boundaries->stripe_boundary_below == NULL) {
+      aom_free(boundaries->stripe_boundary_above);
+      aom_free(boundaries->stripe_boundary_below);
+
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
+                      (uint8_t *)aom_memalign(32, buf_size));
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
+                      (uint8_t *)aom_memalign(32, buf_size));
+
+      boundaries->stripe_boundary_size = buf_size;
+    }
+    boundaries->stripe_boundary_stride = stride;
   }
-#endif  // CONFIG_STRIPED_LOOP_RESTORATION
 }
 
 void av1_free_restoration_buffers(AV1_COMMON *cm) {
   int p;
   for (p = 0; p < MAX_MB_PLANE; ++p)
     av1_free_restoration_struct(&cm->rst_info[p]);
-  aom_free(cm->rst_internal.tmpbuf);
-  cm->rst_internal.tmpbuf = NULL;
+  aom_free(cm->rst_tmpbuf);
+  cm->rst_tmpbuf = NULL;
+  aom_free(cm->rlbs);
+  cm->rlbs = NULL;
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+    aom_free(boundaries->stripe_boundary_above);
+    aom_free(boundaries->stripe_boundary_below);
+    boundaries->stripe_boundary_above = NULL;
+    boundaries->stripe_boundary_below = NULL;
+  }
+
+  aom_free_frame_buffer(&cm->rst_frame);
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
-void av1_free_context_buffers(AV1_COMMON *cm) {
+void av1_free_above_context_buffers(AV1_COMMON *cm,
+                                    int num_free_above_contexts) {
   int i;
-  cm->free_mi(cm);
-  free_seg_map(cm);
-  free_scratch_buffers(cm);
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  const int num_planes = cm->num_allocated_above_context_planes;
+
+  for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+    for (i = 0; i < num_planes; i++) {
+      aom_free(cm->above_context[i][tile_row]);
+      cm->above_context[i][tile_row] = NULL;
+    }
+    aom_free(cm->above_seg_context[tile_row]);
+    cm->above_seg_context[tile_row] = NULL;
+
+    aom_free(cm->above_txfm_context[tile_row]);
+    cm->above_txfm_context[tile_row] = NULL;
+  }
+  for (i = 0; i < num_planes; i++) {
     aom_free(cm->above_context[i]);
     cm->above_context[i] = NULL;
   }
   aom_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
-  cm->above_context_alloc_cols = 0;
-#if CONFIG_VAR_TX
+
   aom_free(cm->above_txfm_context);
   cm->above_txfm_context = NULL;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    aom_free(cm->top_txfm_context[i]);
-    cm->top_txfm_context[i] = NULL;
-  }
+  cm->num_allocated_above_contexts = 0;
+  cm->num_allocated_above_context_mi_col = 0;
+  cm->num_allocated_above_context_planes = 0;
+}
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+  cm->free_mi(cm);
+
+  av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+
+#if LOOP_FILTER_BITMASK
+  free_loop_filter_mask(cm);
 #endif
 }
 
+int av1_alloc_above_context_buffers(AV1_COMMON *cm,
+                                    int num_alloc_above_contexts) {
+  const int num_planes = av1_num_planes(cm);
+  int plane_idx;
+  const int aligned_mi_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+
+  // Allocate above context buffers
+  cm->num_allocated_above_contexts = num_alloc_above_contexts;
+  cm->num_allocated_above_context_mi_col = aligned_mi_cols;
+  cm->num_allocated_above_context_planes = num_planes;
+  for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+    cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+        num_alloc_above_contexts, sizeof(cm->above_context[0]));
+    if (!cm->above_context[plane_idx]) return 1;
+  }
+
+  cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
+      num_alloc_above_contexts, sizeof(cm->above_seg_context));
+  if (!cm->above_seg_context) return 1;
+
+  cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
+      num_alloc_above_contexts, sizeof(cm->above_txfm_context));
+  if (!cm->above_txfm_context) return 1;
+
+  for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
+    for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+      cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
+          aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
+      if (!cm->above_context[plane_idx][tile_row]) return 1;
+    }
+
+    cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
+    if (!cm->above_seg_context[tile_row]) return 1;
+
+    cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
+    if (!cm->above_txfm_context[tile_row]) return 1;
+  }
+
+  return 0;
+}
+
 int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
   int new_mi_size;
 
@@ -235,52 +279,6 @@ int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
   }
 
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-  if (alloc_scratch_buffers(cm)) goto fail;
-
-  if (cm->above_context_alloc_cols < cm->mi_cols) {
-    // TODO(geza.lore): These are bigger than they need to be.
-    // cm->tile_width would be enough but it complicates indexing a
-    // little elsewhere.
-    const int aligned_mi_cols =
-        ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    int i;
-
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      aom_free(cm->above_context[i]);
-      cm->above_context[i] = (ENTROPY_CONTEXT *)aom_calloc(
-          aligned_mi_cols << (MI_SIZE_LOG2 - tx_size_wide_log2[0]),
-          sizeof(*cm->above_context[0]));
-      if (!cm->above_context[i]) goto fail;
-    }
-
-    aom_free(cm->above_seg_context);
-    cm->above_seg_context = (PARTITION_CONTEXT *)aom_calloc(
-        aligned_mi_cols, sizeof(*cm->above_seg_context));
-    if (!cm->above_seg_context) goto fail;
-
-#if CONFIG_VAR_TX
-    aom_free(cm->above_txfm_context);
-    cm->above_txfm_context = (TXFM_CONTEXT *)aom_calloc(
-        aligned_mi_cols << TX_UNIT_WIDE_LOG2, sizeof(*cm->above_txfm_context));
-    if (!cm->above_txfm_context) goto fail;
-
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
-      aom_free(cm->top_txfm_context[i]);
-      cm->top_txfm_context[i] =
-          (TXFM_CONTEXT *)aom_calloc(aligned_mi_cols << TX_UNIT_WIDE_LOG2,
-                                     sizeof(*cm->top_txfm_context[0]));
-      if (!cm->top_txfm_context[i]) goto fail;
-    }
-#endif
-
-    cm->above_context_alloc_cols = aligned_mi_cols;
-  }
-
   return 0;
 
 fail:
@@ -299,18 +297,4 @@ void av1_remove_common(AV1_COMMON *cm) {
   cm->frame_contexts = NULL;
 }
 
-void av1_init_context_buffers(AV1_COMMON *cm) {
-  cm->setup_mi(cm);
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
-    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
-}
-
-void av1_swap_current_and_last_seg_map(AV1_COMMON *cm) {
-  // Swap indices.
-  const int tmp = cm->seg_map_idx;
-  cm->seg_map_idx = cm->prev_seg_map_idx;
-  cm->prev_seg_map_idx = tmp;
-
-  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
-}
+void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
index 0d420f825..dbcb5b947 100644
--- a/third_party/aom/av1/common/alloccommon.h
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -23,15 +23,17 @@ struct BufferPool;
 
 void av1_remove_common(struct AV1Common *cm);
 
+int av1_alloc_above_context_buffers(struct AV1Common *cm,
+                                    int num_alloc_above_contexts);
+void av1_free_above_context_buffers(struct AV1Common *cm,
+                                    int num_free_above_contexts);
 int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
 void av1_init_context_buffers(struct AV1Common *cm);
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
-#if CONFIG_LOOP_RESTORATION
 void av1_alloc_restoration_buffers(struct AV1Common *cm);
 void av1_free_restoration_buffers(struct AV1Common *cm);
-#endif  // CONFIG_LOOP_RESTORATION
 
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
 void av1_free_state_buffers(struct AV1Common *cm);
@@ -39,8 +41,6 @@ void av1_free_state_buffers(struct AV1Common *cm);
 void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
 int av1_get_MBs(int width, int height);
 
-void av1_swap_current_and_last_seg_map(struct AV1Common *cm);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c
new file mode 100644
index 000000000..de3c54724
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c
@@ -0,0 +1,28 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  for (int i = 0; i < size; i += 4) {
+    int32x4_t tmp_q_s32 = vld1q_s32(arr);
+    tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4);
+    vst1q_s32(arr, tmp_q_s32);
+    arr += 4;
+  }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
new file mode 100644
index 000000000..0d8233744
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -0,0 +1,134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32, tmp1_32;
+  uint16x4_t tmp0_16, tmp1_16;
+  const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
+
+  if (w >= 16) {
+    const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        const uint8x16_t tmp0_q = vld1q_u8(src0);
+        const uint8x16_t tmp1_q = vld1q_u8(src1);
+        const uint8x16_t m_q = vld1q_u8(mask);
+        const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
+        res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
+        res_low =
+            vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
+        res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
+                            vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+        mask += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+      mask -= w;
+    }
+  } else if (w == 8) {
+    const uint8x8_t m = vld1_u8(mask);
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
new file mode 100644
index 000000000..33b06b767
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -0,0 +1,141 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t tmp0_q, tmp1_q, res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32, tmp1_32;
+  uint16x4_t tmp0_16, tmp1_16;
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (w >= 16) {
+    for (int i = 0; i < h; ++i) {
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        tmp0_q = vld1q_u8(src0);
+        tmp1_q = vld1q_u8(src1);
+        res_low = vmull_u8(m, vget_low_u8(tmp0_q));
+        res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
+        res_high = vmull_u8(m, vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+    }
+  } else if (w == 8) {
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
+      const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
+      const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
+      const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
+      const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
+      const uint8x8_t max_minus_m =
+          vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint8x8_t m1 = vdup_n_u8(mask[i]);
+      const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
+      const uint16x4x2_t m_trn =
+          vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
+      const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
+      const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
+      const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
+      const uint16x4x2_t max_minus_m_trn = vtrn_u16(
+          vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
+      const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
new file mode 100644
index 000000000..d731b6a66
--- /dev/null
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+                                 int16x8_t sub) {
+  vst1q_s16(dst + offset,
+            vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
+}
+
+static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+  return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
+}
+
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+}
+
+// Store half of a vector.
+static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+      vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+      vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      const uint8x8x4_t bot = vld4_u8(input + input_stride);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      uint16x8x2_t sum;
+      // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+      sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
+      vst1_u16(pred_buf_q3, vget_low_u16(top));
+    } else if (width == 8) {
+      const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
+      vst1q_u16(pred_buf_q3, top);
+    } else {
+      const uint8x16_t top = vld1q_u8(input);
+      vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+      vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
+      if (width == 32) {
+        const uint8x16_t next_top = vld1q_u8(input + 16);
+        vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+        vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
+      }
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+#if __ARM_ARCH <= 7
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t bot = vld1_u16(input + input_stride);
+      const uint16x4_t sum = vadd_u16(top, bot);
+      const uint16x4_t hsum = vpadd_u16(sum, sum);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+    } else if (width < 32) {
+      const uint16x8_t top = vld1q_u16(input);
+      const uint16x8_t bot = vld1q_u16(input + input_stride);
+      const uint16x8_t sum = vaddq_u16(top, bot);
+      if (width == 8) {
+        const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+        vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+      } else {
+        const uint16x8_t top_1 = vld1q_u16(input + 8);
+        const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
+        const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
+        const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+        vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
+      }
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      const uint16x8x4_t bot = vld4q_u16(input + input_stride);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t hsum = vpadd_u16(top, top);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 8) {
+      const uint16x4x2_t top = vld2_u16(input);
+      // equivalent to a vpadd_u16 (because vld2 interleaves)
+      const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
+      vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 16) {
+      const uint16x8x2_t top = vld2q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld2q interleaves)
+      const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
+      uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+                                vshlq_n_u16(hsum_1, 2) } };
+      vst2q_u16(pred_buf_q3, result);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
+    } else if (width == 8) {
+      const uint16x8_t top = vld1q_u16(input);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
+    } else if (width == 16) {
+      uint16x8x2_t top = vld2q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      vst2q_u16(pred_buf_q3, top);
+    } else {
+      uint16x8x4_t top = vld4q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      top.val[2] = vshlq_n_u16(top.val[2], 3);
+      top.val[3] = vshlq_n_u16(top.val[3], 3);
+      vst4q_u16(pred_buf_q3, top);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+                                         int width, int height,
+                                         int round_offset,
+                                         const int num_pel_log2) {
+  const uint16_t *const end = src + height * CFL_BUF_LINE;
+
+  // Round offset is not needed, because NEON will handle the rounding.
+  (void)round_offset;
+
+  // To optimize the use of the CPU pipeline, we process 4 rows per iteration
+  const int step = 4 * CFL_BUF_LINE;
+
+  // At this stage, the prediction buffer contains scaled reconstructed luma
+  // pixels, which are positive integer and only require 15 bits. By using
+  // unsigned integer for the sum, we can do one addition operation inside 16
+  // bits (8 lanes) before having to convert to 32 bits (4 lanes).
+  const uint16_t *sum_buf = src;
+  uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+  do {
+    // For all widths, we load, add and combine the data so it fits in 4 lanes.
+    if (width == 4) {
+      const uint16x4_t a0 =
+          vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
+      const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
+                                     vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
+      sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
+    } else if (width == 8) {
+      const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
+      const uint16x8_t a1 =
+          vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
+      sum_32x4 = vpadalq_u16(sum_32x4, a0);
+      sum_32x4 = vpadalq_u16(sum_32x4, a1);
+    } else {
+      const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
+      const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
+      const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
+      const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
+      sum_32x4 = vpadalq_u16(sum_32x4, row0);
+      sum_32x4 = vpadalq_u16(sum_32x4, row1);
+      sum_32x4 = vpadalq_u16(sum_32x4, row2);
+      sum_32x4 = vpadalq_u16(sum_32x4, row3);
+
+      if (width == 32) {
+        const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
+        const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row2_1 =
+            vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row3_1 =
+            vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
+
+        sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
+      }
+    }
+    sum_buf += step;
+  } while (sum_buf < end);
+
+  // Permute and add in such a way that each lane contains the block sum.
+  // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
+#if __ARM_ARCH >= 8
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+#else
+  uint32x4_t flip =
+      vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
+  sum_32x4 = vaddq_u32(sum_32x4, flip);
+  sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
+#endif
+
+  // Computing the average could be done using scalars, but getting off the NEON
+  // engine introduces latency, so we use vqrshrn.
+  int16x4_t avg_16x4;
+  // Constant propagation makes for some ugly code.
+  switch (num_pel_log2) {
+    case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
+    case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
+    case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
+    case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
+    case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
+    case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
+    case 10:
+      avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
+      break;
+    default: assert(0);
+  }
+
+  if (width == 4) {
+    do {
+      vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
+      src += CFL_BUF_LINE;
+      dst += CFL_BUF_LINE;
+    } while (src < end);
+  } else {
+    const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
+    do {
+      vldsubstq_s16(dst, src, 0, avg_16x8);
+      vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
+
+      if (width > 8) {
+        vldsubstq_s16(dst, src, 8, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      if (width == 32) {
+        vldsubstq_s16(dst, src, 16, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      src += step;
+      dst += step;
+    } while (src < end);
+  }
+}
+
+CFL_SUB_AVG_FN(neon)
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsign is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
+  const int16x4_t mask = vshr_n_s16(b, 15);
+  return veor_s16(vadd_s16(a, mask), mask);
+}
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsignq is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vshrq_n_s16(b, 15);
+  return veorq_s16(vaddq_s16(a, mask), mask);
+}
+
+static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+                                   int16x4_t alpha_sign, int abs_alpha_q12,
+                                   int16x4_t dc) {
+  const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
+  const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
+  int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
+  return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+                                   int16x8_t alpha_sign, int abs_alpha_q12,
+                                   int16x8_t dc) {
+  const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
+  const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
+  int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
+  return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  int16x8x2_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  return result;
+}
+
+static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
+  const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  const int16x8_t scaled_luma_2 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12);
+  const int16x8_t scaled_luma_3 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12);
+  int16x8x4_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc);
+  result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc);
+  return result;
+}
+
+static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    do {
+      const int16x4_t pred =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    do {
+      if (width == 8) {
+        vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
+                                            abs_alpha_q12, dc)));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
+                                       vqmovun_s16(pred.val[1]) } };
+        vst2_u8(dst, predun);
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x4_t predun = {
+          { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
+            vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
+        };
+        vst4_u8(dst, predun);
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, lbd)
+
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+  return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+  return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+  uint16x8x2_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+  uint16x8x4_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  result.val[2] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+  result.val[3] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  const int max = (1 << bd) - 1;
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    const int16x4_t max_16x4 = vdup_n_s16(max);
+    do {
+      const int16x4_t scaled_luma =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    const int16x8_t max_16x8 = vdupq_n_s16(max);
+    do {
+      if (width == 8) {
+        const int16x8_t pred =
+            predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst1q_u16(dst, clampq_s16(pred, max_16x8));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, hbd)
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
new file mode 100644
index 000000000..86a25e109
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -0,0 +1,1134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16_t *filter) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+    const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  sum = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(sum);
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint16x4_t convolve8_vert_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  int32x4_t sum0, sum1;
+  uint16x8_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+
+  sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum1 = vaddq_s32(sum1, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum1 = vsubq_s32(sum1, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+  res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
+                     vqmovn_u32(vreinterpretq_u32_s32(sum1)));
+
+  res = vqrshlq_u16(res, vec_round_bits);
+
+  return vqmovn_u16(res);
+}
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)filter_params_y;
+
+  uint8x8_t t0, t1, t2, t3;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
+  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+  src -= horiz_offset;
+
+  if (h == 4) {
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t d01_temp, d23_temp;
+
+    __builtin_prefetch(src + 0 * src_stride);
+    __builtin_prefetch(src + 1 * src_stride);
+    __builtin_prefetch(src + 2 * src_stride);
+    __builtin_prefetch(src + 3 * src_stride);
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    __builtin_prefetch(dst + 0 * dst_stride);
+    __builtin_prefetch(dst + 1 * dst_stride);
+    __builtin_prefetch(dst + 2 * dst_stride);
+    __builtin_prefetch(dst + 3 * dst_stride);
+    src += 7;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+
+      d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
+      d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
+
+      d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
+      d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
+
+      d01 = vqmovun_s16(d01_temp);
+      d23 = vqmovun_s16(d23_temp);
+
+      transpose_u8_4x4(&d01, &d23);
+
+      if (w != 2) {
+        vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
+                      vreinterpret_u32_u8(d01), 0);
+        vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
+                      vreinterpret_u32_u8(d23), 0);
+        vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
+                      vreinterpret_u32_u8(d01), 1);
+        vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
+                      vreinterpret_u32_u8(d23), 1);
+      } else {
+        vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
+                      vreinterpret_u16_u8(d01), 0);
+        vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
+                      vreinterpret_u16_u8(d23), 0);
+        vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
+                      vreinterpret_u16_u8(d01), 2);
+        vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
+                      vreinterpret_u16_u8(d23), 2);
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w > 0);
+  } else {
+    int width;
+    const uint8_t *s;
+    uint8x8_t t4, t5, t6, t7;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    if (w <= 4) {
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        src += 8 * src_stride;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                 shift_round_0, shift_by_bits);
+
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        if ((w == 4) && (h > 4)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        0);  // 20 21 22 23
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        0);  // 30 31 32 33
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        1);  // 40 41 42 43
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        1);  // 50 51 52 53
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        1);  // 60 61 62 63
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        1);  // 70 71 72 73
+          dst += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+        } else if ((w == 2) && (h > 4)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
+          dst += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+        }
+        h -= 8;
+      } while (h > 0);
+    } else {
+      uint8_t *d;
+      int16x8_t s11, s12, s13, s14;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        width = w;
+        s = src + 7;
+        d = dst;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter, shift_round_0, shift_by_bits);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          if (h != 2) {
+            store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          } else {
+            store_row2_u8_8x8(d, dst_stride, t0, t1);
+          }
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+    }
+  }
+}
+
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+
+  src -= vert_offset * src_stride;
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  if (w <= 4) {
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+
+    do {
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      if ((w == 4) && (h != 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      0);  // 20 21 22 23
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      1);  // 30 31 32 33
+        dst += dst_stride;
+      } else if ((w == 4) && (h == 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+      } else if ((w == 2) && (h != 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
+        dst += dst_stride;
+      } else if ((w == 2) && (h == 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0, t1, t2, t3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      s = src;
+      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+        t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+        t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+        t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+        if (h != 2) {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+          vst1_u8(d, t2);
+          d += dst_stride;
+          vst1_u8(d, t3);
+          d += dst_stride;
+        } else {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+        }
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  int im_dst_stride;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int bd = 8;
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+
+  dst_ptr = im_block;
+  im_dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+
+    do {
+      s = src_ptr;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+      if (w == 4) {
+        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      }
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * im_dst_stride;
+      height -= 8;
+    } while (height > 0);
+  }
+
+  // vertical
+  {
+    uint8_t *dst_u8_ptr, *d_u8;
+    int16_t *v_src_ptr, *v_s;
+
+    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                              (1 << (offset_bits - conv_params->round_1 - 1));
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+    src_stride = im_stride;
+    v_src_ptr = im_block;
+    dst_u8_ptr = dst;
+
+    height = h;
+    width = w;
+
+    if (width <= 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x4_t d0, d1, d2, d3;
+      uint16x8_t dd0, dd1;
+      uint8x8_t d01, d23;
+
+      d_u8 = dst_u8_ptr;
+      v_s = v_src_ptr;
+
+      __builtin_prefetch(v_s + 0 * im_stride);
+      __builtin_prefetch(v_s + 1 * im_stride);
+      __builtin_prefetch(v_s + 2 * im_stride);
+      __builtin_prefetch(v_s + 3 * im_stride);
+      __builtin_prefetch(v_s + 4 * im_stride);
+      __builtin_prefetch(v_s + 5 * im_stride);
+      __builtin_prefetch(v_s + 6 * im_stride);
+      __builtin_prefetch(v_s + 7 * im_stride);
+
+      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      v_s += (7 * im_stride);
+
+      do {
+        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+        v_s += (im_stride << 2);
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+        __builtin_prefetch(d_u8 + 1 * dst_stride);
+        __builtin_prefetch(d_u8 + 2 * dst_stride);
+        __builtin_prefetch(d_u8 + 3 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+        d01 = vqmovn_u16(dd0);
+        d23 = vqmovn_u16(dd1);
+
+        if ((w == 4) && (h != 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        0);  // 20 21 22 23
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        1);  // 30 31 32 33
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h != 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        0);  // 20 21
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        2);  // 30 31
+          d_u8 += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 0);
+    } else {
+      // if width is a multiple of 8 & height is a multiple of 4
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint8x8_t res0, res1, res2, res3;
+
+      do {
+        __builtin_prefetch(v_src_ptr + 0 * im_stride);
+        __builtin_prefetch(v_src_ptr + 1 * im_stride);
+        __builtin_prefetch(v_src_ptr + 2 * im_stride);
+        __builtin_prefetch(v_src_ptr + 3 * im_stride);
+        __builtin_prefetch(v_src_ptr + 4 * im_stride);
+        __builtin_prefetch(v_src_ptr + 5 * im_stride);
+        __builtin_prefetch(v_src_ptr + 6 * im_stride);
+        __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+        v_s = v_src_ptr;
+        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+        v_s += (7 * im_stride);
+
+        d_u8 = dst_u8_ptr;
+        height = h;
+
+        do {
+          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+          v_s += (im_stride << 2);
+
+          __builtin_prefetch(d_u8 + 4 * dst_stride);
+          __builtin_prefetch(d_u8 + 5 * dst_stride);
+          __builtin_prefetch(d_u8 + 6 * dst_stride);
+          __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          if (h != 2) {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res2);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res3);
+            d_u8 += dst_stride;
+          } else {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+          }
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+        } while (height > 0);
+        v_src_ptr += 8;
+        dst_u8_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
new file mode 100644
index 000000000..47c93d645
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AV1_COMMON_ARM_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
+
+static INLINE uint8x8_t wiener_convolve8_vert_4x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, int16_t *filter_y, const int bd,
+    const int round1_bits) {
+  int16x8_t ss0, ss1, ss2;
+  int32x4_t sum0, sum1;
+  uint16x4_t tmp0, tmp1;
+  uint16x8_t tmp;
+  uint8x8_t res;
+
+  const int32_t round_const = (1 << (bd + round1_bits - 1));
+  const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec = vdupq_n_s32(round_const);
+
+  ss0 = vaddq_s16(s0, s6);
+  ss1 = vaddq_s16(s1, s5);
+  ss2 = vaddq_s16(s2, s4);
+
+  sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+
+  sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+
+  sum0 = vsubq_s32(sum0, round_vec);
+  sum1 = vsubq_s32(sum1, round_vec);
+
+  /* right shift & rounding */
+  sum0 = vrshlq_s32(sum0, round_bits);
+  sum1 = vrshlq_s32(sum1, round_bits);
+
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+
+  /* from int32x4_t to uint8x8_t */
+  tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
+  tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
+  tmp = vcombine_u16(tmp0, tmp1);
+  res = vqmovn_u16(tmp);
+
+  return res;
+}
+
+static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  int16x8_t sum;
+  uint16x8_t res;
+  int32x4_t sum_0, sum_1;
+  int32x4_t s3_0, s3_1;
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+
+  /* for the purpose of right shift by { conv_params->round_0 } */
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  sum = vmulq_n_s16(s0, filter_x[0]);
+  sum = vmlaq_n_s16(sum, s1, filter_x[1]);
+  sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+
+  /* sum from 16x8 to 2 32x4 registers */
+  sum_0 = vmovl_s16(vget_low_s16(sum));
+  sum_1 = vmovl_s16(vget_high_s16(sum));
+
+  /* s[3]*128 -- and filter coef max can be 128
+   *  then max value possible = 128*128*255 exceeding 16 bit
+   */
+
+  s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
+  s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+  sum_1 = vaddq_s32(sum_1, s3_1);
+
+  /* Add the constant value */
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_1 = vaddq_s32(sum_1, round_vec_0);
+
+  /* right shift & rounding & saturating */
+  sum_0 = vqrshlq_s32(sum_0, round_bits);
+  sum_1 = vqrshlq_s32(sum_1, round_bits);
+
+  /* Clipping to max value */
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  sum_1 = vminq_s32(sum_1, round_vec_1);
+
+  res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
+  return res;
+}
+
+static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  uint16x4_t res;
+  int32x4_t sum_0, s3_0;
+  int16x4_t sum, temp0, temp1, temp2;
+
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  temp0 = vadd_s16(s0, s6);
+  temp1 = vadd_s16(s1, s5);
+  temp2 = vadd_s16(s2, s4);
+
+  sum = vmul_n_s16(temp0, filter_x[0]);
+  sum = vmla_n_s16(sum, temp1, filter_x[1]);
+  sum = vmla_n_s16(sum, temp2, filter_x[2]);
+  sum_0 = vmovl_s16(sum);
+
+  /* s[3]*128 -- and filter coff max can be 128.
+   * then max value possible = 128*128*255 Therefore, 32 bits are required to
+   * hold the result.
+   */
+  s3_0 = vmull_n_s16(s3, filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_0 = vrshlq_s32(sum_0, round_bits);
+
+  sum_0 = vmaxq_s32(sum_0, zero);
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  res = vqmovun_s32(sum_0);
+  return res;
+}
+
+static INLINE int16x8_t
+convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                  const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+                  const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t sum;
+  int16x8_t res;
+
+  sum = horiz_const;
+  sum = vmlaq_n_s16(sum, s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s3, filter[3]);
+  sum = vmlaq_n_s16(sum, s4, filter[4]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+
+  res = vqrshlq_s16(sum, shift_round_0);
+
+  return res;
+}
+
+static INLINE int16x4_t
+convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                  const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+                  const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t sum;
+  sum = horiz_const;
+  sum = vmla_n_s16(sum, s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s3, filter[3]);
+  sum = vmla_n_s16(sum, s4, filter[4]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+
+  sum = vqrshl_s16(sum, shift_round_0);
+
+  return sum;
+}
+
+static INLINE uint16x4_t convolve8_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+#endif  // AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/intrapred_neon.c b/third_party/aom/av1/common/arm/intrapred_neon.c
new file mode 100644
index 000000000..799355553
--- /dev/null
+++ b/third_party/aom/av1/common/arm/intrapred_neon.c
@@ -0,0 +1,79 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, const uint16_t *above,
+                                            const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define intra_pred_highbd_sized(type, width)                         \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(       \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,        \
+      const uint16_t *left, int bd) {                                \
+    (void)bd;                                                        \
+    highbd_##type##_predictor_neon(dst, stride, width, above, left); \
+  }
+
+#define intra_pred_square(type)      \
+  intra_pred_highbd_sized(type, 4);  \
+  intra_pred_highbd_sized(type, 8);  \
+  intra_pred_highbd_sized(type, 16); \
+  intra_pred_highbd_sized(type, 32); \
+  intra_pred_highbd_sized(type, 64);
+
+intra_pred_square(dc);
+
+#undef intra_pred_square
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
new file mode 100644
index 000000000..992be4a9e
--- /dev/null
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -0,0 +1,1326 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE void compute_avg_4x4(
+    uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
+    uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const_vec, const int16_t round_bits,
+    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3;
+  uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+
+  int32x4_t dst0, dst1, dst2, dst3;
+  int16x8_t tmp4, tmp5;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+    const int32x4_t const_vec = vmovl_s16(sub_const_vec);
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+    sum1 = vmull_n_u16(res1, fwd_offset);
+    sum1 = vmlal_n_u16(sum1, d1, bck_offset);
+    sum2 = vmull_n_u16(res2, fwd_offset);
+    sum2 = vmlal_n_u16(sum2, d2, bck_offset);
+    sum3 = vmull_n_u16(res3, fwd_offset);
+    sum3 = vmlal_n_u16(sum3, d3, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+    tmp_u1 = vhadd_u16(res1, d1);
+    tmp_u2 = vhadd_u16(res2, d2);
+    tmp_u3 = vhadd_u16(res3, d3);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+    tmp1 = vsub_s16(vreinterpret_s16_u16(tmp_u1), sub_const_vec);
+    tmp2 = vsub_s16(vreinterpret_s16_u16(tmp_u2), sub_const_vec);
+    tmp3 = vsub_s16(vreinterpret_s16_u16(tmp_u3), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+    tmp1 = vqrshl_s16(tmp1, round_bits_vec);
+    tmp2 = vqrshl_s16(tmp2, round_bits_vec);
+    tmp3 = vqrshl_s16(tmp3, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  }
+}
+
+static INLINE void compute_avg_8x4(
+    uint16x8_t res0, uint16x8_t res1, uint16x8_t res2, uint16x8_t res3,
+    uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const, const int16_t round_bits,
+    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+    uint8x8_t *t3) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int16x8_t f0, f1, f2, f3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+  uint32x4_t sum4, sum5, sum6, sum7;
+  int32x4_t dst0, dst1, dst2, dst3;
+  int32x4_t dst4, dst5, dst6, dst7;
+  uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum1 = vmull_n_u16(vget_low_u16(res1), fwd_offset);
+    sum1 = vmlal_n_u16(sum1, vget_low_u16(d1), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum3 = vmull_n_u16(vget_high_u16(res1), fwd_offset);
+    sum3 = vmlal_n_u16(sum3, vget_high_u16(d1), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    sum4 = vmull_n_u16(vget_low_u16(res2), fwd_offset);
+    sum4 = vmlal_n_u16(sum4, vget_low_u16(d2), bck_offset);
+    sum5 = vmull_n_u16(vget_low_u16(res3), fwd_offset);
+    sum5 = vmlal_n_u16(sum5, vget_low_u16(d3), bck_offset);
+    sum4 = vshrq_n_u32(sum4, DIST_PRECISION_BITS);
+    sum5 = vshrq_n_u32(sum5, DIST_PRECISION_BITS);
+
+    sum6 = vmull_n_u16(vget_high_u16(res2), fwd_offset);
+    sum6 = vmlal_n_u16(sum6, vget_high_u16(d2), bck_offset);
+    sum7 = vmull_n_u16(vget_high_u16(res3), fwd_offset);
+    sum7 = vmlal_n_u16(sum7, vget_high_u16(d3), bck_offset);
+    sum6 = vshrq_n_u32(sum6, DIST_PRECISION_BITS);
+    sum7 = vshrq_n_u32(sum7, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), sub_const_vec);
+    dst4 = vsubq_s32(vreinterpretq_s32_u32(sum4), sub_const_vec);
+    dst5 = vsubq_s32(vreinterpretq_s32_u32(sum5), sub_const_vec);
+    dst6 = vsubq_s32(vreinterpretq_s32_u32(sum6), sub_const_vec);
+    dst7 = vsubq_s32(vreinterpretq_s32_u32(sum7), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+    dst4 = vqrshlq_s32(dst4, round_bits_vec);
+    dst5 = vqrshlq_s32(dst5, round_bits_vec);
+    dst6 = vqrshlq_s32(dst6, round_bits_vec);
+    dst7 = vqrshlq_s32(dst7, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vqmovn_s32(dst4);
+    tmp5 = vqmovn_s32(dst5);
+    tmp6 = vqmovn_s32(dst6);
+    tmp7 = vqmovn_s32(dst7);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+    f1 = vcombine_s16(tmp1, tmp3);
+    f2 = vcombine_s16(tmp4, tmp6);
+    f3 = vcombine_s16(tmp5, tmp7);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+    tmp_u1 = vhaddq_u16(res1, d1);
+    tmp_u2 = vhaddq_u16(res2, d2);
+    tmp_u3 = vhaddq_u16(res3, d3);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+    f1 = vsubq_s16(vreinterpretq_s16_u16(tmp_u1), sub_const_vec);
+    f2 = vsubq_s16(vreinterpretq_s16_u16(tmp_u2), sub_const_vec);
+    f3 = vsubq_s16(vreinterpretq_s16_u16(tmp_u3), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+    f1 = vqrshlq_s16(f1, round_bits_vec);
+    f2 = vqrshlq_s16(f2, round_bits_vec);
+    f3 = vqrshlq_s16(f3, round_bits_vec);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+  }
+}
+
+static INLINE void jnt_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+  int dst_stride;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  dst_ptr = im_block;
+  dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t tt0, tt1, tt2, tt3;
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+
+    do {
+      s = src;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s7 = vget_low_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s9 = vget_low_s16(tt2);
+      s10 = vget_low_s16(tt3);
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      __builtin_prefetch(src + 7 * src_stride);
+      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src + 7;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      height -= 8;
+    } while (height > 0);
+  }
+}
+
+static INLINE void jnt_convolve_2d_vert_neon(
+    int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
+    ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+  uint8_t *dst_u8_ptr, *d_u8;
+  CONV_BUF_TYPE *dst_ptr, *dst;
+  int16_t *src_ptr, *s;
+  uint16_t *d;
+
+  const int bd = 8;
+  int height;
+  int dst_stride = conv_params->dst_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+
+  const int16_t round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset);
+  const int16x4_t sub_const_vec = vdup_n_s16(sub_const);
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint16x4_t res4, res5, res6, res7;
+  uint16x4_t d0, d1, d2, d3;
+  uint8x8_t t0, t1;
+
+  dst = conv_params->dst;
+  src_ptr = im_block;
+  dst_u8_ptr = dst8;
+  dst_ptr = dst;
+  height = h;
+
+  do {
+    d = dst_ptr;
+    d_u8 = dst_u8_ptr;
+    s = src_ptr;
+    height = h;
+
+    __builtin_prefetch(s + 0 * im_stride);
+    __builtin_prefetch(s + 1 * im_stride);
+    __builtin_prefetch(s + 2 * im_stride);
+    __builtin_prefetch(s + 3 * im_stride);
+    __builtin_prefetch(s + 4 * im_stride);
+    __builtin_prefetch(s + 5 * im_stride);
+    __builtin_prefetch(s + 6 * im_stride);
+    __builtin_prefetch(s + 7 * im_stride);
+
+    load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    s += (7 * im_stride);
+
+    do {
+      load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
+      s += (im_stride << 2);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      __builtin_prefetch(d_u8 + 4 * dst8_stride);
+      __builtin_prefetch(d_u8 + 5 * dst8_stride);
+      __builtin_prefetch(d_u8 + 6 * dst8_stride);
+      __builtin_prefetch(d_u8 + 7 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+      d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                             round_shift_vec, offset_const);
+      d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                             round_shift_vec, offset_const);
+      d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+        d += (dst_stride << 2);
+
+        compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
+                        bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
+                        &t0, &t1);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                      0);  // 00 01 02 03
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                      1);  // 10 11 12 13
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                      0);  // 20 21 22 23
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                      1);  // 30 31 32 33
+        d_u8 += dst8_stride;
+
+      } else {
+        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+        d += (dst_stride << 2);
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      height -= 4;
+    } while (height > 0);
+    src_ptr += 4;
+    dst_ptr += 4;
+    dst_u8_ptr += 4;
+    w -= 4;
+  } while (w > 0);
+}
+
+void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                              int dst8_stride, int w, int h,
+                              InterpFilterParams *filter_params_x,
+                              InterpFilterParams *filter_params_y,
+                              const int subpel_x_q4, const int subpel_y_q4,
+                              ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int round_0 = conv_params->round_0 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                             x_filter_tmp, im_h, w, round_0);
+
+  jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
+                            y_filter, h, w);
+}
+
+void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
+      tmp_shift3;
+  uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
+  uint16x4_t tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7;
+  const uint8_t *src1, *src2;
+  uint8_t *dst8_1;
+  CONV_BUF_TYPE *dst = conv_params->dst, *dst_1, *dst_2;
+  const int dst_stride = conv_params->dst_stride;
+  int x, y;
+  const int16_t bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int16x4_t sub_const_vec = vdup_n_s16((int16_t)round_offset);
+  const uint16x8_t dup_round_offset16x8 = vdupq_n_u16((uint16_t)round_offset);
+  const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
+  const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
+
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  if (!(w & 0x07)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+      for (x = 0; x < (w >> 3); ++x) {
+        src2 = src1;
+        load_u8_8x4(src2, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+        res_q0 = vaddq_u16(vshlq_u16(vmovl_u8(res0_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q1 = vaddq_u16(vshlq_u16(vmovl_u8(res1_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q2 = vaddq_u16(vshlq_u16(vmovl_u8(res2_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q3 = vaddq_u16(vshlq_u16(vmovl_u8(res3_8), dup_bits16x8),
+                           dup_round_offset16x8);
+
+        if (conv_params->do_average) {
+          dst_2 = dst_1;
+          load_u16_8x4(dst_2, dst_stride, &tmp_q0, &tmp_q1, &tmp_q2, &tmp_q3);
+
+          compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
+                          res_q2, res_q3, conv_params->fwd_offset,
+                          conv_params->bck_offset, sub_const_vec, bits,
+                          conv_params->use_jnt_comp_avg, &tmp_shift0,
+                          &tmp_shift1, &tmp_shift2, &tmp_shift3);
+
+          vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
+          vst1_u8(dst8_1 + (1 * dst8_stride), tmp_shift1);
+          vst1_u8(dst8_1 + (2 * dst8_stride), tmp_shift2);
+          vst1_u8(dst8_1 + (3 * dst8_stride), tmp_shift3);
+
+        } else {
+          vst1q_u16(dst_1 + (0 * dst_stride), res_q0);
+          vst1q_u16(dst_1 + (1 * dst_stride), res_q1);
+          vst1q_u16(dst_1 + (2 * dst_stride), res_q2);
+          vst1q_u16(dst_1 + (3 * dst_stride), res_q3);
+        }
+        src1 = src1 + 8;
+        dst_1 = dst_1 + 8;
+        dst8_1 = dst8_1 + 8;
+      }
+      src += src_stride * 4;
+      dst8 += dst8_stride * 4;
+      dst += dst_stride * 4;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+
+      load_u8_8x4(src1, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+      res4 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res0_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res5 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res1_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res6 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res2_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res7 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res3_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      if (conv_params->do_average) {
+        load_u16_4x4(dst_1, dst_stride, &tmp4, &tmp5, &tmp6, &tmp7);
+
+        compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
+                        conv_params->fwd_offset, conv_params->bck_offset,
+                        sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+                        &tmp_shift0, &tmp_shift1);
+
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 1);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 1);
+
+      } else {
+        vst1_u16(dst_1, res4);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res5);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res6);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res7);
+      }
+      src += src_stride * 4;
+      dst += dst_stride * 4;
+      dst8 += dst8_stride * 4;
+    }
+  }
+}
+
+void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - horiz_offset;
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint16x4_t res4, res5, res6, res7;
+    uint32x2_t tu0, tu1;
+    int16x8_t u0, u1;
+    const int16x4_t zero = vdup_n_s16(0);
+    const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
+    const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
+    const int16x4_t horiz_const = vdup_n_s16(bits);
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      width = w;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      s += 7;
+      do {
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+        t0 = vreinterpret_u8_u32(tu0);
+        t1 = vreinterpret_u8_u32(tu1);
+
+        transpose_u8_4x4(&t0, &t1);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_low_s16(u1);
+        s9 = vget_high_s16(u0);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                               zero, shift_round_0);
+        d1 = vrshl_s16(d1, horiz_const);
+        d1 = vadd_s16(d1, round_offset_vec);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                               zero, shift_round_0);
+        d2 = vrshl_s16(d2, horiz_const);
+        d2 = vadd_s16(d2, round_offset_vec);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                               zero, shift_round_0);
+        d3 = vrshl_s16(d3, horiz_const);
+        d3 = vadd_s16(d3, round_offset_vec);
+
+        transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
+                          &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          vst1_lane_u32((uint32_t *)(d_u8 + dst8_stride),
+                        vreinterpret_u32_u8(t0),
+                        1);  // 10 11 12 13
+          vst1_lane_u32((uint32_t *)(d_u8 + 2 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        0);  // 20 21 22 23
+          vst1_lane_u32((uint32_t *)(d_u8 + 3 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        1);  // 30 31 32 33
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride << 2);
+      dst_ptr += (dst_stride << 2);
+      dst_u8_ptr += (dst8_stride << 2);
+      height -= 4;
+    } while (height > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    uint8_t *d_u8_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res8, res9, res10, res11;
+
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+    const int16x8_t horiz_const = vdupq_n_s16(bits);
+    const int16x8_t zero = vdupq_n_s16(0);
+
+    d = dst_ptr = dst;
+    d_u8 = dst_u8_ptr = dst8;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 zero, shift_round_0);
+        res1 = vrshlq_s16(res1, horiz_const);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 zero, shift_round_0);
+        res2 = vrshlq_s16(res2, horiz_const);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 zero, shift_round_0);
+        res3 = vrshlq_s16(res3, horiz_const);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 zero, shift_round_0);
+        res4 = vrshlq_s16(res4, horiz_const);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, zero, shift_round_0);
+        res5 = vrshlq_s16(res5, horiz_const);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, zero, shift_round_0);
+        res6 = vrshlq_s16(res6, horiz_const);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, zero, shift_round_0);
+        res7 = vrshlq_s16(res7, horiz_const);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        if (conv_params->do_average) {
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      dst_u8_ptr += 8 * dst8_stride;
+      height -= 8;
+    } while (height > 0);
+  }
+}
+
+void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int shift_value = (conv_params->round_1 - 1 - bits);
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+  int16_t y_filter_tmp[8];
+  int16x8_t filter_y_coef = vld1q_s16(y_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
+  vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  // used to get rid of multiplication = (vertical filter output sum) *
+  // (1<<bits).
+  assert((conv_params->round_1 - 2) >= bits);
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    uint16x4_t res4, res5, res6, res7;
+    uint32x2_t tu0, tu1, tu2, tu3;
+    int16x8_t u0, u1, u2, u3;
+
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x4_t shift_vec = vdup_n_s16(-shift_value);
+    const int16x4_t zero = vdup_n_s16(0);
+
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      height = h;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
+
+      u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+      u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+      u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
+      u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+
+      s0 = vget_low_s16(u0);
+      s1 = vget_high_s16(u0);
+      s2 = vget_low_s16(u1);
+      s3 = vget_high_s16(u1);
+      s4 = vget_low_s16(u2);
+      s5 = vget_high_s16(u2);
+      s6 = vget_low_s16(u3);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += (7 * src_stride);
+      do {
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_high_s16(u0);
+        s9 = vget_low_s16(u1);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+        d0 = vadd_s16(d0, round_offset64);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                               zero, shift_vec);
+        d1 = vadd_s16(d1, round_offset64);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                               zero, shift_vec);
+        d2 = vadd_s16(d2, round_offset64);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                               zero, shift_vec);
+        d3 = vadd_s16(d3, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+          d += (dst_stride << 2);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_jnt_comp_avg, &t0,
+                          &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        1);  // 10 11 12 13
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                        0);  // 20 21 22 23
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                        1);  // 30 31 32 33
+          d_u8 += dst8_stride;
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+          d += (dst_stride << 2);
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += (src_stride << 2);
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst_u8_ptr += 4;
+      width -= 4;
+    } while (width > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res8, res9, res10, res11;
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t zero = vdupq_n_s16(0);
+
+    dst_ptr = dst;
+    dst_u8_ptr = dst8;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      height = h;
+      s = src_ptr + (7 * src_stride);
+      d_tmp = dst_ptr;
+      d_u8 = dst_u8_ptr;
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                                 zero, shift_vec);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                                 zero, shift_vec);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                                 zero, shift_vec);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+                                 zero, shift_vec);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 y_filter_tmp, zero, shift_vec);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 y_filter_tmp, zero, shift_vec);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 y_filter_tmp, zero, shift_vec);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp + 0 * dst8_stride);
+          __builtin_prefetch(d_tmp + 1 * dst8_stride);
+          __builtin_prefetch(d_tmp + 2 * dst8_stride);
+          __builtin_prefetch(d_tmp + 3 * dst8_stride);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += (8 * src_stride);
+        height -= 8;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst_u8_ptr += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
new file mode 100644
index 000000000..214b14bf7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -0,0 +1,401 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_ARM_MEM_NEON_H_
+#define AV1_COMMON_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+
+static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
+                                     const uint8x8_t s1) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5,
+                                int16x4_t *const s6, int16x4_t *const s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3, const uint8x8_t s4,
+                                const uint8x8_t s5, const uint8x8_t s6,
+                                const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+                                 const uint8x16_t s1, const uint8x16_t s2,
+                                 const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3,
+                                 const uint16x8_t s4, const uint16x8_t s5,
+                                 const uint16x8_t s6, const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+  s += dst_stride;
+  vst1q_u16(s, s4);
+  s += dst_stride;
+  vst1q_u16(s, s5);
+  s += dst_stride;
+  vst1q_u16(s, s6);
+  s += dst_stride;
+  vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3,
+                                 const int16x8_t s4, const int16x8_t s5,
+                                 const int16x8_t s6, const int16x8_t s7) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+  s += dst_stride;
+  vst1q_s16(s, s4);
+  s += dst_stride;
+  vst1q_s16(s, s5);
+  s += dst_stride;
+  vst1q_s16(s, s6);
+  s += dst_stride;
+  vst1q_s16(s, s7);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5,
+                                int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1,
+                                         uint32x2_t *tu2, uint32x2_t *tu3) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu3 = vset_lane_u32(a, *tu3, 0);
+  memcpy(&a, buf, 4);
+  *tu3 = vset_lane_u32(a, *tu3, 1);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+}
+
+static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+}
+
+static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
+                                         uint16x4_t *tu0) {
+  uint16_t a;
+
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 0);
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 1);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+                                          uint64x2_t *tu0, uint64x2_t *tu1) {
+  uint64_t a;
+
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 0);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 1);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu1 = vsetq_lane_u64(a, *tu1, 0);
+  memcpy(&a, buf, 8);
+  *tu1 = vsetq_lane_u64(a, *tu1, 1);
+}
+
+#endif  // AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c b/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
deleted file mode 100644
index b29228e43..000000000
--- a/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "av1/common/common.h"
-
-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
-  int32x4_t q8s32, q9s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-
-  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
-  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
-  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
-  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
-  q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
-  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
-  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-  return;
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
-                                             int16x4_t *d2s16) {
-  *d0s16 = vdup_n_s16((int16_t)cospi_8_64);
-  *d1s16 = vdup_n_s16((int16_t)cospi_16_64);
-  *d2s16 = vdup_n_s16((int16_t)cospi_24_64);
-  return;
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
-                                           int16x4_t *d5s16, int16x8_t *q3s16) {
-  *d3s16 = vdup_n_s16((int16_t)sinpi_1_9);
-  *d4s16 = vdup_n_s16((int16_t)sinpi_2_9);
-  *q3s16 = vdupq_n_s16((int16_t)sinpi_3_9);
-  *d5s16 = vdup_n_s16((int16_t)sinpi_4_9);
-  return;
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
-                              int16x4_t *d2s16, int16x8_t *q8s16,
-                              int16x8_t *q9s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  int32x4_t q10s32, q13s32, q14s32, q15s32;
-  int16x8_t q13s16, q14s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, *d2s16);
-  q10s32 = vmull_s16(d17s16, *d0s16);
-  q13s32 = vmull_s16(d23s16, *d1s16);
-  q14s32 = vmull_s16(d24s16, *d1s16);
-  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
-  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q10s32, 14);
-
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-  *q8s16 = vaddq_s16(q13s16, q14s16);
-  *q9s16 = vsubq_s16(q13s16, q14s16);
-  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-  return;
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
-                               int16x4_t *d5s16, int16x8_t *q3s16,
-                               int16x8_t *q8s16, int16x8_t *q9s16) {
-  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d6s16 = vget_low_s16(*q3s16);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  q10s32 = vmull_s16(*d3s16, d16s16);
-  q11s32 = vmull_s16(*d4s16, d16s16);
-  q12s32 = vmull_s16(d6s16, d17s16);
-  q13s32 = vmull_s16(*d5s16, d18s16);
-  q14s32 = vmull_s16(*d3s16, d18s16);
-  q15s32 = vmovl_s16(d16s16);
-  q15s32 = vaddw_s16(q15s32, d19s16);
-  q8s32 = vmull_s16(*d4s16, d19s16);
-  q15s32 = vsubw_s16(q15s32, d18s16);
-  q9s32 = vmull_s16(*d5s16, d19s16);
-
-  q10s32 = vaddq_s32(q10s32, q13s32);
-  q10s32 = vaddq_s32(q10s32, q8s32);
-  q11s32 = vsubq_s32(q11s32, q14s32);
-  q8s32 = vdupq_n_s32((int32_t)sinpi_3_9);
-  q11s32 = vsubq_s32(q11s32, q9s32);
-  q15s32 = vmulq_s32(q15s32, q8s32);
-
-  q13s32 = vaddq_s32(q10s32, q12s32);
-  q10s32 = vaddq_s32(q10s32, q11s32);
-  q14s32 = vaddq_s32(q11s32, q12s32);
-  q10s32 = vsubq_s32(q10s32, q12s32);
-
-  d16s16 = vqrshrn_n_s32(q13s32, 14);
-  d17s16 = vqrshrn_n_s32(q14s32, 14);
-  d18s16 = vqrshrn_n_s32(q15s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
-
-  *q8s16 = vcombine_s16(d16s16, d17s16);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  return;
-}
-
-void av1_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, const TxfmParam *txfm_param) {
-  uint8x8_t d26u8, d27u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint32x2_t d26u32, d27u32;
-  int16x8_t q3s16, q8s16, q9s16;
-  uint16x8_t q8u16, q9u16;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-
-  TRANSPOSE4X4(&q8s16, &q9s16);
-
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-    case DCT_DCT:  // idct_idct is not supported. Fall back to C
-      av1_iht4x4_16_add_c(input, dest, dest_stride, txfm_param);
-      return;
-      break;
-    case ADST_DCT:  // iadst_idct
-      // generate constants
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-      break;
-    case DCT_ADST:  // idct_iadst
-      // generate constantsyy
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-      break;
-    case ADST_ADST:  // iadst_iadst
-      // generate constants
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-      break;
-    default:  // iadst_idct
-      assert(0);
-      break;
-  }
-
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += dest_stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= dest_stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= dest_stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= dest_stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
-  return;
-}
diff --git a/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c b/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
deleted file mode 100644
index 4cd43a99d..000000000
--- a/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "av1/common/common.h"
-
-static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
-                                int16x8_t *q10s16, int16x8_t *q11s16,
-                                int16x8_t *q12s16, int16x8_t *q13s16,
-                                int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-  *q12s16 = vcombine_s16(d17s16, d25s16);
-  *q13s16 = vcombine_s16(d19s16, d27s16);
-  *q14s16 = vcombine_s16(d21s16, d29s16);
-  *q15s16 = vcombine_s16(d23s16, d31s16);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
-  q1x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
-  q2x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
-  q3x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
-
-  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-  *q8s16 = q0x2s16.val[0];
-  *q9s16 = q0x2s16.val[1];
-  *q10s16 = q1x2s16.val[0];
-  *q11s16 = q1x2s16.val[1];
-  *q12s16 = q2x2s16.val[0];
-  *q13s16 = q2x2s16.val[1];
-  *q14s16 = q3x2s16.val[0];
-  *q15s16 = q3x2s16.val[1];
-  return;
-}
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-  return;
-}
-
-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                               int16x8_t *q10s16, int16x8_t *q11s16,
-                               int16x8_t *q12s16, int16x8_t *q13s16,
-                               int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q2s16, q4s16, q5s16, q6s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
-  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_2_64);
-  d15s16 = vdup_n_s16((int16_t)cospi_30_64);
-
-  q1s32 = vmull_s16(d30s16, d14s16);
-  q2s32 = vmull_s16(d31s16, d14s16);
-  q3s32 = vmull_s16(d30s16, d15s16);
-  q4s32 = vmull_s16(d31s16, d15s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_18_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_14_64);
-
-  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
-
-  q5s32 = vmull_s16(d22s16, d30s16);
-  q6s32 = vmull_s16(d23s16, d30s16);
-  q7s32 = vmull_s16(d22s16, d31s16);
-  q8s32 = vmull_s16(d23s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-
-  q11s32 = vaddq_s32(q1s32, q5s32);
-  q12s32 = vaddq_s32(q2s32, q6s32);
-  q1s32 = vsubq_s32(q1s32, q5s32);
-  q2s32 = vsubq_s32(q2s32, q6s32);
-
-  d22s16 = vqrshrn_n_s32(q11s32, 14);
-  d23s16 = vqrshrn_n_s32(q12s32, 14);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q12s32 = vaddq_s32(q3s32, q7s32);
-  q15s32 = vaddq_s32(q4s32, q8s32);
-  q3s32 = vsubq_s32(q3s32, q7s32);
-  q4s32 = vsubq_s32(q4s32, q8s32);
-
-  d2s16 = vqrshrn_n_s32(q1s32, 14);
-  d3s16 = vqrshrn_n_s32(q2s32, 14);
-  d24s16 = vqrshrn_n_s32(q12s32, 14);
-  d25s16 = vqrshrn_n_s32(q15s32, 14);
-  d6s16 = vqrshrn_n_s32(q3s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_10_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_22_64);
-  q4s32 = vmull_s16(d26s16, d0s16);
-  q5s32 = vmull_s16(d27s16, d0s16);
-  q2s32 = vmull_s16(d26s16, d1s16);
-  q6s32 = vmull_s16(d27s16, d1s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_26_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_6_64);
-
-  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
-
-  q0s32 = vmull_s16(d18s16, d30s16);
-  q13s32 = vmull_s16(d19s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
-
-  q10s32 = vmull_s16(d18s16, d31s16);
-  q9s32 = vmull_s16(d19s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
-  q14s32 = vaddq_s32(q2s32, q10s32);
-  q15s32 = vaddq_s32(q6s32, q9s32);
-  q2s32 = vsubq_s32(q2s32, q10s32);
-  q6s32 = vsubq_s32(q6s32, q9s32);
-
-  d28s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  q9s32 = vaddq_s32(q4s32, q0s32);
-  q10s32 = vaddq_s32(q5s32, q13s32);
-  q4s32 = vsubq_s32(q4s32, q0s32);
-  q5s32 = vsubq_s32(q5s32, q13s32);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  d18s16 = vqrshrn_n_s32(q9s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
-  d8s16 = vqrshrn_n_s32(q4s32, 14);
-  d9s16 = vqrshrn_n_s32(q5s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q5s32 = vmull_s16(d2s16, d30s16);
-  q6s32 = vmull_s16(d3s16, d30s16);
-  q7s32 = vmull_s16(d2s16, d31s16);
-  q0s32 = vmull_s16(d3s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
-  q1s32 = vmull_s16(d4s16, d30s16);
-  q3s32 = vmull_s16(d5s16, d30s16);
-  q10s32 = vmull_s16(d4s16, d31s16);
-  q2s32 = vmull_s16(d5s16, d31s16);
-
-  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
-  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
-  *q8s16 = vaddq_s16(*q11s16, *q9s16);
-  *q11s16 = vsubq_s16(*q11s16, *q9s16);
-  q4s16 = vaddq_s16(*q12s16, *q14s16);
-  *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
-  q14s32 = vaddq_s32(q5s32, q1s32);
-  q15s32 = vaddq_s32(q6s32, q3s32);
-  q5s32 = vsubq_s32(q5s32, q1s32);
-  q6s32 = vsubq_s32(q6s32, q3s32);
-
-  d18s16 = vqrshrn_n_s32(q14s32, 14);
-  d19s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q1s32 = vaddq_s32(q7s32, q10s32);
-  q3s32 = vaddq_s32(q0s32, q2s32);
-  q7s32 = vsubq_s32(q7s32, q10s32);
-  q0s32 = vsubq_s32(q0s32, q2s32);
-
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  d29s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q7s32, 14);
-  d15s16 = vqrshrn_n_s32(q0s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  q2s32 = vmull_s16(d22s16, d30s16);
-  q3s32 = vmull_s16(d23s16, d30s16);
-  q13s32 = vmull_s16(d22s16, d30s16);
-  q1s32 = vmull_s16(d23s16, d30s16);
-
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
-  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
-
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q3s32, 14);
-  d24s16 = vqrshrn_n_s32(q13s32, 14);
-  d25s16 = vqrshrn_n_s32(q1s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  q13s32 = vmull_s16(d10s16, d30s16);
-  q1s32 = vmull_s16(d11s16, d30s16);
-  q11s32 = vmull_s16(d10s16, d30s16);
-  q0s32 = vmull_s16(d11s16, d30s16);
-
-  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
-  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
-
-  d20s16 = vqrshrn_n_s32(q13s32, 14);
-  d21s16 = vqrshrn_n_s32(q1s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q0s32, 14);
-  *q10s16 = vcombine_s16(d20s16, d21s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q5s16 = vdupq_n_s16(0);
-
-  *q9s16 = vsubq_s16(q5s16, *q9s16);
-  *q11s16 = vsubq_s16(q5s16, q2s16);
-  *q13s16 = vsubq_s16(q5s16, q6s16);
-  *q15s16 = vsubq_s16(q5s16, q4s16);
-  return;
-}
-
-void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, const TxfmParam *txfm_param) {
-  int i;
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 8 * 2);
-  q11s16 = vld1q_s16(input + 8 * 3);
-  q12s16 = vld1q_s16(input + 8 * 4);
-  q13s16 = vld1q_s16(input + 8 * 5);
-  q14s16 = vld1q_s16(input + 8 * 6);
-  q15s16 = vld1q_s16(input + 8 * 7);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-    case DCT_DCT:  // idct_idct is not supported. Fall back to C
-      av1_iht8x8_64_add_c(input, dest, dest_stride, txfm_param);
-      return;
-      break;
-    case ADST_DCT:  // iadst_idct
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // first transform rows
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-
-      // transpose the matrix
-      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                   &q15s16);
-
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-      break;
-    case DCT_ADST:  // idct_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                   &q15s16);
-
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // then transform columns
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-      break;
-    case ADST_ADST:  // iadst_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                   &q15s16);
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-      break;
-    default:  // iadst_idct
-      assert(0);
-      break;
-  }
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  for (d1 = d2 = dest, i = 0; i < 2; i++) {
-    if (i != 0) {
-      q8s16 = q12s16;
-      q9s16 = q13s16;
-      q10s16 = q14s16;
-      q11s16 = q15s16;
-    }
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-    q10u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-    q11u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-  }
-  return;
-}
diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c
new file mode 100644
index 000000000..44e064195
--- /dev/null
+++ b/third_party/aom/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  assert(h >= 4);
+  assert(w >= 4);
+  assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+  const int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  uint16x8_t diff_q, tmp0, tmp1;
+  uint8x8_t diff_d, diff_select;
+  const CONV_BUF_TYPE *src0_1, *src1_1;
+  const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+  const uint8x8_t dup_38 = vdup_n_u8(38);
+  const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+  if (mask_type == DIFFWTD_38) {
+    diff_select = vdup_n_u8(255);
+  } else {
+    diff_select = vdup_n_u8(0);
+  }
+  if (w >= 8) {
+    for (int i = 0; i < h; ++i) {
+      src0_1 = src0;
+      src1_1 = src1;
+      for (int j = 0; j < w; j += 8) {
+        __builtin_prefetch(src0_1);
+        __builtin_prefetch(src1_1);
+        diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+        diff_q = vrshlq_u16(diff_q, dup_round);
+        diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+        diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+        diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+        vst1_u8(mask, diff_d);
+        src0_1 += 8;
+        src1_1 += 8;
+        mask += 8;
+      }
+      src0 += src0_stride;
+      src1 += src1_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      src0_1 = src0;
+      src1_1 = src1;
+      __builtin_prefetch(src0_1 + 0 * src0_stride);
+      __builtin_prefetch(src0_1 + 1 * src0_stride);
+      __builtin_prefetch(src1_1 + 0 * src1_stride);
+      __builtin_prefetch(src1_1 + 1 * src1_stride);
+      tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+                          vld1_u16(src0_1 + (1 * src0_stride)));
+      tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+                          vld1_u16(src1_1 + (1 * src1_stride)));
+      diff_q = vabdq_u16(tmp0, tmp1);
+      diff_q = vrshlq_u16(diff_q, dup_round);
+      diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+      diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+      diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+      vst1_u8(mask, diff_d);
+      src0 += src0_stride * 2;
+      src1 += src1_stride * 2;
+      mask += w * 2;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
new file mode 100644
index 000000000..53727bb43
--- /dev/null
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -0,0 +1,422 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+                                    uint8x8_t *a6, uint8x8_t *a7) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint16x4x2_t b0 =
+      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b0.val[1]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, const uint8x8_t a4,
+                                    const uint8x8_t a5, const uint8x8_t a6,
+                                    const uint8x8_t a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+  *a2 = d1.val[0];
+  *a3 = d1.val[1];
+}
+
+static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
+                                     uint16x4_t *a2, uint16x4_t *a3,
+                                     uint16x4_t *a4, uint16x4_t *a5,
+                                     uint16x4_t *a6, uint16x4_t *a7,
+                                     uint16x8_t *o0, uint16x8_t *o1,
+                                     uint16x8_t *o2, uint16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+  uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+  uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
+  uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                     vreinterpret_u16_u32(c2.val[0]));
+  *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                     vreinterpret_u16_u32(c3.val[0]));
+  *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                     vreinterpret_u16_u32(c2.val[1]));
+  *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                     vreinterpret_u16_u32(c3.val[1]));
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3,
+                                     uint16x8_t *a4, uint16x8_t *a5,
+                                     uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
+  *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
+
+  *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
+  *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
+
+  *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
+  *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
+
+  *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
+  *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+                                     int16x8_t *a2, int16x8_t *a3,
+                                     int16x8_t *a4, int16x8_t *a5,
+                                     int16x8_t *a6, int16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
+  *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
+
+  *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
+  *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
+
+  *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
+  *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
+
+  *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
+  *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+                                      int16x4_t *a2, int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+#endif  // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
new file mode 100644
index 000000000..72fbed4d4
--- /dev/null
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+/* Wiener filter 2D
+   Apply horizontal filter and store in a temporary buffer. When applying
+   vertical filter, overwrite the original pixel values.
+ */
+
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  uint16_t *d_tmp;
+  uint8_t *d;
+  const uint8_t *src_ptr, *s_tmp;
+  uint16_t *dst_ptr;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  int width, height;
+  const int bd = 8;
+  const int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  int16_t filter_x_tmp[7], filter_y_tmp[7];
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w % 8));
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  assert(filter_x[7] == 0);
+  assert(filter_y[7] == 0);
+
+  /* assumption of horizontal filtering output will not exceed 15 bit.
+     ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
+     16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
+   */
+  assert((conv_params->round_0) >= 1);
+
+  memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
+  memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
+
+  filter_x_tmp[3] += (1 << FILTER_BITS);
+  filter_y_tmp[3] += (1 << FILTER_BITS);
+
+  s_tmp = src - center_tap * src_stride - center_tap;
+  dst_ptr = temp;
+  src_ptr = s_tmp;
+  height = intermediate_height;
+
+  /* if height is a multiple of 8 */
+  if (!(h & 7)) {
+    int16x8_t res0, res1, res2, res3;
+    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+        transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
+                          &res11);
+        store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
+                      res10, res11);
+
+        t0 = t8;
+        t1 = t9;
+        t2 = t10;
+        t3 = t11;
+        t4 = t12;
+        t5 = t13;
+        t6 = t14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * MAX_SB_SIZE;
+      height -= 8;
+    } while (height > 0);
+  } else {
+    /*if height is a multiple of 4*/
+    int16x8_t tt0, tt1, tt2, tt3;
+    const uint8_t *s;
+    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t d0, d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x4_t s11, s12, s13, s14;
+    uint8x8_t t0, t1, t2, t3;
+
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+      transpose_u8_8x4(&t0, &t1, &t2,
+                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
+
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
+      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
+      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
+      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
+      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
+      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
+      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
+        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
+        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
+        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
+        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
+        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
+        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
+        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
+
+        res0 = wiener_convolve8_horiz_4x8(
+            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
+        res1 = wiener_convolve8_horiz_4x8(
+            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
+        res2 = wiener_convolve8_horiz_4x8(
+            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
+        res3 = wiener_convolve8_horiz_4x8(
+            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
+        res4 =
+            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res5 =
+            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res6 =
+            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res7 =
+            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
+                                       filter_x_tmp, bd, conv_params->round_0);
+
+        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7, &d0, &d1, &d2, &d3);
+
+        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * MAX_SB_SIZE;
+      height -= 4;
+    } while (height > 0);
+  }
+
+  {
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    uint8x8_t t0, t1, t2, t3;
+    int16_t *src_tmp_ptr, *s;
+    uint8_t *dst_tmp_ptr;
+    height = h;
+    width = w;
+    src_tmp_ptr = (int16_t *)temp;
+    dst_tmp_ptr = dst;
+    src_stride = MAX_SB_SIZE;
+
+    do {
+      s = src_tmp_ptr;
+      s0 = vld1q_s16(s);
+      s += src_stride;
+      s1 = vld1q_s16(s);
+      s += src_stride;
+      s2 = vld1q_s16(s);
+      s += src_stride;
+      s3 = vld1q_s16(s);
+      s += src_stride;
+      s4 = vld1q_s16(s);
+      s += src_stride;
+      s5 = vld1q_s16(s);
+      s += src_stride;
+      s6 = vld1q_s16(s);
+      s += src_stride;
+      d = dst_tmp_ptr;
+      height = h;
+
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+        s8 = vld1q_s16(s);
+        s += src_stride;
+        s9 = vld1q_s16(s);
+        s += src_stride;
+        s10 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+        vst1_u8(d, t1);
+        d += dst_stride;
+        vst1_u8(d, t2);
+        d += dst_stride;
+        vst1_u8(d, t3);
+        d += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 3);
+
+      if (height != 0) {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+
+        do {
+          s7 = vld1q_s16(s);
+          s += src_stride;
+
+          t0 =
+              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
+                                        filter_y_tmp, bd, conv_params->round_1);
+          vst1_u8(d, t0);
+          d += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+        } while (height > 0);
+      }
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d.c b/third_party/aom/av1/common/av1_fwd_txfm1d.c
deleted file mode 100644
index c9c7f437e..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm1d.c
+++ /dev/null
@@ -1,2355 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include "aom_dsp/inv_txfm.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-
-void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
-                      int32_t size, int8_t bit);
-
-#define range_check(stage, input, buf, size, bit) \
-  range_check_func(stage, input, buf, size, bit)
-#else
-#define range_check(stage, input, buf, size, bit) \
-  {                                               \
-    (void)stage;                                  \
-    (void)input;                                  \
-    (void)buf;                                    \
-    (void)size;                                   \
-    (void)bit;                                    \
-  }
-#endif
-
-// TODO(angiebird): Make 1-d txfm functions static
-void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
-                   const int8_t *stage_range) {
-  const int32_t size = 4;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[4];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[0] + input[3];
-  bf1[1] = input[1] + input[2];
-  bf1[2] = -input[2] + input[1];
-  bf1[3] = -input[3] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[2];
-  bf1[2] = bf0[1];
-  bf1[3] = bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
-                   const int8_t *stage_range) {
-  const int32_t size = 8;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[8];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[0] + input[7];
-  bf1[1] = input[1] + input[6];
-  bf1[2] = input[2] + input[5];
-  bf1[3] = input[3] + input[4];
-  bf1[4] = -input[4] + input[3];
-  bf1[5] = -input[5] + input[2];
-  bf1[6] = -input[6] + input[1];
-  bf1[7] = -input[7] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = -bf0[2] + bf0[1];
-  bf1[3] = -bf0[3] + bf0[0];
-  bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
-  bf1[7] = bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = -bf0[5] + bf0[4];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[7] + bf0[6];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[4];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[6];
-  bf1[4] = bf0[1];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[3];
-  bf1[7] = bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fdct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 16;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[16];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[0] + input[15];
-  bf1[1] = input[1] + input[14];
-  bf1[2] = input[2] + input[13];
-  bf1[3] = input[3] + input[12];
-  bf1[4] = input[4] + input[11];
-  bf1[5] = input[5] + input[10];
-  bf1[6] = input[6] + input[9];
-  bf1[7] = input[7] + input[8];
-  bf1[8] = -input[8] + input[7];
-  bf1[9] = -input[9] + input[6];
-  bf1[10] = -input[10] + input[5];
-  bf1[11] = -input[11] + input[4];
-  bf1[12] = -input[12] + input[3];
-  bf1[13] = -input[13] + input[2];
-  bf1[14] = -input[14] + input[1];
-  bf1[15] = -input[15] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = -bf0[4] + bf0[3];
-  bf1[5] = -bf0[5] + bf0[2];
-  bf1[6] = -bf0[6] + bf0[1];
-  bf1[7] = -bf0[7] + bf0[0];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = -bf0[2] + bf0[1];
-  bf1[3] = -bf0[3] + bf0[0];
-  bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = -bf0[10] + bf0[9];
-  bf1[11] = -bf0[11] + bf0[8];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[14] + bf0[13];
-  bf1[15] = bf0[15] + bf0[12];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = -bf0[5] + bf0[4];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[7] + bf0[6];
-  bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
-  bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = -bf0[9] + bf0[8];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[11] + bf0[10];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = -bf0[13] + bf0[12];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[15] + bf0[14];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[8];
-  bf1[2] = bf0[4];
-  bf1[3] = bf0[12];
-  bf1[4] = bf0[2];
-  bf1[5] = bf0[10];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[14];
-  bf1[8] = bf0[1];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[5];
-  bf1[11] = bf0[13];
-  bf1[12] = bf0[3];
-  bf1[13] = bf0[11];
-  bf1[14] = bf0[7];
-  bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fdct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 32;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[32];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[0] + input[31];
-  bf1[1] = input[1] + input[30];
-  bf1[2] = input[2] + input[29];
-  bf1[3] = input[3] + input[28];
-  bf1[4] = input[4] + input[27];
-  bf1[5] = input[5] + input[26];
-  bf1[6] = input[6] + input[25];
-  bf1[7] = input[7] + input[24];
-  bf1[8] = input[8] + input[23];
-  bf1[9] = input[9] + input[22];
-  bf1[10] = input[10] + input[21];
-  bf1[11] = input[11] + input[20];
-  bf1[12] = input[12] + input[19];
-  bf1[13] = input[13] + input[18];
-  bf1[14] = input[14] + input[17];
-  bf1[15] = input[15] + input[16];
-  bf1[16] = -input[16] + input[15];
-  bf1[17] = -input[17] + input[14];
-  bf1[18] = -input[18] + input[13];
-  bf1[19] = -input[19] + input[12];
-  bf1[20] = -input[20] + input[11];
-  bf1[21] = -input[21] + input[10];
-  bf1[22] = -input[22] + input[9];
-  bf1[23] = -input[23] + input[8];
-  bf1[24] = -input[24] + input[7];
-  bf1[25] = -input[25] + input[6];
-  bf1[26] = -input[26] + input[5];
-  bf1[27] = -input[27] + input[4];
-  bf1[28] = -input[28] + input[3];
-  bf1[29] = -input[29] + input[2];
-  bf1[30] = -input[30] + input[1];
-  bf1[31] = -input[31] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = -bf0[8] + bf0[7];
-  bf1[9] = -bf0[9] + bf0[6];
-  bf1[10] = -bf0[10] + bf0[5];
-  bf1[11] = -bf0[11] + bf0[4];
-  bf1[12] = -bf0[12] + bf0[3];
-  bf1[13] = -bf0[13] + bf0[2];
-  bf1[14] = -bf0[14] + bf0[1];
-  bf1[15] = -bf0[15] + bf0[0];
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = bf0[30];
-  bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = -bf0[4] + bf0[3];
-  bf1[5] = -bf0[5] + bf0[2];
-  bf1[6] = -bf0[6] + bf0[1];
-  bf1[7] = -bf0[7] + bf0[0];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[23];
-  bf1[17] = bf0[17] + bf0[22];
-  bf1[18] = bf0[18] + bf0[21];
-  bf1[19] = bf0[19] + bf0[20];
-  bf1[20] = -bf0[20] + bf0[19];
-  bf1[21] = -bf0[21] + bf0[18];
-  bf1[22] = -bf0[22] + bf0[17];
-  bf1[23] = -bf0[23] + bf0[16];
-  bf1[24] = -bf0[24] + bf0[31];
-  bf1[25] = -bf0[25] + bf0[30];
-  bf1[26] = -bf0[26] + bf0[29];
-  bf1[27] = -bf0[27] + bf0[28];
-  bf1[28] = bf0[28] + bf0[27];
-  bf1[29] = bf0[29] + bf0[26];
-  bf1[30] = bf0[30] + bf0[25];
-  bf1[31] = bf0[31] + bf0[24];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = -bf0[2] + bf0[1];
-  bf1[3] = -bf0[3] + bf0[0];
-  bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = -bf0[10] + bf0[9];
-  bf1[11] = -bf0[11] + bf0[8];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[14] + bf0[13];
-  bf1[15] = bf0[15] + bf0[12];
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
-  bf1[30] = bf0[30];
-  bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = -bf0[5] + bf0[4];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[7] + bf0[6];
-  bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
-  bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[19];
-  bf1[17] = bf0[17] + bf0[18];
-  bf1[18] = -bf0[18] + bf0[17];
-  bf1[19] = -bf0[19] + bf0[16];
-  bf1[20] = -bf0[20] + bf0[23];
-  bf1[21] = -bf0[21] + bf0[22];
-  bf1[22] = bf0[22] + bf0[21];
-  bf1[23] = bf0[23] + bf0[20];
-  bf1[24] = bf0[24] + bf0[27];
-  bf1[25] = bf0[25] + bf0[26];
-  bf1[26] = -bf0[26] + bf0[25];
-  bf1[27] = -bf0[27] + bf0[24];
-  bf1[28] = -bf0[28] + bf0[31];
-  bf1[29] = -bf0[29] + bf0[30];
-  bf1[30] = bf0[30] + bf0[29];
-  bf1[31] = bf0[31] + bf0[28];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = -bf0[9] + bf0[8];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[11] + bf0[10];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = -bf0[13] + bf0[12];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[15] + bf0[14];
-  bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
-  bf1[23] = bf0[23];
-  bf1[24] = bf0[24];
-  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
-  bf1[27] = bf0[27];
-  bf1[28] = bf0[28];
-  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
-  bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
-  bf1[16] = bf0[16] + bf0[17];
-  bf1[17] = -bf0[17] + bf0[16];
-  bf1[18] = -bf0[18] + bf0[19];
-  bf1[19] = bf0[19] + bf0[18];
-  bf1[20] = bf0[20] + bf0[21];
-  bf1[21] = -bf0[21] + bf0[20];
-  bf1[22] = -bf0[22] + bf0[23];
-  bf1[23] = bf0[23] + bf0[22];
-  bf1[24] = bf0[24] + bf0[25];
-  bf1[25] = -bf0[25] + bf0[24];
-  bf1[26] = -bf0[26] + bf0[27];
-  bf1[27] = bf0[27] + bf0[26];
-  bf1[28] = bf0[28] + bf0[29];
-  bf1[29] = -bf0[29] + bf0[28];
-  bf1[30] = -bf0[30] + bf0[31];
-  bf1[31] = bf0[31] + bf0[30];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[16];
-  bf1[2] = bf0[8];
-  bf1[3] = bf0[24];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[20];
-  bf1[6] = bf0[12];
-  bf1[7] = bf0[28];
-  bf1[8] = bf0[2];
-  bf1[9] = bf0[18];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[26];
-  bf1[12] = bf0[6];
-  bf1[13] = bf0[22];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[30];
-  bf1[16] = bf0[1];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[9];
-  bf1[19] = bf0[25];
-  bf1[20] = bf0[5];
-  bf1[21] = bf0[21];
-  bf1[22] = bf0[13];
-  bf1[23] = bf0[29];
-  bf1[24] = bf0[3];
-  bf1[25] = bf0[19];
-  bf1[26] = bf0[11];
-  bf1[27] = bf0[27];
-  bf1[28] = bf0[7];
-  bf1[29] = bf0[23];
-  bf1[30] = bf0[15];
-  bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 4;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[4];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[3];
-  bf1[1] = input[0];
-  bf1[2] = input[1];
-  bf1[3] = input[2];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[2];
-  bf1[2] = bf0[3];
-  bf1[3] = -bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 8;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[8];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[7];
-  bf1[1] = input[0];
-  bf1[2] = input[5];
-  bf1[3] = input[2];
-  bf1[4] = input[3];
-  bf1[5] = input[4];
-  bf1[6] = input[1];
-  bf1[7] = input[6];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = -bf0[4] + bf0[0];
-  bf1[5] = -bf0[5] + bf0[1];
-  bf1[6] = -bf0[6] + bf0[2];
-  bf1[7] = -bf0[7] + bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = -bf0[6] + bf0[4];
-  bf1[7] = -bf0[7] + bf0[5];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[4];
-  bf1[2] = bf0[6];
-  bf1[3] = -bf0[2];
-  bf1[4] = bf0[3];
-  bf1[5] = -bf0[7];
-  bf1[6] = bf0[5];
-  bf1[7] = -bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 16;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[16];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[15];
-  bf1[1] = input[0];
-  bf1[2] = input[13];
-  bf1[3] = input[2];
-  bf1[4] = input[11];
-  bf1[5] = input[4];
-  bf1[6] = input[9];
-  bf1[7] = input[6];
-  bf1[8] = input[7];
-  bf1[9] = input[8];
-  bf1[10] = input[5];
-  bf1[11] = input[10];
-  bf1[12] = input[3];
-  bf1[13] = input[12];
-  bf1[14] = input[1];
-  bf1[15] = input[14];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = -bf0[8] + bf0[0];
-  bf1[9] = -bf0[9] + bf0[1];
-  bf1[10] = -bf0[10] + bf0[2];
-  bf1[11] = -bf0[11] + bf0[3];
-  bf1[12] = -bf0[12] + bf0[4];
-  bf1[13] = -bf0[13] + bf0[5];
-  bf1[14] = -bf0[14] + bf0[6];
-  bf1[15] = -bf0[15] + bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = -bf0[4] + bf0[0];
-  bf1[5] = -bf0[5] + bf0[1];
-  bf1[6] = -bf0[6] + bf0[2];
-  bf1[7] = -bf0[7] + bf0[3];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = -bf0[12] + bf0[8];
-  bf1[13] = -bf0[13] + bf0[9];
-  bf1[14] = -bf0[14] + bf0[10];
-  bf1[15] = -bf0[15] + bf0[11];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = -bf0[6] + bf0[4];
-  bf1[7] = -bf0[7] + bf0[5];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = -bf0[10] + bf0[8];
-  bf1[11] = -bf0[11] + bf0[9];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = -bf0[14] + bf0[12];
-  bf1[15] = -bf0[15] + bf0[13];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[8];
-  bf1[2] = bf0[12];
-  bf1[3] = -bf0[4];
-  bf1[4] = bf0[6];
-  bf1[5] = -bf0[14];
-  bf1[6] = bf0[10];
-  bf1[7] = -bf0[2];
-  bf1[8] = bf0[3];
-  bf1[9] = -bf0[11];
-  bf1[10] = bf0[15];
-  bf1[11] = -bf0[7];
-  bf1[12] = bf0[5];
-  bf1[13] = -bf0[13];
-  bf1[14] = bf0[9];
-  bf1[15] = -bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 32;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[32];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[31];
-  bf1[1] = input[0];
-  bf1[2] = input[29];
-  bf1[3] = input[2];
-  bf1[4] = input[27];
-  bf1[5] = input[4];
-  bf1[6] = input[25];
-  bf1[7] = input[6];
-  bf1[8] = input[23];
-  bf1[9] = input[8];
-  bf1[10] = input[21];
-  bf1[11] = input[10];
-  bf1[12] = input[19];
-  bf1[13] = input[12];
-  bf1[14] = input[17];
-  bf1[15] = input[14];
-  bf1[16] = input[15];
-  bf1[17] = input[16];
-  bf1[18] = input[13];
-  bf1[19] = input[18];
-  bf1[20] = input[11];
-  bf1[21] = input[20];
-  bf1[22] = input[9];
-  bf1[23] = input[22];
-  bf1[24] = input[7];
-  bf1[25] = input[24];
-  bf1[26] = input[5];
-  bf1[27] = input[26];
-  bf1[28] = input[3];
-  bf1[29] = input[28];
-  bf1[30] = input[1];
-  bf1[31] = input[30];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
-  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[16];
-  bf1[1] = bf0[1] + bf0[17];
-  bf1[2] = bf0[2] + bf0[18];
-  bf1[3] = bf0[3] + bf0[19];
-  bf1[4] = bf0[4] + bf0[20];
-  bf1[5] = bf0[5] + bf0[21];
-  bf1[6] = bf0[6] + bf0[22];
-  bf1[7] = bf0[7] + bf0[23];
-  bf1[8] = bf0[8] + bf0[24];
-  bf1[9] = bf0[9] + bf0[25];
-  bf1[10] = bf0[10] + bf0[26];
-  bf1[11] = bf0[11] + bf0[27];
-  bf1[12] = bf0[12] + bf0[28];
-  bf1[13] = bf0[13] + bf0[29];
-  bf1[14] = bf0[14] + bf0[30];
-  bf1[15] = bf0[15] + bf0[31];
-  bf1[16] = -bf0[16] + bf0[0];
-  bf1[17] = -bf0[17] + bf0[1];
-  bf1[18] = -bf0[18] + bf0[2];
-  bf1[19] = -bf0[19] + bf0[3];
-  bf1[20] = -bf0[20] + bf0[4];
-  bf1[21] = -bf0[21] + bf0[5];
-  bf1[22] = -bf0[22] + bf0[6];
-  bf1[23] = -bf0[23] + bf0[7];
-  bf1[24] = -bf0[24] + bf0[8];
-  bf1[25] = -bf0[25] + bf0[9];
-  bf1[26] = -bf0[26] + bf0[10];
-  bf1[27] = -bf0[27] + bf0[11];
-  bf1[28] = -bf0[28] + bf0[12];
-  bf1[29] = -bf0[29] + bf0[13];
-  bf1[30] = -bf0[30] + bf0[14];
-  bf1[31] = -bf0[31] + bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
-  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
-  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = -bf0[8] + bf0[0];
-  bf1[9] = -bf0[9] + bf0[1];
-  bf1[10] = -bf0[10] + bf0[2];
-  bf1[11] = -bf0[11] + bf0[3];
-  bf1[12] = -bf0[12] + bf0[4];
-  bf1[13] = -bf0[13] + bf0[5];
-  bf1[14] = -bf0[14] + bf0[6];
-  bf1[15] = -bf0[15] + bf0[7];
-  bf1[16] = bf0[16] + bf0[24];
-  bf1[17] = bf0[17] + bf0[25];
-  bf1[18] = bf0[18] + bf0[26];
-  bf1[19] = bf0[19] + bf0[27];
-  bf1[20] = bf0[20] + bf0[28];
-  bf1[21] = bf0[21] + bf0[29];
-  bf1[22] = bf0[22] + bf0[30];
-  bf1[23] = bf0[23] + bf0[31];
-  bf1[24] = -bf0[24] + bf0[16];
-  bf1[25] = -bf0[25] + bf0[17];
-  bf1[26] = -bf0[26] + bf0[18];
-  bf1[27] = -bf0[27] + bf0[19];
-  bf1[28] = -bf0[28] + bf0[20];
-  bf1[29] = -bf0[29] + bf0[21];
-  bf1[30] = -bf0[30] + bf0[22];
-  bf1[31] = -bf0[31] + bf0[23];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = -bf0[4] + bf0[0];
-  bf1[5] = -bf0[5] + bf0[1];
-  bf1[6] = -bf0[6] + bf0[2];
-  bf1[7] = -bf0[7] + bf0[3];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = -bf0[12] + bf0[8];
-  bf1[13] = -bf0[13] + bf0[9];
-  bf1[14] = -bf0[14] + bf0[10];
-  bf1[15] = -bf0[15] + bf0[11];
-  bf1[16] = bf0[16] + bf0[20];
-  bf1[17] = bf0[17] + bf0[21];
-  bf1[18] = bf0[18] + bf0[22];
-  bf1[19] = bf0[19] + bf0[23];
-  bf1[20] = -bf0[20] + bf0[16];
-  bf1[21] = -bf0[21] + bf0[17];
-  bf1[22] = -bf0[22] + bf0[18];
-  bf1[23] = -bf0[23] + bf0[19];
-  bf1[24] = bf0[24] + bf0[28];
-  bf1[25] = bf0[25] + bf0[29];
-  bf1[26] = bf0[26] + bf0[30];
-  bf1[27] = bf0[27] + bf0[31];
-  bf1[28] = -bf0[28] + bf0[24];
-  bf1[29] = -bf0[29] + bf0[25];
-  bf1[30] = -bf0[30] + bf0[26];
-  bf1[31] = -bf0[31] + bf0[27];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = bf0[26];
-  bf1[27] = bf0[27];
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = -bf0[6] + bf0[4];
-  bf1[7] = -bf0[7] + bf0[5];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = -bf0[10] + bf0[8];
-  bf1[11] = -bf0[11] + bf0[9];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = -bf0[14] + bf0[12];
-  bf1[15] = -bf0[15] + bf0[13];
-  bf1[16] = bf0[16] + bf0[18];
-  bf1[17] = bf0[17] + bf0[19];
-  bf1[18] = -bf0[18] + bf0[16];
-  bf1[19] = -bf0[19] + bf0[17];
-  bf1[20] = bf0[20] + bf0[22];
-  bf1[21] = bf0[21] + bf0[23];
-  bf1[22] = -bf0[22] + bf0[20];
-  bf1[23] = -bf0[23] + bf0[21];
-  bf1[24] = bf0[24] + bf0[26];
-  bf1[25] = bf0[25] + bf0[27];
-  bf1[26] = -bf0[26] + bf0[24];
-  bf1[27] = -bf0[27] + bf0[25];
-  bf1[28] = bf0[28] + bf0[30];
-  bf1[29] = bf0[29] + bf0[31];
-  bf1[30] = -bf0[30] + bf0[28];
-  bf1[31] = -bf0[31] + bf0[29];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 10
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 11
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[16];
-  bf1[2] = bf0[24];
-  bf1[3] = -bf0[8];
-  bf1[4] = bf0[12];
-  bf1[5] = -bf0[28];
-  bf1[6] = bf0[20];
-  bf1[7] = -bf0[4];
-  bf1[8] = bf0[6];
-  bf1[9] = -bf0[22];
-  bf1[10] = bf0[30];
-  bf1[11] = -bf0[14];
-  bf1[12] = bf0[10];
-  bf1[13] = -bf0[26];
-  bf1[14] = bf0[18];
-  bf1[15] = -bf0[2];
-  bf1[16] = bf0[3];
-  bf1[17] = -bf0[19];
-  bf1[18] = bf0[27];
-  bf1[19] = -bf0[11];
-  bf1[20] = bf0[15];
-  bf1[21] = -bf0[31];
-  bf1[22] = bf0[23];
-  bf1[23] = -bf0[7];
-  bf1[24] = bf0[5];
-  bf1[25] = -bf0[21];
-  bf1[26] = bf0[29];
-  bf1[27] = -bf0[13];
-  bf1[28] = bf0[9];
-  bf1[29] = -bf0[25];
-  bf1[30] = bf0[17];
-  bf1[31] = -bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-#if CONFIG_EXT_TX
-void av1_fidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 4; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * Sqrt2);
-  range_check(0, input, output, 4, stage_range[0]);
-}
-
-void av1_fidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
-  range_check(0, input, output, 8, stage_range[0]);
-}
-
-void av1_fidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 16; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
-  range_check(0, input, output, 16, stage_range[0]);
-}
-
-void av1_fidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
-  range_check(0, input, output, 32, stage_range[0]);
-}
-
-#if CONFIG_TX64X64
-void av1_fidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 64; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-  range_check(0, input, output, 64, stage_range[0]);
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_TX64X64
-void av1_fdct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 64;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[64];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf1 = output;
-  bf1[0] = input[0] + input[63];
-  bf1[1] = input[1] + input[62];
-  bf1[2] = input[2] + input[61];
-  bf1[3] = input[3] + input[60];
-  bf1[4] = input[4] + input[59];
-  bf1[5] = input[5] + input[58];
-  bf1[6] = input[6] + input[57];
-  bf1[7] = input[7] + input[56];
-  bf1[8] = input[8] + input[55];
-  bf1[9] = input[9] + input[54];
-  bf1[10] = input[10] + input[53];
-  bf1[11] = input[11] + input[52];
-  bf1[12] = input[12] + input[51];
-  bf1[13] = input[13] + input[50];
-  bf1[14] = input[14] + input[49];
-  bf1[15] = input[15] + input[48];
-  bf1[16] = input[16] + input[47];
-  bf1[17] = input[17] + input[46];
-  bf1[18] = input[18] + input[45];
-  bf1[19] = input[19] + input[44];
-  bf1[20] = input[20] + input[43];
-  bf1[21] = input[21] + input[42];
-  bf1[22] = input[22] + input[41];
-  bf1[23] = input[23] + input[40];
-  bf1[24] = input[24] + input[39];
-  bf1[25] = input[25] + input[38];
-  bf1[26] = input[26] + input[37];
-  bf1[27] = input[27] + input[36];
-  bf1[28] = input[28] + input[35];
-  bf1[29] = input[29] + input[34];
-  bf1[30] = input[30] + input[33];
-  bf1[31] = input[31] + input[32];
-  bf1[32] = -input[32] + input[31];
-  bf1[33] = -input[33] + input[30];
-  bf1[34] = -input[34] + input[29];
-  bf1[35] = -input[35] + input[28];
-  bf1[36] = -input[36] + input[27];
-  bf1[37] = -input[37] + input[26];
-  bf1[38] = -input[38] + input[25];
-  bf1[39] = -input[39] + input[24];
-  bf1[40] = -input[40] + input[23];
-  bf1[41] = -input[41] + input[22];
-  bf1[42] = -input[42] + input[21];
-  bf1[43] = -input[43] + input[20];
-  bf1[44] = -input[44] + input[19];
-  bf1[45] = -input[45] + input[18];
-  bf1[46] = -input[46] + input[17];
-  bf1[47] = -input[47] + input[16];
-  bf1[48] = -input[48] + input[15];
-  bf1[49] = -input[49] + input[14];
-  bf1[50] = -input[50] + input[13];
-  bf1[51] = -input[51] + input[12];
-  bf1[52] = -input[52] + input[11];
-  bf1[53] = -input[53] + input[10];
-  bf1[54] = -input[54] + input[9];
-  bf1[55] = -input[55] + input[8];
-  bf1[56] = -input[56] + input[7];
-  bf1[57] = -input[57] + input[6];
-  bf1[58] = -input[58] + input[5];
-  bf1[59] = -input[59] + input[4];
-  bf1[60] = -input[60] + input[3];
-  bf1[61] = -input[61] + input[2];
-  bf1[62] = -input[62] + input[1];
-  bf1[63] = -input[63] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0] + bf0[31];
-  bf1[1] = bf0[1] + bf0[30];
-  bf1[2] = bf0[2] + bf0[29];
-  bf1[3] = bf0[3] + bf0[28];
-  bf1[4] = bf0[4] + bf0[27];
-  bf1[5] = bf0[5] + bf0[26];
-  bf1[6] = bf0[6] + bf0[25];
-  bf1[7] = bf0[7] + bf0[24];
-  bf1[8] = bf0[8] + bf0[23];
-  bf1[9] = bf0[9] + bf0[22];
-  bf1[10] = bf0[10] + bf0[21];
-  bf1[11] = bf0[11] + bf0[20];
-  bf1[12] = bf0[12] + bf0[19];
-  bf1[13] = bf0[13] + bf0[18];
-  bf1[14] = bf0[14] + bf0[17];
-  bf1[15] = bf0[15] + bf0[16];
-  bf1[16] = -bf0[16] + bf0[15];
-  bf1[17] = -bf0[17] + bf0[14];
-  bf1[18] = -bf0[18] + bf0[13];
-  bf1[19] = -bf0[19] + bf0[12];
-  bf1[20] = -bf0[20] + bf0[11];
-  bf1[21] = -bf0[21] + bf0[10];
-  bf1[22] = -bf0[22] + bf0[9];
-  bf1[23] = -bf0[23] + bf0[8];
-  bf1[24] = -bf0[24] + bf0[7];
-  bf1[25] = -bf0[25] + bf0[6];
-  bf1[26] = -bf0[26] + bf0[5];
-  bf1[27] = -bf0[27] + bf0[4];
-  bf1[28] = -bf0[28] + bf0[3];
-  bf1[29] = -bf0[29] + bf0[2];
-  bf1[30] = -bf0[30] + bf0[1];
-  bf1[31] = -bf0[31] + bf0[0];
-  bf1[32] = bf0[32];
-  bf1[33] = bf0[33];
-  bf1[34] = bf0[34];
-  bf1[35] = bf0[35];
-  bf1[36] = bf0[36];
-  bf1[37] = bf0[37];
-  bf1[38] = bf0[38];
-  bf1[39] = bf0[39];
-  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
-  bf1[56] = bf0[56];
-  bf1[57] = bf0[57];
-  bf1[58] = bf0[58];
-  bf1[59] = bf0[59];
-  bf1[60] = bf0[60];
-  bf1[61] = bf0[61];
-  bf1[62] = bf0[62];
-  bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = -bf0[8] + bf0[7];
-  bf1[9] = -bf0[9] + bf0[6];
-  bf1[10] = -bf0[10] + bf0[5];
-  bf1[11] = -bf0[11] + bf0[4];
-  bf1[12] = -bf0[12] + bf0[3];
-  bf1[13] = -bf0[13] + bf0[2];
-  bf1[14] = -bf0[14] + bf0[1];
-  bf1[15] = -bf0[15] + bf0[0];
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = bf0[30];
-  bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[47];
-  bf1[33] = bf0[33] + bf0[46];
-  bf1[34] = bf0[34] + bf0[45];
-  bf1[35] = bf0[35] + bf0[44];
-  bf1[36] = bf0[36] + bf0[43];
-  bf1[37] = bf0[37] + bf0[42];
-  bf1[38] = bf0[38] + bf0[41];
-  bf1[39] = bf0[39] + bf0[40];
-  bf1[40] = -bf0[40] + bf0[39];
-  bf1[41] = -bf0[41] + bf0[38];
-  bf1[42] = -bf0[42] + bf0[37];
-  bf1[43] = -bf0[43] + bf0[36];
-  bf1[44] = -bf0[44] + bf0[35];
-  bf1[45] = -bf0[45] + bf0[34];
-  bf1[46] = -bf0[46] + bf0[33];
-  bf1[47] = -bf0[47] + bf0[32];
-  bf1[48] = -bf0[48] + bf0[63];
-  bf1[49] = -bf0[49] + bf0[62];
-  bf1[50] = -bf0[50] + bf0[61];
-  bf1[51] = -bf0[51] + bf0[60];
-  bf1[52] = -bf0[52] + bf0[59];
-  bf1[53] = -bf0[53] + bf0[58];
-  bf1[54] = -bf0[54] + bf0[57];
-  bf1[55] = -bf0[55] + bf0[56];
-  bf1[56] = bf0[56] + bf0[55];
-  bf1[57] = bf0[57] + bf0[54];
-  bf1[58] = bf0[58] + bf0[53];
-  bf1[59] = bf0[59] + bf0[52];
-  bf1[60] = bf0[60] + bf0[51];
-  bf1[61] = bf0[61] + bf0[50];
-  bf1[62] = bf0[62] + bf0[49];
-  bf1[63] = bf0[63] + bf0[48];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = -bf0[4] + bf0[3];
-  bf1[5] = -bf0[5] + bf0[2];
-  bf1[6] = -bf0[6] + bf0[1];
-  bf1[7] = -bf0[7] + bf0[0];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[23];
-  bf1[17] = bf0[17] + bf0[22];
-  bf1[18] = bf0[18] + bf0[21];
-  bf1[19] = bf0[19] + bf0[20];
-  bf1[20] = -bf0[20] + bf0[19];
-  bf1[21] = -bf0[21] + bf0[18];
-  bf1[22] = -bf0[22] + bf0[17];
-  bf1[23] = -bf0[23] + bf0[16];
-  bf1[24] = -bf0[24] + bf0[31];
-  bf1[25] = -bf0[25] + bf0[30];
-  bf1[26] = -bf0[26] + bf0[29];
-  bf1[27] = -bf0[27] + bf0[28];
-  bf1[28] = bf0[28] + bf0[27];
-  bf1[29] = bf0[29] + bf0[26];
-  bf1[30] = bf0[30] + bf0[25];
-  bf1[31] = bf0[31] + bf0[24];
-  bf1[32] = bf0[32];
-  bf1[33] = bf0[33];
-  bf1[34] = bf0[34];
-  bf1[35] = bf0[35];
-  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
-  bf1[44] = bf0[44];
-  bf1[45] = bf0[45];
-  bf1[46] = bf0[46];
-  bf1[47] = bf0[47];
-  bf1[48] = bf0[48];
-  bf1[49] = bf0[49];
-  bf1[50] = bf0[50];
-  bf1[51] = bf0[51];
-  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
-  bf1[60] = bf0[60];
-  bf1[61] = bf0[61];
-  bf1[62] = bf0[62];
-  bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = -bf0[2] + bf0[1];
-  bf1[3] = -bf0[3] + bf0[0];
-  bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = -bf0[10] + bf0[9];
-  bf1[11] = -bf0[11] + bf0[8];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[14] + bf0[13];
-  bf1[15] = bf0[15] + bf0[12];
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
-  bf1[30] = bf0[30];
-  bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[39];
-  bf1[33] = bf0[33] + bf0[38];
-  bf1[34] = bf0[34] + bf0[37];
-  bf1[35] = bf0[35] + bf0[36];
-  bf1[36] = -bf0[36] + bf0[35];
-  bf1[37] = -bf0[37] + bf0[34];
-  bf1[38] = -bf0[38] + bf0[33];
-  bf1[39] = -bf0[39] + bf0[32];
-  bf1[40] = -bf0[40] + bf0[47];
-  bf1[41] = -bf0[41] + bf0[46];
-  bf1[42] = -bf0[42] + bf0[45];
-  bf1[43] = -bf0[43] + bf0[44];
-  bf1[44] = bf0[44] + bf0[43];
-  bf1[45] = bf0[45] + bf0[42];
-  bf1[46] = bf0[46] + bf0[41];
-  bf1[47] = bf0[47] + bf0[40];
-  bf1[48] = bf0[48] + bf0[55];
-  bf1[49] = bf0[49] + bf0[54];
-  bf1[50] = bf0[50] + bf0[53];
-  bf1[51] = bf0[51] + bf0[52];
-  bf1[52] = -bf0[52] + bf0[51];
-  bf1[53] = -bf0[53] + bf0[50];
-  bf1[54] = -bf0[54] + bf0[49];
-  bf1[55] = -bf0[55] + bf0[48];
-  bf1[56] = -bf0[56] + bf0[63];
-  bf1[57] = -bf0[57] + bf0[62];
-  bf1[58] = -bf0[58] + bf0[61];
-  bf1[59] = -bf0[59] + bf0[60];
-  bf1[60] = bf0[60] + bf0[59];
-  bf1[61] = bf0[61] + bf0[58];
-  bf1[62] = bf0[62] + bf0[57];
-  bf1[63] = bf0[63] + bf0[56];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = -bf0[5] + bf0[4];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[7] + bf0[6];
-  bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
-  bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[19];
-  bf1[17] = bf0[17] + bf0[18];
-  bf1[18] = -bf0[18] + bf0[17];
-  bf1[19] = -bf0[19] + bf0[16];
-  bf1[20] = -bf0[20] + bf0[23];
-  bf1[21] = -bf0[21] + bf0[22];
-  bf1[22] = bf0[22] + bf0[21];
-  bf1[23] = bf0[23] + bf0[20];
-  bf1[24] = bf0[24] + bf0[27];
-  bf1[25] = bf0[25] + bf0[26];
-  bf1[26] = -bf0[26] + bf0[25];
-  bf1[27] = -bf0[27] + bf0[24];
-  bf1[28] = -bf0[28] + bf0[31];
-  bf1[29] = -bf0[29] + bf0[30];
-  bf1[30] = bf0[30] + bf0[29];
-  bf1[31] = bf0[31] + bf0[28];
-  bf1[32] = bf0[32];
-  bf1[33] = bf0[33];
-  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
-  bf1[38] = bf0[38];
-  bf1[39] = bf0[39];
-  bf1[40] = bf0[40];
-  bf1[41] = bf0[41];
-  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
-  bf1[46] = bf0[46];
-  bf1[47] = bf0[47];
-  bf1[48] = bf0[48];
-  bf1[49] = bf0[49];
-  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
-  bf1[54] = bf0[54];
-  bf1[55] = bf0[55];
-  bf1[56] = bf0[56];
-  bf1[57] = bf0[57];
-  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
-  bf1[62] = bf0[62];
-  bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = -bf0[9] + bf0[8];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[11] + bf0[10];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = -bf0[13] + bf0[12];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[15] + bf0[14];
-  bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
-  bf1[23] = bf0[23];
-  bf1[24] = bf0[24];
-  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
-  bf1[27] = bf0[27];
-  bf1[28] = bf0[28];
-  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
-  bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[35];
-  bf1[33] = bf0[33] + bf0[34];
-  bf1[34] = -bf0[34] + bf0[33];
-  bf1[35] = -bf0[35] + bf0[32];
-  bf1[36] = -bf0[36] + bf0[39];
-  bf1[37] = -bf0[37] + bf0[38];
-  bf1[38] = bf0[38] + bf0[37];
-  bf1[39] = bf0[39] + bf0[36];
-  bf1[40] = bf0[40] + bf0[43];
-  bf1[41] = bf0[41] + bf0[42];
-  bf1[42] = -bf0[42] + bf0[41];
-  bf1[43] = -bf0[43] + bf0[40];
-  bf1[44] = -bf0[44] + bf0[47];
-  bf1[45] = -bf0[45] + bf0[46];
-  bf1[46] = bf0[46] + bf0[45];
-  bf1[47] = bf0[47] + bf0[44];
-  bf1[48] = bf0[48] + bf0[51];
-  bf1[49] = bf0[49] + bf0[50];
-  bf1[50] = -bf0[50] + bf0[49];
-  bf1[51] = -bf0[51] + bf0[48];
-  bf1[52] = -bf0[52] + bf0[55];
-  bf1[53] = -bf0[53] + bf0[54];
-  bf1[54] = bf0[54] + bf0[53];
-  bf1[55] = bf0[55] + bf0[52];
-  bf1[56] = bf0[56] + bf0[59];
-  bf1[57] = bf0[57] + bf0[58];
-  bf1[58] = -bf0[58] + bf0[57];
-  bf1[59] = -bf0[59] + bf0[56];
-  bf1[60] = -bf0[60] + bf0[63];
-  bf1[61] = -bf0[61] + bf0[62];
-  bf1[62] = bf0[62] + bf0[61];
-  bf1[63] = bf0[63] + bf0[60];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
-  bf1[16] = bf0[16] + bf0[17];
-  bf1[17] = -bf0[17] + bf0[16];
-  bf1[18] = -bf0[18] + bf0[19];
-  bf1[19] = bf0[19] + bf0[18];
-  bf1[20] = bf0[20] + bf0[21];
-  bf1[21] = -bf0[21] + bf0[20];
-  bf1[22] = -bf0[22] + bf0[23];
-  bf1[23] = bf0[23] + bf0[22];
-  bf1[24] = bf0[24] + bf0[25];
-  bf1[25] = -bf0[25] + bf0[24];
-  bf1[26] = -bf0[26] + bf0[27];
-  bf1[27] = bf0[27] + bf0[26];
-  bf1[28] = bf0[28] + bf0[29];
-  bf1[29] = -bf0[29] + bf0[28];
-  bf1[30] = -bf0[30] + bf0[31];
-  bf1[31] = bf0[31] + bf0[30];
-  bf1[32] = bf0[32];
-  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
-  bf1[35] = bf0[35];
-  bf1[36] = bf0[36];
-  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
-  bf1[39] = bf0[39];
-  bf1[40] = bf0[40];
-  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
-  bf1[43] = bf0[43];
-  bf1[44] = bf0[44];
-  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
-  bf1[47] = bf0[47];
-  bf1[48] = bf0[48];
-  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
-  bf1[51] = bf0[51];
-  bf1[52] = bf0[52];
-  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
-  bf1[55] = bf0[55];
-  bf1[56] = bf0[56];
-  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
-  bf1[59] = bf0[59];
-  bf1[60] = bf0[60];
-  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
-  bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
-  bf1[32] = bf0[32] + bf0[33];
-  bf1[33] = -bf0[33] + bf0[32];
-  bf1[34] = -bf0[34] + bf0[35];
-  bf1[35] = bf0[35] + bf0[34];
-  bf1[36] = bf0[36] + bf0[37];
-  bf1[37] = -bf0[37] + bf0[36];
-  bf1[38] = -bf0[38] + bf0[39];
-  bf1[39] = bf0[39] + bf0[38];
-  bf1[40] = bf0[40] + bf0[41];
-  bf1[41] = -bf0[41] + bf0[40];
-  bf1[42] = -bf0[42] + bf0[43];
-  bf1[43] = bf0[43] + bf0[42];
-  bf1[44] = bf0[44] + bf0[45];
-  bf1[45] = -bf0[45] + bf0[44];
-  bf1[46] = -bf0[46] + bf0[47];
-  bf1[47] = bf0[47] + bf0[46];
-  bf1[48] = bf0[48] + bf0[49];
-  bf1[49] = -bf0[49] + bf0[48];
-  bf1[50] = -bf0[50] + bf0[51];
-  bf1[51] = bf0[51] + bf0[50];
-  bf1[52] = bf0[52] + bf0[53];
-  bf1[53] = -bf0[53] + bf0[52];
-  bf1[54] = -bf0[54] + bf0[55];
-  bf1[55] = bf0[55] + bf0[54];
-  bf1[56] = bf0[56] + bf0[57];
-  bf1[57] = -bf0[57] + bf0[56];
-  bf1[58] = -bf0[58] + bf0[59];
-  bf1[59] = bf0[59] + bf0[58];
-  bf1[60] = bf0[60] + bf0[61];
-  bf1[61] = -bf0[61] + bf0[60];
-  bf1[62] = -bf0[62] + bf0[63];
-  bf1[63] = bf0[63] + bf0[62];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 10
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = bf0[26];
-  bf1[27] = bf0[27];
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = bf0[30];
-  bf1[31] = bf0[31];
-  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
-  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
-  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 11
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[32];
-  bf1[2] = bf0[16];
-  bf1[3] = bf0[48];
-  bf1[4] = bf0[8];
-  bf1[5] = bf0[40];
-  bf1[6] = bf0[24];
-  bf1[7] = bf0[56];
-  bf1[8] = bf0[4];
-  bf1[9] = bf0[36];
-  bf1[10] = bf0[20];
-  bf1[11] = bf0[52];
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[44];
-  bf1[14] = bf0[28];
-  bf1[15] = bf0[60];
-  bf1[16] = bf0[2];
-  bf1[17] = bf0[34];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[50];
-  bf1[20] = bf0[10];
-  bf1[21] = bf0[42];
-  bf1[22] = bf0[26];
-  bf1[23] = bf0[58];
-  bf1[24] = bf0[6];
-  bf1[25] = bf0[38];
-  bf1[26] = bf0[22];
-  bf1[27] = bf0[54];
-  bf1[28] = bf0[14];
-  bf1[29] = bf0[46];
-  bf1[30] = bf0[30];
-  bf1[31] = bf0[62];
-  bf1[32] = bf0[1];
-  bf1[33] = bf0[33];
-  bf1[34] = bf0[17];
-  bf1[35] = bf0[49];
-  bf1[36] = bf0[9];
-  bf1[37] = bf0[41];
-  bf1[38] = bf0[25];
-  bf1[39] = bf0[57];
-  bf1[40] = bf0[5];
-  bf1[41] = bf0[37];
-  bf1[42] = bf0[21];
-  bf1[43] = bf0[53];
-  bf1[44] = bf0[13];
-  bf1[45] = bf0[45];
-  bf1[46] = bf0[29];
-  bf1[47] = bf0[61];
-  bf1[48] = bf0[3];
-  bf1[49] = bf0[35];
-  bf1[50] = bf0[19];
-  bf1[51] = bf0[51];
-  bf1[52] = bf0[11];
-  bf1[53] = bf0[43];
-  bf1[54] = bf0[27];
-  bf1[55] = bf0[59];
-  bf1[56] = bf0[7];
-  bf1[57] = bf0[39];
-  bf1[58] = bf0[23];
-  bf1[59] = bf0[55];
-  bf1[60] = bf0[15];
-  bf1[61] = bf0[47];
-  bf1[62] = bf0[31];
-  bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d.h b/third_party/aom/av1/common/av1_fwd_txfm1d.h
deleted file mode 100644
index f880239f7..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm1d.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_FWD_TXFM1D_H_
-#define AV1_FWD_TXFM1D_H_
-
-#include "av1/common/av1_txfm.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
-                   const int8_t *stage_range);
-void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
-                   const int8_t *stage_range);
-void av1_fdct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_fdct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-
-void av1_fadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_EXT_TX
-void av1_fidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_fidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h
deleted file mode 100644
index f2ed93151..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_FWD_TXFM2D_CFG_H_
-#define AV1_FWD_TXFM2D_CFG_H_
-#include "av1/common/enums.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-
-//  ---------------- 4x4 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_4[3] = { 2, 0, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_4[4] = { 0, 1, 2, 2 };
-static const int8_t fwd_stage_range_row_dct_4[4] = { 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_col_adst_4[6] = { 0, 0, 1, 2, 2, 2 };
-static const int8_t fwd_stage_range_row_adst_4[6] = { 2, 2, 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_idx_4[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_col_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-
-//  ---------------- 8x8 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_8[3] = { 2, -1, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_8[6] = { 0, 1, 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_row_dct_8[6] = { 3, 4, 5, 5, 5, 5 };
-static const int8_t fwd_stage_range_col_adst_8[8] = { 0, 0, 1, 2, 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_row_adst_8[8] = { 3, 3, 3, 4, 4, 5, 5, 5 };
-static const int8_t fwd_stage_range_idx_8[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_col_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t fwd_cos_bit_row_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-
-//  ---------------- 16x16 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_16[3] = { 2, -2, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_16[8] = { 0, 1, 2, 3, 4, 4, 4, 4 };
-static const int8_t fwd_stage_range_row_dct_16[8] = { 4, 5, 6, 7, 7, 7, 7, 7 };
-static const int8_t fwd_stage_range_col_adst_16[10] = { 0, 0, 1, 2, 2,
-                                                        3, 3, 4, 4, 4 };
-static const int8_t fwd_stage_range_row_adst_16[10] = {
-  4, 4, 4, 5, 5, 6, 6, 7, 7, 7,
-};
-static const int8_t fwd_stage_range_idx_16[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_16[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t fwd_cos_bit_row_dct_16[8] = {
-  12, 12, 12, 12, 12, 12, 12, 12
-};
-static const int8_t fwd_cos_bit_col_adst_16[10] = { 13, 13, 13, 13, 13,
-                                                    13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_16[10] = { 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12 };
-
-//  ---------------- 32x32 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_32[3] = { 2, -4, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_32[10] = { 0, 1, 2, 3, 4,
-                                                       5, 5, 5, 5, 5 };
-static const int8_t fwd_stage_range_row_dct_32[10] = { 5, 6, 7, 8, 9,
-                                                       9, 9, 9, 9, 9 };
-static const int8_t fwd_stage_range_col_adst_32[12] = { 0, 0, 1, 2, 2, 3,
-                                                        3, 4, 4, 5, 5, 5 };
-static const int8_t fwd_stage_range_row_adst_32[12] = { 5, 5, 5, 6, 6, 7,
-                                                        7, 8, 8, 9, 9, 9 };
-static const int8_t fwd_stage_range_idx_32[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_32[10] = { 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12 };
-static const int8_t fwd_cos_bit_row_dct_32[10] = { 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12 };
-static const int8_t fwd_cos_bit_col_adst_32[12] = { 12, 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12, 12 };
-static const int8_t fwd_cos_bit_row_adst_32[12] = { 12, 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12, 12 };
-
-//  ---------------- 64x64 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_64[3] = { 0, -2, -2 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_64[12] = { 0, 1, 2, 3, 4, 5,
-                                                       6, 6, 6, 6, 6, 6 };
-static const int8_t fwd_stage_range_row_dct_64[12] = { 6,  7,  8,  9,  10, 11,
-                                                       11, 11, 11, 11, 11, 11 };
-static const int8_t fwd_stage_range_idx_64[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_64[12] = { 15, 15, 15, 15, 15, 14,
-                                                   13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_dct_64[12] = { 15, 14, 13, 12, 11, 10,
-                                                   10, 10, 10, 10, 10, 10 };
-
-//  ---------------- row config fwd_dct_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_4 = {
-  4,  // .txfm_size
-  4,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                // .shift
-  fwd_stage_range_row_dct_4,  // .stage_range
-  fwd_cos_bit_row_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- row config fwd_dct_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_8 = {
-  8,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                // .shift
-  fwd_stage_range_row_dct_8,  // .stage_range
-  fwd_cos_bit_row_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- row config fwd_dct_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_16 = {
-  16,  // .txfm_size
-  8,   // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                // .shift
-  fwd_stage_range_row_dct_16,  // .stage_range
-  fwd_cos_bit_row_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- row config fwd_dct_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_32 = {
-  32,  // .txfm_size
-  10,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                // .shift
-  fwd_stage_range_row_dct_32,  // .stage_range
-  fwd_cos_bit_row_dct_32,      // .cos_bit_row
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-//  ---------------- row config fwd_dct_64 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  fwd_shift_64,                // .shift
-  fwd_stage_range_row_dct_64,  // .stage_range
-  fwd_cos_bit_row_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-
-//  ---------------- row config fwd_adst_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_4 = {
-  4,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                 // .shift
-  fwd_stage_range_row_adst_4,  // .stage_range
-  fwd_cos_bit_row_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- row config fwd_adst_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_8 = {
-  8,  // .txfm_size
-  8,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                 // .shift
-  fwd_stage_range_row_adst_8,  // .stage_range
-  fwd_cos_bit_row_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- row config fwd_adst_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_16 = {
-  16,  // .txfm_size
-  10,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                 // .shift
-  fwd_stage_range_row_adst_16,  // .stage_range
-  fwd_cos_bit_row_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- row config fwd_adst_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_32 = {
-  32,  // .txfm_size
-  12,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                 // .shift
-  fwd_stage_range_row_adst_32,  // .stage_range
-  fwd_cos_bit_row_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_4 = {
-  4,  // .txfm_size
-  4,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                // .shift
-  fwd_stage_range_col_dct_4,  // .stage_range
-  fwd_cos_bit_col_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_8 = {
-  8,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                // .shift
-  fwd_stage_range_col_dct_8,  // .stage_range
-  fwd_cos_bit_col_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- col config fwd_dct_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_16 = {
-  16,  // .txfm_size
-  8,   // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                // .shift
-  fwd_stage_range_col_dct_16,  // .stage_range
-  fwd_cos_bit_col_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_32 = {
-  32,  // .txfm_size
-  10,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                // .shift
-  fwd_stage_range_col_dct_32,  // .stage_range
-  fwd_cos_bit_col_dct_32,      // .cos_bit_col
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_64 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  fwd_shift_64,                // .shift
-  fwd_stage_range_col_dct_64,  // .stage_range
-  fwd_cos_bit_col_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-
-//  ---------------- col config fwd_adst_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_4 = {
-  4,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                 // .shift
-  fwd_stage_range_col_adst_4,  // .stage_range
-  fwd_cos_bit_col_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- col config fwd_adst_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_8 = {
-  8,  // .txfm_size
-  8,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                 // .shift
-  fwd_stage_range_col_adst_8,  // .stage_range
-  fwd_cos_bit_col_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- col config fwd_adst_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_16 = {
-  16,  // .txfm_size
-  10,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                 // .shift
-  fwd_stage_range_col_adst_16,  // .stage_range
-  fwd_cos_bit_col_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- col config fwd_adst_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_32 = {
-  32,  // .txfm_size
-  12,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                 // .shift
-  fwd_stage_range_col_adst_32,  // .stage_range
-  fwd_cos_bit_col_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-#if CONFIG_EXT_TX
-// identity does not need to differentiate between row and col
-//  ---------------- row/col config fwd_identity_4 ----------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_4 = {
-  4,  // .txfm_size
-  1,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,            // .shift
-  fwd_stage_range_idx_4,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY4,    // .txfm_type
-};
-
-//  ---------------- row/col config fwd_identity_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_8 = {
-  8,  // .txfm_size
-  1,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,            // .shift
-  fwd_stage_range_idx_8,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY8,    // .txfm_type
-};
-
-//  ---------------- row/col config fwd_identity_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_16 = {
-  16,  // .txfm_size
-  1,   // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,            // .shift
-  fwd_stage_range_idx_16,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY16,    // .txfm_type
-};
-
-//  ---------------- row/col config fwd_identity_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_32 = {
-  32,  // .txfm_size
-  1,   // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,            // .shift
-  fwd_stage_range_idx_32,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY32,    // .txfm_type
-};
-#endif  // CONFIG_EXT_TX
-#endif  // AV1_FWD_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_fwd_txfm2d.c b/third_party/aom/av1/common/av1_fwd_txfm2d.c
deleted file mode 100644
index 740c63322..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm2d.c
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "av1/common/enums.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
-#include "av1/common/av1_txfm.h"
-
-static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
-  switch (txfm_type) {
-    case TXFM_TYPE_DCT4: return av1_fdct4_new;
-    case TXFM_TYPE_DCT8: return av1_fdct8_new;
-    case TXFM_TYPE_DCT16: return av1_fdct16_new;
-    case TXFM_TYPE_DCT32: return av1_fdct32_new;
-#if CONFIG_TX64X64
-    case TXFM_TYPE_DCT64: return av1_fdct64_new;
-#endif  // CONFIG_TX64X64
-    case TXFM_TYPE_ADST4: return av1_fadst4_new;
-    case TXFM_TYPE_ADST8: return av1_fadst8_new;
-    case TXFM_TYPE_ADST16: return av1_fadst16_new;
-    case TXFM_TYPE_ADST32: return av1_fadst32_new;
-#if CONFIG_EXT_TX
-    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
-    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
-    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
-    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
-#if CONFIG_TX64X64
-    case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-    default: assert(0); return NULL;
-  }
-}
-
-void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
-  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_col[i] = cfg->col_cfg->stage_range[i] + shift[0] + bd + 1;
-  }
-
-  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_row[i] =
-        cfg->row_cfg->stage_range[i] + shift[0] + shift[1] + bd + 1;
-  }
-}
-
-static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
-                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
-                                int32_t *buf, int bd) {
-  int c, r;
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
-  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
-  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
-  assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  assert(cfg->row_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
-
-  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
-  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
-  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
-  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
-
-  // use output buffer as temp buffer
-  int32_t *temp_in = output;
-  int32_t *temp_out = output + txfm_size_row;
-
-  // Columns
-  for (c = 0; c < txfm_size_col; ++c) {
-    if (cfg->ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
-    } else {
-      for (r = 0; r < txfm_size_row; ++r)
-        // flip upside down
-        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
-    }
-    round_shift_array(temp_in, txfm_size_row, -shift[0]);
-    // Multiply everything by Sqrt2 on the larger dimension if the
-    // transform is rectangular
-    if (txfm_size_col > txfm_size_row) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = (int32_t)fdct_round_shift(temp_in[r] * Sqrt2);
-    }
-    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
-    round_shift_array(temp_out, txfm_size_row, -shift[1]);
-    if (cfg->lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        buf[r * txfm_size_col + c] = temp_out[r];
-    } else {
-      for (r = 0; r < txfm_size_row; ++r)
-        // flip from left to right
-        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
-    }
-  }
-
-  // Rows
-  for (r = 0; r < txfm_size_row; ++r) {
-    // Multiply everything by Sqrt2 on the larger dimension if the
-    // transform is rectangular
-    if (txfm_size_row > txfm_size_col) {
-      for (c = 0; c < txfm_size_col; ++c)
-        buf[r * txfm_size_col + c] =
-            (int32_t)fdct_round_shift(buf[r * txfm_size_col + c] * Sqrt2);
-    }
-    txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
-                  cos_bit_row, stage_range_row);
-    round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
-  }
-}
-
-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int32_t txfm_buf[4 * 8];
-  int16_t rinput[4 * 8];
-  TX_SIZE tx_size = TX_4X8;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(rtx_type, rtx_size);
-  fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
-  transpose_int32(output, w, txfm_buf, rw, rw, rh);
-#else
-  int32_t txfm_buf[4 * 8];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X8);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-#endif
-}
-
-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[8 * 4];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X4);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
-                           TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int32_t txfm_buf[8 * 16];
-  int16_t rinput[8 * 16];
-  TX_SIZE tx_size = TX_8X16;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(rtx_type, rtx_size);
-  fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
-  transpose_int32(output, w, txfm_buf, rw, rw, rh);
-#else
-  int32_t txfm_buf[8 * 16];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X16);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-#endif
-}
-
-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
-                           TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[16 * 8];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X8);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int32_t txfm_buf[16 * 32];
-  int16_t rinput[16 * 32];
-  TX_SIZE tx_size = TX_16X32;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(rtx_type, rtx_size);
-  fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
-  transpose_int32(output, w, txfm_buf, rw, rw, rh);
-#else
-  int32_t txfm_buf[16 * 32];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X32);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-#endif
-}
-
-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[32 * 16];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X16);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[4 * 4];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X4);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[8 * 8];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X8);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[16 * 16];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X16);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[32 * 32];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-#if CONFIG_TX64X64
-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[64 * 64];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[32 * 64];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_32x64_cfg(tx_type);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[64 * 32];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x32_cfg(tx_type);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-#endif  // CONFIG_TX64X64
-
-static const TXFM_1D_CFG *fwd_txfm_col_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_col_cfg_dct_4, &fwd_txfm_1d_col_cfg_dct_8,
-      &fwd_txfm_1d_col_cfg_dct_16, &fwd_txfm_1d_col_cfg_dct_32 },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_col_cfg_adst_4, &fwd_txfm_1d_col_cfg_adst_8,
-      &fwd_txfm_1d_col_cfg_adst_16, &fwd_txfm_1d_col_cfg_adst_32 },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_col_cfg_adst_4, &fwd_txfm_1d_col_cfg_adst_8,
-      &fwd_txfm_1d_col_cfg_adst_16, &fwd_txfm_1d_col_cfg_adst_32 },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_cfg_identity_4, &fwd_txfm_1d_cfg_identity_8,
-      &fwd_txfm_1d_cfg_identity_16, &fwd_txfm_1d_cfg_identity_32 },
-#endif  // CONFIG_EXT_TX
-};
-
-static const TXFM_1D_CFG *fwd_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_row_cfg_dct_4, &fwd_txfm_1d_row_cfg_dct_8,
-      &fwd_txfm_1d_row_cfg_dct_16, &fwd_txfm_1d_row_cfg_dct_32 },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_row_cfg_adst_4, &fwd_txfm_1d_row_cfg_adst_8,
-      &fwd_txfm_1d_row_cfg_adst_16, &fwd_txfm_1d_row_cfg_adst_32 },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_row_cfg_adst_4, &fwd_txfm_1d_row_cfg_adst_8,
-      &fwd_txfm_1d_row_cfg_adst_16, &fwd_txfm_1d_row_cfg_adst_32 },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_cfg_identity_4, &fwd_txfm_1d_cfg_identity_8,
-      &fwd_txfm_1d_cfg_identity_16, &fwd_txfm_1d_cfg_identity_32 },
-#endif  // CONFIG_EXT_TX
-};
-
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size) {
-  TXFM_2D_FLIP_CFG cfg;
-  set_flip_cfg(tx_type, &cfg);
-  const TX_TYPE_1D tx_type_col = vtx_tab[tx_type];
-  const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
-  const TX_SIZE tx_size_col = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_row = txsize_horz_map[tx_size];
-  cfg.col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size_col];
-  cfg.row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size_row];
-  return cfg;
-}
-
-#if CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_32x64_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
-  const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
-  const TX_SIZE tx_size_row = txsize_horz_map[TX_32X64];
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &fwd_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size_row];
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
-
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x32_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
-  const TX_TYPE_1D tx_type_col = vtx_tab[tx_type];
-  const TX_SIZE tx_size_col = txsize_vert_map[TX_64X32];
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size_col];
-      cfg.row_cfg = &fwd_txfm_1d_row_cfg_dct_64;
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
-
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &fwd_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = &fwd_txfm_1d_row_cfg_dct_64;
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      break;
-    default:
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      assert(0);
-  }
-  return cfg;
-}
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
index 51f4b6362..8514dc64c 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -10,28 +10,28 @@
  */
 
 #include <stdlib.h>
-#include "aom_dsp/inv_txfm.h"
 #include "av1/common/av1_inv_txfm1d.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
 
-void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
-                      int32_t size, int8_t bit) {
-  const int64_t maxValue = (1LL << (bit - 1)) - 1;
-  const int64_t minValue = -(1LL << (bit - 1));
+static void range_check_buf(int32_t stage, const int32_t *input,
+                            const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
 
   int in_range = 1;
 
   for (int i = 0; i < size; ++i) {
-    if (buf[i] < minValue || buf[i] > maxValue) {
+    if (buf[i] < min_value || buf[i] > max_value) {
       in_range = 0;
     }
   }
 
   if (!in_range) {
     fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+    fprintf(stderr, "size: %d\n", size);
     fprintf(stderr, "stage: %d\n", stage);
-    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", minValue,
-            maxValue);
+    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+            max_value);
 
     fprintf(stderr, "coeffs: ");
 
@@ -53,81 +53,73 @@ void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
   }
 
   assert(in_range);
-}
-
-#define range_check(stage, input, buf, size, bit) \
-  range_check_func(stage, input, buf, size, bit)
 #else
-#define range_check(stage, input, buf, size, bit) \
-  {                                               \
-    (void)stage;                                  \
-    (void)input;                                  \
-    (void)buf;                                    \
-    (void)size;                                   \
-    (void)bit;                                    \
-  }
+  (void)stage;
+  (void)input;
+  (void)buf;
+  (void)size;
+  (void)bit;
 #endif
+}
 
 // TODO(angiebird): Make 1-d txfm functions static
-void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+//
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 4;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[4];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[2];
   bf1[2] = input[1];
   bf1[3] = input[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
 }
 
-void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 8;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[8];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[4];
@@ -137,83 +129,78 @@ void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
   bf1[5] = input[5];
   bf1[6] = input[3];
   bf1[7] = input[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
 }
 
-void av1_idct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 16;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[16];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[8];
@@ -231,11 +218,10 @@ void av1_idct16_new(const int32_t *input, int32_t *output,
   bf1[13] = input[11];
   bf1[14] = input[7];
   bf1[15] = input[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -246,146 +232,140 @@ void av1_idct16_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = bf0[8] - bf0[9];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[10] + bf0[11];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = bf0[12] - bf0[13];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[14] + bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = bf0[9] - bf0[10];
-  bf1[11] = bf0[8] - bf0[11];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[13] + bf0[14];
-  bf1[15] = bf0[12] + bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = bf0[7] - bf0[8];
-  bf1[9] = bf0[6] - bf0[9];
-  bf1[10] = bf0[5] - bf0[10];
-  bf1[11] = bf0[4] - bf0[11];
-  bf1[12] = bf0[3] - bf0[12];
-  bf1[13] = bf0[2] - bf0[13];
-  bf1[14] = bf0[1] - bf0[14];
-  bf1[15] = bf0[0] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
 }
 
-void av1_idct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 32;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[32];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[16];
@@ -419,11 +399,10 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
   bf1[29] = input[23];
   bf1[30] = input[15];
   bf1[31] = input[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -442,27 +421,26 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
   bf1[13] = bf0[13];
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -473,572 +451,506 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16] + bf0[17];
-  bf1[17] = bf0[16] - bf0[17];
-  bf1[18] = -bf0[18] + bf0[19];
-  bf1[19] = bf0[18] + bf0[19];
-  bf1[20] = bf0[20] + bf0[21];
-  bf1[21] = bf0[20] - bf0[21];
-  bf1[22] = -bf0[22] + bf0[23];
-  bf1[23] = bf0[22] + bf0[23];
-  bf1[24] = bf0[24] + bf0[25];
-  bf1[25] = bf0[24] - bf0[25];
-  bf1[26] = -bf0[26] + bf0[27];
-  bf1[27] = bf0[26] + bf0[27];
-  bf1[28] = bf0[28] + bf0[29];
-  bf1[29] = bf0[28] - bf0[29];
-  bf1[30] = -bf0[30] + bf0[31];
-  bf1[31] = bf0[30] + bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = bf0[8] - bf0[9];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[10] + bf0[11];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = bf0[12] - bf0[13];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[14] + bf0[15];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   bf1[19] = bf0[19];
   bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
-  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
   bf1[27] = bf0[27];
   bf1[28] = bf0[28];
-  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[19];
-  bf1[17] = bf0[17] + bf0[18];
-  bf1[18] = bf0[17] - bf0[18];
-  bf1[19] = bf0[16] - bf0[19];
-  bf1[20] = -bf0[20] + bf0[23];
-  bf1[21] = -bf0[21] + bf0[22];
-  bf1[22] = bf0[21] + bf0[22];
-  bf1[23] = bf0[20] + bf0[23];
-  bf1[24] = bf0[24] + bf0[27];
-  bf1[25] = bf0[25] + bf0[26];
-  bf1[26] = bf0[25] - bf0[26];
-  bf1[27] = bf0[24] - bf0[27];
-  bf1[28] = -bf0[28] + bf0[31];
-  bf1[29] = -bf0[29] + bf0[30];
-  bf1[30] = bf0[29] + bf0[30];
-  bf1[31] = bf0[28] + bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = bf0[9] - bf0[10];
-  bf1[11] = bf0[8] - bf0[11];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[13] + bf0[14];
-  bf1[15] = bf0[12] + bf0[15];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   bf1[22] = bf0[22];
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
   bf1[25] = bf0[25];
-  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[23];
-  bf1[17] = bf0[17] + bf0[22];
-  bf1[18] = bf0[18] + bf0[21];
-  bf1[19] = bf0[19] + bf0[20];
-  bf1[20] = bf0[19] - bf0[20];
-  bf1[21] = bf0[18] - bf0[21];
-  bf1[22] = bf0[17] - bf0[22];
-  bf1[23] = bf0[16] - bf0[23];
-  bf1[24] = -bf0[24] + bf0[31];
-  bf1[25] = -bf0[25] + bf0[30];
-  bf1[26] = -bf0[26] + bf0[29];
-  bf1[27] = -bf0[27] + bf0[28];
-  bf1[28] = bf0[27] + bf0[28];
-  bf1[29] = bf0[26] + bf0[29];
-  bf1[30] = bf0[25] + bf0[30];
-  bf1[31] = bf0[24] + bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = bf0[7] - bf0[8];
-  bf1[9] = bf0[6] - bf0[9];
-  bf1[10] = bf0[5] - bf0[10];
-  bf1[11] = bf0[4] - bf0[11];
-  bf1[12] = bf0[3] - bf0[12];
-  bf1[13] = bf0[2] - bf0[13];
-  bf1[14] = bf0[1] - bf0[14];
-  bf1[15] = bf0[0] - bf0[15];
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
   bf1[18] = bf0[18];
   bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   bf1[28] = bf0[28];
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[31];
-  bf1[1] = bf0[1] + bf0[30];
-  bf1[2] = bf0[2] + bf0[29];
-  bf1[3] = bf0[3] + bf0[28];
-  bf1[4] = bf0[4] + bf0[27];
-  bf1[5] = bf0[5] + bf0[26];
-  bf1[6] = bf0[6] + bf0[25];
-  bf1[7] = bf0[7] + bf0[24];
-  bf1[8] = bf0[8] + bf0[23];
-  bf1[9] = bf0[9] + bf0[22];
-  bf1[10] = bf0[10] + bf0[21];
-  bf1[11] = bf0[11] + bf0[20];
-  bf1[12] = bf0[12] + bf0[19];
-  bf1[13] = bf0[13] + bf0[18];
-  bf1[14] = bf0[14] + bf0[17];
-  bf1[15] = bf0[15] + bf0[16];
-  bf1[16] = bf0[15] - bf0[16];
-  bf1[17] = bf0[14] - bf0[17];
-  bf1[18] = bf0[13] - bf0[18];
-  bf1[19] = bf0[12] - bf0[19];
-  bf1[20] = bf0[11] - bf0[20];
-  bf1[21] = bf0[10] - bf0[21];
-  bf1[22] = bf0[9] - bf0[22];
-  bf1[23] = bf0[8] - bf0[23];
-  bf1[24] = bf0[7] - bf0[24];
-  bf1[25] = bf0[6] - bf0[25];
-  bf1[26] = bf0[5] - bf0[26];
-  bf1[27] = bf0[4] - bf0[27];
-  bf1[28] = bf0[3] - bf0[28];
-  bf1[29] = bf0[2] - bf0[29];
-  bf1[30] = bf0[1] - bf0[30];
-  bf1[31] = bf0[0] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
 }
 
-void av1_iadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 4;
-  const int32_t *cospi;
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[4];
+  int32_t x0 = input[0];
+  int32_t x1 = input[1];
+  int32_t x2 = input[2];
+  int32_t x3 = input[3];
 
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
 
-  // stage 1;
-  stage++;
-  assert(output != input);
-  bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[3];
-  bf1[2] = -input[1];
-  bf1[3] = input[2];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  assert(sinpi[1] + sinpi[2] == sinpi[4]);
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
+  s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
+  s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
+  s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
+  s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
+  s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
+  s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
 
   // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  // NOTICE: (x0 - x2) here may use one extra bit compared to the
+  // opt_range_row/col specified in av1_gen_inv_stage_range()
+  s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
 
   // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  s0 = range_check_value(s0 + s3, stage_range[3] + bit);
+  s1 = range_check_value(s1 - s4, stage_range[3] + bit);
+  s3 = range_check_value(s2, stage_range[3] + bit);
+  s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
 
   // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  s0 = range_check_value(s0 + s5, stage_range[4] + bit);
+  s1 = range_check_value(s1 - s6, stage_range[4] + bit);
 
   // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[2];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  x0 = range_check_value(s0 + s3, stage_range[5] + bit);
+  x1 = range_check_value(s1 + s3, stage_range[5] + bit);
+  x2 = range_check_value(s2, stage_range[5] + bit);
+  x3 = range_check_value(s0 + s1, stage_range[5] + bit);
+
+  // stage 6
+  x3 = range_check_value(x3 - s3, stage_range[6] + bit);
+
+  output[0] = round_shift(x0, bit);
+  output[1] = round_shift(x1, bit);
+  output[2] = round_shift(x2, bit);
+  output[3] = round_shift(x3, bit);
+  range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
-void av1_iadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 8;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[8];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[7];
-  bf1[2] = -input[3];
-  bf1[3] = input[4];
-  bf1[4] = -input[1];
-  bf1[5] = input[6];
-  bf1[6] = input[2];
-  bf1[7] = -input[5];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = bf0[4] - bf0[6];
-  bf1[7] = bf0[5] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = bf0[0] - bf0[4];
-  bf1[5] = bf0[1] - bf0[5];
-  bf1[6] = bf0[2] - bf0[6];
-  bf1[7] = bf0[3] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[6];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[4];
-  bf1[4] = bf0[5];
-  bf1[5] = bf0[2];
-  bf1[6] = bf0[7];
-  bf1[7] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
 }
 
-void av1_iadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 16;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[16];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[15];
-  bf1[2] = -input[7];
-  bf1[3] = input[8];
-  bf1[4] = -input[3];
-  bf1[5] = input[12];
-  bf1[6] = input[4];
-  bf1[7] = -input[11];
-  bf1[8] = -input[1];
-  bf1[9] = input[14];
-  bf1[10] = input[6];
-  bf1[11] = -input[9];
-  bf1[12] = input[2];
-  bf1[13] = -input[13];
-  bf1[14] = -input[5];
-  bf1[15] = input[10];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = bf0[4] - bf0[6];
-  bf1[7] = bf0[5] - bf0[7];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = bf0[8] - bf0[10];
-  bf1[11] = bf0[9] - bf0[11];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = bf0[12] - bf0[14];
-  bf1[15] = bf0[13] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = bf0[0] - bf0[4];
-  bf1[5] = bf0[1] - bf0[5];
-  bf1[6] = bf0[2] - bf0[6];
-  bf1[7] = bf0[3] - bf0[7];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = bf0[8] - bf0[12];
-  bf1[13] = bf0[9] - bf0[13];
-  bf1[14] = bf0[10] - bf0[14];
-  bf1[15] = bf0[11] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -1049,579 +961,173 @@ void av1_iadst16_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = bf0[0] - bf0[8];
-  bf1[9] = bf0[1] - bf0[9];
-  bf1[10] = bf0[2] - bf0[10];
-  bf1[11] = bf0[3] - bf0[11];
-  bf1[12] = bf0[4] - bf0[12];
-  bf1[13] = bf0[5] - bf0[13];
-  bf1[14] = bf0[6] - bf0[14];
-  bf1[15] = bf0[7] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[14];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[12];
-  bf1[4] = bf0[5];
-  bf1[5] = bf0[10];
-  bf1[6] = bf0[7];
-  bf1[7] = bf0[8];
-  bf1[8] = bf0[9];
-  bf1[9] = bf0[6];
-  bf1[10] = bf0[11];
-  bf1[11] = bf0[4];
-  bf1[12] = bf0[13];
-  bf1[13] = bf0[2];
-  bf1[14] = bf0[15];
-  bf1[15] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_iadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 32;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[32];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  assert(output != input);
-  bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[31];
-  bf1[2] = -input[15];
-  bf1[3] = input[16];
-  bf1[4] = -input[7];
-  bf1[5] = input[24];
-  bf1[6] = input[8];
-  bf1[7] = -input[23];
-  bf1[8] = -input[3];
-  bf1[9] = input[28];
-  bf1[10] = input[12];
-  bf1[11] = -input[19];
-  bf1[12] = input[4];
-  bf1[13] = -input[27];
-  bf1[14] = -input[11];
-  bf1[15] = input[20];
-  bf1[16] = -input[1];
-  bf1[17] = input[30];
-  bf1[18] = input[14];
-  bf1[19] = -input[17];
-  bf1[20] = input[6];
-  bf1[21] = -input[25];
-  bf1[22] = -input[9];
-  bf1[23] = input[22];
-  bf1[24] = input[2];
-  bf1[25] = -input[29];
-  bf1[26] = -input[13];
-  bf1[27] = input[18];
-  bf1[28] = -input[5];
-  bf1[29] = input[26];
-  bf1[30] = input[10];
-  bf1[31] = -input[21];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
+  // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = bf0[4] - bf0[6];
-  bf1[7] = bf0[5] - bf0[7];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = bf0[8] - bf0[10];
-  bf1[11] = bf0[9] - bf0[11];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = bf0[12] - bf0[14];
-  bf1[15] = bf0[13] - bf0[15];
-  bf1[16] = bf0[16] + bf0[18];
-  bf1[17] = bf0[17] + bf0[19];
-  bf1[18] = bf0[16] - bf0[18];
-  bf1[19] = bf0[17] - bf0[19];
-  bf1[20] = bf0[20] + bf0[22];
-  bf1[21] = bf0[21] + bf0[23];
-  bf1[22] = bf0[20] - bf0[22];
-  bf1[23] = bf0[21] - bf0[23];
-  bf1[24] = bf0[24] + bf0[26];
-  bf1[25] = bf0[25] + bf0[27];
-  bf1[26] = bf0[24] - bf0[26];
-  bf1[27] = bf0[25] - bf0[27];
-  bf1[28] = bf0[28] + bf0[30];
-  bf1[29] = bf0[29] + bf0[31];
-  bf1[30] = bf0[28] - bf0[30];
-  bf1[31] = bf0[29] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 4
+  // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
   bf1[10] = bf0[10];
   bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = bf0[26];
-  bf1[27] = bf0[27];
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = bf0[0] - bf0[4];
-  bf1[5] = bf0[1] - bf0[5];
-  bf1[6] = bf0[2] - bf0[6];
-  bf1[7] = bf0[3] - bf0[7];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = bf0[8] - bf0[12];
-  bf1[13] = bf0[9] - bf0[13];
-  bf1[14] = bf0[10] - bf0[14];
-  bf1[15] = bf0[11] - bf0[15];
-  bf1[16] = bf0[16] + bf0[20];
-  bf1[17] = bf0[17] + bf0[21];
-  bf1[18] = bf0[18] + bf0[22];
-  bf1[19] = bf0[19] + bf0[23];
-  bf1[20] = bf0[16] - bf0[20];
-  bf1[21] = bf0[17] - bf0[21];
-  bf1[22] = bf0[18] - bf0[22];
-  bf1[23] = bf0[19] - bf0[23];
-  bf1[24] = bf0[24] + bf0[28];
-  bf1[25] = bf0[25] + bf0[29];
-  bf1[26] = bf0[26] + bf0[30];
-  bf1[27] = bf0[27] + bf0[31];
-  bf1[28] = bf0[24] - bf0[28];
-  bf1[29] = bf0[25] - bf0[29];
-  bf1[30] = bf0[26] - bf0[30];
-  bf1[31] = bf0[27] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = bf0[0] - bf0[8];
-  bf1[9] = bf0[1] - bf0[9];
-  bf1[10] = bf0[2] - bf0[10];
-  bf1[11] = bf0[3] - bf0[11];
-  bf1[12] = bf0[4] - bf0[12];
-  bf1[13] = bf0[5] - bf0[13];
-  bf1[14] = bf0[6] - bf0[14];
-  bf1[15] = bf0[7] - bf0[15];
-  bf1[16] = bf0[16] + bf0[24];
-  bf1[17] = bf0[17] + bf0[25];
-  bf1[18] = bf0[18] + bf0[26];
-  bf1[19] = bf0[19] + bf0[27];
-  bf1[20] = bf0[20] + bf0[28];
-  bf1[21] = bf0[21] + bf0[29];
-  bf1[22] = bf0[22] + bf0[30];
-  bf1[23] = bf0[23] + bf0[31];
-  bf1[24] = bf0[16] - bf0[24];
-  bf1[25] = bf0[17] - bf0[25];
-  bf1[26] = bf0[18] - bf0[26];
-  bf1[27] = bf0[19] - bf0[27];
-  bf1[28] = bf0[20] - bf0[28];
-  bf1[29] = bf0[21] - bf0[29];
-  bf1[30] = bf0[22] - bf0[30];
-  bf1[31] = bf0[23] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
   bf1[4] = bf0[4];
   bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
   bf1[12] = bf0[12];
   bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
-  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[16];
-  bf1[1] = bf0[1] + bf0[17];
-  bf1[2] = bf0[2] + bf0[18];
-  bf1[3] = bf0[3] + bf0[19];
-  bf1[4] = bf0[4] + bf0[20];
-  bf1[5] = bf0[5] + bf0[21];
-  bf1[6] = bf0[6] + bf0[22];
-  bf1[7] = bf0[7] + bf0[23];
-  bf1[8] = bf0[8] + bf0[24];
-  bf1[9] = bf0[9] + bf0[25];
-  bf1[10] = bf0[10] + bf0[26];
-  bf1[11] = bf0[11] + bf0[27];
-  bf1[12] = bf0[12] + bf0[28];
-  bf1[13] = bf0[13] + bf0[29];
-  bf1[14] = bf0[14] + bf0[30];
-  bf1[15] = bf0[15] + bf0[31];
-  bf1[16] = bf0[0] - bf0[16];
-  bf1[17] = bf0[1] - bf0[17];
-  bf1[18] = bf0[2] - bf0[18];
-  bf1[19] = bf0[3] - bf0[19];
-  bf1[20] = bf0[4] - bf0[20];
-  bf1[21] = bf0[5] - bf0[21];
-  bf1[22] = bf0[6] - bf0[22];
-  bf1[23] = bf0[7] - bf0[23];
-  bf1[24] = bf0[8] - bf0[24];
-  bf1[25] = bf0[9] - bf0[25];
-  bf1[26] = bf0[10] - bf0[26];
-  bf1[27] = bf0[11] - bf0[27];
-  bf1[28] = bf0[12] - bf0[28];
-  bf1[29] = bf0[13] - bf0[29];
-  bf1[30] = bf0[14] - bf0[30];
-  bf1[31] = bf0[15] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 10
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
-  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 11
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[30];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[28];
-  bf1[4] = bf0[5];
-  bf1[5] = bf0[26];
-  bf1[6] = bf0[7];
-  bf1[7] = bf0[24];
-  bf1[8] = bf0[9];
-  bf1[9] = bf0[22];
-  bf1[10] = bf0[11];
-  bf1[11] = bf0[20];
-  bf1[12] = bf0[13];
-  bf1[13] = bf0[18];
-  bf1[14] = bf0[15];
-  bf1[15] = bf0[16];
-  bf1[16] = bf0[17];
-  bf1[17] = bf0[14];
-  bf1[18] = bf0[19];
-  bf1[19] = bf0[12];
-  bf1[20] = bf0[21];
-  bf1[21] = bf0[10];
-  bf1[22] = bf0[23];
-  bf1[23] = bf0[8];
-  bf1[24] = bf0[25];
-  bf1[25] = bf0[6];
-  bf1[26] = bf0[27];
-  bf1[27] = bf0[4];
-  bf1[28] = bf0[29];
-  bf1[29] = bf0[2];
-  bf1[30] = bf0[31];
-  bf1[31] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
 }
 
-#if CONFIG_EXT_TX
-void av1_iidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
   (void)cos_bit;
-  for (int i = 0; i < 4; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * Sqrt2);
-  range_check(0, input, output, 4, stage_range[0]);
+  (void)stage_range;
+  for (int i = 0; i < 4; ++i) {
+    output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
+  }
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
 }
 
-void av1_iidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
   (void)cos_bit;
-  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
-  range_check(0, input, output, 8, stage_range[0]);
+  (void)stage_range;
+  for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
 }
 
-void av1_iidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
   (void)cos_bit;
+  (void)stage_range;
   for (int i = 0; i < 16; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
-  range_check(0, input, output, 16, stage_range[0]);
-}
-
-void av1_iidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
-  range_check(0, input, output, 32, stage_range[0]);
+    output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
 }
 
-#if CONFIG_TX64X64
-void av1_iidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
   (void)cos_bit;
-  for (int i = 0; i < 64; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-  range_check(0, input, output, 64, stage_range[0]);
+  (void)stage_range;
+  for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
 }
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 
-#if CONFIG_TX64X64
-void av1_idct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 64;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[64];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[32];
@@ -1687,11 +1193,10 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[61] = input[47];
   bf1[62] = input[31];
   bf1[63] = input[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -1726,43 +1231,42 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
-  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
-  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -1781,59 +1285,58 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[13] = bf0[13];
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
-  bf1[32] = bf0[32] + bf0[33];
-  bf1[33] = bf0[32] - bf0[33];
-  bf1[34] = -bf0[34] + bf0[35];
-  bf1[35] = bf0[34] + bf0[35];
-  bf1[36] = bf0[36] + bf0[37];
-  bf1[37] = bf0[36] - bf0[37];
-  bf1[38] = -bf0[38] + bf0[39];
-  bf1[39] = bf0[38] + bf0[39];
-  bf1[40] = bf0[40] + bf0[41];
-  bf1[41] = bf0[40] - bf0[41];
-  bf1[42] = -bf0[42] + bf0[43];
-  bf1[43] = bf0[42] + bf0[43];
-  bf1[44] = bf0[44] + bf0[45];
-  bf1[45] = bf0[44] - bf0[45];
-  bf1[46] = -bf0[46] + bf0[47];
-  bf1[47] = bf0[46] + bf0[47];
-  bf1[48] = bf0[48] + bf0[49];
-  bf1[49] = bf0[48] - bf0[49];
-  bf1[50] = -bf0[50] + bf0[51];
-  bf1[51] = bf0[50] + bf0[51];
-  bf1[52] = bf0[52] + bf0[53];
-  bf1[53] = bf0[52] - bf0[53];
-  bf1[54] = -bf0[54] + bf0[55];
-  bf1[55] = bf0[54] + bf0[55];
-  bf1[56] = bf0[56] + bf0[57];
-  bf1[57] = bf0[56] - bf0[57];
-  bf1[58] = -bf0[58] + bf0[59];
-  bf1[59] = bf0[58] + bf0[59];
-  bf1[60] = bf0[60] + bf0[61];
-  bf1[61] = bf0[60] - bf0[61];
-  bf1[62] = -bf0[62] + bf0[63];
-  bf1[63] = bf0[62] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -1844,326 +1347,322 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16] + bf0[17];
-  bf1[17] = bf0[16] - bf0[17];
-  bf1[18] = -bf0[18] + bf0[19];
-  bf1[19] = bf0[18] + bf0[19];
-  bf1[20] = bf0[20] + bf0[21];
-  bf1[21] = bf0[20] - bf0[21];
-  bf1[22] = -bf0[22] + bf0[23];
-  bf1[23] = bf0[22] + bf0[23];
-  bf1[24] = bf0[24] + bf0[25];
-  bf1[25] = bf0[24] - bf0[25];
-  bf1[26] = -bf0[26] + bf0[27];
-  bf1[27] = bf0[26] + bf0[27];
-  bf1[28] = bf0[28] + bf0[29];
-  bf1[29] = bf0[28] - bf0[29];
-  bf1[30] = -bf0[30] + bf0[31];
-  bf1[31] = bf0[30] + bf0[31];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
-  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
   bf1[35] = bf0[35];
   bf1[36] = bf0[36];
-  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
   bf1[39] = bf0[39];
   bf1[40] = bf0[40];
-  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
   bf1[43] = bf0[43];
   bf1[44] = bf0[44];
-  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
   bf1[47] = bf0[47];
   bf1[48] = bf0[48];
-  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
   bf1[51] = bf0[51];
   bf1[52] = bf0[52];
-  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
   bf1[55] = bf0[55];
   bf1[56] = bf0[56];
-  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
   bf1[59] = bf0[59];
   bf1[60] = bf0[60];
-  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = bf0[8] - bf0[9];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[10] + bf0[11];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = bf0[12] - bf0[13];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[14] + bf0[15];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   bf1[19] = bf0[19];
   bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
-  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
   bf1[27] = bf0[27];
   bf1[28] = bf0[28];
-  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[35];
-  bf1[33] = bf0[33] + bf0[34];
-  bf1[34] = bf0[33] - bf0[34];
-  bf1[35] = bf0[32] - bf0[35];
-  bf1[36] = -bf0[36] + bf0[39];
-  bf1[37] = -bf0[37] + bf0[38];
-  bf1[38] = bf0[37] + bf0[38];
-  bf1[39] = bf0[36] + bf0[39];
-  bf1[40] = bf0[40] + bf0[43];
-  bf1[41] = bf0[41] + bf0[42];
-  bf1[42] = bf0[41] - bf0[42];
-  bf1[43] = bf0[40] - bf0[43];
-  bf1[44] = -bf0[44] + bf0[47];
-  bf1[45] = -bf0[45] + bf0[46];
-  bf1[46] = bf0[45] + bf0[46];
-  bf1[47] = bf0[44] + bf0[47];
-  bf1[48] = bf0[48] + bf0[51];
-  bf1[49] = bf0[49] + bf0[50];
-  bf1[50] = bf0[49] - bf0[50];
-  bf1[51] = bf0[48] - bf0[51];
-  bf1[52] = -bf0[52] + bf0[55];
-  bf1[53] = -bf0[53] + bf0[54];
-  bf1[54] = bf0[53] + bf0[54];
-  bf1[55] = bf0[52] + bf0[55];
-  bf1[56] = bf0[56] + bf0[59];
-  bf1[57] = bf0[57] + bf0[58];
-  bf1[58] = bf0[57] - bf0[58];
-  bf1[59] = bf0[56] - bf0[59];
-  bf1[60] = -bf0[60] + bf0[63];
-  bf1[61] = -bf0[61] + bf0[62];
-  bf1[62] = bf0[61] + bf0[62];
-  bf1[63] = bf0[60] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
+  bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
+  bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
+  bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[19];
-  bf1[17] = bf0[17] + bf0[18];
-  bf1[18] = bf0[17] - bf0[18];
-  bf1[19] = bf0[16] - bf0[19];
-  bf1[20] = -bf0[20] + bf0[23];
-  bf1[21] = -bf0[21] + bf0[22];
-  bf1[22] = bf0[21] + bf0[22];
-  bf1[23] = bf0[20] + bf0[23];
-  bf1[24] = bf0[24] + bf0[27];
-  bf1[25] = bf0[25] + bf0[26];
-  bf1[26] = bf0[25] - bf0[26];
-  bf1[27] = bf0[24] - bf0[27];
-  bf1[28] = -bf0[28] + bf0[31];
-  bf1[29] = -bf0[29] + bf0[30];
-  bf1[30] = bf0[29] + bf0[30];
-  bf1[31] = bf0[28] + bf0[31];
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
-  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
   bf1[38] = bf0[38];
   bf1[39] = bf0[39];
   bf1[40] = bf0[40];
   bf1[41] = bf0[41];
-  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
   bf1[46] = bf0[46];
   bf1[47] = bf0[47];
   bf1[48] = bf0[48];
   bf1[49] = bf0[49];
-  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
-  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
   bf1[54] = bf0[54];
   bf1[55] = bf0[55];
   bf1[56] = bf0[56];
   bf1[57] = bf0[57];
-  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
-  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = bf0[9] - bf0[10];
-  bf1[11] = bf0[8] - bf0[11];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[13] + bf0[14];
-  bf1[15] = bf0[12] + bf0[15];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   bf1[22] = bf0[22];
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
   bf1[25] = bf0[25];
-  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[39];
-  bf1[33] = bf0[33] + bf0[38];
-  bf1[34] = bf0[34] + bf0[37];
-  bf1[35] = bf0[35] + bf0[36];
-  bf1[36] = bf0[35] - bf0[36];
-  bf1[37] = bf0[34] - bf0[37];
-  bf1[38] = bf0[33] - bf0[38];
-  bf1[39] = bf0[32] - bf0[39];
-  bf1[40] = -bf0[40] + bf0[47];
-  bf1[41] = -bf0[41] + bf0[46];
-  bf1[42] = -bf0[42] + bf0[45];
-  bf1[43] = -bf0[43] + bf0[44];
-  bf1[44] = bf0[43] + bf0[44];
-  bf1[45] = bf0[42] + bf0[45];
-  bf1[46] = bf0[41] + bf0[46];
-  bf1[47] = bf0[40] + bf0[47];
-  bf1[48] = bf0[48] + bf0[55];
-  bf1[49] = bf0[49] + bf0[54];
-  bf1[50] = bf0[50] + bf0[53];
-  bf1[51] = bf0[51] + bf0[52];
-  bf1[52] = bf0[51] - bf0[52];
-  bf1[53] = bf0[50] - bf0[53];
-  bf1[54] = bf0[49] - bf0[54];
-  bf1[55] = bf0[48] - bf0[55];
-  bf1[56] = -bf0[56] + bf0[63];
-  bf1[57] = -bf0[57] + bf0[62];
-  bf1[58] = -bf0[58] + bf0[61];
-  bf1[59] = -bf0[59] + bf0[60];
-  bf1[60] = bf0[59] + bf0[60];
-  bf1[61] = bf0[58] + bf0[61];
-  bf1[62] = bf0[57] + bf0[62];
-  bf1[63] = bf0[56] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
+  bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
+  bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
+  bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
+  bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[23];
-  bf1[17] = bf0[17] + bf0[22];
-  bf1[18] = bf0[18] + bf0[21];
-  bf1[19] = bf0[19] + bf0[20];
-  bf1[20] = bf0[19] - bf0[20];
-  bf1[21] = bf0[18] - bf0[21];
-  bf1[22] = bf0[17] - bf0[22];
-  bf1[23] = bf0[16] - bf0[23];
-  bf1[24] = -bf0[24] + bf0[31];
-  bf1[25] = -bf0[25] + bf0[30];
-  bf1[26] = -bf0[26] + bf0[29];
-  bf1[27] = -bf0[27] + bf0[28];
-  bf1[28] = bf0[27] + bf0[28];
-  bf1[29] = bf0[26] + bf0[29];
-  bf1[30] = bf0[25] + bf0[30];
-  bf1[31] = bf0[24] + bf0[31];
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
   bf1[34] = bf0[34];
   bf1[35] = bf0[35];
-  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
   bf1[44] = bf0[44];
   bf1[45] = bf0[45];
   bf1[46] = bf0[46];
@@ -2172,128 +1671,126 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[49] = bf0[49];
   bf1[50] = bf0[50];
   bf1[51] = bf0[51];
-  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
-  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
   bf1[60] = bf0[60];
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = bf0[7] - bf0[8];
-  bf1[9] = bf0[6] - bf0[9];
-  bf1[10] = bf0[5] - bf0[10];
-  bf1[11] = bf0[4] - bf0[11];
-  bf1[12] = bf0[3] - bf0[12];
-  bf1[13] = bf0[2] - bf0[13];
-  bf1[14] = bf0[1] - bf0[14];
-  bf1[15] = bf0[0] - bf0[15];
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
   bf1[18] = bf0[18];
   bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   bf1[28] = bf0[28];
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[47];
-  bf1[33] = bf0[33] + bf0[46];
-  bf1[34] = bf0[34] + bf0[45];
-  bf1[35] = bf0[35] + bf0[44];
-  bf1[36] = bf0[36] + bf0[43];
-  bf1[37] = bf0[37] + bf0[42];
-  bf1[38] = bf0[38] + bf0[41];
-  bf1[39] = bf0[39] + bf0[40];
-  bf1[40] = bf0[39] - bf0[40];
-  bf1[41] = bf0[38] - bf0[41];
-  bf1[42] = bf0[37] - bf0[42];
-  bf1[43] = bf0[36] - bf0[43];
-  bf1[44] = bf0[35] - bf0[44];
-  bf1[45] = bf0[34] - bf0[45];
-  bf1[46] = bf0[33] - bf0[46];
-  bf1[47] = bf0[32] - bf0[47];
-  bf1[48] = -bf0[48] + bf0[63];
-  bf1[49] = -bf0[49] + bf0[62];
-  bf1[50] = -bf0[50] + bf0[61];
-  bf1[51] = -bf0[51] + bf0[60];
-  bf1[52] = -bf0[52] + bf0[59];
-  bf1[53] = -bf0[53] + bf0[58];
-  bf1[54] = -bf0[54] + bf0[57];
-  bf1[55] = -bf0[55] + bf0[56];
-  bf1[56] = bf0[55] + bf0[56];
-  bf1[57] = bf0[54] + bf0[57];
-  bf1[58] = bf0[53] + bf0[58];
-  bf1[59] = bf0[52] + bf0[59];
-  bf1[60] = bf0[51] + bf0[60];
-  bf1[61] = bf0[50] + bf0[61];
-  bf1[62] = bf0[49] + bf0[62];
-  bf1[63] = bf0[48] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
+  bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
+  bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
+  bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 10
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[31];
-  bf1[1] = bf0[1] + bf0[30];
-  bf1[2] = bf0[2] + bf0[29];
-  bf1[3] = bf0[3] + bf0[28];
-  bf1[4] = bf0[4] + bf0[27];
-  bf1[5] = bf0[5] + bf0[26];
-  bf1[6] = bf0[6] + bf0[25];
-  bf1[7] = bf0[7] + bf0[24];
-  bf1[8] = bf0[8] + bf0[23];
-  bf1[9] = bf0[9] + bf0[22];
-  bf1[10] = bf0[10] + bf0[21];
-  bf1[11] = bf0[11] + bf0[20];
-  bf1[12] = bf0[12] + bf0[19];
-  bf1[13] = bf0[13] + bf0[18];
-  bf1[14] = bf0[14] + bf0[17];
-  bf1[15] = bf0[15] + bf0[16];
-  bf1[16] = bf0[15] - bf0[16];
-  bf1[17] = bf0[14] - bf0[17];
-  bf1[18] = bf0[13] - bf0[18];
-  bf1[19] = bf0[12] - bf0[19];
-  bf1[20] = bf0[11] - bf0[20];
-  bf1[21] = bf0[10] - bf0[21];
-  bf1[22] = bf0[9] - bf0[22];
-  bf1[23] = bf0[8] - bf0[23];
-  bf1[24] = bf0[7] - bf0[24];
-  bf1[25] = bf0[6] - bf0[25];
-  bf1[26] = bf0[5] - bf0[26];
-  bf1[27] = bf0[4] - bf0[27];
-  bf1[28] = bf0[3] - bf0[28];
-  bf1[29] = bf0[2] - bf0[29];
-  bf1[30] = bf0[1] - bf0[30];
-  bf1[31] = bf0[0] - bf0[31];
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
   bf1[34] = bf0[34];
@@ -2302,22 +1799,22 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[37] = bf0[37];
   bf1[38] = bf0[38];
   bf1[39] = bf0[39];
-  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
   bf1[56] = bf0[56];
   bf1[57] = bf0[57];
   bf1[58] = bf0[58];
@@ -2326,77 +1823,74 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 11
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[63];
-  bf1[1] = bf0[1] + bf0[62];
-  bf1[2] = bf0[2] + bf0[61];
-  bf1[3] = bf0[3] + bf0[60];
-  bf1[4] = bf0[4] + bf0[59];
-  bf1[5] = bf0[5] + bf0[58];
-  bf1[6] = bf0[6] + bf0[57];
-  bf1[7] = bf0[7] + bf0[56];
-  bf1[8] = bf0[8] + bf0[55];
-  bf1[9] = bf0[9] + bf0[54];
-  bf1[10] = bf0[10] + bf0[53];
-  bf1[11] = bf0[11] + bf0[52];
-  bf1[12] = bf0[12] + bf0[51];
-  bf1[13] = bf0[13] + bf0[50];
-  bf1[14] = bf0[14] + bf0[49];
-  bf1[15] = bf0[15] + bf0[48];
-  bf1[16] = bf0[16] + bf0[47];
-  bf1[17] = bf0[17] + bf0[46];
-  bf1[18] = bf0[18] + bf0[45];
-  bf1[19] = bf0[19] + bf0[44];
-  bf1[20] = bf0[20] + bf0[43];
-  bf1[21] = bf0[21] + bf0[42];
-  bf1[22] = bf0[22] + bf0[41];
-  bf1[23] = bf0[23] + bf0[40];
-  bf1[24] = bf0[24] + bf0[39];
-  bf1[25] = bf0[25] + bf0[38];
-  bf1[26] = bf0[26] + bf0[37];
-  bf1[27] = bf0[27] + bf0[36];
-  bf1[28] = bf0[28] + bf0[35];
-  bf1[29] = bf0[29] + bf0[34];
-  bf1[30] = bf0[30] + bf0[33];
-  bf1[31] = bf0[31] + bf0[32];
-  bf1[32] = bf0[31] - bf0[32];
-  bf1[33] = bf0[30] - bf0[33];
-  bf1[34] = bf0[29] - bf0[34];
-  bf1[35] = bf0[28] - bf0[35];
-  bf1[36] = bf0[27] - bf0[36];
-  bf1[37] = bf0[26] - bf0[37];
-  bf1[38] = bf0[25] - bf0[38];
-  bf1[39] = bf0[24] - bf0[39];
-  bf1[40] = bf0[23] - bf0[40];
-  bf1[41] = bf0[22] - bf0[41];
-  bf1[42] = bf0[21] - bf0[42];
-  bf1[43] = bf0[20] - bf0[43];
-  bf1[44] = bf0[19] - bf0[44];
-  bf1[45] = bf0[18] - bf0[45];
-  bf1[46] = bf0[17] - bf0[46];
-  bf1[47] = bf0[16] - bf0[47];
-  bf1[48] = bf0[15] - bf0[48];
-  bf1[49] = bf0[14] - bf0[49];
-  bf1[50] = bf0[13] - bf0[50];
-  bf1[51] = bf0[12] - bf0[51];
-  bf1[52] = bf0[11] - bf0[52];
-  bf1[53] = bf0[10] - bf0[53];
-  bf1[54] = bf0[9] - bf0[54];
-  bf1[55] = bf0[8] - bf0[55];
-  bf1[56] = bf0[7] - bf0[56];
-  bf1[57] = bf0[6] - bf0[57];
-  bf1[58] = bf0[5] - bf0[58];
-  bf1[59] = bf0[4] - bf0[59];
-  bf1[60] = bf0[3] - bf0[60];
-  bf1[61] = bf0[2] - bf0[61];
-  bf1[62] = bf0[1] - bf0[62];
-  bf1[63] = bf0[0] - bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
+  bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
index 8996f7c9d..64a1a921c 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -18,41 +18,41 @@
 extern "C" {
 #endif
 
-void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+  if (bit <= 0) return value;  // Do nothing for invalid clamp bit.
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  return (int32_t)clamp64(value, min_value, max_value);
+}
+
+static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+  for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
+}
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range);
-void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range);
-void av1_idct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_idct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-
-void av1_iadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_EXT_TX
-void av1_iidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_iidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range);
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
index 8bcf84e05..4c600f756 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -14,358 +14,34 @@
 #include "av1/common/av1_inv_txfm1d.h"
 
 // sum of fwd_shift_##
-#if CONFIG_CHROMA_2X2
-#if CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 3, 2, 1, 0, -2, -4 };
-#else   // CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 3, 2, 1, 0, -2 };
-#endif  // CONFIG_TX64X64
-#else   // CONFIG_CHROMA_2X2
-#if CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 2, 1, 0, -2, -4 };
-#else  // CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 2, 1, 0, -2 };
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_CHROMA_2X2
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+  5,  // 4x4 transform
+  6,  // 8x8 transform
+  7,  // 16x16 transform
+  7,  // 32x32 transform
+  7,  // 64x64 transform
+  5,  // 4x8 transform
+  5,  // 8x4 transform
+  6,  // 8x16 transform
+  6,  // 16x8 transform
+  6,  // 16x32 transform
+  6,  // 32x16 transform
+  6,  // 32x64 transform
+  6,  // 64x32 transform
+  6,  // 4x16 transform
+  6,  // 16x4 transform
+  7,  // 8x32 transform
+  7,  // 32x8 transform
+  7,  // 16x64 transform
+  7,  // 64x16 transform
+};
+
+extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+
+// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// for each valid row and col combination
+#define INV_COS_BIT 12
+extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
-//  ---------------- 4x4 1D config -----------------------
-// shift
-static const int8_t inv_shift_4[2] = { 0, -4 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_4[4] = { 3, 3, 2, 2 };
-static const int8_t inv_stage_range_row_dct_4[4] = { 3, 3, 3, 3 };
-static const int8_t inv_stage_range_col_adst_4[6] = { 3, 3, 3, 3, 2, 2 };
-static const int8_t inv_stage_range_row_adst_4[6] = { 3, 3, 3, 3, 3, 3 };
-static const int8_t inv_stage_range_idx_4[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_col_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-
-//  ---------------- 8x8 1D constants -----------------------
-// shift
-static const int8_t inv_shift_8[2] = { 0, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_8[6] = { 5, 5, 5, 5, 4, 4 };
-static const int8_t inv_stage_range_row_dct_8[6] = { 5, 5, 5, 5, 5, 5 };
-static const int8_t inv_stage_range_col_adst_8[8] = { 5, 5, 5, 5, 5, 5, 4, 4 };
-static const int8_t inv_stage_range_row_adst_8[8] = { 5, 5, 5, 5, 5, 5, 5, 5 };
-static const int8_t inv_stage_range_idx_8[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_col_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t inv_cos_bit_row_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-
-//  ---------------- 16x16 1D constants -----------------------
-// shift
-static const int8_t inv_shift_16[2] = { -1, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_16[8] = { 7, 7, 7, 7, 7, 7, 6, 6 };
-static const int8_t inv_stage_range_row_dct_16[8] = { 7, 7, 7, 7, 7, 7, 7, 7 };
-static const int8_t inv_stage_range_col_adst_16[10] = { 7, 7, 7, 7, 7,
-                                                        7, 7, 7, 6, 6 };
-static const int8_t inv_stage_range_row_adst_16[10] = { 7, 7, 7, 7, 7,
-                                                        7, 7, 7, 7, 7 };
-static const int8_t inv_stage_range_idx_16[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_16[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t inv_cos_bit_row_dct_16[8] = {
-  12, 12, 12, 12, 12, 12, 12, 12
-};
-static const int8_t inv_cos_bit_col_adst_16[10] = { 13, 13, 13, 13, 13,
-                                                    13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_adst_16[10] = { 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12 };
-
-//  ---------------- 32x32 1D constants -----------------------
-// shift
-static const int8_t inv_shift_32[2] = { -1, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_32[10] = { 9, 9, 9, 9, 9,
-                                                       9, 9, 9, 8, 8 };
-static const int8_t inv_stage_range_row_dct_32[10] = { 9, 9, 9, 9, 9,
-                                                       9, 9, 9, 9, 9 };
-static const int8_t inv_stage_range_col_adst_32[12] = { 9, 9, 9, 9, 9, 9,
-                                                        9, 9, 9, 9, 8, 8 };
-static const int8_t inv_stage_range_row_adst_32[12] = { 9, 9, 9, 9, 9, 9,
-                                                        9, 9, 9, 9, 9, 9 };
-static const int8_t inv_stage_range_idx_32[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_32[10] = { 13, 13, 13, 13, 13,
-                                                   13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_32[10] = { 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12 };
-static const int8_t inv_cos_bit_col_adst_32[12] = { 13, 13, 13, 13, 13, 13,
-                                                    13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_adst_32[12] = { 12, 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12, 12 };
-
-//  ---------------- 64x64 1D constants -----------------------
-// shift
-static const int8_t inv_shift_64[2] = { -1, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_64[12] = { 11, 11, 11, 11, 11, 11,
-                                                       11, 11, 11, 11, 10, 10 };
-static const int8_t inv_stage_range_row_dct_64[12] = { 11, 11, 11, 11, 11, 11,
-                                                       11, 11, 11, 11, 11, 11 };
-
-static const int8_t inv_stage_range_idx_64[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_64[12] = { 13, 13, 13, 13, 13, 13,
-                                                   13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_64[12] = { 12, 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12, 12 };
-
-//  ---------------- row config inv_dct_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_4 = {
-  4,                          // .txfm_size
-  4,                          // .stage_num
-  inv_shift_4,                // .shift
-  inv_stage_range_row_dct_4,  // .stage_range
-  inv_cos_bit_row_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- row config inv_dct_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_8 = {
-  8,                          // .txfm_size
-  6,                          // .stage_num
-  inv_shift_8,                // .shift
-  inv_stage_range_row_dct_8,  // .stage_range
-  inv_cos_bit_row_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- row config inv_dct_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_16 = {
-  16,                          // .txfm_size
-  8,                           // .stage_num
-  inv_shift_16,                // .shift
-  inv_stage_range_row_dct_16,  // .stage_range
-  inv_cos_bit_row_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- row config inv_dct_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_32 = {
-  32,                          // .txfm_size
-  10,                          // .stage_num
-  inv_shift_32,                // .shift
-  inv_stage_range_row_dct_32,  // .stage_range
-  inv_cos_bit_row_dct_32,      // .cos_bit_row
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-#if CONFIG_TX64X64
-//  ---------------- row config inv_dct_64 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  inv_shift_64,                // .shift
-  inv_stage_range_row_dct_64,  // .stage_range
-  inv_cos_bit_row_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-#endif  // CONFIG_TX64X64
-
-//  ---------------- row config inv_adst_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_4 = {
-  4,                           // .txfm_size
-  6,                           // .stage_num
-  inv_shift_4,                 // .shift
-  inv_stage_range_row_adst_4,  // .stage_range
-  inv_cos_bit_row_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- row config inv_adst_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_8 = {
-  8,                           // .txfm_size
-  8,                           // .stage_num
-  inv_shift_8,                 // .shift
-  inv_stage_range_row_adst_8,  // .stage_range
-  inv_cos_bit_row_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- row config inv_adst_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_16 = {
-  16,                           // .txfm_size
-  10,                           // .stage_num
-  inv_shift_16,                 // .shift
-  inv_stage_range_row_adst_16,  // .stage_range
-  inv_cos_bit_row_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- row config inv_adst_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_32 = {
-  32,                           // .txfm_size
-  12,                           // .stage_num
-  inv_shift_32,                 // .shift
-  inv_stage_range_row_adst_32,  // .stage_range
-  inv_cos_bit_row_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-//  ---------------- col config inv_dct_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_4 = {
-  4,                          // .txfm_size
-  4,                          // .stage_num
-  inv_shift_4,                // .shift
-  inv_stage_range_col_dct_4,  // .stage_range
-  inv_cos_bit_col_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- col config inv_dct_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_8 = {
-  8,                          // .txfm_size
-  6,                          // .stage_num
-  inv_shift_8,                // .shift
-  inv_stage_range_col_dct_8,  // .stage_range
-  inv_cos_bit_col_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- col config inv_dct_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_16 = {
-  16,                          // .txfm_size
-  8,                           // .stage_num
-  inv_shift_16,                // .shift
-  inv_stage_range_col_dct_16,  // .stage_range
-  inv_cos_bit_col_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- col config inv_dct_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_32 = {
-  32,                          // .txfm_size
-  10,                          // .stage_num
-  inv_shift_32,                // .shift
-  inv_stage_range_col_dct_32,  // .stage_range
-  inv_cos_bit_col_dct_32,      // .cos_bit_col
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-//  ---------------- col config inv_dct_64 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  inv_shift_64,                // .shift
-  inv_stage_range_col_dct_64,  // .stage_range
-  inv_cos_bit_col_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-
-//  ---------------- col config inv_adst_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_4 = {
-  4,                           // .txfm_size
-  6,                           // .stage_num
-  inv_shift_4,                 // .shift
-  inv_stage_range_col_adst_4,  // .stage_range
-  inv_cos_bit_col_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- col config inv_adst_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_8 = {
-  8,                           // .txfm_size
-  8,                           // .stage_num
-  inv_shift_8,                 // .shift
-  inv_stage_range_col_adst_8,  // .stage_range
-  inv_cos_bit_col_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- col config inv_adst_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_16 = {
-  16,                           // .txfm_size
-  10,                           // .stage_num
-  inv_shift_16,                 // .shift
-  inv_stage_range_col_adst_16,  // .stage_range
-  inv_cos_bit_col_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- col config inv_adst_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_32 = {
-  32,                           // .txfm_size
-  12,                           // .stage_num
-  inv_shift_32,                 // .shift
-  inv_stage_range_col_adst_32,  // .stage_range
-  inv_cos_bit_col_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-#if CONFIG_EXT_TX
-// identity does not need to differentiate between row and col
-//  ---------------- row/col config inv_identity_4 ----------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_4 = {
-  4,                      // .txfm_size
-  1,                      // .stage_num
-  inv_shift_4,            // .shift
-  inv_stage_range_idx_4,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY4,    // .txfm_type
-};
-
-//  ---------------- row/col config inv_identity_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_8 = {
-  8,                      // .txfm_size
-  1,                      // .stage_num
-  inv_shift_8,            // .shift
-  inv_stage_range_idx_8,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY8,    // .txfm_type
-};
-
-//  ---------------- row/col config inv_identity_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_16 = {
-  16,                      // .txfm_size
-  1,                       // .stage_num
-  inv_shift_16,            // .shift
-  inv_stage_range_idx_16,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY16,    // .txfm_type
-};
-
-//  ---------------- row/col config inv_identity_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_32 = {
-  32,                      // .txfm_size
-  1,                       // .stage_num
-  inv_shift_32,            // .shift
-  inv_stage_range_idx_32,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY32,    // .txfm_type
-};
-
-#if CONFIG_TX64X64
-//  ---------------- row/col config inv_identity_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_64 = {
-  64,                      // .txfm_size
-  1,                       // .stage_num
-  inv_shift_64,            // .shift
-  inv_stage_range_idx_64,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY64,    // .txfm_type
-};
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 #endif  // AV1_INV_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c
index 031d11b40..4e6944314 100644
--- a/third_party/aom/av1/common/av1_inv_txfm2d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm2d.c
@@ -9,218 +9,252 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/enums.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_low_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    range_check_value(a1, bd + 1);
+    range_check_value(b1, bd + 1);
+    range_check_value(c1, bd + 1);
+    range_check_value(d1, bd + 1);
+
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_low_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void)bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = a1;
+  op[1] = op[2] = op[3] = e1;
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] =
+        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] =
+        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] =
+        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] =
+        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
 static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4: return av1_idct4_new;
     case TXFM_TYPE_DCT8: return av1_idct8_new;
     case TXFM_TYPE_DCT16: return av1_idct16_new;
     case TXFM_TYPE_DCT32: return av1_idct32_new;
-#if CONFIG_TX64X64
     case TXFM_TYPE_DCT64: return av1_idct64_new;
-#endif  // CONFIG_TX64X64
     case TXFM_TYPE_ADST4: return av1_iadst4_new;
     case TXFM_TYPE_ADST8: return av1_iadst8_new;
     case TXFM_TYPE_ADST16: return av1_iadst16_new;
-    case TXFM_TYPE_ADST32: return av1_iadst32_new;
-#if CONFIG_EXT_TX
     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
     case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
-#if CONFIG_TX64X64
-    case TXFM_TYPE_IDENTITY64: return av1_iidentity64_c;
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
     default: assert(0); return NULL;
   }
 }
 
-static const TXFM_1D_CFG *inv_txfm_col_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_col_cfg_dct_4, &inv_txfm_1d_col_cfg_dct_8,
-      &inv_txfm_1d_col_cfg_dct_16, &inv_txfm_1d_col_cfg_dct_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_col_cfg_dct_64
-#endif  // CONFIG_TX64X64
-  },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_col_cfg_adst_4, &inv_txfm_1d_col_cfg_adst_8,
-      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_col_cfg_adst_4, &inv_txfm_1d_col_cfg_adst_8,
-      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_cfg_identity_4, &inv_txfm_1d_cfg_identity_8,
-      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_cfg_identity_64
-#endif  // CONFIG_TX64X64
-  },
-#endif  // CONFIG_EXT_TX
-};
+static const int8_t inv_shift_4x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
+static const int8_t inv_shift_4x8[2] = { 0, -4 };
+static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x32[2] = { -1, -4 };
+static const int8_t inv_shift_32x16[2] = { -1, -4 };
+static const int8_t inv_shift_32x64[2] = { -1, -4 };
+static const int8_t inv_shift_64x32[2] = { -1, -4 };
+static const int8_t inv_shift_4x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x4[2] = { -1, -4 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
 
-static const TXFM_1D_CFG *inv_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_row_cfg_dct_4, &inv_txfm_1d_row_cfg_dct_8,
-      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_row_cfg_dct_64,
-#endif  // CONFIG_TX64X64
-  },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_row_cfg_adst_4, &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_row_cfg_adst_4, &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_cfg_identity_4, &inv_txfm_1d_cfg_identity_8,
-      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_cfg_identity_64
-#endif  // CONFIG_TX64X64
-  },
-#endif  // CONFIG_EXT_TX
+const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+  inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
+  inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
+  inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
+  inv_shift_64x32, inv_shift_4x16,  inv_shift_16x4,  inv_shift_8x32,
+  inv_shift_32x8,  inv_shift_16x64, inv_shift_64x16,
 };
 
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size) {
-  TXFM_2D_FLIP_CFG cfg;
-  set_flip_cfg(tx_type, &cfg);
-  const TX_TYPE_1D tx_type_col = vtx_tab[tx_type];
-  const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
-  const TX_SIZE tx_size_col = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_row = txsize_horz_map[tx_size];
-  cfg.col_cfg = inv_txfm_col_cfg_ls[tx_type_col][tx_size_col];
-  cfg.row_cfg = inv_txfm_row_cfg_ls[tx_type_row][tx_size_row];
-  return cfg;
-}
+/* clang-format off */
+const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
 
-#if CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x64_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_64;
-      set_flip_cfg(tx_type, &cfg);
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
+const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
+/* clang-format on */
 
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_32x64_cfg(int tx_type) {
-  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_32;
-      set_flip_cfg(tx_type, &cfg);
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
+const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
 
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x32_cfg(int tx_type) {
-  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_32;
-      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_64;
-      set_flip_cfg(tx_type, &cfg);
-      break;
-    default: assert(0);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  cfg->shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
+  }
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
   }
-  return cfg;
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
 }
-#endif  // CONFIG_TX64X64
 
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
                              int bd) {
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
+  const int fwd_shift = inv_start_range[tx_size];
+  const int8_t *shift = cfg->shift;
+  int8_t opt_range_row, opt_range_col;
+  if (bd == 8) {
+    opt_range_row = 16;
+    opt_range_col = 16;
+  } else if (bd == 10) {
+    opt_range_row = 18;
+    opt_range_col = 16;
+  } else {
+    assert(bd == 12);
+    opt_range_row = 20;
+    opt_range_col = 18;
+  }
   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_row[i] = cfg->row_cfg->stage_range[i] + fwd_shift + bd + 1;
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
+    (void)real_range_row;
+    if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_row[i] = opt_range_row;
+    } else {
+      assert(opt_range_row >= real_range_row);
+      stage_range_row[i] = opt_range_row;
+    }
   }
   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_col[i] =
-        cfg->col_cfg->stage_range[i] + fwd_shift + shift[0] + bd + 1;
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_col =
+        cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
+    (void)real_range_col;
+    if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_col[i] = opt_range_col;
+    } else {
+      assert(opt_range_col >= real_range_col);
+      stage_range_col[i] = opt_range_col;
+    }
   }
 }
 
 static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
                                     int stride, TXFM_2D_FLIP_CFG *cfg,
-                                    int32_t *txfm_buf, int8_t fwd_shift,
+                                    int32_t *txfm_buf, TX_SIZE tx_size,
                                     int bd) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -228,39 +262,48 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
   // rectangular, the number of columns will be the same as the
   // txfm_size stored in the row cfg struct. It will make no difference
   // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
   // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
   int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
-  assert(cfg->row_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, fwd_shift, bd);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
 
-  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
-  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
-  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->col_cfg->txfm_type);
-  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->row_cfg->txfm_type);
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
 
-  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 * txfm_size_row
+  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
+  // AOMMAX(txfm_size_row, txfm_size_col)
   // it is used for intermediate data buffering
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_in = txfm_buf;
-  int32_t *temp_out = temp_in + txfm_size_row;
-  int32_t *buf = temp_out + txfm_size_row;
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
   int c, r;
 
   // Rows
   for (r = 0; r < txfm_size_row; ++r) {
-    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
-    round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    // Multiply everything by Sqrt2 if the transform is rectangular
-    if (txfm_size_row != txfm_size_col) {
-      for (c = 0; c < txfm_size_col; ++c)
-        buf_ptr[c] = (int32_t)dct_const_round_shift(buf_ptr[c] * Sqrt2);
+    if (abs(rect_type) == 1) {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    } else {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = input[c];
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
     }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
   }
@@ -275,8 +318,9 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
-    round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
     if (cfg->ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
@@ -296,156 +340,166 @@ static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
                                          int stride, int32_t *txfm_buf,
                                          TX_TYPE tx_type, TX_SIZE tx_size,
                                          int bd) {
-  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, tx_size);
-  TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
-  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf,
-                   fwd_shift_sum[tx_size_sqr], bd);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
+  // Forward shift sum uses larger square size, to be consistent with what
+  // av1_gen_inv_stage_range() does for inverse shifts.
+  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
 }
 
 void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[4 * 8 + 8 + 8];
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
 }
 
 void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[8 * 4 + 8 + 8];
-  int32_t rinput[8 * 4];
-  uint16_t routput[8 * 4];
-  TX_SIZE tx_size = TX_8X4;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[8 * 4 + 4 + 4];
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
-#endif
 }
 
 void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[8 * 16 + 16 + 16];
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
 }
 
 void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[16 * 8 + 16 + 16];
-  int32_t rinput[16 * 8];
-  uint16_t routput[16 * 8];
-  TX_SIZE tx_size = TX_16X8;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[16 * 8 + 8 + 8];
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
-#endif
 }
 
 void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[16 * 32 + 32 + 32];
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
 }
 
 void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[32 * 16 + 32 + 32];
-  int32_t rinput[32 * 16];
-  uint16_t routput[32 * 16];
-  TX_SIZE tx_size = TX_32X16;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[32 * 16 + 16 + 16];
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
-#endif
 }
 
 void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[4 * 4 + 4 + 4];
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
 }
 
 void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[8 * 8 + 8 + 8];
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
 }
 
 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[16 * 16 + 16 + 16];
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
 }
 
 void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[32 * 32 + 32 + 32];
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
 }
 
-#if CONFIG_TX64X64
 void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[64 * 64 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X64, bd);
+  // TODO(urvang): Can the same array be reused, instead of using a new array?
+  // Remap 32x32 input into a modified 64x64 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 64];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
+                        bd);
 }
 
 void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[64 * 32 + 64 + 64];
-  int32_t rinput[64 * 32];
-  uint16_t routput[64 * 32];
-  TX_SIZE tx_size = TX_64X32;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[64 * 32 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X32, bd);
-#endif
+  // Remap 32x32 input into a modified 64x32 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 32];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+                        bd);
 }
 
 void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[64 * 32 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X64, bd);
+  // Remap 32x32 input into a modified 32x64 input by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[32 * 64];
+  memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
+  memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 16x32 input into a modified 16x64 input by:
+  // - Copying over these values in top-left 16x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[16 * 64];
+  memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
+  memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x16 input into a modified 64x16 by:
+  // - Copying over these values in top-left 32x16 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 16];
+  for (int row = 0; row < 16; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 95f7a8687..738290fad 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -11,8 +11,9 @@
 
 #include <math.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
@@ -21,590 +22,211 @@
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
-#if CONFIG_LOOPFILTER_LEVEL
 static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
   { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
   { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
 };
 
-#if CONFIG_EXT_DELTA_Q
 static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
   { 0, 1 }, { 2, 2 }, { 3, 3 }
 };
-#endif  // CONFIG_EXT_DELTA_Q
-#endif  // CONFIG_LOOPFILTER_LEVEL
-
-#if CONFIG_LPF_DIRECT
-static void pick_filter_pixel_left(uint8_t *const src, uint8_t *const line,
-                                   int *const orig_pos, int length, int row,
-                                   int col, int width, int height, int pitch,
-                                   int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
-
-  for (i = 0; i < length; ++i) {
-    int dy = 0;
-    switch (direct) {
-      case VERT_HORZ: dy = 0; break;
-      case DEGREE_45: dy = 1; break;
-      case DEGREE_135: dy = -1; break;
-    }
-    col -= 1;
-    row += dy;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot - 1 - i] = src[pos];
-      orig_pos[pivot - 1 - i] = pos;
-    }
-  }
-}
 
-static void pick_filter_pixel_right(uint8_t *const src, uint8_t *const line,
-                                    int *const orig_pos, int length, int row,
-                                    int col, int width, int height, int pitch,
-                                    int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
 
-  line[pivot] = src[pos];
-  orig_pos[pivot] = pos;
+static const int mode_lf_lut[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1,                             // INTER_MODES (GLOBALMV == 0)
+  1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
+};
 
-  for (i = 1; i < length; ++i) {
-    int dy = 0;
-    switch (direct) {
-      case VERT_HORZ: dy = 0; break;
-      case DEGREE_45: dy = -1; break;
-      case DEGREE_135: dy = 1; break;
-    }
-    col += 1;
-    row += dy;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot + i] = src[pos];
-      orig_pos[pivot + i] = pos;
-    }
-  }
-}
+#if LOOP_FILTER_BITMASK
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    -----------------
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+// TODO(chengchen): make these tables static
+const FilterMask left_txform_mask[TX_SIZES] = {
+  { { 0xffffffffffffffffULL,  // TX_4X4,
+      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
 
-static void pick_filter_pixel_above(uint8_t *const src, uint8_t *const line,
-                                    int *const orig_pos, int length, int row,
-                                    int col, int width, int height, int pitch,
-                                    int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
-
-  for (i = 0; i < length; ++i) {
-    int dx = 0;
-    switch (direct) {
-      case VERT_HORZ: dx = 0; break;
-      case DEGREE_45: dx = 1; break;
-      case DEGREE_135: dx = -1; break;
-    }
-    col += dx;
-    row -= 1;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot - 1 - i] = src[pos];
-      orig_pos[pivot - 1 - i] = pos;
-    }
-  }
-}
+  { { 0x5555555555555555ULL,  // TX_8X8,
+      0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
 
-static void pick_filter_pixel_bot(uint8_t *const src, uint8_t *const line,
-                                  int *const orig_pos, int length, int row,
-                                  int col, int width, int height, int pitch,
-                                  int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
+  { { 0x1111111111111111ULL,  // TX_16X16,
+      0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
 
-  line[pivot] = src[pos];
-  orig_pos[pivot] = pos;
+  { { 0x0101010101010101ULL,  // TX_32X32,
+      0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
 
-  for (i = 1; i < length; ++i) {
-    int dx = 0;
-    switch (direct) {
-      case VERT_HORZ: dx = 0; break;
-      case DEGREE_45: dx = -1; break;
-      case DEGREE_135: dx = 1; break;
-    }
-    col += dx;
-    row += 1;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot + i] = src[pos];
-      orig_pos[pivot + i] = pos;
-    }
-  }
-}
+  { { 0x0001000100010001ULL,  // TX_64X64,
+      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
 
-static void pick_filter_block_vert(uint8_t *const src, uint8_t *const block,
-                                   int *const orig_pos, int length, int row,
-                                   int col, int width, int height, int pitch,
-                                   int pivot, int line_length, int unit,
-                                   int direct) {
-  int i;
-  for (i = 0; i < 8 * unit; ++i) {
-    pick_filter_pixel_left(src, block + i * line_length,
-                           orig_pos + i * line_length, length, row + i, col,
-                           width, height, pitch, pivot, direct);
-    pick_filter_pixel_right(src, block + i * line_length,
-                            orig_pos + i * line_length, length, row + i, col,
-                            width, height, pitch, pivot, direct);
-  }
-}
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    -----------------
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+const FilterMask above_txform_mask[TX_SIZES] = {
+  { { 0xffffffffffffffffULL,  // TX_4X4
+      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
 
-static void pick_filter_block_horz(uint8_t *const src, uint8_t *const block,
-                                   int *const orig_pos, int length, int row,
-                                   int col, int width, int height, int pitch,
-                                   int pivot, int line_length, int unit,
-                                   int direct) {
-  int i, j;
-  int num = 8 * unit;
-  for (i = 0; i < num; ++i) {
-    pick_filter_pixel_above(src, block + i * line_length,
-                            orig_pos + i * line_length, length, row, col + i,
-                            width, height, pitch, pivot, direct);
-    pick_filter_pixel_bot(src, block + i * line_length,
-                          orig_pos + i * line_length, length, row, col + i,
-                          width, height, pitch, pivot, direct);
-  }
+  { { 0x0000ffff0000ffffULL,  // TX_8X8
+      0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
 
-  // rearrange block
-  // TODO(chengchen): make it in-place or a stand alone function
-  uint8_t tmp_block[256];
-  int tmp_pos[256];
-  for (i = 0; i < 256; ++i) {
-    tmp_block[i] = 0;
-    tmp_pos[i] = -1;
-  }
-  for (i = 0; i < num; ++i) {
-    for (j = 0; j < line_length; ++j) {
-      tmp_block[j * line_length + i] = block[i * line_length + j];
-      tmp_pos[j * line_length + i] = orig_pos[i * line_length + j];
-    }
-  }
-  for (i = 0; i < 256; ++i) {
-    block[i] = tmp_block[i];
-    orig_pos[i] = tmp_pos[i];
-  }
-}
+  { { 0x000000000000ffffULL,  // TX_16X16
+      0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
 
-static int compute_block_grad(uint8_t *const src, int length, int row, int col,
-                              int width, int height, int pitch, int unit,
-                              int vert_or_horz, int direct) {
-  int i, j;
-  int r0, c0, pos0, r1 = 0, c1 = 0, pos1;
-  int sum_grad = 0;
-  for (i = 0; i < 8 * unit; ++i) {
-    // vert_or_horz: 0 vertical edge, 1 horizontal edge
-    r0 = vert_or_horz ? row : row + i;
-    c0 = vert_or_horz ? col + i : col;
-    pos0 = r0 * pitch + c0;
-
-    for (j = 0; j < length; ++j) {
-      if (vert_or_horz == 0) {
-        switch (direct) {
-          case VERT_HORZ: r1 = r0; break;
-          case DEGREE_45: r1 = r0 + 1; break;
-          case DEGREE_135: r1 = r0 - 1; break;
-        }
-        c1 = c0 - 1;
-      } else {
-        r1 = r0 - 1;
-        switch (direct) {
-          case VERT_HORZ: c1 = c0; break;
-          case DEGREE_45: c1 = c0 + 1; break;
-          case DEGREE_135: c1 = c0 - 1; break;
-        }
-      }
-      pos1 = r1 * pitch + c1;
+  { { 0x000000000000ffffULL,  // TX_32X32
+      0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
 
-      if (r0 >= 0 && r0 < height && c0 >= 0 && c0 < width && r1 >= 0 &&
-          r1 < height && c1 >= 0 && c1 < width) {
-        sum_grad += abs(src[pos1] - src[pos0]);
-      } else {
-        sum_grad += 255;  // penalize unreachable boundary
-      }
-      r0 = r1;
-      c0 = c1;
-      pos0 = pos1;
-    }
+  { { 0x000000000000ffffULL,  // TX_64X64
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+};
 
-    r0 = vert_or_horz ? row : row + i;
-    c0 = vert_or_horz ? col + i : col;
-    pos0 = r0 * pitch + c0;
+// 64 bit mask to shift and set for each prediction size. A bit is set for
+// each 4x4 block that would be in the top left most block of the given block
+// size in the 64x64 block.
+const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
+  { { 0x0000000000000001ULL,  // BLOCK_4X4
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-    for (j = 0; j < length - 1; ++j) {
-      if (vert_or_horz == 0) {
-        switch (direct) {
-          case VERT_HORZ: r1 = r0; break;
-          case DEGREE_45: r1 = r0 - 1; break;
-          case DEGREE_135: r1 = r0 + 1; break;
-        }
-        c1 = c0 + 1;
-      } else {
-        r1 = r0 + 1;
-        switch (direct) {
-          case VERT_HORZ: c1 = c0; break;
-          case DEGREE_45: c1 = c0 - 1; break;
-          case DEGREE_135: c1 = c0 + 1; break;
-        }
-      }
-      pos1 = r1 * pitch + c1;
+  { { 0x0000000000010001ULL,  // BLOCK_4X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-      if (r0 >= 0 && r0 < height && c0 >= 0 && c0 < width && r1 >= 0 &&
-          r1 < height && c1 >= 0 && c1 < width) {
-        sum_grad += abs(src[pos1] - src[pos0]);
-      } else {
-        sum_grad += 255;  // penalize unreachable boundary
-      }
-      r0 = r1;
-      c0 = c1;
-      pos0 = pos1;
-    }
-  }
+  { { 0x0000000000000003ULL,  // BLOCK_8X4
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-  return sum_grad;
-}
+  { { 0x0000000000030003ULL,  // BLOCK_8X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-static int pick_min_grad_direct(uint8_t *const src, int length, int row,
-                                int col, int width, int height, int pitch,
-                                int unit, int vert_or_horz) {
-  int direct = VERT_HORZ;
-  int min_grad = INT_MAX, sum_grad = 0;
-
-  int degree;
-  for (degree = 0; degree < FILTER_DEGREES; ++degree) {
-    // compute abs gradient along each line for the filter block
-    sum_grad = compute_block_grad(src, length, row, col, width, height, pitch,
-                                  unit, vert_or_horz, degree);
-    if (sum_grad < min_grad) {
-      min_grad = sum_grad;
-      direct = degree;
-    }
-  }
+  { { 0x0003000300030003ULL,  // BLOCK_8X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-  return direct;
-}
-#endif  // CONFIG_LPF_DIRECT
+  { { 0x00000000000f000fULL,  // BLOCK_16X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-#define PARALLEL_DEBLOCKING_15TAPLUMAONLY 1
-#define PARALLEL_DEBLOCKING_DISABLE_15TAP 0
-#if CONFIG_DEBLOCK_13TAP
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
-#else
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
-#endif
+  { { 0x000f000f000f000fULL,  // BLOCK_16X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
-extern void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh);
+  { { 0x000f000f000f000fULL,  // BLOCK_16X32
+      0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-extern void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh);
+  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-extern void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd);
+  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X32
+      0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-extern void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch,
-                                        const uint8_t *blimit,
-                                        const uint8_t *limit,
-                                        const uint8_t *thresh, int bd);
-#endif
+  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X64
+      0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
 
-// 64 bit masks for left transform size. Each 1 represents a position where
-// we should apply a loop filter across the left border of an 8x8 block
-// boundary.
-//
-// In the case of TX_16X16->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//
-// A loopfilter should be applied to every other 8x8 horizontally.
-static const uint64_t left_64x64_txform_mask[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffffffffffffffffULL,  // TX_2X2
-#endif
-  0xffffffffffffffffULL,  // TX_4X4
-  0xffffffffffffffffULL,  // TX_8x8
-  0x5555555555555555ULL,  // TX_16x16
-  0x1111111111111111ULL,  // TX_32x32
-#if CONFIG_TX64X64
-  0x0101010101010101ULL,  // TX_64x64
-#endif                    // CONFIG_TX64X64
-};
+  { { 0xffffffffffffffffULL,  // BLOCK_64X32
+      0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// 64 bit masks for above transform size. Each 1 represents a position where
-// we should apply a loop filter across the top border of an 8x8 block
-// boundary.
-//
-// In the case of TX_32x32 ->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111
-//    00000000
-//    00000000
-//    00000000
-//    11111111
-//    00000000
-//    00000000
-//    00000000
-//
-// A loopfilter should be applied to every other 4 the row vertically.
-static const uint64_t above_64x64_txform_mask[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffffffffffffffffULL,  // TX_4X4
-#endif
-  0xffffffffffffffffULL,  // TX_4X4
-  0xffffffffffffffffULL,  // TX_8x8
-  0x00ff00ff00ff00ffULL,  // TX_16x16
-  0x000000ff000000ffULL,  // TX_32x32
-#if CONFIG_TX64X64
-  0x00000000000000ffULL,  // TX_64x64
-#endif                    // CONFIG_TX64X64
-};
+  { { 0xffffffffffffffffULL,  // BLOCK_64X64
+      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
+  // Y plane max coding block size is 128x128, but the codec divides it
+  // into 4 64x64 blocks.
+  // BLOCK_64X128
+  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
+  // BLOCK_128X64
+  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
+  // BLOCK_128X128
+  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
 
-// 64 bit masks for prediction sizes (left). Each 1 represents a position
-// where left border of an 8x8 block. These are aligned to the right most
-// appropriate bit, and then shifted into place.
-//
-// In the case of TX_16x32 ->  ( low order byte first ) we end up with
-// a mask that looks like this :
-//
-//  10000000
-//  10000000
-//  10000000
-//  10000000
-//  00000000
-//  00000000
-//  00000000
-//  00000000
-static const uint64_t left_prediction_mask[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0000000000000001ULL,  // BLOCK_2X2,
-  0x0000000000000001ULL,  // BLOCK_2X4,
-  0x0000000000000001ULL,  // BLOCK_4X2,
-#endif
-  0x0000000000000001ULL,  // BLOCK_4X4,
-  0x0000000000000001ULL,  // BLOCK_4X8,
-  0x0000000000000001ULL,  // BLOCK_8X4,
-  0x0000000000000001ULL,  // BLOCK_8X8,
-  0x0000000000000101ULL,  // BLOCK_8X16,
-  0x0000000000000001ULL,  // BLOCK_16X8,
-  0x0000000000000101ULL,  // BLOCK_16X16,
-  0x0000000001010101ULL,  // BLOCK_16X32,
-  0x0000000000000101ULL,  // BLOCK_32X16,
-  0x0000000001010101ULL,  // BLOCK_32X32,
-  0x0101010101010101ULL,  // BLOCK_32X64,
-  0x0000000001010101ULL,  // BLOCK_64X32,
-  0x0101010101010101ULL,  // BLOCK_64X64,
-  0x0000000000000101ULL,  // BLOCK_4X16,
-  0x0000000000000001ULL,  // BLOCK_16X4,
-  0x0000000001010101ULL,  // BLOCK_8X32,
-  0x0000000000000001ULL,  // BLOCK_32X8,
-  0x0101010101010101ULL,  // BLOCK_16X64,
-  0x0000000000000101ULL,  // BLOCK_64X16
-};
+  { { 0x0001000100010001ULL,  // BLOCK_4X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// 64 bit mask to shift and set for each prediction size.
-static const uint64_t above_prediction_mask[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0000000000000001ULL,  // BLOCK_2X2
-  0x0000000000000001ULL,  // BLOCK_2X4
-  0x0000000000000001ULL,  // BLOCK_4X2
-#endif
-  0x0000000000000001ULL,  // BLOCK_4X4
-  0x0000000000000001ULL,  // BLOCK_4X8
-  0x0000000000000001ULL,  // BLOCK_8X4
-  0x0000000000000001ULL,  // BLOCK_8X8
-  0x0000000000000001ULL,  // BLOCK_8X16,
-  0x0000000000000003ULL,  // BLOCK_16X8
-  0x0000000000000003ULL,  // BLOCK_16X16
-  0x0000000000000003ULL,  // BLOCK_16X32,
-  0x000000000000000fULL,  // BLOCK_32X16,
-  0x000000000000000fULL,  // BLOCK_32X32,
-  0x000000000000000fULL,  // BLOCK_32X64,
-  0x00000000000000ffULL,  // BLOCK_64X32,
-  0x00000000000000ffULL,  // BLOCK_64X64,
-  0x0000000000000001ULL,  // BLOCK_4X16,
-  0x0000000000000003ULL,  // BLOCK_16X4,
-  0x0000000000000001ULL,  // BLOCK_8X32,
-  0x000000000000000fULL,  // BLOCK_32X8,
-  0x0000000000000003ULL,  // BLOCK_16X64,
-  0x00000000000000ffULL,  // BLOCK_64X16
-};
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 8x8 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-static const uint64_t size_mask[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0000000000000001ULL,  // BLOCK_2X2
-  0x0000000000000001ULL,  // BLOCK_2X4
-  0x0000000000000001ULL,  // BLOCK_4X2
-#endif
-  0x0000000000000001ULL,  // BLOCK_4X4
-  0x0000000000000001ULL,  // BLOCK_4X8
-  0x0000000000000001ULL,  // BLOCK_8X4
-  0x0000000000000001ULL,  // BLOCK_8X8
-  0x0000000000000101ULL,  // BLOCK_8X16,
-  0x0000000000000003ULL,  // BLOCK_16X8
-  0x0000000000000303ULL,  // BLOCK_16X16
-  0x0000000003030303ULL,  // BLOCK_16X32,
-  0x0000000000000f0fULL,  // BLOCK_32X16,
-  0x000000000f0f0f0fULL,  // BLOCK_32X32,
-  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
-  0x00000000ffffffffULL,  // BLOCK_64X32,
-  0xffffffffffffffffULL,  // BLOCK_64X64,
-  0x0000000000000101ULL,  // BLOCK_4X16,
-  0x0000000000000003ULL,  // BLOCK_16X4,
-  0x0000000001010101ULL,  // BLOCK_8X32,
-  0x000000000000000fULL,  // BLOCK_32X8,
-  0x0303030303030303ULL,  // BLOCK_16X64,
-  0x000000000000ffffULL,  // BLOCK_64X16
-};
+  { { 0x000000000000000fULL,  // BLOCK_16X4
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// These are used for masking the left and above 32x32 borders.
-static const uint64_t left_border = 0x1111111111111111ULL;
-static const uint64_t above_border = 0x000000ff000000ffULL;
+  { { 0x0003000300030003ULL,  // BLOCK_8X32
+      0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// 16 bit masks for uv transform sizes.
-static const uint16_t left_64x64_txform_mask_uv[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffff,  // TX_2X2
-#endif
-  0xffff,  // TX_4X4
-  0xffff,  // TX_8x8
-  0x5555,  // TX_16x16
-  0x1111,  // TX_32x32
-#if CONFIG_TX64X64
-  0x0101,  // TX_64x64, never used
-#endif     // CONFIG_TX64X64
-};
+  { { 0x0000000000ff00ffULL,  // BLOCK_32X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-static const uint16_t above_64x64_txform_mask_uv[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffff,  // TX_2X2
-#endif
-  0xffff,  // TX_4X4
-  0xffff,  // TX_8x8
-  0x0f0f,  // TX_16x16
-  0x000f,  // TX_32x32
-#if CONFIG_TX64X64
-  0x0003,  // TX_64x64, never used
-#endif     // CONFIG_TX64X64
-};
+  { { 0x000f000f000f000fULL,  // BLOCK_16X64
+      0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
 
-// 16 bit left mask to shift and set for each uv prediction size.
-static const uint16_t left_prediction_mask_uv[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0001,  // BLOCK_2X2,
-  0x0001,  // BLOCK_2X4,
-  0x0001,  // BLOCK_4X2,
-#endif
-  0x0001,  // BLOCK_4X4,
-  0x0001,  // BLOCK_4X8,
-  0x0001,  // BLOCK_8X4,
-  0x0001,  // BLOCK_8X8,
-  0x0001,  // BLOCK_8X16,
-  0x0001,  // BLOCK_16X8,
-  0x0001,  // BLOCK_16X16,
-  0x0011,  // BLOCK_16X32,
-  0x0001,  // BLOCK_32X16,
-  0x0011,  // BLOCK_32X32,
-  0x1111,  // BLOCK_32X64
-  0x0011,  // BLOCK_64X32,
-  0x1111,  // BLOCK_64X64,
-  0x0001,  // BLOCK_4X16,
-  0x0001,  // BLOCK_16X4,
-  0x0011,  // BLOCK_8X32,
-  0x0001,  // BLOCK_32X8,
-  0x1111,  // BLOCK_16X64,
-  0x0001,  // BLOCK_64X16,
+  { { 0xffffffffffffffffULL,  // BLOCK_64X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
 };
 
-// 16 bit above mask to shift and set for uv each prediction size.
-static const uint16_t above_prediction_mask_uv[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0001,  // BLOCK_2X2
-  0x0001,  // BLOCK_2X4
-  0x0001,  // BLOCK_4X2
-#endif
-  0x0001,  // BLOCK_4X4
-  0x0001,  // BLOCK_4X8
-  0x0001,  // BLOCK_8X4
-  0x0001,  // BLOCK_8X8
-  0x0001,  // BLOCK_8X16,
-  0x0001,  // BLOCK_16X8
-  0x0001,  // BLOCK_16X16
-  0x0001,  // BLOCK_16X32,
-  0x0003,  // BLOCK_32X16,
-  0x0003,  // BLOCK_32X32,
-  0x0003,  // BLOCK_32X64,
-  0x000f,  // BLOCK_64X32,
-  0x000f,  // BLOCK_64X64,
-  0x0001,  // BLOCK_4X16,
-  0x0001,  // BLOCK_16X4,
-  0x0001,  // BLOCK_8X32,
-  0x0003,  // BLOCK_32X8,
-  0x0001,  // BLOCK_16X64,
-  0x000f,  // BLOCK_64X16
-};
+LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
+                                     int mi_col) {
+  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+      (mi_col << MI_SIZE_LOG2) >= cm->width)
+    return NULL;
+  assert(cm->lf.lfm != NULL);
+  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
+  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
 
-// 64 bit mask to shift and set for each uv prediction size
-static const uint16_t size_mask_uv[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0001,  // BLOCK_2X2
-  0x0001,  // BLOCK_2X4
-  0x0001,  // BLOCK_4X2
-#endif
-  0x0001,  // BLOCK_4X4
-  0x0001,  // BLOCK_4X8
-  0x0001,  // BLOCK_8X4
-  0x0001,  // BLOCK_8X8
-  0x0001,  // BLOCK_8X16,
-  0x0001,  // BLOCK_16X8
-  0x0001,  // BLOCK_16X16
-  0x0011,  // BLOCK_16X32,
-  0x0003,  // BLOCK_32X16,
-  0x0033,  // BLOCK_32X32,
-  0x3333,  // BLOCK_32X64,
-  0x00ff,  // BLOCK_64X32,
-  0xffff,  // BLOCK_64X64,
-  0x0001,  // BLOCK_4X16,
-  0x0001,  // BLOCK_16X4,
-  0x0011,  // BLOCK_8X32,
-  0x0003,  // BLOCK_32X8,
-  0x1111,  // BLOCK_16X64,
-  0x000f,  // BLOCK_64X16
-};
-static const uint16_t left_border_uv = 0x1111;
-static const uint16_t above_border_uv = 0x000f;
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+                        const uint8_t *limit, const uint8_t *thresh);
 
-static const int mode_lf_lut[] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
-  0,
-#if CONFIG_SMOOTH_HV
-  0, 0,
-#endif         // CONFIG_SMOOTH_HV
-  1, 1, 0, 1,  // INTER_MODES (ZEROMV == 0)
-#if CONFIG_COMPOUND_SINGLEREF
-  // 1, 1, 1, 1, 1,       // INTER_SINGLEREF_COMP_MODES
-  // NOTE(zoeliu): Remove SR_NEAREST_NEWMV
-  1, 1, 1, 1,             // INTER_SINGLEREF_COMP_MODES
-#endif                    // CONFIG_COMPOUND_SINGLEREF
-  1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (ZERO_ZEROMV == 0)
-};
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+                            const uint8_t *limit0, const uint8_t *thresh0,
+                            const uint8_t *blimit1, const uint8_t *limit1,
+                            const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#endif  // LOOP_FILTER_BITMASK
 
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
@@ -626,64 +248,36 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
            SIMD_WIDTH);
   }
 }
-#if CONFIG_EXT_DELTA_Q
 static uint8_t get_filter_level(const AV1_COMMON *cm,
                                 const loop_filter_info_n *lfi_n,
-#if CONFIG_LOOPFILTER_LEVEL
                                 const int dir_idx, int plane,
-#endif
-#if CONFIG_LPF_SB
-                                int mi_row, int mi_col,
-#endif
                                 const MB_MODE_INFO *mbmi) {
-#if CONFIG_LPF_SB
-  return cm->mi[mi_row * cm->mi_stride + mi_col].mbmi.filt_lvl;
-#endif
-
-#if CONFIG_SUPERTX
-  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
-  assert(
-      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
-  assert(IMPLIES(supertx_enabled(mbmi),
-                 mbmi->segment_id_supertx <= mbmi->segment_id));
-#else
   const int segment_id = mbmi->segment_id;
-#endif  // CONFIG_SUPERTX
   if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
     int delta_lf;
     if (cm->delta_lf_multi) {
       const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
-      delta_lf = mbmi->curr_delta_lf[delta_lf_idx];
+      delta_lf = mbmi->delta_lf[delta_lf_idx];
     } else {
-      delta_lf = mbmi->current_delta_lf_from_base;
+      delta_lf = mbmi->delta_lf_from_base;
     }
-    int lvl_seg =
-        clamp(delta_lf + cm->lf.filter_level[dir_idx], 0, MAX_LOOP_FILTER);
-#else
-    int lvl_seg = clamp(mbmi->current_delta_lf_from_base + cm->lf.filter_level,
-                        0, MAX_LOOP_FILTER);
-#endif
-    const int scale = 1 << (lvl_seg >> 5);
-#if CONFIG_LOOPFILTER_LEVEL
+    int base_level;
+    if (plane == 0)
+      base_level = cm->lf.filter_level[dir_idx];
+    else if (plane == 1)
+      base_level = cm->lf.filter_level_u;
+    else
+      base_level = cm->lf.filter_level_v;
+    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
     assert(plane >= 0 && plane <= 2);
     const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
     if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
       const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
-      lvl_seg =
-          clamp(cm->seg.abs_delta == SEGMENT_ABSDATA ? data : lvl_seg + data, 0,
-                MAX_LOOP_FILTER);
-    }
-#else
-    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_ALT_LF)) {
-      const int data = get_segdata(&cm->seg, segment_id, SEG_LVL_ALT_LF);
-      lvl_seg =
-          clamp(cm->seg.abs_delta == SEGMENT_ABSDATA ? data : lvl_seg + data, 0,
-                MAX_LOOP_FILTER);
+      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
     }
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
     if (cm->lf.mode_ref_delta_enabled) {
+      const int scale = 1 << (lvl_seg >> 5);
       lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
       if (mbmi->ref_frame[0] > INTRA_FRAME)
         lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
@@ -691,29 +285,10 @@ static uint8_t get_filter_level(const AV1_COMMON *cm,
     }
     return lvl_seg;
   } else {
-#if CONFIG_LOOPFILTER_LEVEL
-    return lfi_n
-        ->lvl[segment_id][dir_idx][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
-#else
-    return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
-#endif
+    return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
+                     [mode_lf_lut[mbmi->mode]];
   }
 }
-#else
-static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
-                                const MB_MODE_INFO *mbmi) {
-#if CONFIG_SUPERTX
-  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
-  assert(
-      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
-  assert(IMPLIES(supertx_enabled(mbmi),
-                 mbmi->segment_id_supertx <= mbmi->segment_id));
-#else
-  const int segment_id = mbmi->segment_id;
-#endif  // CONFIG_SUPERTX
-  return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
-}
-#endif
 
 void av1_loop_filter_init(AV1_COMMON *cm) {
   assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
@@ -721,2198 +296,1221 @@ void av1_loop_filter_init(AV1_COMMON *cm) {
   struct loopfilter *lf = &cm->lf;
   int lvl;
 
+  lf->combine_vert_horz_lf = 1;
+
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
-  lf->last_sharpness_level = lf->sharpness_level;
 
   // init hev threshold const vectors
   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
     memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
-#if CONFIG_LPF_SB
-void av1_loop_filter_sb_level_init(AV1_COMMON *cm, int mi_row, int mi_col,
-                                   int lvl) {
-  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
-  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
-
-  int row, col;
-  for (row = mi_row_start; row < mi_row_end; ++row) {
-    for (col = mi_col_start; col < mi_col_end; ++col) {
-      // Note: can't use cm->mi_grid_visible. Because for each partition,
-      // all visible pointers will point to the first of the partition.
-      cm->mi[row * cm->mi_stride + col].mbmi.filt_lvl = lvl;
-    }
-  }
-}
-#endif  // CONFIG_LPF_SB
-
-void av1_loop_filter_frame_init(AV1_COMMON *cm, int default_filt_lvl,
-                                int default_filt_lvl_r
-#if CONFIG_LOOPFILTER_LEVEL
-                                ,
-                                int plane
-#endif
-                                ) {
+// Update the loop filter for the current frame.
+// This should be called before loop_filter_rows(),
+// av1_loop_filter_frame() calls this function directly.
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
+                                int plane_end) {
+  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
+  int plane;
   int seg_id;
   // n_shift is the multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
-  int scale = 1 << (default_filt_lvl >> 5);
   loop_filter_info_n *const lfi = &cm->lf_info;
   struct loopfilter *const lf = &cm->lf;
   const struct segmentation *const seg = &cm->seg;
 
-  // update limits if sharpness has changed
-  if (lf->last_sharpness_level != lf->sharpness_level) {
-    update_sharpness(lfi, lf->sharpness_level);
-    lf->last_sharpness_level = lf->sharpness_level;
-  }
+  // update sharpness limits
+  update_sharpness(lfi, lf->sharpness_level);
 
-  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-    for (int dir = 0; dir < 2; ++dir) {
-      int lvl_seg = (dir == 0) ? default_filt_lvl : default_filt_lvl_r;
-#if CONFIG_LOOPFILTER_LEVEL
-      assert(plane >= 0 && plane <= 2);
-      const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
-      if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
-        const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
-        lvl_seg = clamp(
-            seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data,
-            0, MAX_LOOP_FILTER);
-      }
-#else
-      if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
-        const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-        lvl_seg = clamp(
-            seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data,
-            0, MAX_LOOP_FILTER);
-      }
-#endif  // CONFIG_LOOPFILTER_LEVEL
+  filt_lvl[0] = cm->lf.filter_level[0];
+  filt_lvl[1] = cm->lf.filter_level_u;
+  filt_lvl[2] = cm->lf.filter_level_v;
 
-      if (!lf->mode_ref_delta_enabled) {
-// we could get rid of this if we assume that deltas are set to
-// zero when not in use; encoder always uses deltas
-#if CONFIG_LOOPFILTER_LEVEL
-        memset(lfi->lvl[seg_id][dir], lvl_seg, sizeof(lfi->lvl[seg_id][dir]));
-#else
-        memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      } else {
-        int ref, mode;
-#if CONFIG_LOOPFILTER_LEVEL
-        scale = 1 << (lvl_seg >> 5);
-
-        const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
-        lfi->lvl[seg_id][dir][INTRA_FRAME][0] =
-            clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-        for (ref = LAST_FRAME; ref < TOTAL_REFS_PER_FRAME; ++ref) {
-          for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-            const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
-                                  lf->mode_deltas[mode] * scale;
-            lfi->lvl[seg_id][dir][ref][mode] =
-                clamp(inter_lvl, 0, MAX_LOOP_FILTER);
-          }
+  filt_lvl_r[0] = cm->lf.filter_level[1];
+  filt_lvl_r[1] = cm->lf.filter_level_u;
+  filt_lvl_r[2] = cm->lf.filter_level_v;
+
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
+      break;
+    else if (plane == 1 && !filt_lvl[1])
+      continue;
+    else if (plane == 2 && !filt_lvl[2])
+      continue;
+
+    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+      for (int dir = 0; dir < 2; ++dir) {
+        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
+        assert(plane >= 0 && plane <= 2);
+        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
+        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
+          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
+          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
         }
-#else
-        (void)default_filt_lvl_r;
-        const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
-        lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-        for (ref = LAST_FRAME; ref < TOTAL_REFS_PER_FRAME; ++ref) {
-          for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-            const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
-                                  lf->mode_deltas[mode] * scale;
-            lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+
+        if (!lf->mode_ref_delta_enabled) {
+          // we could get rid of this if we assume that deltas are set to
+          // zero when not in use; encoder always uses deltas
+          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
+                 sizeof(lfi->lvl[plane][seg_id][dir]));
+        } else {
+          int ref, mode;
+          const int scale = 1 << (lvl_seg >> 5);
+          const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+          lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
+              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+          for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
+            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+                                    lf->mode_deltas[mode] * scale;
+              lfi->lvl[plane][seg_id][dir][ref][mode] =
+                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+            }
           }
         }
-#endif
       }
     }
   }
-}
 
-static void filter_selectively_vert_row2(int subsampling_factor, uint8_t *s,
-                                         int pitch, unsigned int mask_16x16_l,
-                                         unsigned int mask_8x8_l,
-                                         unsigned int mask_4x4_l,
-                                         unsigned int mask_4x4_int_l,
-                                         const loop_filter_info_n *lfi_n,
-                                         const uint8_t *lfl) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
-  const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
-  unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+#if LOOP_FILTER_BITMASK
+  memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.y_level_above, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.y_level_left, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.u_level_above, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.u_level_left, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.v_level_above, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.v_level_left, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64);
+#endif  // LOOP_FILTER_BITMASK
+}
 
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          aom_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          aom_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_16(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr);
-        }
-      }
+#if LOOP_FILTER_BITMASK
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+static int get_index_shift(int mi_col, int mi_row, int *index) {
+  // *index = mi_row >> 2;
+  // rows = mi_row % 4;
+  // stride_log2 = 4;
+  // shift = (rows << stride_log2) + mi_col;
+  *index = mi_row >> 2;
+  return ((mi_row & 3) << 4) | mi_col;
+}
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_8x8_0 & 1) {
-          aom_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
+static void check_mask(const FilterMask *lfm) {
+#ifndef NDEBUG
+  for (int i = 0; i < 4; ++i) {
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
+  }
+#else
+  (void)lfm;
+#endif
+}
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
+static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
+  if (plane == 0) {
+    // Assert if we try to apply 2 different loop filters at the same
+    // position.
+    check_mask(lfm->left_y);
+    check_mask(lfm->above_y);
+  } else if (plane == 1) {
+    check_mask(lfm->left_u);
+    check_mask(lfm->above_u);
+  } else {
+    check_mask(lfm->left_v);
+    check_mask(lfm->above_v);
+  }
+}
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          aom_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_int_0 & 1) {
-          aom_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
+static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
+                         TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
+  if (dir == VERT_EDGE) {
+    switch (plane) {
+      case 0:
+        for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 1:
+        for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 2:
+        for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      default: assert(plane <= 2);
+    }
+  } else {
+    switch (plane) {
+      case 0:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 1:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 2:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      default: assert(plane <= 2);
     }
-
-    s += 8;
-    lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_filter_selectively_vert_row2(
-    int subsampling_factor, uint16_t *s, int pitch, unsigned int mask_16x16_l,
-    unsigned int mask_8x8_l, unsigned int mask_4x4_l,
-    unsigned int mask_4x4_int_l, const loop_filter_info_n *lfi_n,
-    const uint8_t *lfl, int bd) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
-  const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
-  unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
+                             int mi_col, int ssx, int ssy, EDGE_DIR dir) {
+  if (plane && (ssx || ssy)) {
+    if (ssx && ssy) {  // format 420
+      if ((mi_row << MI_SIZE_LOG2) > cm->height ||
+          (mi_col << MI_SIZE_LOG2) > cm->width)
+        return 1;
+    } else if (ssx) {  // format 422
+      if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+          (mi_col << MI_SIZE_LOG2) > cm->width)
+        return 1;
+    }
+  } else {
+    if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+        (mi_col << MI_SIZE_LOG2) >= cm->width)
+      return 1;
+  }
 
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          aom_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
-        } else if (mask_16x16_0 & 1) {
-          aom_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_16(s + 8 * pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr, bd);
-        }
-      }
+  int row_or_col;
+  if (plane == 0) {
+    row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
+  } else {
+    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+    // So if mi_col == 1, it is actually the frame boundary.
+    if (dir == VERT_EDGE) {
+      row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
+    } else {
+      row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
+    }
+  }
+  return row_or_col == 0;
+}
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_8x8_0 & 1) {
-          aom_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+                        int ssx, int ssy, TX_SIZE tx_size) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
+  const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
+  // decide whether current vertical/horizontal edge needs loop filtering
+  for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
+    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+    mi_row |= ssy;
+    mi_col |= ssx;
+
+    MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+    const MB_MODE_INFO *const mbmi = mi[0];
+    const int curr_skip = mbmi->skip && is_inter_block(mbmi);
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+    const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
+    const int prediction_masks = dir == VERT_EDGE
+                                     ? block_size_wide[plane_bsize] - 1
+                                     : block_size_high[plane_bsize] - 1;
+    const int is_coding_block_border =
+        dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
+
+    // TODO(chengchen): step can be optimized.
+    const int row_step = mi_size_high[TX_4X4] << ssy;
+    const int col_step = mi_size_wide[TX_4X4] << ssx;
+    const int mi_height =
+        dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
+    const int mi_width =
+        dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
+
+    // assign filter levels
+    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+        // do not filter frame boundary
+        // Note: when chroma planes' size are half of luma plane,
+        // chroma plane mi corresponds to even position.
+        // If frame size is not even, we still need to filter this chroma
+        // position. Therefore the boundary condition check needs to be
+        // separated to two cases.
+        if (plane && (ssx || ssy)) {
+          if (ssx && ssy) {  // format 420
+            if ((r << MI_SIZE_LOG2) > cm->height ||
+                (c << MI_SIZE_LOG2) > cm->width)
+              continue;
+          } else if (ssx) {  // format 422
+            if ((r << MI_SIZE_LOG2) >= cm->height ||
+                (c << MI_SIZE_LOG2) > cm->width)
+              continue;
+          }
         } else {
-          aom_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          if ((r << MI_SIZE_LOG2) >= cm->height ||
+              (c << MI_SIZE_LOG2) >= cm->width)
+            continue;
         }
-      }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        if (plane == 0) {
+          if (dir == VERT_EDGE)
+            lfm->lfl_y_ver[row][col] = level;
+          else
+            lfm->lfl_y_hor[row][col] = level;
+        } else if (plane == 1) {
+          if (dir == VERT_EDGE)
+            lfm->lfl_u_ver[row][col] = level;
+          else
+            lfm->lfl_u_hor[row][col] = level;
         } else {
-          aom_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          if (dir == VERT_EDGE)
+            lfm->lfl_v_ver[row][col] = level;
+          else
+            lfm->lfl_v_hor[row][col] = level;
         }
       }
+    }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_int_0 & 1) {
-          aom_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+        // do not filter frame boundary
+        if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
+
+        uint64_t mask[4] = { 0 };
+        const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
+        const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
+        MB_MODE_INFO **mi_prev =
+            cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
+        const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
+        const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
+        const uint8_t level_prev =
+            get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
+        const int is_edge =
+            (level || level_prev) &&
+            (!curr_skip || !prev_skip || is_coding_block_border);
+
+        if (is_edge) {
+          const TX_SIZE prev_tx_size =
+              plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
+                    : mbmi_prev->tx_size;
+          const TX_SIZE min_tx_size =
+              (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
+                                          txsize_horz_map[prev_tx_size])
+                                 : AOMMIN(txsize_vert_map[tx_size],
+                                          txsize_vert_map[prev_tx_size]);
+          assert(min_tx_size < TX_SIZES);
+          const int row = r % MI_SIZE_64X64;
+          const int col = c % MI_SIZE_64X64;
+          int index = 0;
+          const int shift = get_index_shift(col, row, &index);
+          assert(index < 4 && index >= 0);
+          mask[index] |= ((uint64_t)1 << shift);
+          // set mask on corresponding bit
+          update_masks(dir, plane, mask, min_tx_size, lfm);
         }
       }
     }
-
-    s += 8;
-    lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void filter_selectively_horiz(
-    uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl
-#if CONFIG_LPF_DIRECT
-    ,
-    uint8_t *const src, int mi_row, int mi_col, int idx_r, int col_step,
-    int width, int height, int ss_x, int ss_y
-#endif
-    ) {
-  unsigned int mask;
-  int count;
-#if CONFIG_LPF_DIRECT
-  // scale for u, v plane
-  width >>= ss_x;
-  height >>= ss_y;
-  int idx_c = 0;
-#endif
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                int blk_row, int blk_col,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                int plane, int ssx, int ssy) {
+  blk_row <<= ssy;
+  blk_col <<= ssx;
+  if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
+      ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
+    return;
 
-    count = 1;
-    if (mask & 1) {
-#if CONFIG_LPF_DIRECT
-      int i;
-      const int line_length = 16;
-      const int pivot = 8;
-      const int above_filt_len = mask_16x16 & 1 ? 8 : 4;
-      const int bot_filt_len = mask_16x16 & 1 ? 8 : 4;
-      uint8_t block[256];  // line_length * size_of(BLOCK_8X8) * two_blocks
-      int orig_pos[256];
-      int direct;
-
-      assert(above_filt_len == bot_filt_len);
-      (void)bot_filt_len;
-      for (i = 0; i < 256; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
+  // U/V plane, tx_size is always the largest size
+  if (plane) {
+    assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
+    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+                tx_size);
+    return;
+  }
 
-      // actual position for current pixel
-      const int row = (mi_row + idx_r) * MI_SIZE >> ss_y;
-      const int col = (mi_col + idx_c) * MI_SIZE >> ss_x;
+  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  const MB_MODE_INFO *const mbmi = mi[0];
+  // For Y plane:
+  // If intra block, tx size is univariant.
+  // If inter block, tx size follows inter_tx_size.
+  TX_SIZE plane_tx_size = tx_size;
+  const int is_inter = is_inter_block(mbmi);
+
+  if (plane == 0) {
+    if (is_inter) {
+      if (mbmi->skip) {
+        // TODO(chengchen): change av1_get_transform_size() to be consistant.
+        // plane_tx_size = get_max_rect_tx_size(plane_bsize);
+        plane_tx_size = mbmi->tx_size;
+      } else {
+        plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            plane_bsize, blk_row, blk_col)];
+      }
+    } else {
+      MB_MODE_INFO **mi_this = cm->mi_grid_visible +
+                               (mi_row + blk_row) * cm->mi_stride + mi_col +
+                               blk_col;
+      const MB_MODE_INFO *const mbmi_this = mi_this[0];
+      plane_tx_size = mbmi_this->tx_size;
+    }
+  }
 
-      // Next block's thresholds.
-      const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+  assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
 
-      if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          // Could use asymmetric length in the future
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 2, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 2,
-                                 direct);
-
-          aom_lpf_horizontal_edge_16(block + pivot * line_length, line_length,
-                                     lfi->mblim, lfi->lim, lfi->hev_thr);
-          count = 2;
-        } else {
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 1, 1);
+  if (plane || plane_tx_size == tx_size) {
+    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+                tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
+                            sub_txs, plane, ssx, ssy);
+      }
+    }
+  }
+}
 
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 1,
-                                 direct);
+static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int plane, int ssx, int ssy) {
+  MB_MODE_INFO **mi =
+      cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
+  const MB_MODE_INFO *const mbmi = mi[0];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+  const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+
+  const int block_width = mi_size_wide[plane_bsize];
+  const int block_height = mi_size_high[plane_bsize];
+
+  TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
+  // The decoder is designed so that it can process 64x64 luma pixels at a
+  // time. If this is a chroma plane with subsampling and bsize corresponds to
+  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
+  // mustn't be used for the subsampled plane (because it would be bigger than
+  // a 64x64 luma block) so we round down to TX_32X32.
+  if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
+    if (max_txsize == TX_16X64)
+      max_txsize = TX_16X32;
+    else if (max_txsize == TX_64X16)
+      max_txsize = TX_32X16;
+    else
+      max_txsize = TX_32X32;
+  }
 
-          aom_lpf_horizontal_edge_8(block + pivot * line_length, line_length,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr);
+  const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
+  const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+  const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+  const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+  mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
+
+  // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
+  // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
+  // U/V: largest tx size is 32x32.
+  for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
+    for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
+      const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
+      const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
+      for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+        for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+          setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
+                              max_txsize, plane, ssx, ssy);
         }
+      }
+    }
+  }
+}
 
-        for (i = 0; i < 256; ++i)
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-      } else if (mask_8x8 & 1) {
-        if ((mask_8x8 & 3) == 3) {
-          count = 2;
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 2, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 2,
-                                 direct);
-
-          aom_lpf_horizontal_8_dual(block + pivot * line_length, line_length,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr,
-                                    lfin->mblim, lfin->lim, lfin->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if ((mask_4x4_int & 3) == 3) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-
-            direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                          pitch, 2, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 2,
-                                   direct);
+static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
+  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+      (mi_col << MI_SIZE_LOG2) >= cm->width)
+    return;
 
-            aom_lpf_horizontal_4_dual(block + pivot * line_length, line_length,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr,
-                                      lfin->mblim, lfin->lim, lfin->hev_thr);
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
+  const int has_next_row =
+      (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
+  const int has_next_col =
+      (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
+  int i;
 
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          } else {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
+  switch (partition) {
+    case PARTITION_NONE:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_SPLIT:
+      setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
+      if (has_next_col)
+        setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
+      if (has_next_row)
+        setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
+      if (has_next_col & has_next_row)
+        setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
+                         ssy);
+      break;
+    case PARTITION_HORZ_A:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ_B:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      if (has_next_col & has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT_A:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT_B:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
+        // chroma plane filter the odd location
+        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+        setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        // chroma plane filter the odd location
+        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+        setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
+      }
+      break;
+    default: assert(0);
+  }
+}
 
-            if (mask_4x4_int & 1) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
+// TODO(chengchen): if lossless, do not need to setup mask. But when
+// segments enabled, each segment has different lossless settings.
+void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+                       int subsampling_x, int subsampling_y, int row_end,
+                       int col_end) {
+  const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
+  for (int y = 0; y < num_64x64; ++y) {
+    for (int x = 0; x < num_64x64; ++x) {
+      const int row = mi_row + y * MI_SIZE_64X64;
+      const int col = mi_col + x * MI_SIZE_64X64;
+      if (row >= row_end || col >= col_end) continue;
+      if ((row << MI_SIZE_LOG2) >= cm->height ||
+          (col << MI_SIZE_LOG2) >= cm->width)
+        continue;
+
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+      if (lfm == NULL) return;
+
+      // init mask to zero
+      if (plane == 0) {
+        av1_zero(lfm->left_y);
+        av1_zero(lfm->above_y);
+        av1_zero(lfm->lfl_y_ver);
+        av1_zero(lfm->lfl_y_hor);
+      } else if (plane == 1) {
+        av1_zero(lfm->left_u);
+        av1_zero(lfm->above_u);
+        av1_zero(lfm->lfl_u_ver);
+        av1_zero(lfm->lfl_u_hor);
+      } else {
+        av1_zero(lfm->left_v);
+        av1_zero(lfm->above_v);
+        av1_zero(lfm->lfl_v_ver);
+        av1_zero(lfm->lfl_v_hor);
+      }
+    }
+  }
 
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
+  // set up bitmask for each superblock
+  setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
+                   subsampling_x, subsampling_y);
 
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfi->mblim, lfi->lim, lfi->hev_thr);
-            } else if (mask_4x4_int & 2) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
+  for (int y = 0; y < num_64x64; ++y) {
+    for (int x = 0; x < num_64x64; ++x) {
+      const int row = mi_row + y * MI_SIZE_64X64;
+      const int col = mi_col + x * MI_SIZE_64X64;
+      if (row >= row_end || col >= col_end) continue;
+      if ((row << MI_SIZE_LOG2) >= cm->height ||
+          (col << MI_SIZE_LOG2) >= cm->width)
+        continue;
 
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col + 8,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+      if (lfm == NULL) return;
 
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfin->mblim, lfin->lim, lfin->hev_thr);
-            }
+      // check if the mask is valid
+      check_loop_filter_masks(lfm, plane);
 
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
+      {
+        // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
+        // Even tx size is greater, we only apply max length filter, which
+        // is 16.
+        if (plane == 0) {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
+            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
+            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
+            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
+
+            // set 32x32 and 64x64 to 0
+            lfm->left_y[TX_32X32].bits[j] = 0;
+            lfm->left_y[TX_64X64].bits[j] = 0;
+            lfm->above_y[TX_32X32].bits[j] = 0;
+            lfm->above_y[TX_64X64].bits[j] = 0;
+          }
+        } else if (plane == 1) {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
+            lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
+
+            // set 32x32 to 0
+            lfm->left_u[TX_32X32].bits[j] = 0;
+            lfm->above_u[TX_32X32].bits[j] = 0;
           }
         } else {
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 1, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 1,
-                                 direct);
-
-          aom_lpf_horizontal_8(block + pivot * line_length, line_length,
-                               lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
+            lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
 
-          if (mask_4x4_int & 1) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-            direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                          pitch, 1, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 1,
-                                   direct);
-
-            aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                 lfi->mblim, lfi->lim, lfi->hev_thr);
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
+            // set 32x32 to 0
+            lfm->left_v[TX_32X32].bits[j] = 0;
+            lfm->above_v[TX_32X32].bits[j] = 0;
           }
         }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & 3) == 3) {
-          count = 2;
-          direct = pick_min_grad_direct(src, 4, row, col, width, height, pitch,
-                                        2, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, 4, row, col, width,
-                                 height, pitch, pivot, line_length, 2, direct);
-
-          aom_lpf_horizontal_4_dual(block + pivot * line_length, line_length,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr,
-                                    lfin->mblim, lfin->lim, lfin->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if ((mask_4x4_int & 3) == 3) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
+      }
 
-            direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                          pitch, 2, 1);
+      // check if the mask is valid
+      check_loop_filter_masks(lfm, plane);
+    }
+  }
+}
 
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 2,
-                                   direct);
+static void filter_selectively_vert_row2(
+    int subsampling_factor, uint8_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
 
-            aom_lpf_horizontal_4_dual(block + pivot * line_length, line_length,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr,
-                                      lfin->mblim, lfin->lim, lfin->hev_thr);
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
 
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
+            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                               lfi1->hev_thr);
           } else {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-
-            if (mask_4x4_int & 1) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
-
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
-
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfi->mblim, lfi->lim, lfi->hev_thr);
-            } else if (mask_4x4_int & 2) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
-
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col + 8,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
-
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfin->mblim, lfin->lim, lfin->hev_thr);
-            }
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
+            // TODO(any): add dual function simd function. Current sse2 code
+            // just called aom_lpf_vertical_14_sse2 twice.
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
           }
+        } else if (mask_16x16_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 1, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 1,
-                                 direct);
-
-          aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                               lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if (mask_4x4_int & 1) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-            direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                          height, pitch, 1, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 1,
-                                   direct);
-
-            aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                 lfi->mblim, lfi->lim, lfi->hev_thr);
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          }
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
         }
-      } else if (mask_4x4_int & 1) {
-        direct =
-            pick_min_grad_direct(src, 4, row, col, width, height, pitch, 1, 1);
-
-        pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                               height, pitch, pivot, line_length, 1, direct);
-
-        aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                             lfi->mblim, lfi->lim, lfi->hev_thr);
-
-        for (i = 0; i < 256; ++i)
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
       }
-#else   // CONFIG_LPF_DIRECT
-      if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          aom_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
-                                     lfi->hev_thr);
-          count = 2;
-        } else {
-          aom_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr);
-        }
-      } else if (mask_8x8 & 1) {
-        if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
 
-          if ((mask_4x4_int & 3) == 3) {
-            aom_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
-                                      lfi->lim, lfi->hev_thr, lfin->mblim,
-                                      lfin->lim, lfin->hev_thr);
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                               lfi1->hev_thr);
           } else {
-            if (mask_4x4_int & 1)
-              aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr);
-            else if (mask_4x4_int & 2)
-              aom_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr);
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
           }
-          count = 2;
+        } else if (mask_8x8_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
-          aom_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          if (mask_4x4_int & 1)
-            aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr);
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
         }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
-
-          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
+      }
 
-          if ((mask_4x4_int & 3) == 3) {
-            aom_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
-                                      lfi->lim, lfi->hev_thr, lfin->mblim,
-                                      lfin->lim, lfin->hev_thr);
-          } else {
-            if (mask_4x4_int & 1)
-              aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr);
-            else if (mask_4x4_int & 2)
-              aom_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr);
-          }
-          count = 2;
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          if (mask_4x4_int & 1)
-            aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr);
+          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
         }
-      } else if (mask_4x4_int & 1) {
-        aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                             lfi->hev_thr);
       }
-#endif  // CONFIG_LPF_DIRECT
     }
-#if CONFIG_LPF_DIRECT
-    idx_c += col_step * count;
-#endif
-    s += 8 * count;
-    lfl += count;
-    mask_16x16 >>= count;
-    mask_8x8 >>= count;
-    mask_4x4 >>= count;
-    mask_4x4_int >>= count;
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_filter_selectively_horiz(
-    uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl, int bd) {
-  unsigned int mask;
-  int count;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+static void highbd_filter_selectively_vert_row2(
+    int subsampling_factor, uint16_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
 
-    count = 1;
     if (mask & 1) {
-      if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          aom_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, bd);
-          count = 2;
-        } else {
-          aom_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, bd);
-        }
-      } else if (mask_8x8 & 1) {
-        if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
-
-          aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, lfin->mblim, lfin->lim,
-                                           lfin->hev_thr, bd);
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
 
-          if ((mask_4x4_int & 3) == 3) {
-            aom_highbd_lpf_horizontal_4_dual(
-                s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                lfin->mblim, lfin->lim, lfin->hev_thr, bd);
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr, bd);
+            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+                                      lfi1->lim, lfi1->hev_thr, bd);
           } else {
-            if (mask_4x4_int & 1) {
-              aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, bd);
-            } else if (mask_4x4_int & 2) {
-              aom_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, bd);
-            }
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
           }
-          count = 2;
+        } else if (mask_16x16_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
         } else {
-          aom_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-
-          if (mask_4x4_int & 1) {
-            aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, bd);
-          }
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
         }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+      }
 
-          aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, lfin->mblim, lfin->lim,
-                                           lfin->hev_thr, bd);
-          if ((mask_4x4_int & 3) == 3) {
-            aom_highbd_lpf_horizontal_4_dual(
-                s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                lfin->mblim, lfin->lim, lfin->hev_thr, bd);
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr, bd);
+            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+                                      lfi1->lim, lfi1->hev_thr, bd);
           } else {
-            if (mask_4x4_int & 1) {
-              aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, bd);
-            } else if (mask_4x4_int & 2) {
-              aom_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, bd);
-            }
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
           }
-          count = 2;
+        } else if (mask_8x8_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
         } else {
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-
-          if (mask_4x4_int & 1) {
-            aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, bd);
-          }
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
         }
-      } else if (mask_4x4_int & 1) {
-        aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, bd);
       }
-    }
-    s += 8 * count;
-    lfl += count;
-    mask_16x16 >>= count;
-    mask_8x8 >>= count;
-    mask_4x4 >>= count;
-    mask_4x4_int >>= count;
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-// This function ors into the current lfm structure, where to do loop
-// filters for the specific mi we are looking at. It uses information
-// including the block_size_type (32x16, 32x32, etc.), the transform size,
-// whether there were any coefficients encoded, and the loop filter strength
-// block we are currently looking at. Shift is used to position the
-// 1's we produce.
-// TODO(JBB) Need another function for different resolution color..
-static void build_masks(AV1_COMMON *const cm,
-                        const loop_filter_info_n *const lfi_n,
-                        const MODE_INFO *mi, const int shift_y,
-                        const int shift_uv, LOOP_FILTER_MASK *lfm) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-  // TODO(debargha): Check if masks can be setup correctly when
-  // rectangular transfroms are used with the EXT_TX expt.
-  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_left = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_above = txsize_vert_map[mbmi->tx_size];
-  const TX_SIZE tx_size_uv =
-      txsize_sqr_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
-  const TX_SIZE tx_size_uv_left =
-      txsize_horz_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
-  const TX_SIZE tx_size_uv_above =
-      txsize_vert_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-#if CONFIG_LPF_SB
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-  const int filter_level = get_filter_level(cm, lfi_n, mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-  const int filter_level = get_filter_level(lfi_n, mbmi);
-  (void)cm;
-#endif
-  uint64_t *const left_y = &lfm->left_y[tx_size_y_left];
-  uint64_t *const above_y = &lfm->above_y[tx_size_y_above];
-  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
-  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv_left];
-  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv_above];
-  uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
-  int i;
-
-  // If filter level is 0 we don't loop filter.
-  if (!filter_level) {
-    return;
-  } else {
-    const int w = num_8x8_blocks_wide_lookup[block_size];
-    const int h = num_8x8_blocks_high_lookup[block_size];
-    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
-    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
 
-    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
-  }
-
-  // These set 1 in the current block size for the block size edges.
-  // For instance if the block size is 32x16, we'll set:
-  //    above =   1111
-  //              0000
-  //    and
-  //    left  =   1000
-  //          =   1000
-  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
-  //        1,  not 8...
-  //
-  // U and V set things on a 16 bit scale.
-  //
-  *above_y |= above_prediction_mask[block_size] << shift_y;
-  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
-  *left_y |= left_prediction_mask[block_size] << shift_y;
-  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
-
-  // If the block has no coefficients and is not intra we skip applying
-  // the loop filter on block edges.
-  if (mbmi->skip && is_inter_block(mbmi)) return;
-
-  // Here we are adding a mask for the transform size. The transform
-  // size mask is set to be correct for a 64x64 prediction block size. We
-  // mask to match the size of the block we are working on and then shift it
-  // into place..
-  *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y_above])
-              << shift_y;
-  *above_uv |=
-      (size_mask_uv[block_size] & above_64x64_txform_mask_uv[tx_size_uv_above])
-      << shift_uv;
-
-  *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y_left])
-             << shift_y;
-  *left_uv |=
-      (size_mask_uv[block_size] & left_64x64_txform_mask_uv[tx_size_uv_left])
-      << shift_uv;
-
-  // Here we are trying to determine what to do with the internal 4x4 block
-  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
-  // an 8x8 in that the internal ones can be skipped and don't depend on
-  // the prediction block size.
-  if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
-
-  if (tx_size_uv == TX_4X4)
-    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
-}
-
-// This function does the same thing as the one above with the exception that
-// it only affects the y masks. It exists because for blocks < 16x16 in size,
-// we only update u and v masks on the first block.
-static void build_y_mask(AV1_COMMON *const cm,
-                         const loop_filter_info_n *const lfi_n,
-                         const MODE_INFO *mi, const int shift_y,
-#if CONFIG_SUPERTX
-                         int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                         LOOP_FILTER_MASK *lfm) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_left = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_above = txsize_vert_map[mbmi->tx_size];
-#if CONFIG_SUPERTX
-  const BLOCK_SIZE block_size =
-      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
-#else
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-#endif
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-#if CONFIG_LPF_SB
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-  const int filter_level = get_filter_level(cm, lfi_n, mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-  const int filter_level = get_filter_level(lfi_n, mbmi);
-  (void)cm;
-#endif
-  uint64_t *const left_y = &lfm->left_y[tx_size_y_left];
-  uint64_t *const above_y = &lfm->above_y[tx_size_y_above];
-  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
-  int i;
-
-  if (!filter_level) {
-    return;
-  } else {
-    const int w = num_8x8_blocks_wide_lookup[block_size];
-    const int h = num_8x8_blocks_high_lookup[block_size];
-    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
-    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
-
-    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
-  }
-
-  *above_y |= above_prediction_mask[block_size] << shift_y;
-  *left_y |= left_prediction_mask[block_size] << shift_y;
-
-  if (mbmi->skip && is_inter_block(mbmi)) return;
-
-  *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y_above])
-              << shift_y;
-
-  *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y_left])
-             << shift_y;
-
-  if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
-}
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-// This function update the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col. In case one of the edge is a tile boundary, loop filtering
-// for that edge is disabled. This function only check the tile boundary info
-// for the top left corner mi to determine the boundary information for the
-// top and left edge of the whole super block
-static void update_tile_boundary_filter_mask(AV1_COMMON *const cm,
-                                             const int mi_row, const int mi_col,
-                                             LOOP_FILTER_MASK *lfm) {
-  int i;
-  MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride + mi_col;
-
-  if (mi->mbmi.boundary_info & TILE_LEFT_BOUNDARY) {
-    for (i = 0; i <= TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefeULL;
-      lfm->left_uv[i] &= 0xeeee;
-    }
-  }
-
-  if (mi->mbmi.boundary_info & TILE_ABOVE_BOUNDARY) {
-    for (i = 0; i <= TX_32X32; i++) {
-      lfm->above_y[i] &= 0xffffffffffffff00ULL;
-      lfm->above_uv[i] &= 0xfff0;
-    }
-  }
-}
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-// This function sets up the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col.
-// TODO(JBB): This function only works for yv12.
-void av1_setup_mask(AV1_COMMON *const cm, const int mi_row, const int mi_col,
-                    MODE_INFO **mi, const int mode_info_stride,
-                    LOOP_FILTER_MASK *lfm) {
-#if CONFIG_EXT_PARTITION
-  assert(0 && "Not yet updated");
-#endif  // CONFIG_EXT_PARTITION
-  int idx_32, idx_16, idx_8;
-  const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi;
-  MODE_INFO **mip2 = mi;
-
-  // These are offsets to the next mi in the 64x64 block. It is what gets
-  // added to the mi ptr as we go through each loop. It helps us to avoid
-  // setting up special row and column counters for each index. The last step
-  // brings us out back to the starting position.
-  const int offset_32[] = { 4, (mode_info_stride << 2) - 4, 4,
-                            -(mode_info_stride << 2) - 4 };
-  const int offset_16[] = { 2, (mode_info_stride << 1) - 2, 2,
-                            -(mode_info_stride << 1) - 2 };
-  const int offset[] = { 1, mode_info_stride - 1, 1, -mode_info_stride - 1 };
-
-  // Following variables represent shifts to position the current block
-  // mask over the appropriate block. A shift of 36 to the left will move
-  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
-  // 4 rows to the appropriate spot.
-  const int shift_32_y[] = { 0, 4, 32, 36 };
-  const int shift_16_y[] = { 0, 2, 16, 18 };
-  const int shift_8_y[] = { 0, 1, 8, 9 };
-  const int shift_32_uv[] = { 0, 2, 8, 10 };
-  const int shift_16_uv[] = { 0, 1, 4, 5 };
-  int i;
-  const int max_rows = AOMMIN(cm->mi_rows - mi_row, MAX_MIB_SIZE);
-  const int max_cols = AOMMIN(cm->mi_cols - mi_col, MAX_MIB_SIZE);
-
-  av1_zero(*lfm);
-  assert(mip[0] != NULL);
-
-  // TODO(jimbankoski): Try moving most of the following code into decode
-  // loop and storing lfm in the mbmi structure so that we don't have to go
-  // through the recursive loop structure multiple times.
-  switch (mip[0]->mbmi.sb_type) {
-    case BLOCK_64X64: build_masks(cm, lfi_n, mip[0], 0, 0, lfm); break;
-    case BLOCK_64X32: build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-      mip2 = mip + mode_info_stride * 4;
-      if (4 >= max_rows) break;
-      build_masks(cm, lfi_n, mip2[0], 32, 8, lfm);
-      break;
-    case BLOCK_32X64: build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-      mip2 = mip + 4;
-      if (4 >= max_cols) break;
-      build_masks(cm, lfi_n, mip2[0], 4, 2, lfm);
-      break;
-    default:
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      if (mip[0]->mbmi.tx_size == TX_64X64) {
-        build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
-      } else {
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-        for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-          const int shift_y_32 = shift_32_y[idx_32];
-          const int shift_uv_32 = shift_32_uv[idx_32];
-          const int mi_32_col_offset = ((idx_32 & 1) << 2);
-          const int mi_32_row_offset = ((idx_32 >> 1) << 2);
-          if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
-            continue;
-          switch (mip[0]->mbmi.sb_type) {
-            case BLOCK_32X32:
-              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-              break;
-            case BLOCK_32X16:
-              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-#if CONFIG_SUPERTX
-              if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-              if (mi_32_row_offset + 2 >= max_rows) continue;
-              mip2 = mip + mode_info_stride * 2;
-              build_masks(cm, lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4,
-                          lfm);
-              break;
-            case BLOCK_16X32:
-              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-#if CONFIG_SUPERTX
-              if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-              if (mi_32_col_offset + 2 >= max_cols) continue;
-              mip2 = mip + 2;
-              build_masks(cm, lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1,
-                          lfm);
-              break;
-            default:
-#if CONFIG_SUPERTX
-              if (mip[0]->mbmi.tx_size == TX_32X32) {
-                build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-                break;
-              }
-#endif
-              for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-                const int shift_y_32_16 = shift_y_32 + shift_16_y[idx_16];
-                const int shift_uv_32_16 = shift_uv_32 + shift_16_uv[idx_16];
-                const int mi_16_col_offset =
-                    mi_32_col_offset + ((idx_16 & 1) << 1);
-                const int mi_16_row_offset =
-                    mi_32_row_offset + ((idx_16 >> 1) << 1);
-
-                if (mi_16_col_offset >= max_cols ||
-                    mi_16_row_offset >= max_rows)
-                  continue;
-
-                switch (mip[0]->mbmi.sb_type) {
-                  case BLOCK_16X16:
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
-                                shift_uv_32_16, lfm);
-                    break;
-                  case BLOCK_16X8:
-#if CONFIG_SUPERTX
-                    if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
-                                shift_uv_32_16, lfm);
-                    if (mi_16_row_offset + 1 >= max_rows) continue;
-                    mip2 = mip + mode_info_stride;
-                    build_y_mask(cm, lfi_n, mip2[0], shift_y_32_16 + 8,
-#if CONFIG_SUPERTX
-                                 0,
-#endif
-                                 lfm);
-                    break;
-                  case BLOCK_8X16:
-#if CONFIG_SUPERTX
-                    if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
-                                shift_uv_32_16, lfm);
-                    if (mi_16_col_offset + 1 >= max_cols) continue;
-                    mip2 = mip + 1;
-                    build_y_mask(cm, lfi_n, mip2[0], shift_y_32_16 + 1,
-#if CONFIG_SUPERTX
-                                 0,
-#endif
-                                 lfm);
-                    break;
-                  default: {
-                    const int shift_y_32_16_8_zero =
-                        shift_y_32_16 + shift_8_y[0];
-#if CONFIG_SUPERTX
-                    if (mip[0]->mbmi.tx_size == TX_16X16) {
-                      build_masks(cm, lfi_n, mip[0], shift_y_32_16_8_zero,
-                                  shift_uv_32_16, lfm);
-                      break;
-                    }
-#endif
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16_8_zero,
-                                shift_uv_32_16, lfm);
-                    mip += offset[0];
-                    for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                      const int shift_y_32_16_8 =
-                          shift_y_32_16 + shift_8_y[idx_8];
-                      const int mi_8_col_offset =
-                          mi_16_col_offset + ((idx_8 & 1));
-                      const int mi_8_row_offset =
-                          mi_16_row_offset + ((idx_8 >> 1));
-
-                      if (mi_8_col_offset >= max_cols ||
-                          mi_8_row_offset >= max_rows)
-                        continue;
-                      build_y_mask(cm, lfi_n, mip[0], shift_y_32_16_8,
-#if CONFIG_SUPERTX
-                                   supertx_enabled(&mip[0]->mbmi),
-#endif
-                                   lfm);
-                    }
-                    break;
-                  }
-                }
-              }
-              break;
-          }
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      }
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-      break;
-  }
-  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also.
-  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
-  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
-  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
-  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
-
-  // We do at least 8 tap filter on every 32x32 even if the transform size
-  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
-  // remove it from the 4x4.
-  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
-  lfm->left_y[TX_4X4] &= ~left_border;
-  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
-  lfm->above_y[TX_4X4] &= ~above_border;
-  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
-  lfm->left_uv[TX_4X4] &= ~left_border_uv;
-  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
-  lfm->above_uv[TX_4X4] &= ~above_border_uv;
-
-  // We do some special edge handling.
-  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
-    const uint64_t rows = cm->mi_rows - mi_row;
-
-    // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t)1 << (rows << MAX_MIB_SIZE_LOG2)) - 1);
-    const uint16_t mask_uv =
-        (((uint16_t)1 << (((rows + 1) >> 1) << (MAX_MIB_SIZE_LOG2 - 1))) - 1);
-
-    // Remove values completely outside our border.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
-
-    // We don't apply a wide loop filter on the last uv block row. If set
-    // apply the shorter one instead.
-    if (rows == 1) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
-      lfm->above_uv[TX_16X16] = 0;
-    }
-    if (rows == 5) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
-      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
-    }
-  } else {
-    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv;
-  }
-
-  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
-    const uint64_t columns = cm->mi_cols - mi_col;
-
-    // Each pixel inside the border gets a 1, the multiply copies the border
-    // to where we need it.
-    const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL;
-    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
-
-    // Internal edges are not applied on the last column of the image so
-    // we mask 1 more for the internal edges
-    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
-
-    // Remove the bits outside the image edge.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->left_int_4x4_uv &= mask_uv_int;
-
-    // We don't apply a wide loop filter on the last uv column. If set
-    // apply the shorter one instead.
-    if (columns == 1) {
-      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
-      lfm->left_uv[TX_16X16] = 0;
-    }
-    if (columns == 5) {
-      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
-      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
-    }
-  }
-  // We don't apply a loop filter on the first column in the image, mask that
-  // out.
-  if (mi_col == 0) {
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefeULL;
-      lfm->left_uv[i] &= 0xeeee;
-    }
-  }
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  if (av1_disable_loopfilter_on_tile_boundary(cm)) {
-    update_tile_boundary_filter_mask(cm, mi_row, mi_col, lfm);
-  }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-  // Assert if we try to apply 2 different loop filters at the same position.
-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
-  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
-  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
-  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_8X8]));
-  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
-  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
-  assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
-  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
-  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
-  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
-  assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
-}
-
-static void filter_selectively_vert(
-    uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl
-#if CONFIG_LPF_DIRECT
-    ,
-    uint8_t *const src, int mi_row, int mi_col, int idx_r, int col_step,
-    int width, int height, int ss_x, int ss_y
-#endif
-    ) {
-  unsigned int mask;
-#if CONFIG_LPF_DIRECT
-  // scale for u, v plane
-  width >>= ss_x;
-  height >>= ss_y;
-  int idx_c = 0;
-#endif
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
-#if CONFIG_LPF_DIRECT
-    int i;
-    const int pivot = 8;
-    const int left_filt_len = mask_16x16 & 1 ? 8 : 4;
-    const int right_filt_len = mask_16x16 & 1 ? 8 : 4;
-    const int line_length = 16;
-    uint8_t block[128];
-    int orig_pos[128];
-
-    // actual position for current pixel
-    const int row = (mi_row + idx_r) * MI_SIZE >> ss_y;
-    const int col = (mi_col + idx_c) * MI_SIZE >> ss_x;
-
-    // Could use asymmetric length in the future
-    assert(left_filt_len == right_filt_len);
-    (void)right_filt_len;
-
-    if ((mask_16x16 & 1) || (mask_8x8 & 1) || (mask_4x4 & 1)) {
-      for (i = 0; i < 128; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
       }
-
-      const int direct = pick_min_grad_direct(src, left_filt_len, row, col,
-                                              width, height, pitch, 1, 0);
-
-      pick_filter_block_vert(src, block, orig_pos, left_filt_len, row, col,
-                             width, height, pitch, pivot, line_length, 1,
-                             direct);
-
-      // apply filtering
-      if (mask_16x16 & 1) {
-        aom_lpf_vertical_16(block + pivot, line_length, lfi->mblim, lfi->lim,
-                            lfi->hev_thr);
-      } else if (mask_8x8 & 1) {
-        aom_lpf_vertical_8(block + pivot, line_length, lfi->mblim, lfi->lim,
-                           lfi->hev_thr);
-      } else if (mask_4x4 & 1) {
-        aom_lpf_vertical_4(block + pivot, line_length, lfi->mblim, lfi->lim,
-                           lfi->hev_thr);
-      }
-
-      for (i = 0; i < 128; ++i)
-        if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
     }
 
-    // filter inner 4x4
-    if (mask_4x4_int & 1) {
-      for (i = 0; i < 128; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      const int direct = pick_min_grad_direct(src, 4, row, col + 4, width,
-                                              height, pitch, 1, 0);
-
-      pick_filter_block_vert(src, block, orig_pos, 4, row, col + 4, width,
-                             height, pitch, pivot, line_length, 1, direct);
-
-      aom_lpf_vertical_4(block + pivot, line_length, lfi->mblim, lfi->lim,
-                         lfi->hev_thr);
-
-      for (i = 0; i < 128; ++i)
-        if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-    }
-#else
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        aom_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-      } else if (mask_8x8 & 1) {
-        aom_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-      } else if (mask_4x4 & 1) {
-        aom_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-      }
-    }
-    if (mask_4x4_int & 1)
-      aom_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-#endif  // CONFIG_LPF_DIRECT
-#if CONFIG_LPF_DIRECT
-    idx_c += col_step;
-#endif
-    s += 8;
-    lfl += 1;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_filter_selectively_vert(
-    uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl, int bd) {
-  unsigned int mask;
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+                                     int subsampling, uint64_t mask_16x16,
+                                     uint64_t mask_8x8, uint64_t mask_4x4,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= 1) {
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
     const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds.
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
 
+    count = 1;
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        aom_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                   bd);
-      } else if (mask_8x8 & 1) {
-        aom_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                  bd);
-      } else if (mask_4x4 & 1) {
-        aom_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                  bd);
-      }
-    }
-    if (mask_4x4_int & 1)
-      aom_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, bd);
-    s += 8;
-    lfl += 1;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-typedef struct {
-  unsigned int m16x16;
-  unsigned int m8x8;
-  unsigned int m4x4;
-} FilterMasks;
-
-// Get filter level and masks for the given row index 'idx_r'. (Only used for
-// the non420 case).
-// Note: 'row_masks_ptr' and/or 'col_masks_ptr' can be passed NULL.
-static void get_filter_level_and_masks_non420(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane, int pl,
-    MODE_INFO **mib, int mi_row, int mi_col, int idx_r, uint8_t *const lfl_r,
-    unsigned int *const mask_4x4_int_r_ptr,
-    unsigned int *const mask_4x4_int_c_ptr, FilterMasks *const row_masks_ptr,
-    FilterMasks *const col_masks_ptr) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
-  FilterMasks row_masks, col_masks;
-  memset(&row_masks, 0, sizeof(row_masks));
-  memset(&col_masks, 0, sizeof(col_masks));
-  unsigned int mask_4x4_int_r = 0, mask_4x4_int_c = 0;
-  const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-
-  // Determine the vertical edges that need filtering
-  int idx_c;
-  for (idx_c = 0; idx_c < cm->mib_size && mi_col + idx_c < cm->mi_cols;
-       idx_c += col_step) {
-    const MODE_INFO *mi = mib[idx_r * cm->mi_stride + idx_c];
-    const MB_MODE_INFO *mbmi = &mi[0].mbmi;
-    const BLOCK_SIZE sb_type = mbmi->sb_type;
-    const int skip_this = mbmi->skip && is_inter_block(mbmi);
-    // Map index to 8x8 unit
-    const int c = idx_c >> mi_width_log2_lookup[BLOCK_8X8];
-
-    const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
-    const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
-
-    // left edge of current unit is block/partition edge -> no skip
-    const int block_edge_left =
-        (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
-    const int skip_this_c = skip_this && !block_edge_left;
-    // top edge of current unit is block/partition edge -> no skip
-    const int block_edge_above =
-        (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
-    const int skip_this_r = skip_this && !block_edge_above;
-
-    TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                          ? av1_get_uv_tx_size(mbmi, plane)
-                          : mbmi->tx_size;
-
-    const int skip_border_4x4_c =
-        ss_x && mi_col + idx_c >= cm->mi_cols - mi_size_wide[BLOCK_8X8];
-    const int skip_border_4x4_r =
-        ss_y && mi_row + idx_r >= cm->mi_rows - mi_size_high[BLOCK_8X8];
-
-    int tx_size_mask = 0;
-    const int c_step = (c >> ss_x);
-    const int r_step = (r >> ss_y);
-    const int col_mask = 1 << c_step;
-
-#if CONFIG_VAR_TX
-    if (is_inter_block(mbmi) && !mbmi->skip) {
-      const int tx_row_idx =
-          (blk_row * mi_size_high[BLOCK_8X8] << TX_UNIT_HIGH_LOG2) >> 1;
-      const int tx_col_idx =
-          (blk_col * mi_size_wide[BLOCK_8X8] << TX_UNIT_WIDE_LOG2) >> 1;
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      const BLOCK_SIZE bsize =
-          AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, plane));
-#else
-      const BLOCK_SIZE bsize = get_plane_block_size(mbmi->sb_type, plane);
-#endif
-      const TX_SIZE mb_tx_size = mbmi->inter_tx_size[tx_row_idx][tx_col_idx];
-      tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                    ? uv_txsize_lookup[bsize][mb_tx_size][0][0]
-                    : mb_tx_size;
-    }
-#endif
-
-// Filter level can vary per MI
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-    if (!(lfl_r[c_step] = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi)))
-      continue;
-#else
-#if CONFIG_LPF_SB
-    if (!(lfl_r[c_step] =
-              get_filter_level(cm, &cm->lf_info, mi_row, mi_col, mbmi)))
-      continue;
-#else
-    if (!(lfl_r[c_step] = get_filter_level(cm, &cm->lf_info, mbmi))) continue;
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-    if (!(lfl_r[c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
-#endif
-
-#if CONFIG_VAR_TX
-    TX_SIZE tx_size_horz_edge, tx_size_vert_edge;
-
-    // filt_len_vert_edge is the length of deblocking filter for a vertical edge
-    // The filter direction of a vertical edge is horizontal.
-    // Thus, filt_len_vert_edge is determined as the minimum width of the two
-    // transform block sizes on the left and right (current block) side of edge
-    const int filt_len_vert_edge = AOMMIN(
-        tx_size_wide[tx_size],
-        tx_size_wide[cm->left_txfm_context[pl][((mi_row + idx_r) & MAX_MIB_MASK)
-                                               << TX_UNIT_HIGH_LOG2]]);
-
-    // filt_len_horz_edge is the len of deblocking filter for a horizontal edge
-    // The filter direction of a horizontal edge is vertical.
-    // Thus, filt_len_horz_edge is determined as the minimum height of the two
-    // transform block sizes on the top and bottom (current block) side of edge
-    const int filt_len_horz_edge =
-        AOMMIN(tx_size_high[tx_size],
-               tx_size_high[cm->top_txfm_context[pl][(mi_col + idx_c)
-                                                     << TX_UNIT_WIDE_LOG2]]);
-
-    // transform width/height of current block
-    const int tx_wide_cur = tx_size_wide[tx_size];
-    const int tx_high_cur = tx_size_high[tx_size];
-
-    // tx_size_vert_edge is square transform size for a vertical deblocking edge
-    // It determines the type of filter applied to the vertical edge
-    // Similarly, tx_size_horz_edge is for a horizontal deblocking edge
-    tx_size_vert_edge = get_sqr_tx_size(filt_len_vert_edge);
-    tx_size_horz_edge = get_sqr_tx_size(filt_len_horz_edge);
-
-    memset(cm->top_txfm_context[pl] + ((mi_col + idx_c) << TX_UNIT_WIDE_LOG2),
-           tx_size, mi_size_wide[BLOCK_8X8] << TX_UNIT_WIDE_LOG2);
-    memset(cm->left_txfm_context[pl] +
-               (((mi_row + idx_r) & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2),
-           tx_size, mi_size_high[BLOCK_8X8] << TX_UNIT_HIGH_LOG2);
-#else
-    // The length (or equally the square tx size) of deblocking filter is only
-    // determined by
-    // a) current block's width for a vertical deblocking edge
-    // b) current block's height for a horizontal deblocking edge
-    TX_SIZE tx_size_vert_edge = txsize_horz_map[tx_size];
-    TX_SIZE tx_size_horz_edge = txsize_vert_map[tx_size];
-    (void)pl;
-#endif  // CONFIG_VAR_TX
-
-    if (tx_size_vert_edge == TX_32X32)
-      tx_size_mask = 3;
-    else if (tx_size_vert_edge == TX_16X16)
-      tx_size_mask = 1;
-    else
-      tx_size_mask = 0;
-
-    // Build masks based on the transform size of each block
-    // handle vertical mask
-    if (tx_size_vert_edge == TX_32X32) {
-      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_c)
-          col_masks.m16x16 |= col_mask;
-        else
-          col_masks.m8x8 |= col_mask;
-      }
-    } else if (tx_size_vert_edge == TX_16X16) {
-      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_c)
-          col_masks.m16x16 |= col_mask;
-        else
-          col_masks.m8x8 |= col_mask;
-      }
-    } else {
-      // force 8x8 filtering on 32x32 boundaries
-      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
-        if (tx_size_vert_edge == TX_8X8 || (c_step & 3) == 0)
-          col_masks.m8x8 |= col_mask;
-        else
-          col_masks.m4x4 |= col_mask;
-      }
-
-#if CONFIG_VAR_TX
-      if (!skip_this && tx_wide_cur < 8 && !skip_border_4x4_c &&
-          (c_step & tx_size_mask) == 0)
-#else
-      if (!skip_this && tx_size_vert_edge < TX_8X8 && !skip_border_4x4_c &&
-          (c_step & tx_size_mask) == 0)
-#endif  // CONFIG_VAR_TX
-        mask_4x4_int_c |= col_mask;
-    }
-
-    if (tx_size_horz_edge == TX_32X32)
-      tx_size_mask = 3;
-    else if (tx_size_horz_edge == TX_16X16)
-      tx_size_mask = 1;
-    else
-      tx_size_mask = 0;
-
-    // set horizontal mask
-    if (tx_size_horz_edge == TX_32X32) {
-      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_r)
-          row_masks.m16x16 |= col_mask;
-        else
-          row_masks.m8x8 |= col_mask;
-      }
-    } else if (tx_size_horz_edge == TX_16X16) {
-      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_r)
-          row_masks.m16x16 |= col_mask;
-        else
-          row_masks.m8x8 |= col_mask;
-      }
-    } else {
-      // force 8x8 filtering on 32x32 boundaries
-      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
-        if (tx_size_horz_edge == TX_8X8 || (r_step & 3) == 0)
-          row_masks.m8x8 |= col_mask;
-        else
-          row_masks.m4x4 |= col_mask;
-      }
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          /*
+          aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr);
+          */
 
-#if CONFIG_VAR_TX
-      if (!skip_this && tx_high_cur < 8 && !skip_border_4x4_r &&
-          (r_step & tx_size_mask) == 0)
-#else
-      if (!skip_this && tx_size_horz_edge < TX_8X8 && !skip_border_4x4_r &&
-          (r_step & tx_size_mask) == 0)
-#endif  // CONFIG_VAR_TX
-        mask_4x4_int_r |= col_mask;
-    }
-  }
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_8x8 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
 
-  if (row_masks_ptr) *row_masks_ptr = row_masks;
-  if (col_masks_ptr) *col_masks_ptr = col_masks;
-  if (mask_4x4_int_c_ptr) *mask_4x4_int_c_ptr = mask_4x4_int_c;
-  if (mask_4x4_int_r_ptr) *mask_4x4_int_r_ptr = mask_4x4_int_r;
-}
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          /*
+          aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          */
 
-void av1_filter_block_plane_non420_ver(AV1_COMMON *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mib, int mi_row, int mi_col,
-                                       int pl) {
-  const int ss_y = plane->subsampling_y;
-  const int row_step = mi_size_high[BLOCK_8X8] << ss_y;
-#if CONFIG_LPF_DIRECT
-  const int ss_x = plane->subsampling_x;
-  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
-#endif
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
-
-  int idx_r;
-  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
-       idx_r += row_step) {
-    unsigned int mask_4x4_int;
-    FilterMasks col_masks;
-    const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-    get_filter_level_and_masks_non420(cm, plane, pl, mib, mi_row, mi_col, idx_r,
-                                      &lfl[r][0], NULL, &mask_4x4_int, NULL,
-                                      &col_masks);
-
-    // Disable filtering on the leftmost column or tile boundary
-    unsigned int border_mask = ~(mi_col == 0 ? 1 : 0);
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    MODE_INFO *const mi = cm->mi + (mi_row + idx_r) * cm->mi_stride + mi_col;
-    if (av1_disable_loopfilter_on_tile_boundary(cm) &&
-        ((mi->mbmi.boundary_info & TILE_LEFT_BOUNDARY) != 0)) {
-      border_mask = 0xfffffffe;
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          /*
+          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          */
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+          aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
+                               lfin->hev_thr);
+          count = 2;
+        } else {
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      }
     }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_vert(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          col_masks.m16x16 & border_mask, col_masks.m8x8 & border_mask,
-          col_masks.m4x4 & border_mask, mask_4x4_int, &cm->lf_info, &lfl[r][0],
-          (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      filter_selectively_vert(
-          dst->buf, dst->stride, col_masks.m16x16 & border_mask,
-          col_masks.m8x8 & border_mask, col_masks.m4x4 & border_mask,
-          mask_4x4_int, &cm->lf_info, &lfl[r][0]
-#if CONFIG_LPF_DIRECT
-          ,
-          dst->buf0, mi_row, mi_col, idx_r, col_step, cm->width, cm->height,
-          ss_x, ss_y
-#endif  // CONFIG_LPF_DIRECT
-          );
-    dst->buf += 8 * dst->stride;
-  }
-
-  // Now do horizontal pass
-  dst->buf = dst0;
-}
 
-void av1_filter_block_plane_non420_hor(AV1_COMMON *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mib, int mi_row, int mi_col,
-                                       int pl) {
-  const int ss_y = plane->subsampling_y;
-  const int row_step = mi_size_high[BLOCK_8X8] << ss_y;
-#if CONFIG_LPF_DIRECT
-  const int ss_x = plane->subsampling_x;
-  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
-#endif
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
-
-  int idx_r;
-  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
-       idx_r += row_step) {
-    unsigned int mask_4x4_int;
-    FilterMasks row_masks;
-    const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-    get_filter_level_and_masks_non420(cm, plane, pl, mib, mi_row, mi_col, idx_r,
-                                      &lfl[r][0], &mask_4x4_int, NULL,
-                                      &row_masks, NULL);
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    // Disable filtering on the abovemost row or tile boundary
-    const MODE_INFO *mi = cm->mi + (mi_row + idx_r) * cm->mi_stride + mi_col;
-    if ((av1_disable_loopfilter_on_tile_boundary(cm) &&
-         (mi->mbmi.boundary_info & TILE_ABOVE_BOUNDARY)) ||
-        (mi_row + idx_r == 0))
-      memset(&row_masks, 0, sizeof(row_masks));
-#else
-    if (mi_row + idx_r == 0) memset(&row_masks, 0, sizeof(row_masks));
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, row_masks.m16x16,
-          row_masks.m8x8, row_masks.m4x4, mask_4x4_int, &cm->lf_info,
-          &lfl[r][0], (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      filter_selectively_horiz(dst->buf, dst->stride, row_masks.m16x16,
-                               row_masks.m8x8, row_masks.m4x4, mask_4x4_int,
-                               &cm->lf_info, &lfl[r][0]
-#if CONFIG_LPF_DIRECT
-                               ,
-                               dst->buf0, mi_row, mi_col, idx_r, col_step,
-                               cm->width, cm->height, ss_x, ss_y
-#endif  // CONFIG_LPF_DIRECT
-                               );
-    dst->buf += 8 * dst->stride;
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
   }
-  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  int r;
-  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
-  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
-  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
-  uint64_t mask_4x4_int = lfm->int_4x4_y;
-
-  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
-
-  // Vertical pass: do 2 rows at one time
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
-    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
-
-// Disable filtering on the leftmost column.
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r][0], (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r][0]);
+static void highbd_filter_selectively_horiz(
+    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+    uint8_t *lfl, int bd) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
 
-    dst->buf += 2 * MI_SIZE * dst->stride;
-    mask_16x16 >>= 2 * MI_SIZE;
-    mask_8x8 >>= 2 * MI_SIZE;
-    mask_4x4 >>= 2 * MI_SIZE;
-    mask_4x4_int >>= 2 * MI_SIZE;
-  }
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds.
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
 
-  // Horizontal pass
-  dst->buf = dst0;
-}
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
 
-void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  int r;
-  uint64_t mask_16x16 = lfm->above_y[TX_16X16];
-  uint64_t mask_8x8 = lfm->above_y[TX_8X8];
-  uint64_t mask_4x4 = lfm->above_y[TX_4X4];
-  uint64_t mask_4x4_int = lfm->int_4x4_y;
-
-  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
-
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
-    unsigned int mask_16x16_r;
-    unsigned int mask_8x8_r;
-    unsigned int mask_4x4_r;
-
-    if (mi_row + r == 0) {
-      mask_16x16_r = 0;
-      mask_8x8_r = 0;
-      mask_4x4_r = 0;
-    } else {
-      mask_16x16_r = mask_16x16 & 0xff;
-      mask_8x8_r = mask_8x8 & 0xff;
-      mask_4x4_r = mask_4x4 & 0xff;
-    }
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          /*
+          aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, bd);
+          */
 
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r][0],
-          (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-#if !CONFIG_LPF_DIRECT
-      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r][0]);
-#endif  // CONFIG_LPF_DIRECT
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
+                                lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_8x8 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
 
-    dst->buf += MI_SIZE * dst->stride;
-    mask_16x16 >>= MI_SIZE;
-    mask_8x8 >>= MI_SIZE;
-    mask_4x4 >>= MI_SIZE;
-    mask_4x4_int >>= MI_SIZE;
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          /*
+          aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          */
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
+                                lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          /*
+          aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          */
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+          aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
   }
-  // restore the buf pointer in case there is additional filter pass.
-  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  int r, c;
-
-  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
-  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
-  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-  uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
-
-  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
-  assert(plane->plane_type == PLANE_TYPE_UV);
-  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
-
-  // Vertical pass: do 2 rows at one time
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
-    for (c = 0; c < (cm->mib_size >> 1); c++) {
-      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
-      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
+static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
+                           uint8_t *dst_buf, int ref_stride, int dst_stride,
+                           int start, int end) {
+  return 0;
+
+  start <<= MI_SIZE_LOG2;
+  end <<= MI_SIZE_LOG2;
+  uint8_t *ref0 = ref_buf;
+  uint8_t *dst0 = dst_buf;
+  if (cm->use_highbitdepth) {
+    const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
+    const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
+    for (int j = 0; j < 4; ++j) {
+      for (int i = start; i < end; ++i)
+        if (ref16[i] != dst16[i]) {
+          ref_buf = ref0;
+          dst_buf = dst0;
+          return i + 1;
+        }
+      ref16 += ref_stride;
+      dst16 += dst_stride;
     }
+  } else {
+    for (int j = 0; j < 4; ++j) {
+      for (int i = start; i < end; ++i)
+        if (ref_buf[i] != dst_buf[i]) {
+          ref_buf = ref0;
+          dst_buf = dst0;
+          return i + 1;
+        }
+      ref_buf += ref_stride;
+      dst_buf += dst_stride;
+    }
+  }
+  ref_buf = ref0;
+  dst_buf = dst0;
+  return 0;
+}
 
-    {
-      unsigned int mask_16x16_l = mask_16x16 & 0xff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int single_step = 1 << ssy;
+  const int r_step = 2 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+
+  // filter two rows at a time
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      // current and next row should belong to the same mask_idx and index
+      // next row's shift
+      const int row_next = row + single_step;
+      int index_next = 0;
+      const int shift_next = get_index_shift(col, row_next, &index_next);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_ver[row][col];
+          lfl2 = &lfm->lfl_y_ver[row_next][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_ver[row][col];
+          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_ver[row][col];
+          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
 
-// Disable filtering on the leftmost column.
-#if CONFIG_HIGHBITDEPTH
       if (cm->use_highbitdepth)
         highbd_filter_selectively_vert_row2(
-            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
+            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+            &cm->lf_info, lfl, lfl2, (int)cm->bit_depth);
       else
-#endif  // CONFIG_HIGHBITDEPTH
-        filter_selectively_vert_row2(plane->subsampling_x, dst->buf,
-                                     dst->stride, mask_16x16_l, mask_8x8_l,
-                                     mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-                                     &lfm->lfl_uv[r >> 1][0]);
-
-      dst->buf += 2 * MI_SIZE * dst->stride;
-      mask_16x16 >>= MI_SIZE;
-      mask_8x8 >>= MI_SIZE;
-      mask_4x4 >>= MI_SIZE;
-      mask_4x4_int >>= MI_SIZE;
+        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
+                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
+                                     &cm->lf_info, lfl, lfl2);
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
     }
+    dst->buf += 2 * MI_SIZE * dst->stride;
   }
-
-  // Horizontal pass
-  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
   int r, c;
-  uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
-  uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
-  uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
-  uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
-
-  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
-  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
-
-  // re-porpulate the filter level for uv, same as the code for vertical
-  // filter in av1_filter_block_plane_ss11_ver
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
-    for (c = 0; c < (cm->mib_size >> 1); c++) {
-      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
-      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
-    }
-  }
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int r_step = 1 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      if (mi_row + r == 0) continue;
+
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_hor[row][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_hor[row][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_hor[row][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
-    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
-    const unsigned int mask_4x4_int_r =
-        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
-    unsigned int mask_16x16_r;
-    unsigned int mask_8x8_r;
-    unsigned int mask_4x4_r;
-
-    if (mi_row + r == 0) {
-      mask_16x16_r = 0;
-      mask_8x8_r = 0;
-      mask_4x4_r = 0;
-    } else {
-      mask_16x16_r = mask_16x16 & 0xf;
-      mask_8x8_r = mask_8x8 & 0xf;
-      mask_4x4_r = mask_4x4 & 0xf;
+      if (cm->use_highbitdepth)
+        highbd_filter_selectively_horiz(
+            CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+            mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth);
+      else
+        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
     }
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int_r, &cm->lf_info, &lfm->lfl_uv[r >> 1][0],
-          (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-#if !CONFIG_LPF_DIRECT
-      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r >> 1][0]);
-#endif  // CONFIG_LPF_DIRECT
-
     dst->buf += MI_SIZE * dst->stride;
-    mask_16x16 >>= MI_SIZE / 2;
-    mask_8x8 >>= MI_SIZE / 2;
-    mask_4x4 >>= MI_SIZE / 2;
-    mask_4x4_int >>= MI_SIZE / 2;
   }
-  // restore the buf pointer in case there is additional filter pass.
-  dst->buf = dst0;
 }
-
-#if CONFIG_PARALLEL_DEBLOCKING
-typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
-static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES_ALL] = {
-  // mask for vertical edges filtering
-  {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      2 - 1,   // BLOCK_2X2
-      2 - 1,   // BLOCK_2X4
-      4 - 1,   // BLOCK_4X2
-#endif         // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      4 - 1,   // BLOCK_4X4
-      4 - 1,   // BLOCK_4X8
-      8 - 1,   // BLOCK_8X4
-      8 - 1,   // BLOCK_8X8
-      8 - 1,   // BLOCK_8X16
-      16 - 1,  // BLOCK_16X8
-      16 - 1,  // BLOCK_16X16
-      16 - 1,  // BLOCK_16X32
-      32 - 1,  // BLOCK_32X16
-      32 - 1,  // BLOCK_32X32
-      32 - 1,  // BLOCK_32X64
-      64 - 1,  // BLOCK_64X32
-      64 - 1,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-      64 - 1,   // BLOCK_64X128
-      128 - 1,  // BLOCK_128X64
-      128 - 1,  // BLOCK_128X128
-#endif          // CONFIG_EXT_PARTITION
-      4 - 1,    // BLOCK_4X16,
-      16 - 1,   // BLOCK_16X4,
-      8 - 1,    // BLOCK_8X32,
-      32 - 1,   // BLOCK_32X8,
-      16 - 1,   // BLOCK_16X64,
-      64 - 1,   // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-      32 - 1,   // BLOCK_32X128
-      128 - 1,  // BLOCK_128X32
-#endif          // CONFIG_EXT_PARTITION
-  },
-  // mask for horizontal edges filtering
-  {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      2 - 1,   // BLOCK_2X2
-      4 - 1,   // BLOCK_2X4
-      2 - 1,   // BLOCK_4X2
-#endif         // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      4 - 1,   // BLOCK_4X4
-      8 - 1,   // BLOCK_4X8
-      4 - 1,   // BLOCK_8X4
-      8 - 1,   // BLOCK_8X8
-      16 - 1,  // BLOCK_8X16
-      8 - 1,   // BLOCK_16X8
-      16 - 1,  // BLOCK_16X16
-      32 - 1,  // BLOCK_16X32
-      16 - 1,  // BLOCK_32X16
-      32 - 1,  // BLOCK_32X32
-      64 - 1,  // BLOCK_32X64
-      32 - 1,  // BLOCK_64X32
-      64 - 1,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-      128 - 1,  // BLOCK_64X128
-      64 - 1,   // BLOCK_128X64
-      128 - 1,  // BLOCK_128X128
-#endif          // CONFIG_EXT_PARTITION
-      16 - 1,   // BLOCK_4X16,
-      4 - 1,    // BLOCK_16X4,
-      32 - 1,   // BLOCK_8X32,
-      8 - 1,    // BLOCK_32X8,
-      64 - 1,   // BLOCK_16X64,
-      16 - 1,   // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-      128 - 1,  // BLOCK_32X128
-      32 - 1,   // BLOCK_128X32
-#endif          // CONFIG_EXT_PARTITION
-  },
-};
-
-static const uint32_t av1_transform_masks[NUM_EDGE_DIRS][TX_SIZES_ALL] = {
-  {
-#if CONFIG_CHROMA_2X2
-      2 - 1,  // TX_2X2
-#endif
-      4 - 1,   // TX_4X4
-      8 - 1,   // TX_8X8
-      16 - 1,  // TX_16X16
-      32 - 1,  // TX_32X32
-#if CONFIG_TX64X64
-      64 - 1,  // TX_64X64
-#endif         // CONFIG_TX64X64
-      4 - 1,   // TX_4X8
-      8 - 1,   // TX_8X4
-      8 - 1,   // TX_8X16
-      16 - 1,  // TX_16X8
-      16 - 1,  // TX_16X32
-      32 - 1,  // TX_32X16
-#if CONFIG_TX64X64
-      32 - 1,  // TX_32X64
-      64 - 1,  // TX_64X32
-#endif         // CONFIG_TX64X64
-      4 - 1,   // TX_4X16
-      16 - 1,  // TX_16X4
-      8 - 1,   // TX_8X32
-      32 - 1   // TX_32X8
-  },
-  {
-#if CONFIG_CHROMA_2X2
-      2 - 1,  // TX_2X2
-#endif
-      4 - 1,   // TX_4X4
-      8 - 1,   // TX_8X8
-      16 - 1,  // TX_16X16
-      32 - 1,  // TX_32X32
-#if CONFIG_TX64X64
-      64 - 1,  // TX_64X64
-#endif         // CONFIG_TX64X64
-      8 - 1,   // TX_4X8
-      4 - 1,   // TX_8X4
-      16 - 1,  // TX_8X16
-      8 - 1,   // TX_16X8
-      32 - 1,  // TX_16X32
-      16 - 1,  // TX_32X16
-#if CONFIG_TX64X64
-      64 - 1,  // TX_32X64
-      32 - 1,  // TX_64X32
-#endif         // CONFIG_TX64X64
-      16 - 1,  // TX_4X16
-      4 - 1,   // TX_16X4
-      32 - 1,  // TX_8X32
-      8 - 1    // TX_32X8
-  }
-};
-
-static TX_SIZE av1_get_transform_size(const MODE_INFO *const mi,
-                                      const EDGE_DIR edge_dir, const int mi_row,
-                                      const int mi_col, const int plane,
-                                      const struct macroblockd_plane *plane_ptr,
-                                      const uint32_t scale_horz,
-                                      const uint32_t scale_vert) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  TX_SIZE tx_size = (plane == AOM_PLANE_Y)
-                        ? mbmi->tx_size
-                        : av1_get_uv_tx_size(mbmi, plane_ptr);
+#endif  // LOOP_FILTER_BITMASK
+
+static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
+                                  const MB_MODE_INFO *const mbmi,
+                                  const EDGE_DIR edge_dir, const int mi_row,
+                                  const int mi_col, const int plane,
+                                  const struct macroblockd_plane *plane_ptr) {
+  assert(mbmi != NULL);
+  if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
+
+  TX_SIZE tx_size =
+      (plane == AOM_PLANE_Y)
+          ? mbmi->tx_size
+          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
   assert(tx_size < TX_SIZES_ALL);
-
-#if CONFIG_VAR_TX
-  // mi_row and mi_col is the absolute position of the MI block.
-  // idx_c and idx_r is the relative offset of the MI within the super block
-  // c and r is the relative offset of the 8x8 block within the supert block
-  // blk_row and block_col is the relative offset of the current 8x8 block
-  // within the current partition.
-  const int idx_c = mi_col & MAX_MIB_MASK;
-  const int idx_r = mi_row & MAX_MIB_MASK;
-  const int c = idx_c >> mi_width_log2_lookup[BLOCK_8X8];
-  const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
-  const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
-  const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
-
-  if (is_inter_block(mbmi) && !mbmi->skip) {
-    const int tx_row_idx =
-        (blk_row * mi_size_high[BLOCK_8X8] << TX_UNIT_HIGH_LOG2) >> 1;
-    const int tx_col_idx =
-        (blk_col * mi_size_wide[BLOCK_8X8] << TX_UNIT_WIDE_LOG2) >> 1;
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE bsize =
-        AOMMAX(BLOCK_4X4, ss_size_lookup[sb_type][scale_horz][scale_vert]);
-#else
-    const BLOCK_SIZE bsize = ss_size_lookup[sb_type][scale_horz][scale_vert];
-#endif
-    const TX_SIZE mb_tx_size = mbmi->inter_tx_size[tx_row_idx][tx_col_idx];
-
+  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
+    const BLOCK_SIZE sb_type = mbmi->sb_type;
+    const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
+    const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
+    const TX_SIZE mb_tx_size =
+        mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
     assert(mb_tx_size < TX_SIZES_ALL);
-
-    tx_size = (plane == AOM_PLANE_Y)
-                  ? mb_tx_size
-                  : uv_txsize_lookup[bsize][mb_tx_size][0][0];
-    assert(tx_size < TX_SIZES_ALL);
+    tx_size = mb_tx_size;
   }
-#else
-  (void)mi_row;
-  (void)mi_col;
-  (void)scale_horz;
-  (void)scale_vert;
-#endif  // CONFIG_VAR_TX
 
   // since in case of chrominance or non-square transorm need to convert
   // transform size into transform size in particular direction.
@@ -2926,111 +1524,84 @@ static TX_SIZE av1_get_transform_size(const MODE_INFO *const mi,
 typedef struct AV1_DEBLOCKING_PARAMETERS {
   // length of the filter applied to the outer edge
   uint32_t filter_length;
-  // length of the filter applied to the inner edge
-  uint32_t filter_length_internal;
   // deblocking limits
   const uint8_t *lim;
   const uint8_t *mblim;
   const uint8_t *hev_thr;
 } AV1_DEBLOCKING_PARAMETERS;
 
-static void set_lpf_parameters(
+// Return TX_SIZE from get_transform_size(), so it is plane and direction
+// awared
+static TX_SIZE set_lpf_parameters(
     AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
-    const AV1_COMMON *const cm, const EDGE_DIR edge_dir, const uint32_t x,
-    const uint32_t y, const int plane,
-    const struct macroblockd_plane *const plane_ptr) {
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
+    const int plane, const struct macroblockd_plane *const plane_ptr) {
   // reset to initial values
   params->filter_length = 0;
-  params->filter_length_internal = 0;
 
   // no deblocking is required
   const uint32_t width = plane_ptr->dst.width;
   const uint32_t height = plane_ptr->dst.height;
   if ((width <= x) || (height <= y)) {
-    return;
+    // just return the smallest transform unit size
+    return TX_4X4;
   }
 
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
-  const int mi_row = (y << scale_vert) >> MI_SIZE_LOG2;
-  const int mi_col = (x << scale_horz) >> MI_SIZE_LOG2;
-  MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
-  const MB_MODE_INFO *mbmi = &mi[0]->mbmi;
+  // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
+  // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
+  // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
+  // and mi_col should be odd number for chroma plane.
+  const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
+  const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
+  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  const MB_MODE_INFO *mbmi = mi[0];
+  // If current mbmi is not correctly setup, return an invalid value to stop
+  // filtering. One example is that if this tile is not coded, then its mbmi
+  // it not set up.
+  if (mbmi == NULL) return TX_INVALID;
+
+  const TX_SIZE ts =
+      get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
 
   {
-    const TX_SIZE ts =
-        av1_get_transform_size(mi[0], edge_dir, mi_row, mi_col, plane,
-                               plane_ptr, scale_horz, scale_vert);
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-    const uint32_t curr_level =
-        get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
-#else
-#if CONFIG_LPF_SB
-    const uint32_t curr_level =
-        get_filter_level(cm, &cm->lf_info, mi_row, mi_col, mbmi);
-#else
-    const uint32_t curr_level = get_filter_level(cm, &cm->lf_info, mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-    const uint32_t curr_level = get_filter_level(&cm->lf_info, mbmi);
-#endif  // CONFIG_EXT_DELTA_Q
-
-    const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
     const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
-    uint32_t level = curr_level;
+    const uint32_t transform_masks =
+        edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+    const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
+
+    if (!tu_edge) return ts;
+
     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
-    if (coord) {
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      MODE_INFO *const mi_bound = cm->mi + mi_row * cm->mi_stride + mi_col;
-      if (!av1_disable_loopfilter_on_tile_boundary(cm) ||
-          ((VERT_EDGE == edge_dir) &&
-           (0 == (mi_bound->mbmi.boundary_info & TILE_LEFT_BOUNDARY))) ||
-          ((HORZ_EDGE == edge_dir) &&
-           (0 == (mi_bound->mbmi.boundary_info & TILE_ABOVE_BOUNDARY))))
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-      {
-        const int32_t tu_edge =
-            (coord & av1_transform_masks[edge_dir][ts]) ? (0) : (1);
-        if (tu_edge) {
-          const MODE_INFO *const mi_prev = *(mi - mode_step);
+    {
+      const uint32_t curr_level =
+          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+      const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+      uint32_t level = curr_level;
+      if (coord) {
+        {
+          const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+          if (mi_prev == NULL) return TX_INVALID;
           const int pv_row =
               (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
           const int pv_col =
               (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
-          const TX_SIZE pv_ts =
-              av1_get_transform_size(mi_prev, edge_dir, pv_row, pv_col, plane,
-                                     plane_ptr, scale_horz, scale_vert);
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-          const uint32_t pv_lvl = get_filter_level(cm, &cm->lf_info, edge_dir,
-                                                   plane, &mi_prev->mbmi);
-#else
-#if CONFIG_LPF_SB
-          const uint32_t pv_lvl = get_filter_level(cm, &cm->lf_info, pv_row,
-                                                   pv_col, &mi_prev->mbmi);
-#else
-          const uint32_t pv_lvl =
-              get_filter_level(cm, &cm->lf_info, &mi_prev->mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
+          const TX_SIZE pv_ts = get_transform_size(
+              xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+
           const uint32_t pv_lvl =
-              get_filter_level(&cm->lf_info, &mi_prev->mbmi);
-#endif  // CONFIG_EXT_DELTA_Q
-
-          const int pv_skip =
-              mi_prev->mbmi.skip && is_inter_block(&mi_prev->mbmi);
-          const int32_t pu_edge =
-              (coord &
-               av1_prediction_masks[edge_dir]
-                                   [ss_size_lookup[mbmi->sb_type][scale_horz]
-                                                  [scale_vert]])
-                  ? (0)
-                  : (1);
+              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+
+          const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
+          const BLOCK_SIZE bsize =
+              get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
+                                   plane_ptr->subsampling_y);
+          const int prediction_masks = edge_dir == VERT_EDGE
+                                           ? block_size_wide[bsize] - 1
+                                           : block_size_high[bsize] - 1;
+          const int32_t pu_edge = !(coord & prediction_masks);
           // if the current and the previous blocks are skipped,
           // deblock the edge if the edge belongs to a PU's edge only.
           if ((curr_level || pv_lvl) &&
@@ -3039,41 +1610,26 @@ static void set_lpf_parameters(
             if (TX_4X4 >= min_ts) {
               params->filter_length = 4;
             } else if (TX_8X8 == min_ts) {
-              params->filter_length = 8;
+              if (plane != 0)
+                params->filter_length = 6;
+              else
+                params->filter_length = 8;
             } else {
-              params->filter_length = 16;
-#if PARALLEL_DEBLOCKING_15TAPLUMAONLY
+              params->filter_length = 14;
               // No wide filtering for chroma plane
               if (plane != 0) {
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
                 params->filter_length = 6;
-#else
-                params->filter_length = 8;
-#endif
               }
-#endif
             }
 
-#if PARALLEL_DEBLOCKING_DISABLE_15TAP
-            params->filter_length = (TX_4X4 >= AOMMIN(ts, pv_ts)) ? (4) : (8);
-#endif  // PARALLEL_DEBLOCKING_DISABLE_15TAP
-
             // update the level if the current block is skipped,
             // but the previous one is not
             level = (curr_level) ? (curr_level) : (pv_lvl);
           }
         }
       }
-
-#if !CONFIG_CB4X4
-      // prepare internal edge parameters
-      if (curr_level && !curr_skipped) {
-        params->filter_length_internal = (TX_4X4 >= ts) ? (4) : (0);
-      }
-#endif
-
       // prepare common parameters
-      if (params->filter_length || params->filter_length_internal) {
+      if (params->filter_length) {
         const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
         params->lim = limits->lim;
         params->mblim = limits->mblim;
@@ -3081,654 +1637,278 @@ static void set_lpf_parameters(
       }
     }
   }
+
+  return ts;
 }
 
-static void av1_filter_block_plane_vert(
-    const AV1_COMMON *const cm, const int plane,
-    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
-    const uint32_t mi_col) {
-  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
   const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-#if CONFIG_LPF_SB
-  int y_range = mi_row ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  y_range = AOMMIN(y_range, cm->mi_rows);
-  y_range >>= scale_vert;
-
-  int x_range = mi_col ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  x_range = AOMMIN(x_range, cm->mi_cols);
-  x_range >>= scale_horz;
-#else
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-#endif  // CONFIG_LPF_SB
   for (int y = 0; y < y_range; y += row_step) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
-    for (int x = 0; x < x_range; x += col_step) {
+    for (int x = 0; x < x_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
       // If 4x4 trasnform is used, it will then filter the internal edge
       //  aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, VERT_EDGE,
-                         curr_x, curr_y, plane, plane_ptr);
-
-#if CONFIG_LPF_DIRECT
-      uint8_t *const src = plane_ptr->dst.buf0;
-      const int width = cm->width >> scale_horz;
-      const int height = cm->height >> scale_vert;
-      const int pivot = 8;
-      const int line_length = 16;
-      uint8_t block[128];
-      int orig_pos[128];
-      const int vert_or_horz = 0;  // 0: vertical
-      const int unit = 1;
-      int i;
-      for (i = 0; i < 128; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      if (params.filter_length) {
-        const int filt_len = params.filter_length == 16 ? 8 : 4;
-        const int direct =
-            pick_min_grad_direct(src, filt_len, curr_y, curr_x, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_vert(src, block, orig_pos, filt_len, curr_y, curr_x,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-        uint8_t *const filt_start = block + pivot;
-        switch (params.filter_length) {
-          // apply 4-tap filtering
-          case 4:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start),
-                                        line_length, params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_4(filt_start, line_length, params.mblim,
-                                 params.lim, params.hev_thr);
-            break;
-          // apply 8-tap filtering
-          case 8:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(filt_start),
-                                        line_length, params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_8(filt_start, line_length, params.mblim,
-                                 params.lim, params.hev_thr);
-            break;
-          // apply 16-tap filtering
-          case 16:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(filt_start),
-                                         line_length, params.mblim, params.lim,
-                                         params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_16(filt_start, line_length, params.mblim,
-                                  params.lim, params.hev_thr);
-            break;
-          // no filtering
-          default: break;
-        }
-
-        for (i = 0; i < 128; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
       }
 
-      if (params.filter_length_internal) {
-        for (i = 0; i < 128; ++i) {
-          block[i] = 0;
-          orig_pos[i] = -1;
-        }
-
-        const int direct =
-            pick_min_grad_direct(src, 4, curr_y, curr_x + 4, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_vert(src, block, orig_pos, 4, curr_y, curr_x + 4,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-
-        uint8_t *const filt_start = block + pivot;
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start),
-                                    line_length, params.mblim, params.lim,
-                                    params.hev_thr, cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_vertical_4(filt_start, line_length, params.mblim, params.lim,
-                             params.hev_thr);
-
-        for (i = 0; i < 128; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
-      }
-#else  // !CONFIG_LPF_DIRECT
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
                                       cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
         case 6:  // apply 6-tap filter for chroma plane only
           assert(plane != 0);
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
-            aom_highbd_lpf_vertical_6_c(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_vertical_6_c(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
           break;
-#endif
         // apply 8-tap filtering
         case 8:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
                                       cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
-        // apply 16-tap filtering
-        case 16:
-#if CONFIG_HIGHBITDEPTH
+        // apply 14-tap filtering
+        case 14:
           if (cm->use_highbitdepth)
-#if CONFIG_DEBLOCK_13TAP
-            // TODO(olah): Remove _c once SIMD for 13-tap is available
-            aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                         params.mblim, params.lim,
-                                         params.hev_thr, cm->bit_depth);
-#else
-            aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride,
+            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
                                        params.mblim, params.lim, params.hev_thr,
                                        cm->bit_depth);
-#endif
           else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_DEBLOCK_13TAP
-            aom_lpf_vertical_16_c(p, dst_stride, params.mblim, params.lim,
-                                  params.hev_thr);
-#else
-          aom_lpf_vertical_16(p, dst_stride, params.mblim, params.lim,
-                              params.hev_thr);
-#endif
+            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
           break;
         // no filtering
         default: break;
       }
-      // process the internal edge
-      if (params.filter_length_internal) {
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p + 4), dst_stride,
-                                    params.mblim, params.lim, params.hev_thr,
-                                    cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_vertical_4(p + 4, dst_stride, params.mblim, params.lim,
-                             params.hev_thr);
-      }
-#endif  // CONFIG_LPF_DIRECT
       // advance the destination pointer
-      p += MI_SIZE;
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
     }
   }
 }
 
-static void av1_filter_block_plane_horz(
-    const AV1_COMMON *const cm, const int plane,
-    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
-    const uint32_t mi_col) {
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
   const int col_step = MI_SIZE >> MI_SIZE_LOG2;
-  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-#if CONFIG_LPF_SB
-  int y_range = mi_row ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  y_range = AOMMIN(y_range, cm->mi_rows);
-  y_range >>= scale_vert;
-
-  int x_range = mi_col ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  x_range = AOMMIN(x_range, cm->mi_cols);
-  x_range >>= scale_horz;
-#else
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-#endif  // CONFIG_LPF_SB
-  for (int y = 0; y < y_range; y += row_step) {
-    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
-    for (int x = 0; x < x_range; x += col_step) {
+  for (int x = 0; x < x_range; x += col_step) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will first filter the vertical edge aligned with a 8x8
       // block. If 4x4 trasnform is used, it will then filter the internal
       // edge aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, HORZ_EDGE,
-                         curr_x, curr_y, plane, plane_ptr);
-
-#if CONFIG_LPF_DIRECT
-      uint8_t *const src = plane_ptr->dst.buf0;
-      const int width = cm->width >> scale_horz;
-      const int height = cm->height >> scale_vert;
-      const int pivot = 8;
-      const int line_length = 16;
-      uint8_t block[256];
-      int orig_pos[256];
-      const int vert_or_horz = 1;  // 1: horizontal
-      const int unit = 1;
-      int i;
-      for (i = 0; i < 256; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      if (params.filter_length) {
-        const int filt_len = params.filter_length == 16 ? 8 : 4;
-        const int direct =
-            pick_min_grad_direct(src, filt_len, curr_y, curr_x, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_horz(src, block, orig_pos, filt_len, curr_y, curr_x,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-        uint8_t *const filt_start = block + pivot * line_length;
-        switch (params.filter_length) {
-          // apply 4-tap filtering
-          case 4:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start),
-                                          line_length, params.mblim, params.lim,
-                                          params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_4(filt_start, line_length, params.mblim,
-                                   params.lim, params.hev_thr);
-            break;
-          // apply 8-tap filtering
-          case 8:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(filt_start),
-                                          line_length, params.mblim, params.lim,
-                                          params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_8(filt_start, line_length, params.mblim,
-                                   params.lim, params.hev_thr);
-            break;
-          // apply 16-tap filtering
-          case 16:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_horizontal_edge_16(
-                  CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim,
-                  params.lim, params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_edge_16(filt_start, line_length, params.mblim,
-                                         params.lim, params.hev_thr);
-            break;
-          // no filtering
-          default: break;
-        }
-
-        for (i = 0; i < 256; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
+      tx_size =
+          set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
+                             HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
       }
-      if (params.filter_length_internal) {
-        for (i = 0; i < 256; ++i) {
-          block[i] = 0;
-          orig_pos[i] = -1;
-        }
 
-        const int direct =
-            pick_min_grad_direct(src, 4, curr_y + 4, curr_x, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_horz(src, block, orig_pos, 4, curr_y + 4, curr_x,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-
-        uint8_t *const filt_start = block + pivot * line_length;
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start),
-                                      line_length, params.mblim, params.lim,
-                                      params.hev_thr, cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_horizontal_4(filt_start, line_length, params.mblim,
-                               params.lim, params.hev_thr);
-
-        for (i = 0; i < 256; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
-      }
-#else  // !CONFIG_LPF_DIRECT
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
                                         params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
           break;
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
         // apply 6-tap filtering
-        case 6: assert(plane != 0);
-#if CONFIG_HIGHBITDEPTH
+        case 6:
+          assert(plane != 0);
           if (cm->use_highbitdepth)
-            aom_highbd_lpf_horizontal_6_c(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                          params.mblim, params.lim,
-                                          params.hev_thr, cm->bit_depth);
+            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_horizontal_6_c(p, dst_stride, params.mblim, params.lim,
-                                   params.hev_thr);
+            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
           break;
-#endif
         // apply 8-tap filtering
         case 8:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
                                         params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
           break;
-        // apply 16-tap filtering
-        case 16:
-#if CONFIG_HIGHBITDEPTH
+        // apply 14-tap filtering
+        case 14:
           if (cm->use_highbitdepth)
-#if CONFIG_DEBLOCK_13TAP
-            // TODO(olah): Remove _c once SIMD for 13-tap is available
-            aom_highbd_lpf_horizontal_edge_16_c(
-                CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
-                params.hev_thr, cm->bit_depth);
-#else
-            aom_highbd_lpf_horizontal_edge_16(
-                CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
-                params.hev_thr, cm->bit_depth);
-#endif
+            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                         params.mblim, params.lim,
+                                         params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_DEBLOCK_13TAP
-            aom_lpf_horizontal_edge_16_c(p, dst_stride, params.mblim,
-                                         params.lim, params.hev_thr);
-#else
-          aom_lpf_horizontal_edge_16(p, dst_stride, params.mblim, params.lim,
-                                     params.hev_thr);
-#endif
+            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr);
           break;
         // no filtering
         default: break;
       }
-      // process the internal edge
-      if (params.filter_length_internal) {
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p + 4 * dst_stride),
-                                      dst_stride, params.mblim, params.lim,
-                                      params.hev_thr, cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_horizontal_4(p + 4 * dst_stride, dst_stride, params.mblim,
-                               params.lim, params.hev_thr);
-      }
-#endif  // CONFIG_LPF_DIRECT
+
       // advance the destination pointer
-      p += MI_SIZE;
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
     }
   }
 }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 
-void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
-                          struct macroblockd_plane *planes, int start, int stop,
-#if CONFIG_LPF_SB
-                          int col_start, int col_end,
-#endif
-                          int y_only) {
-#if CONFIG_LOOPFILTER_LEVEL
-  // y_only no longer has its original meaning.
-  // Here it means which plane to filter
-  // when y_only = {0, 1, 2}, it means we are searching for filter level for
-  // Y/U/V plane individually.
-  const int plane_start = y_only;
-  const int plane_end = plane_start + 1;
-#else
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int plane_start = 0;
-  const int plane_end = num_planes;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#if !CONFIG_LPF_SB
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
+                             MACROBLOCKD *xd, int start, int stop,
+                             int plane_start, int plane_end) {
+  struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
   const int col_end = cm->mi_cols;
-#endif  // CONFIG_LPF_SB
   int mi_row, mi_col;
   int plane;
 
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
-    CONFIG_CB4X4
-
-#if !CONFIG_PARALLEL_DEBLOCKING
-#if CONFIG_VAR_TX
-  for (int i = 0; i < MAX_MB_PLANE; ++i)
-    memset(cm->top_txfm_context[i], TX_32X32, cm->mi_cols << TX_UNIT_WIDE_LOG2);
-#endif  // CONFIG_VAR_TX
-  for (mi_row = start; mi_row < stop; mi_row += cm->mib_size) {
-    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-#if CONFIG_VAR_TX
-    for (int i = 0; i < MAX_MB_PLANE; ++i)
-      memset(cm->left_txfm_context[i], TX_32X32,
-             MAX_MIB_SIZE << TX_UNIT_HIGH_LOG2);
-#endif  // CONFIG_VAR_TX
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-
-      for (plane = plane_start; plane < plane_end; ++plane) {
-        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col, plane);
-        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col, plane);
-      }
-    }
-  }
-#else
-
-  // filter all vertical edges in every 64x64 super block
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      for (plane = plane_start; plane < plane_end; ++plane) {
-        av1_filter_block_plane_vert(cm, plane, &planes[plane], mi_row, mi_col);
-      }
-    }
-  }
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+      break;
+    else if (plane == 1 && !(cm->lf.filter_level_u))
+      continue;
+    else if (plane == 2 && !(cm->lf.filter_level_v))
+      continue;
 
-  // filter all horizontal edges in every 64x64 super block
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      for (plane = plane_start; plane < plane_end; ++plane) {
-        av1_filter_block_plane_horz(cm, plane, &planes[plane], mi_row, mi_col);
+#if LOOP_FILTER_BITMASK
+    // filter all vertical edges every superblock (could be 128x128 or 64x64)
+    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+      for (mi_col = col_start; mi_col < col_end;
+           mi_col += cm->seq_params.mib_size) {
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col, plane, plane + 1);
+
+        av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x,
+                          pd[plane].subsampling_y, stop, col_end);
+        av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col);
       }
     }
-  }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 
-#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+    // filter all horizontal edges every superblock
+    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+      for (mi_col = col_start; mi_col < col_end;
+           mi_col += cm->seq_params.mib_size) {
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col, plane, plane + 1);
 
-#if CONFIG_PARALLEL_DEBLOCKING
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      // filter all vertical edges in every 64x64 super block
-      for (plane = plane_start; plane < plane_end; plane += 1) {
-        av1_filter_block_plane_vert(cm, plane, &planes[plane], mi_row, mi_col);
+        av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col);
       }
     }
-  }
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      // filter all horizontal edges in every 64x64 super block
-      for (plane = plane_start; plane < plane_end; plane += 1) {
-        av1_filter_block_plane_horz(cm, plane, &planes[plane], mi_row, mi_col);
+#else
+    if (cm->lf.combine_vert_horz_lf) {
+      // filter all vertical and horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          // filter vertical edges
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+          // filter horizontal edges
+          if (mi_col - MAX_MIB_SIZE >= 0) {
+            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
+                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
+                                 plane + 1);
+            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col - MAX_MIB_SIZE);
+          }
+        }
+        // filter horizontal edges
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
+        av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                    mi_col - MAX_MIB_SIZE);
       }
-    }
-  }
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
-
-  if (y_only)
-    path = LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
-  else
-    path = LF_PATH_SLOW;
-
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-
-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
-
-      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
-      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
-            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
-            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col, plane);
-            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col, plane);
-
-            break;
+    } else {
+      // filter all vertical edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+        }
+      }
+
+      // filter all horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
         }
       }
     }
+#endif  // LOOP_FILTER_BITMASK
   }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                           MACROBLOCKD *xd, int frame_filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                           int frame_filter_level_r,
-#endif
-                           int y_only, int partial_frame
-#if CONFIG_LPF_SB
-                           ,
-                           int mi_row, int mi_col
-#endif
-                           ) {
+                           MACROBLOCKD *xd, int plane_start, int plane_end,
+                           int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  int orig_filter_level[2] = { cm->lf.filter_level[0], cm->lf.filter_level[1] };
-#else
-  int orig_filter_level = cm->lf.filter_level;
-#endif
-#endif
 
-#if CONFIG_LPF_SB
-  if (partial_frame && !frame_filter_level) return;
-#else
-#if CONFIG_LOOPFILTER_LEVEL
-  if (!frame_filter_level && !frame_filter_level_r) return;
-#else
-  if (!frame_filter_level) return;
-#endif
-#endif  // CONFIG_LPF_SB
-#if CONFIG_LPF_SB
-  int start_mi_col;
-  int end_mi_col;
-
-  // In the experiment of deblocking filtering per superblock.
-  // When partial_frame is 1, it indicates we are searching for the best filter
-  // level for current superblock. We reuse frame_filter_level as filter level
-  // for superblock, no longer for the whole frame.
-  // When partial_frame is 0, it's in the actual filtering stage for the frame
-  if (partial_frame) {
-    start_mi_row = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-    start_mi_col = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-    const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-    const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-    end_mi_row = AOMMIN(mi_row_range, cm->mi_rows);
-    end_mi_col = AOMMIN(mi_col_range, cm->mi_cols);
-
-    av1_loop_filter_sb_level_init(cm, mi_row, mi_col, frame_filter_level);
-  } else {
-    start_mi_row = 0;
-    mi_rows_to_filter = cm->mi_rows;
-    end_mi_row = start_mi_row + mi_rows_to_filter;
-    start_mi_col = 0;
-    end_mi_col = cm->mi_cols;
-  }
-#else
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_rows;
   if (partial_frame && cm->mi_rows > 8) {
@@ -3737,61 +1917,7 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
-#if CONFIG_LOOPFILTER_LEVEL
-  // TODO(chengchen): refactor the code such that y_only has its matching
-  // meaning. Now it means the plane to be filtered in this experiment.
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level_r,
-                             y_only);
-#else
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level);
-#endif
-#endif  // CONFIG_LPF_SB
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  cm->lf.filter_level[0] = frame_filter_level;
-  cm->lf.filter_level[1] = frame_filter_level_r;
-#else
-  cm->lf.filter_level = frame_filter_level;
-#endif
-#endif
-
-#if CONFIG_LPF_SB
-  av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row,
-                       start_mi_col, end_mi_col, y_only);
-#else
-  av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
-#endif  // CONFIG_LPF_SB
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  cm->lf.filter_level[0] = orig_filter_level[0];
-  cm->lf.filter_level[1] = orig_filter_level[1];
-#else
-  cm->lf.filter_level = orig_filter_level;
-#endif
-#endif
-}
-
-void av1_loop_filter_data_reset(LFWorkerData *lf_data,
-                                YV12_BUFFER_CONFIG *frame_buffer,
-                                struct AV1Common *cm,
-                                const struct macroblockd_plane *planes) {
-  lf_data->frame_buffer = frame_buffer;
-  lf_data->cm = cm;
-  lf_data->start = 0;
-  lf_data->stop = 0;
-  lf_data->y_only = 0;
-  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
-}
-
-int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
-  (void)unused;
-#if !CONFIG_LPF_SB
-  av1_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                       lf_data->start, lf_data->stop, lf_data->y_only);
-#else
-  (void)lf_data;
-#endif  // CONFIG_LPF_SB
-  return 1;
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                   plane_end);
 }
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
index ee32c368c..c35c3b2dc 100644
--- a/third_party/aom/av1/common/av1_loopfilter.h
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -12,9 +12,9 @@
 #ifndef AV1_COMMON_LOOPFILTER_H_
 #define AV1_COMMON_LOOPFILTER_H_
 
-#include "aom_ports/mem.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
+#include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/seg_common.h"
 
@@ -27,37 +27,111 @@ extern "C" {
 
 #define SIMD_WIDTH 16
 
-#define MAX_MODE_LF_DELTAS 2
-
 enum lf_path {
   LF_PATH_420,
   LF_PATH_444,
   LF_PATH_SLOW,
 };
 
+#if LOOP_FILTER_BITMASK
+typedef struct {
+  uint64_t bits[4];
+} FilterMask;
+
+// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
+// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
+// a uint64_t to represent bitmask.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  FilterMask left_y[TX_SIZES];
+  FilterMask above_y[TX_SIZES];
+  FilterMask left_u[TX_SIZES];
+  FilterMask above_u[TX_SIZES];
+  FilterMask left_v[TX_SIZES];
+  FilterMask above_v[TX_SIZES];
+
+  // Y plane vertical edge and horizontal edge filter level
+  uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // U plane vertical edge and horizontal edge filter level
+  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // V plane vertical edge and horizontal edge filter level
+  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+} LoopFilterMask;
+
+// To determine whether to apply loop filtering at one transform block edge,
+// we need information of the neighboring transform block. Specifically,
+// in determining a vertical edge, we need the information of the tx block
+// to its left. For a horizontal edge, we need info of the tx block above it.
+// Thus, we need to record info of right column and bottom row of tx blocks.
+// We record the information of the neighboring superblock, when bitmask
+// building for a superblock is finished. And it will be used for next
+// superblock bitmask building.
+// Information includes:
+// ------------------------------------------------------------
+//                    MI_SIZE_64X64
+// Y  tx_size above |--------------|
+// Y  tx_size left  |--------------|
+// UV tx_size above |--------------|
+// UV tx_size left  |--------------|
+// Y level above    |--------------|
+// Y level left     |--------------|
+// U level above    |--------------|
+// U level left     |--------------|
+// V level above    |--------------|
+// V level left     |--------------|
+// skip             |--------------|
+// ------------------------------------------------------------
+typedef struct {
+  TX_SIZE tx_size_y_above[MI_SIZE_64X64];
+  TX_SIZE tx_size_y_left[MI_SIZE_64X64];
+  TX_SIZE tx_size_uv_above[MI_SIZE_64X64];
+  TX_SIZE tx_size_uv_left[MI_SIZE_64X64];
+  uint8_t y_level_above[MI_SIZE_64X64];
+  uint8_t y_level_left[MI_SIZE_64X64];
+  uint8_t u_level_above[MI_SIZE_64X64];
+  uint8_t u_level_left[MI_SIZE_64X64];
+  uint8_t v_level_above[MI_SIZE_64X64];
+  uint8_t v_level_left[MI_SIZE_64X64];
+  uint8_t skip[MI_SIZE_64X64];
+} LpfSuperblockInfo;
+#endif  // LOOP_FILTER_BITMASK
+
 struct loopfilter {
-#if CONFIG_LOOPFILTER_LEVEL
   int filter_level[2];
   int filter_level_u;
   int filter_level_v;
-#else
-  int filter_level;
-#endif
 
   int sharpness_level;
-  int last_sharpness_level;
 
   uint8_t mode_ref_delta_enabled;
   uint8_t mode_ref_delta_update;
 
-  // 0 = Intra, Last, Last2+Last3(CONFIG_EXT_REFS),
-  // GF, BRF(CONFIG_EXT_REFS), ARF2(CONFIG_EXT_REFS), ARF
-  int8_t ref_deltas[TOTAL_REFS_PER_FRAME];
-  int8_t last_ref_deltas[TOTAL_REFS_PER_FRAME];
+  // 0 = Intra, Last, Last2+Last3,
+  // GF, BRF, ARF2, ARF
+  int8_t ref_deltas[REF_FRAMES];
 
   // 0 = ZERO_MV, MV
   int8_t mode_deltas[MAX_MODE_LF_DELTAS];
-  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+
+  int combine_vert_horz_lf;
+
+#if LOOP_FILTER_BITMASK
+  LoopFilterMask *lfm;
+  size_t lfm_num;
+  int lfm_stride;
+  LpfSuperblockInfo neighbor_sb_lpf_info;
+#endif  // LOOP_FILTER_BITMASK
 };
 
 // Need to align this structure so when it is declared and
@@ -70,127 +144,56 @@ typedef struct {
 
 typedef struct {
   loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
-#if CONFIG_LOOPFILTER_LEVEL
-  uint8_t lvl[MAX_SEGMENTS][2][TOTAL_REFS_PER_FRAME][MAX_MODE_LF_DELTAS];
-#else
-  uint8_t lvl[MAX_SEGMENTS][TOTAL_REFS_PER_FRAME][MAX_MODE_LF_DELTAS];
-#endif
+  uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
-// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.   Int_ entries refer to whether or not to
-// apply borders on the 4x4 edges within the 8x8 block that each bit
-// represents.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  uint64_t left_y[TX_SIZES];
-  uint64_t above_y[TX_SIZES];
-  uint64_t int_4x4_y;
-  uint16_t left_uv[TX_SIZES];
-  uint16_t above_uv[TX_SIZES];
-  uint16_t left_int_4x4_uv;
-  uint16_t above_int_4x4_uv;
-  uint8_t lfl_y[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  uint8_t lfl_uv[MAX_MIB_SIZE / 2][MAX_MIB_SIZE / 2];
-} LOOP_FILTER_MASK;
-
 /* assorted loopfilter functions which get used elsewhere */
 struct AV1Common;
 struct macroblockd;
 struct AV1LfSyncData;
 
-// This function sets up the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col.
-void av1_setup_mask(struct AV1Common *const cm, const int mi_row,
-                    const int mi_col, MODE_INFO **mi_8x8,
-                    const int mode_info_stride, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_non420_ver(struct AV1Common *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mi_8x8, int mi_row,
-                                       int mi_col, int pl);
-void av1_filter_block_plane_non420_hor(struct AV1Common *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mi_8x8, int mi_row,
-                                       int mi_col, int pl);
-
 void av1_loop_filter_init(struct AV1Common *cm);
 
-// Update the loop filter for the current frame.
-// This should be called before av1_loop_filter_rows(),
-// av1_loop_filter_frame()
-// calls this function directly.
-void av1_loop_filter_frame_init(struct AV1Common *cm, int default_filt_lvl,
-                                int default_filt_lvl_r
-#if CONFIG_LOOPFILTER_LEVEL
-                                ,
-                                int plane
-#endif
-                                );
+void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
+                                int plane_end);
 
-#if CONFIG_LPF_SB
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int filter_level,
-                           int y_only, int partial_frame, int mi_row,
-                           int mi_col);
-
-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          struct AV1Common *cm,
-                          struct macroblockd_plane *planes, int start, int stop,
-                          int col_start, int col_end, int y_only);
-
-void av1_loop_filter_sb_level_init(struct AV1Common *cm, int mi_row, int mi_col,
-                                   int lvl);
-#else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                           int filter_level_r,
-#endif
-                           int y_only, int partial_frame);
+                           struct macroblockd *mbd, int plane_start,
+                           int plane_end, int partial_frame);
+
+void av1_filter_block_plane_vert(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
 
-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          struct AV1Common *cm,
-                          struct macroblockd_plane *planes, int start, int stop,
-                          int y_only);
-#endif  // CONFIG_LPF_SB
+void av1_filter_block_plane_horz(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
 
 typedef struct LoopFilterWorkerData {
   YV12_BUFFER_CONFIG *frame_buffer;
   struct AV1Common *cm;
   struct macroblockd_plane planes[MAX_MB_PLANE];
-
-  int start;
-  int stop;
-  int y_only;
+  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+  // add lossless as a member here.
+  MACROBLOCKD *xd;
 } LFWorkerData;
 
-void av1_loop_filter_data_reset(LFWorkerData *lf_data,
-                                YV12_BUFFER_CONFIG *frame_buffer,
-                                struct AV1Common *cm,
-                                const struct macroblockd_plane *planes);
+#if LOOP_FILTER_BITMASK
+void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
+                       int plane, int subsampling_x, int subsampling_y,
+                       int row_end, int col_end);
+
+void av1_filter_block_plane_ver(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_hor(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane, int pl,
+                                int mi_row, int mi_col);
+#endif
 
-// Operates on the rows described by 'lf_data'.
-int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
index f9ccd1979..38e26bee1 100644
--- a/third_party/aom/av1/common/av1_rtcd.c
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #define RTCD_C
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/aom_once.h"
 
 void av1_rtcd() {
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index 203426e59..6aa925515 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
 sub av1_common_forward_decls() {
 print <<EOF
 /*
@@ -13,6 +23,7 @@ print <<EOF
 #include "av1/common/convolve.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
 
 struct macroblockd;
 
@@ -21,9 +32,22 @@ struct macroblock;
 struct txfm_param;
 struct aom_variance_vtable;
 struct search_site_config;
-struct mv;
-union int_mv;
 struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -38,233 +62,55 @@ if ($opts{arch} eq "x86_64") {
   $avx2_x86_64 = 'avx2';
 }
 
-#
-# 10/12-tap convolution filters
-#
-add_proto qw/void av1_lowbd_convolve_init/, "void";
-specialize qw/av1_lowbd_convolve_init ssse3/;
-
-add_proto qw/void av1_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params";
-specialize qw/av1_convolve_horiz ssse3/;
-
-add_proto qw/void av1_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params";
-specialize qw/av1_convolve_vert ssse3/;
-
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_highbd_convolve_init/, "void";
-  specialize qw/av1_highbd_convolve_init sse4_1/;
-  add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
-  specialize qw/av1_highbd_convolve_horiz sse4_1/;
-  add_proto qw/void av1_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
-  specialize qw/av1_highbd_convolve_vert sse4_1/;
-}
-
-#
-# Inverse dct
-#
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht4x4_16_add sse2/;
-
-  add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht4x8_32_add sse2/;
-
-  add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x4_32_add sse2/;
-
-  add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x16_128_add sse2/;
-
-  add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x8_128_add sse2/;
-
-  add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x32_512_add sse2/;
-
-  add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht32x16_512_add sse2/;
-
-  add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
+specialize qw/av1_convolve_horiz_rs sse4_1/;
 
-  add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
 
-  add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
 
-  add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
 
-  add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x8_64_add sse2/;
+specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
+specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
+specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
 
-  add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  specialize qw/av1_iht16x16_256_add sse2 avx2/;
+# directional intra predictor functions
+add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 
-  add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-} else {
-  add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-    specialize qw/av1_iht4x4_16_add sse2 neon/;
-  }
-
-  add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht4x8_32_add sse2/;
-
-  add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x4_32_add sse2/;
-
-  add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x16_128_add sse2/;
-
-  add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x8_128_add sse2/;
-
-  add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x32_512_add sse2/;
-
-  add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht32x16_512_add sse2/;
-
-  add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-    specialize qw/av1_iht8x8_64_add sse2 neon/;
-  }
-
-  add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-    specialize qw/av1_iht16x16_256_add sse2 avx2/;
-  }
-
-  add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-
-  if (aom_config("CONFIG_EXT_TX") ne "yes") {
-    if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-      specialize qw/av1_iht4x4_16_add msa/;
-    }
-    if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-      specialize qw/av1_iht8x8_64_add msa/;
-    }
-    if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-      specialize qw/av1_iht16x16_256_add msa/;
-    }
-  }
-}
-
-add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  add_proto qw/void av1_iht32x64_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  add_proto qw/void av1_iht64x32_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-}
-
-if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
-  add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  add_proto qw/void quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  add_proto qw/void quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  add_proto qw/void quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  if (aom_config("CONFIG_TX64X64") eq "yes") {
-    add_proto qw/void quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-    add_proto qw/void quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-  }
-}
 
 # FILTER_INTRA predictor functions
-if (aom_config("CONFIG_FILTER_INTRA") eq "yes") {
-  add_proto qw/void av1_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  # High bitdepth functions
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  }
-}
+add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+specialize qw/av1_filter_intra_predictor sse4_1/;
 
 # High bitdepth functions
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  #
-  # Sub Pixel Filters
-  #
-  add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-  add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-  add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_avg/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_avg_vert/, "$sse2_x86_64";
-
-  #
-  # dct
-  #
-  add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+#
+# Sub Pixel Filters
+#
+add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 
-  add_proto qw/void av1_highbd_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 
-  add_proto qw/void av1_highbd_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+#inv txfm
+add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_inv_txfm_add ssse3 avx2/;
 
-  add_proto qw/void av1_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-}
+add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
-#inv txfm
 add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -272,227 +118,128 @@ add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *out
 add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
-}
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
-}
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
-}
+specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT32") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
-}
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-  add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-  add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-}
+specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
 
-#
-# Encoder functions below this point.
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-
-  # ENCODEMB INVOKE
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    # the transform coefficients are held in 32-bit
-    # values, so the assembler code for  av1_block_error can no longer be used.
-    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-    specialize qw/av1_block_error avx2/;
-
-    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp sse2 avx2/;
-
-    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp_32x32 avx2/;
-
-    if (aom_config("CONFIG_TX64X64") eq "yes") {
-      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    }
-  } else {
-    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-    specialize qw/av1_block_error sse2 avx2 msa/;
-
-    add_proto qw/int64_t av1_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-    specialize qw/av1_block_error_fp neon sse2/;
-
-    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";
-
-    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp_32x32 avx2/, "$ssse3_x86_64";
-
-    if (aom_config("CONFIG_TX64X64") eq "yes") {
-      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    }
-
-  }
-
-  # fdct functions
-
-  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-    specialize qw/av1_fht4x4 sse2/;
-  }
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-
-  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-    specialize qw/av1_fht8x8 sse2/;
-  }
-
-  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-    specialize qw/av1_fht16x16 sse2 avx2/;
-  }
+specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/;
 
-  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT32") ne "yes") {
-    specialize qw/av1_fht32x32 sse2 avx2/;
-  }
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-  if (aom_config("CONFIG_TX64X64") eq "yes") {
-    add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    add_proto qw/void av1_fht64x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    add_proto qw/void av1_fht32x64/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  }
+# directional intra predictor functions
+add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
 
-  add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht4x8 sse2/;
+# build compound seg mask functions
+add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
+specialize qw/av1_build_compound_diffwtd_mask sse4_1/;
 
-  add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht8x4 sse2/;
+add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 
-  add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht8x16 sse2/;
+add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
 
-  add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht16x8 sse2/;
-
-  add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht16x32 sse2/;
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
-  add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht32x16 sse2/;
+  # ENCODEMB INVOKE
 
-  add_proto qw/void av1_fht4x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  # the transform coefficients are held in 32-bit
+  # values, so the assembler code for  av1_block_error can no longer be used.
+  add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/av1_block_error avx2/;
 
-  add_proto qw/void av1_fht16x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp sse2 avx2/;
 
-  add_proto qw/void av1_fht8x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_32x32 avx2/;
 
-  add_proto qw/void av1_fht32x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_64x64 avx2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") ne "yes") {
-    if (aom_config("CONFIG_EXT_TX") ne "yes") {
-      if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-        specialize qw/av1_fht4x4 msa/;
-      }
-      if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-        specialize qw/av1_fht8x8 msa/;
-      }
-      if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-        specialize qw/av1_fht16x16 msa/;
-      }
-    }
-  }
+  # fdct functions
 
-  add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type";
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
   #fwd txfm
+  add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/;
+
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
-  }
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
-  }
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
-  }
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT32") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
-  }
-
-  if (aom_config("CONFIG_TX64X64") eq "yes") {
-    add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-    add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-    add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  }
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+
+  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+
   #
   # Motion search
   #
-  add_proto qw/int av1_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-  specialize qw/av1_full_search_sad sse3 sse4_1/;
-  $av1_full_search_sad_sse3=av1_full_search_sadx3;
-  $av1_full_search_sad_sse4_1=av1_full_search_sadx8;
+  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
-  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
-
-  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
+  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
   add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
   specialize qw/av1_temporal_filter_apply sse2 msa/;
 
-  if (aom_config("CONFIG_AOM_QM") eq "yes") {
-    add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
-  } else {
-    add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  }
-
-  if (aom_config("CONFIG_LGT_FROM_PRED") eq "yes") {
-    add_proto qw/void flgt2d_from_pred/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  }
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-
-    # ENCODEMB INVOKE
-    if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
-      add_proto qw/void highbd_quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
-      add_proto qw/void highbd_quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-      add_proto qw/void highbd_quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-      add_proto qw/void highbd_quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-      if (aom_config("CONFIG_TX64X64") eq "yes") {
-        add_proto qw/void highbd_quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-        add_proto qw/void highbd_quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-      }
-    }
-
-    add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-    specialize qw/av1_highbd_block_error sse2/;
+  # ENCODEMB INVOKE
 
-    add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/av1_highbd_block_error sse2/;
 
-  }
+  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 
-  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
   specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
   # End av1_high encoder functions
 
+  # txb
+  add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
+  specialize qw/av1_get_nz_map_contexts sse2/;
+  add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
+  specialize qw/av1_txb_init_levels sse4_1/;
+
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2/;
   add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
@@ -500,179 +247,132 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
   specialize qw/av1_wedge_compute_delta_squares sse2/;
 
-}
-# end encoder functions
-
-# If PVQ is enabled, fwd transforms are required by decoder
-if (aom_config("CONFIG_PVQ") eq "yes") {
-  # fdct functions
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht4x4 sse2/;
-
-    add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht8x8 sse2/;
-
-    add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht16x16 sse2/;
-
-    add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fwht4x4 sse2/;
-  } else {
-    add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht4x4 sse2 msa/;
-
-    add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht8x8 sse2 msa/;
-
-    add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht16x16 sse2 msa/;
-
-    add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fwht4x4 msa sse2/;
-  }
+  # hash
+  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+  specialize qw/av1_get_crc32c_value sse4_2/;
 
 }
+# end encoder functions
 
 # Deringing Functions
 
-if (aom_config("CONFIG_CDEF") eq "yes") {
-  add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
-  if (aom_config("CONFIG_CDEF_SINGLEPASS") ne "yes") {
-    add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
-    add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
-    add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
-    add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
-    add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
-    add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
-  } else {
-    add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max";
-  }
-
-  add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
-  add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
-
-  # VS compiling for 32 bit targets does not support vector types in
-  # structs as arguments, which makes the v256 type of the intrinsics
-  # hard to support, so optimizations for this target are disabled.
-  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-    if (aom_config("CONFIG_CDEF_SINGLEPASS") eq "yes") {
-      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
-      specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
-      specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
-      specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
-    } else {
-      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
-      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
-      specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
-      specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
-
-      specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    }
-  }
-}
+add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
 
-# PVQ Functions
+add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
 
-if (aom_config("CONFIG_PVQ") eq "yes") {
-  add_proto qw/double pvq_search_rdo_double/, "const od_val16 *xcoeff, int n, int k, int *ypulse, double g2, double pvq_norm_lambda, int prev_k";
-  specialize qw/pvq_search_rdo_double sse4_1/;
+# VS compiling for 32 bit targets does not support vector types in
+# structs as arguments, which makes the v256 type of the intrinsics
+# hard to support, so optimizations for this target are disabled.
+if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+  specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
 
-if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
-    (aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
-  add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_warp_affine sse2 ssse3/;
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-    specialize qw/av1_highbd_warp_affine ssse3/;
-  }
-}
+add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_highbd_warp_affine sse4_1/;
 
-if (aom_config("CONFIG_GLOBAL_MOTION") eq "yes" &&
-    aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
   specialize qw/compute_cross_correlation sse4_1/;
 }
 
 # LOOP_RESTORATION functions
 
-if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
-  add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
-  specialize qw/apply_selfguided_restoration sse4_1/;
+add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/apply_selfguided_restoration sse4_1 avx2/;
 
-  add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
-  specialize qw/av1_selfguided_restoration sse4_1/;
+add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                  int sgr_params_idx, int bit_depth, int highbd";
+specialize qw/av1_selfguided_restoration sse4_1 avx2/;
 
-  add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
-  specialize qw/av1_highpass_filter sse4_1/;
+# CONVOLVE_ROUND/COMPOUND_ROUND functions
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
-    specialize qw/apply_selfguided_restoration_highbd sse4_1/;
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_scale sse4_1/;
+  specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/;
+  specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
+  specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
+  specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+  specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+  specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
 
-    add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
-    specialize qw/av1_selfguided_restoration_highbd sse4_1/;
+# INTRA_EDGE functions
+add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge sse4_1/;
+add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
+specialize qw/av1_upsample_intra_edge sse4_1/;
 
-    add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
-    specialize qw/av1_highpass_filter_highbd sse4_1/;
-  }
-}
+add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge_high sse4_1/;
+add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
+specialize qw/av1_upsample_intra_edge_high sse4_1/;
 
-# CONVOLVE_ROUND/COMPOUND_ROUND functions
+# CFL
+add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
 
-if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
-  add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-  specialize qw/av1_convolve_2d sse2/;
-  add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
-  specialize qw/av1_convolve_rounding avx2/;
-
-  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
-  if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
-    specialize qw/av1_convolve_2d_scale sse4_1/;
-  }
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-    specialize qw/av1_highbd_convolve_2d ssse3/;
-    add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
-    specialize qw/av1_highbd_convolve_rounding avx2/;
-
-    add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
-    if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
-        specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
-    }
-  }
-}
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
 
-# INTRA_EDGE functions
-if (aom_config("CONFIG_INTRA_EDGE") eq "yes") {
-  add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
-  specialize qw/av1_filter_intra_edge sse4_1/;
-  add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
-  specialize qw/av1_upsample_intra_edge sse4_1/;
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
-    specialize qw/av1_filter_intra_edge_high sse4_1/;
-    add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
-    specialize qw/av1_upsample_intra_edge_high sse4_1/;
-  }
-}
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
 
+1;
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
new file mode 100644
index 000000000..1e6654121
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+
+// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+const int32_t av1_cospi_arr_data[7][64] = {
+  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
+    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
+    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
+    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
+  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
+    449,  400,  350,  301,  251,  201,  151,  100,  50 },
+  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101 },
+  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
+    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
+    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
+    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+};
+
+// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
+// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
+const int32_t av1_sinpi_arr_data[7][5] = {
+  { 0, 330, 621, 836, 951 },        { 0, 660, 1241, 1672, 1901 },
+  { 0, 1321, 2482, 3344, 3803 },    { 0, 2642, 4964, 6689, 7606 },
+  { 0, 5283, 9929, 13377, 15212 },  { 0, 10566, 19858, 26755, 30424 },
+  { 0, 21133, 39716, 53510, 60849 }
+};
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN,
+                                  INT32_MAX);
+      }
+    }
+  }
+}
+
+const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = {
+  { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
+  { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
+  { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+  { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
+    TXFM_TYPE_IDENTITY32 },
+  { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
+};
+
+const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
+  4,   // TXFM_TYPE_DCT4
+  6,   // TXFM_TYPE_DCT8
+  8,   // TXFM_TYPE_DCT16
+  10,  // TXFM_TYPE_DCT32
+  12,  // TXFM_TYPE_DCT64
+  7,   // TXFM_TYPE_ADST4
+  8,   // TXFM_TYPE_ADST8
+  10,  // TXFM_TYPE_ADST16
+  1,   // TXFM_TYPE_IDENTITY4
+  1,   // TXFM_TYPE_IDENTITY8
+  1,   // TXFM_TYPE_IDENTITY16
+  1,   // TXFM_TYPE_IDENTITY32
+};
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index bd365de59..5db3233f5 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -16,6 +16,8 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "config/aom_config.h"
+
 #include "av1/common/enums.h"
 #include "av1/common/blockd.h"
 #include "aom/aom_integer.h"
@@ -25,100 +27,73 @@
 extern "C" {
 #endif
 
+#if !defined(DO_RANGE_CHECK_CLAMP)
+#define DO_RANGE_CHECK_CLAMP 0
+#endif
+
+extern const int32_t av1_cospi_arr_data[7][64];
+extern const int32_t av1_sinpi_arr_data[7][5];
+
 #define MAX_TXFM_STAGE_NUM 12
 
 static const int cos_bit_min = 10;
 static const int cos_bit_max = 16;
 
-// cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
-static const int32_t cospi_arr_data[7][64] = {
-  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
-    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
-    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
-    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
-    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
-  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
-    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
-    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
-    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
-    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
-    449,  400,  350,  301,  251,  201,  151,  100,  50 },
-  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
-    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
-    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
-    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
-    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
-    897,  799,  700,  601,  501,  401,  301,  201,  101 },
-  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
-    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
-    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
-    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
-    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
-    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
-  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
-    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
-    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
-    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
-    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
-    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
-  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
-    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
-    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
-    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
-    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
-    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
-  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
-    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
-    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
-    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
-    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
-    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
-};
+static const int NewSqrt2Bits = 12;
+// 2^12 * sqrt(2)
+static const int32_t NewSqrt2 = 5793;
+// 2^12 / sqrt(2)
+static const int32_t NewInvSqrt2 = 2896;
 
 static INLINE const int32_t *cospi_arr(int n) {
-  return cospi_arr_data[n - cos_bit_min];
+  return av1_cospi_arr_data[n - cos_bit_min];
 }
 
-static INLINE int32_t round_shift(int32_t value, int bit) {
-  assert(bit >= 1);
-  return (value + (1 << (bit - 1))) >> bit;
+static INLINE const int32_t *sinpi_arr(int n) {
+  return av1_sinpi_arr_data[n - cos_bit_min];
 }
 
-static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
-  int i;
-  if (bit == 0) {
-    return;
-  } else {
-    if (bit > 0) {
-      for (i = 0; i < size; i++) {
-        arr[i] = round_shift(arr[i], bit);
-      }
-    } else {
-      for (i = 0; i < size; i++) {
-        arr[i] = arr[i] * (1 << (-bit));
-      }
-    }
+static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  if (value < min_value || value > max_value) {
+    fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+    assert(0);
   }
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+#if DO_RANGE_CHECK_CLAMP
+  bit = AOMMIN(bit, 31);
+  return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
+#endif  // DO_RANGE_CHECK_CLAMP
+  (void)bit;
+  return value;
+}
+
+static INLINE int32_t round_shift(int64_t value, int bit) {
+  assert(bit >= 1);
+  return (int32_t)((value + (1ll << (bit - 1))) >> bit);
 }
 
 static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
                                int bit) {
-  int32_t result_32 = w0 * in0 + w1 * in1;
+  int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
-  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
-  if (result_64 < INT32_MIN || result_64 > INT32_MAX) {
-    printf("%s %d overflow result_32: %d result_64: %" PRId64
-           " w0: %d in0: %d w1: %d in1: "
-           "%d\n",
-           __FILE__, __LINE__, result_32, result_64, w0, in0, w1, in1);
-    assert(0 && "half_btf overflow");
-  }
+  assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
 #endif
-  return round_shift(result_32, bit);
+  return round_shift(result_64, bit);
 }
 
-typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
-                         const int8_t *cos_bit, const int8_t *stage_range);
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  return clip_pixel_highbd(dest + (int)trans, bd);
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
+                         const int8_t *stage_range);
+
+typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
+                              TX_TYPE tx_type, int bd);
 
 typedef enum TXFM_TYPE {
   TXFM_TYPE_DCT4,
@@ -129,88 +104,82 @@ typedef enum TXFM_TYPE {
   TXFM_TYPE_ADST4,
   TXFM_TYPE_ADST8,
   TXFM_TYPE_ADST16,
-  TXFM_TYPE_ADST32,
   TXFM_TYPE_IDENTITY4,
   TXFM_TYPE_IDENTITY8,
   TXFM_TYPE_IDENTITY16,
   TXFM_TYPE_IDENTITY32,
-  TXFM_TYPE_IDENTITY64,
+  TXFM_TYPES,
+  TXFM_TYPE_INVALID,
 } TXFM_TYPE;
 
-typedef struct TXFM_1D_CFG {
-  const int txfm_size;
-  const int stage_num;
-
-  const int8_t *shift;
-  const int8_t *stage_range;
-  const int8_t *cos_bit;
-  const TXFM_TYPE txfm_type;
-} TXFM_1D_CFG;
-
 typedef struct TXFM_2D_FLIP_CFG {
+  TX_SIZE tx_size;
   int ud_flip;  // flip upside down
   int lr_flip;  // flip left to right
-  const TXFM_1D_CFG *col_cfg;
-  const TXFM_1D_CFG *row_cfg;
+  const int8_t *shift;
+  int8_t cos_bit_col;
+  int8_t cos_bit_row;
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  TXFM_TYPE txfm_type_col;
+  TXFM_TYPE txfm_type_row;
+  int stage_num_col;
+  int stage_num_row;
 } TXFM_2D_FLIP_CFG;
 
-static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       break;
-#if CONFIG_EXT_TX
     case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
-      cfg->ud_flip = 1;
-      cfg->lr_flip = 0;
+      *ud_flip = 1;
+      *lr_flip = 0;
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 1;
+      *ud_flip = 0;
+      *lr_flip = 1;
       break;
     case FLIPADST_FLIPADST:
-      cfg->ud_flip = 1;
-      cfg->lr_flip = 1;
+      *ud_flip = 1;
+      *lr_flip = 1;
       break;
-#endif  // CONFIG_EXT_TX
     default:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       assert(0);
   }
 }
 
-#if CONFIG_TXMG
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
 static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
   switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return TX_2X2;
-#endif
     case TX_4X4: return TX_4X4;
     case TX_8X8: return TX_8X8;
     case TX_16X16: return TX_16X16;
     case TX_32X32: return TX_32X32;
-#if CONFIG_TX64X64
     case TX_64X64: return TX_64X64;
     case TX_32X64: return TX_64X32;
     case TX_64X32: return TX_32X64;
-#endif
     case TX_4X8: return TX_8X4;
     case TX_8X4: return TX_4X8;
     case TX_8X16: return TX_16X8;
@@ -221,6 +190,8 @@ static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
     case TX_16X4: return TX_4X16;
     case TX_8X32: return TX_32X8;
     case TX_32X8: return TX_8X32;
+    case TX_16X64: return TX_64X16;
+    case TX_64X16: return TX_16X64;
     default: assert(0); return TX_INVALID;
   }
 }
@@ -231,7 +202,6 @@ static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
     case ADST_DCT: return DCT_ADST;
     case DCT_ADST: return ADST_DCT;
     case ADST_ADST: return ADST_ADST;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT: return DCT_FLIPADST;
     case DCT_FLIPADST: return FLIPADST_DCT;
     case FLIPADST_FLIPADST: return FLIPADST_FLIPADST;
@@ -244,123 +214,46 @@ static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
     case H_ADST: return V_ADST;
     case V_FLIPADST: return H_FLIPADST;
     case H_FLIPADST: return V_FLIPADST;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: return MRC_DCT;
-#endif  // CONFIG_MRC_TX
     default: assert(0); return TX_TYPES;
   }
 }
-#endif  // CONFIG_TXMG
-
-#if CONFIG_MRC_TX
-static INLINE int get_mrc_diff_mask_inter(const int16_t *diff, int diff_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  assert(SIGNAL_MRC_MASK_INTER);
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = diff[i * diff_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
-
-static INLINE int get_mrc_pred_mask_inter(const uint8_t *pred, int pred_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = pred[i * pred_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
-
-static INLINE int get_mrc_diff_mask_intra(const int16_t *diff, int diff_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  assert(SIGNAL_MRC_MASK_INTRA);
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = diff[i * diff_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
 
-static INLINE int get_mrc_pred_mask_intra(const uint8_t *pred, int pred_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = pred[i * pred_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
-
-static INLINE int get_mrc_diff_mask(const int16_t *diff, int diff_stride,
-                                    uint8_t *mask, int mask_stride, int width,
-                                    int height, int is_inter) {
-  if (is_inter) {
-    assert(USE_MRC_INTER && "MRC invalid for inter blocks");
-    assert(SIGNAL_MRC_MASK_INTER);
-    return get_mrc_diff_mask_inter(diff, diff_stride, mask, mask_stride, width,
-                                   height);
+// Utility function that returns the log of the ratio of the col and row
+// sizes.
+static INLINE int get_rect_tx_log_ratio(int col, int row) {
+  if (col == row) return 0;
+  if (col > row) {
+    if (col == row * 2) return 1;
+    if (col == row * 4) return 2;
+    assert(0 && "Unsupported transform size");
   } else {
-    assert(USE_MRC_INTRA && "MRC invalid for intra blocks");
-    assert(SIGNAL_MRC_MASK_INTRA);
-    return get_mrc_diff_mask_intra(diff, diff_stride, mask, mask_stride, width,
-                                   height);
+    if (row == col * 2) return -1;
+    if (row == col * 4) return -2;
+    assert(0 && "Unsupported transform size");
   }
+  return 0;  // Invalid
 }
 
-static INLINE int get_mrc_pred_mask(const uint8_t *pred, int pred_stride,
-                                    uint8_t *mask, int mask_stride, int width,
-                                    int height, int is_inter) {
-  if (is_inter) {
-    assert(USE_MRC_INTER && "MRC invalid for inter blocks");
-    return get_mrc_pred_mask_inter(pred, pred_stride, mask, mask_stride, width,
-                                   height);
-  } else {
-    assert(USE_MRC_INTRA && "MRC invalid for intra blocks");
-    return get_mrc_pred_mask_intra(pred, pred_stride, mask, mask_stride, width,
-                                   height);
-  }
-}
-
-static INLINE int is_valid_mrc_mask(int n_masked_vals, int width, int height) {
-  return !(n_masked_vals == 0 || n_masked_vals == (width * height));
-}
-#endif  // CONFIG_MRC_TX
-
 void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
                              const TXFM_2D_FLIP_CFG *cfg, int bd);
 
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
                              int bd);
 
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size);
-#if CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(TX_TYPE tx_type);
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x32_cfg(TX_TYPE tx_type);
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_32x64_cfg(TX_TYPE tx_type);
-#endif  // CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size);
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
+extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
+static INLINE int get_txw_idx(TX_SIZE tx_size) {
+  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+}
+static INLINE int get_txh_idx(TX_SIZE tx_size) {
+  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+}
+#define MAX_TXWH_IDX 5
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
index 7bada8bb1..86b4b5d6c 100644
--- a/third_party/aom/av1/common/blockd.c
+++ b/third_party/aom/av1/common/blockd.c
@@ -16,109 +16,17 @@
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 
-PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
-                                    const MODE_INFO *left_mi, int b) {
-  if (b == 0 || b == 2) {
-    if (!left_mi || is_inter_block(&left_mi->mbmi)) return DC_PRED;
-
-    return get_y_mode(left_mi, b + 1);
-  } else {
-    assert(b == 1 || b == 3);
-    return cur_mi->bmi[b - 1].as_mode;
-  }
-}
-
-PREDICTION_MODE av1_above_block_mode(const MODE_INFO *cur_mi,
-                                     const MODE_INFO *above_mi, int b) {
-  if (b == 0 || b == 1) {
-    if (!above_mi || is_inter_block(&above_mi->mbmi)) return DC_PRED;
-
-    return get_y_mode(above_mi, b + 2);
-  } else {
-    assert(b == 2 || b == 3);
-    return cur_mi->bmi[b - 2].as_mode;
-  }
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
+  if (!left_mi) return DC_PRED;
+  assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
+  return left_mi->mode;
 }
 
-#if CONFIG_COEF_INTERLEAVE
-void av1_foreach_transformed_block_interleave(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd_y = &xd->plane[0];
-  const struct macroblockd_plane *const pd_c = &xd->plane[1];
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  const TX_SIZE tx_log2_y = mbmi->tx_size;
-  const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
-  const int tx_sz_y = (1 << tx_log2_y);
-  const int tx_sz_c = (1 << tx_log2_c);
-
-  const BLOCK_SIZE plane_bsize_y = get_plane_block_size(bsize, pd_y);
-  const BLOCK_SIZE plane_bsize_c = get_plane_block_size(bsize, pd_c);
-
-  const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
-  const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
-  const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
-  const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
-
-  const int step_y = 1 << (tx_log2_y << 1);
-  const int step_c = 1 << (tx_log2_c << 1);
-
-  const int max_4x4_w_y =
-      get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge, pd_y->subsampling_x);
-  const int max_4x4_h_y =
-      get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge, pd_y->subsampling_y);
-
-  const int extra_step_y = ((num_4x4_w_y - max_4x4_w_y) >> tx_log2_y) * step_y;
-
-  const int max_4x4_w_c =
-      get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge, pd_c->subsampling_x);
-  const int max_4x4_h_c =
-      get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge, pd_c->subsampling_y);
-
-  const int extra_step_c = ((num_4x4_w_c - max_4x4_w_c) >> tx_log2_c) * step_c;
-
-  // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
-  // i.e. when the SB is splitted by tile boundaries.
-  const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
-  const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
-  const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
-  const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
-  const int tu_num_c = tu_num_w_c * tu_num_h_c;
-
-  int tu_idx_c = 0;
-  int offset_y, row_y, col_y;
-  int offset_c, row_c, col_c;
-
-  for (row_y = 0; row_y < tu_num_h_y; row_y++) {
-    for (col_y = 0; col_y < tu_num_w_y; col_y++) {
-      // luma
-      offset_y = (row_y * tu_num_w_y + col_y) * step_y + row_y * extra_step_y;
-      visit(0, offset_y, row_y * tx_sz_y, col_y * tx_sz_y, plane_bsize_y,
-            tx_log2_y, arg);
-      // chroma
-      if (tu_idx_c < tu_num_c) {
-        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-        offset_c = tu_idx_c * step_c + (tu_idx_c / tu_num_w_c) * extra_step_c;
-        visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-        visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-        tu_idx_c++;
-      }
-    }
-  }
-
-  // In 422 case, it's possible that Chroma has more TUs than Luma
-  while (tu_idx_c < tu_num_c) {
-    row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-    col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-    offset_c = tu_idx_c * step_c + row_c * extra_step_c;
-    visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-    visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-    tu_idx_c++;
-  }
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
+  if (!above_mi) return DC_PRED;
+  assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
+  return above_mi->mode;
 }
-#endif
 
 void av1_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
@@ -128,12 +36,8 @@ void av1_foreach_transformed_block_in_plane(
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
   const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-#if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   const uint8_t txw_unit = tx_size_wide_unit[tx_size];
   const uint8_t txh_unit = tx_size_high_unit[tx_size];
   const int step = txw_unit * txh_unit;
@@ -147,7 +51,8 @@ void av1_foreach_transformed_block_in_plane(
 
   int blk_row, blk_col;
 
-  const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
   int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
   int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
   mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
@@ -170,95 +75,60 @@ void av1_foreach_transformed_block_in_plane(
   }
 }
 
-#if CONFIG_LV_MAP
 void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
                                    foreach_transformed_block_visitor visit,
-                                   void *arg) {
-  int plane;
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
+                                   void *arg, const int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     if (!is_chroma_reference(mi_row, mi_col, bsize,
                              xd->plane[plane].subsampling_x,
                              xd->plane[plane].subsampling_y))
       continue;
-#else
-    (void)mi_row;
-    (void)mi_col;
-#endif
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
   }
 }
-#endif
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, TX_SIZE tx_size, int has_eob, int aoff,
-                      int loff) {
+                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
   const int txs_wide = tx_size_wide_unit[tx_size];
   const int txs_high = tx_size_high_unit[tx_size];
-#if CONFIG_CB4X4
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#else
-  const BLOCK_SIZE bsize = AOMMAX(xd->mi[0]->mbmi.sb_type, BLOCK_8X8);
-#endif
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
-    int i;
     const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
-    int above_contexts = txs_wide;
-    if (above_contexts + aoff > blocks_wide)
-      above_contexts = blocks_wide - aoff;
-
-    for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
-    for (i = above_contexts; i < txs_wide; ++i) a[i] = 0;
+    const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
+    memset(a, has_eob, sizeof(*a) * above_contexts);
+    memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(a, has_eob, sizeof(*a) * txs_wide);
   }
 
   // left
   if (has_eob && xd->mb_to_bottom_edge < 0) {
-    int i;
     const int blocks_high = max_block_high(xd, plane_bsize, plane);
-    int left_contexts = txs_high;
-    if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
-
-    for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
-    for (i = left_contexts; i < txs_high; ++i) l[i] = 0;
+    const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
+    memset(l, has_eob, sizeof(*l) * left_contexts);
+    memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * txs_high);
+    memset(l, has_eob, sizeof(*l) * txs_high);
   }
 }
-#endif
-
 void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize) {
+                            BLOCK_SIZE bsize, const int num_planes) {
   int i;
   int nplanes;
-#if CONFIG_CB4X4
   int chroma_ref;
   chroma_ref =
       is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                           xd->plane[1].subsampling_y);
-  nplanes = 1 + (MAX_MB_PLANE - 1) * chroma_ref;
-#else
-  (void)mi_row;
-  (void)mi_col;
-  nplanes = MAX_MB_PLANE;
-#endif
+  nplanes = 1 + (num_planes - 1) * chroma_ref;
   for (i = 0; i < nplanes; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-#if CONFIG_CHROMA_SUB8X8
     const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
     const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
     memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
@@ -266,38 +136,61 @@ void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
   }
 }
 
-void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
+  xd->delta_lf_from_base = 0;
+  const int frame_lf_count =
+      num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+  for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
+}
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
+  for (int p = 0; p < num_planes; ++p) {
+    set_default_wiener(xd->wiener_info + p);
+    set_default_sgrproj(xd->sgrproj_info + p);
+  }
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes) {
   int i;
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  for (i = 0; i < num_planes; i++) {
     xd->plane[i].plane_type = get_plane_type(i);
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
 }
 
-#if CONFIG_EXT_INTRA
 const int16_t dr_intra_derivative[90] = {
-  1,    14666, 7330, 4884, 3660, 2926, 2435, 2084, 1821, 1616, 1451, 1317, 1204,
-  1108, 1026,  955,  892,  837,  787,  743,  703,  666,  633,  603,  574,  548,
-  524,  502,   481,  461,  443,  426,  409,  394,  379,  365,  352,  339,  327,
-  316,  305,   294,  284,  274,  265,  256,  247,  238,  230,  222,  214,  207,
-  200,  192,   185,  179,  172,  166,  159,  153,  147,  141,  136,  130,  124,
-  119,  113,   108,  103,  98,   93,   88,   83,   78,   73,   68,   63,   59,
-  54,   49,    45,   40,   35,   31,   26,   22,   17,   13,   8,    4,
+  // More evenly spread out angles and limited to 10-bit
+  // Values that are 0 will never be used
+  //                    Approx angle
+  0,    0, 0,        //
+  1023, 0, 0,        // 3, ...
+  547,  0, 0,        // 6, ...
+  372,  0, 0, 0, 0,  // 9, ...
+  273,  0, 0,        // 14, ...
+  215,  0, 0,        // 17, ...
+  178,  0, 0,        // 20, ...
+  151,  0, 0,        // 23, ... (113 & 203 are base angles)
+  132,  0, 0,        // 26, ...
+  116,  0, 0,        // 29, ...
+  102,  0, 0, 0,     // 32, ...
+  90,   0, 0,        // 36, ...
+  80,   0, 0,        // 39, ...
+  71,   0, 0,        // 42, ...
+  64,   0, 0,        // 45, ... (45 & 135 are base angles)
+  57,   0, 0,        // 48, ...
+  51,   0, 0,        // 51, ...
+  45,   0, 0, 0,     // 54, ...
+  40,   0, 0,        // 58, ...
+  35,   0, 0,        // 61, ...
+  31,   0, 0,        // 64, ...
+  27,   0, 0,        // 67, ... (67 & 157 are base angles)
+  23,   0, 0,        // 70, ...
+  19,   0, 0,        // 73, ...
+  15,   0, 0, 0, 0,  // 76, ...
+  11,   0, 0,        // 81, ...
+  7,    0, 0,        // 84, ...
+  3,    0, 0,        // 87, ...
 };
-
-#if CONFIG_INTRA_INTERP
-int av1_is_intra_filter_switchable(int angle) {
-  assert(angle > 0 && angle < 270);
-  if (angle % 45 == 0) return 0;
-  if (angle > 90 && angle < 180) {
-    return 1;
-  } else {
-    return ((angle < 90 ? dr_intra_derivative[angle]
-                        : dr_intra_derivative[270 - angle]) &
-            0xFF) > 0;
-  }
-}
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 01a449a1c..3e8d1d6c6 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -12,7 +12,7 @@
 #ifndef AV1_COMMON_BLOCKD_H_
 #define AV1_COMMON_BLOCKD_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
@@ -26,104 +26,40 @@
 #include "av1/common/scale.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#include "av1/common/pvq_state.h"
-#include "av1/decoder/decint.h"
-#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#if (CONFIG_CHROMA_SUB8X8 || CONFIG_CHROMA_2X2)
-#define SUB8X8_COMP_REF 0
-#else
-#define SUB8X8_COMP_REF 1
-#endif
+#define USE_B_QUANT_NO_TRELLIS 1
 
 #define MAX_MB_PLANE 3
 
-#if CONFIG_COMPOUND_SEGMENT
-// Set COMPOUND_SEGMENT_TYPE to one of the three
-// 0: Uniform
-// 1: Difference weighted
-#define COMPOUND_SEGMENT_TYPE 1
-#define MAX_SEG_MASK_BITS 1
+#define MAX_DIFFWTD_MASK_BITS 1
 
-// SEG_MASK_TYPES should not surpass 1 << MAX_SEG_MASK_BITS
+// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
 typedef enum {
-#if COMPOUND_SEGMENT_TYPE == 0
-  UNIFORM_45 = 0,
-  UNIFORM_45_INV,
-#elif COMPOUND_SEGMENT_TYPE == 1
   DIFFWTD_38 = 0,
   DIFFWTD_38_INV,
-#endif  // COMPOUND_SEGMENT_TYPE
-  SEG_MASK_TYPES,
-} SEG_MASK_TYPE;
-
-#endif  // CONFIG_COMPOUND_SEGMENT
+  DIFFWTD_MASK_TYPES,
+} DIFFWTD_MASK_TYPE;
 
 typedef enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
-#if CONFIG_OBU
   INTRA_ONLY_FRAME = 2,  // replaces intra-only
   S_FRAME = 3,
-#endif
   FRAME_TYPES,
 } FRAME_TYPE;
 
 static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
-  (void)bsize;
-#if SUB8X8_COMP_REF
-  return 1;
-#else
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-#endif  // SUB8X8_COMP_REF
 }
 
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEW_NEWMV;
 }
 
-#if CONFIG_PVQ
-typedef struct PVQ_INFO {
-  int theta[PVQ_MAX_PARTITIONS];
-  int qg[PVQ_MAX_PARTITIONS];
-  int k[PVQ_MAX_PARTITIONS];
-  od_coeff y[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
-  int nb_bands;
-  int off[PVQ_MAX_PARTITIONS];
-  int size[PVQ_MAX_PARTITIONS];
-  int skip_rest;
-  int skip_dir;
-  int bs;  // log of the block size minus two,
-           // i.e. equivalent to aom's TX_SIZE
-  // Block skip info, indicating whether DC/AC, is coded.
-  PVQ_SKIP_TYPE ac_dc_coded;  // bit0: DC coded, bit1 : AC coded (1 means coded)
-  tran_low_t dq_dc_residue;
-} PVQ_INFO;
-
-typedef struct PVQ_QUEUE {
-  PVQ_INFO *buf;  // buffer for pvq info, stored in encoding order
-  int curr_pos;   // curr position to write PVQ_INFO
-  int buf_len;    // allocated buffer length
-  int last_pos;   // last written position of PVQ_INFO in a tile
-} PVQ_QUEUE;
-#endif
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-typedef struct superblock_mi_boundaries {
-  int mi_row_begin;
-  int mi_col_begin;
-  int mi_row_end;
-  int mi_col_end;
-} SB_MI_BD;
-
-typedef struct { int16_t KERNEL[4][MAX_SB_SIZE][MAX_SB_SIZE]; } NCOBMC_KERNELS;
-#endif
-
 typedef struct {
   uint8_t *plane[MAX_MB_PLANE];
   int stride[MAX_MB_PLANE];
@@ -135,14 +71,6 @@ static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
 static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
   return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
 }
-#if CONFIG_COMPOUND_SINGLEREF
-static INLINE int is_inter_singleref_comp_mode(PREDICTION_MODE mode) {
-  return mode >= SR_NEAREST_NEARMV && mode <= SR_NEW_NEWMV;
-}
-static INLINE int is_inter_anyref_comp_mode(PREDICTION_MODE mode) {
-  return is_inter_compound_mode(mode) || is_inter_singleref_comp_mode(mode);
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
 static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
   static PREDICTION_MODE lut[] = {
@@ -151,42 +79,29 @@ static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
     MB_MODE_COUNT,  // H_PRED
     MB_MODE_COUNT,  // D45_PRED
     MB_MODE_COUNT,  // D135_PRED
-    MB_MODE_COUNT,  // D117_PRED
-    MB_MODE_COUNT,  // D153_PRED
-    MB_MODE_COUNT,  // D207_PRED
-    MB_MODE_COUNT,  // D63_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
     MB_MODE_COUNT,  // SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
     MB_MODE_COUNT,  // SMOOTH_V_PRED
     MB_MODE_COUNT,  // SMOOTH_H_PRED
-#endif              // CONFIG_SMOOTH_HV
-    MB_MODE_COUNT,  // TM_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
     MB_MODE_COUNT,  // NEARESTMV
     MB_MODE_COUNT,  // NEARMV
-    MB_MODE_COUNT,  // ZEROMV
+    MB_MODE_COUNT,  // GLOBALMV
     MB_MODE_COUNT,  // NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-    NEARESTMV,  // SR_NEAREST_NEARMV
-    // NEARESTMV,  // SR_NEAREST_NEWMV
-    NEARMV,     // SR_NEAR_NEWMV
-    ZEROMV,     // SR_ZERO_NEWMV
-    NEWMV,      // SR_NEW_NEWMV
-#endif          // CONFIG_COMPOUND_SINGLEREF
-    NEARESTMV,  // NEAREST_NEARESTMV
-    NEARMV,     // NEAR_NEARMV
-    NEARESTMV,  // NEAREST_NEWMV
-    NEWMV,      // NEW_NEARESTMV
-    NEARMV,     // NEAR_NEWMV
-    NEWMV,      // NEW_NEARMV
-    ZEROMV,     // ZERO_ZEROMV
-    NEWMV,      // NEW_NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEARESTMV,      // NEAREST_NEWMV
+    NEWMV,          // NEW_NEARESTMV
+    NEARMV,         // NEAR_NEWMV
+    NEWMV,          // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
   };
   assert(NELEMENTS(lut) == MB_MODE_COUNT);
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(is_inter_anyref_comp_mode(mode));
-#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(is_inter_compound_mode(mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
   return lut[mode];
 }
 
@@ -197,94 +112,54 @@ static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
     MB_MODE_COUNT,  // H_PRED
     MB_MODE_COUNT,  // D45_PRED
     MB_MODE_COUNT,  // D135_PRED
-    MB_MODE_COUNT,  // D117_PRED
-    MB_MODE_COUNT,  // D153_PRED
-    MB_MODE_COUNT,  // D207_PRED
-    MB_MODE_COUNT,  // D63_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
     MB_MODE_COUNT,  // SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
     MB_MODE_COUNT,  // SMOOTH_V_PRED
     MB_MODE_COUNT,  // SMOOTH_H_PRED
-#endif              // CONFIG_SMOOTH_HV
-    MB_MODE_COUNT,  // TM_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
     MB_MODE_COUNT,  // NEARESTMV
     MB_MODE_COUNT,  // NEARMV
-    MB_MODE_COUNT,  // ZEROMV
+    MB_MODE_COUNT,  // GLOBALMV
     MB_MODE_COUNT,  // NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-    NEARMV,  // SR_NEAREST_NEARMV
-    // NEWMV,      // SR_NEAREST_NEWMV
-    NEWMV,      // SR_NEAR_NEWMV
-    NEWMV,      // SR_ZERO_NEWMV
-    NEWMV,      // SR_NEW_NEWMV
-#endif          // CONFIG_COMPOUND_SINGLEREF
-    NEARESTMV,  // NEAREST_NEARESTMV
-    NEARMV,     // NEAR_NEARMV
-    NEWMV,      // NEAREST_NEWMV
-    NEARESTMV,  // NEW_NEARESTMV
-    NEWMV,      // NEAR_NEWMV
-    NEARMV,     // NEW_NEARMV
-    ZEROMV,     // ZERO_ZEROMV
-    NEWMV,      // NEW_NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEWMV,          // NEAREST_NEWMV
+    NEARESTMV,      // NEW_NEARESTMV
+    NEWMV,          // NEAR_NEWMV
+    NEARMV,         // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
   };
   assert(NELEMENTS(lut) == MB_MODE_COUNT);
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(is_inter_anyref_comp_mode(mode));
-#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(is_inter_compound_mode(mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
   return lut[mode];
 }
 
 static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
   return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-          mode == SR_NEAREST_NEARMV || mode == SR_NEAR_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
           mode == NEW_NEARMV);
 }
 
 static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
   return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-          /* mode == SR_NEAREST_NEWMV || */ mode == SR_NEAR_NEWMV ||
-          mode == SR_ZERO_NEWMV || mode == SR_NEW_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
           mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
 }
 
 static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
-#if CONFIG_WEDGE
   return (type == COMPOUND_WEDGE);
-#else
-  (void)type;
-  return 0;
-#endif
 }
 
 static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-  return (type == COMPOUND_WEDGE || type == COMPOUND_SEG);
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-  return (type == COMPOUND_WEDGE);
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-  return (type == COMPOUND_SEG);
-#endif  // CONFIG_COMPOUND_SEGMENT
-  (void)type;
-  return 0;
+  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
 }
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
-typedef struct {
-  PREDICTION_MODE as_mode;
-  int_mv as_mv[2];  // first, second inter predictor motion vectors
-  int_mv pred_mv[2];
-  int_mv ref_mv[2];
-} b_mode_info;
-
 typedef int8_t MV_REFERENCE_FRAME;
 
 typedef struct {
@@ -294,19 +169,17 @@ typedef struct {
   uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
 } PALETTE_MODE_INFO;
 
-#if CONFIG_FILTER_INTRA
-#define USE_3TAP_INTRA_FILTER 1  // 0: 4-tap; 1: 3-tap
 typedef struct {
-  // 1: an ext intra mode is used; 0: otherwise.
-  uint8_t use_filter_intra_mode[PLANE_TYPES];
-  FILTER_INTRA_MODE filter_intra_mode[PLANE_TYPES];
+  uint8_t use_filter_intra;
+  FILTER_INTRA_MODE filter_intra_mode;
 } FILTER_INTRA_MODE_INFO;
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_VAR_TX
+static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
+  DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED
+};
+
 #if CONFIG_RD_DEBUG
-#define TXB_COEFF_COST_MAP_SIZE (2 * MAX_MIB_SIZE)
-#endif
+#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE)
 #endif
 
 typedef struct RD_STATS {
@@ -325,213 +198,122 @@ typedef struct RD_STATS {
   uint8_t invalid_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
-#if CONFIG_VAR_TX
   int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
                         [TXB_COEFF_COST_MAP_SIZE];
-#endif  // CONFIG_VAR_TX
 #endif  // CONFIG_RD_DEBUG
 } RD_STATS;
 
 // This struct is used to group function args that are commonly
 // sent together in functions related to interinter compound modes
 typedef struct {
-#if CONFIG_WEDGE
   int wedge_index;
   int wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-  SEG_MASK_TYPE mask_type;
+  DIFFWTD_MASK_TYPE mask_type;
   uint8_t *seg_mask;
-#endif  // CONFIG_COMPOUND_SEGMENT
-  COMPOUND_TYPE interinter_compound_type;
+  COMPOUND_TYPE type;
 } INTERINTER_COMPOUND_DATA;
 
-// This structure now relates to 8x8 block regions.
+#define INTER_TX_SIZE_BUF_LEN 16
+#define TXK_TYPE_BUF_LEN 64
+// This structure now relates to 4x4 block regions.
 typedef struct MB_MODE_INFO {
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
-#if CONFIG_VAR_TX
-  // TODO(jingning): This effectively assigned a separate entry for each
-  // 8x8 block. Apparently it takes much more space than needed.
-  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  TX_SIZE min_tx_size;
-#endif
+  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
   int8_t skip;
+  int8_t skip_mode;
   int8_t segment_id;
-#if CONFIG_SUPERTX
-  // Minimum of all segment IDs under the current supertx block.
-  int8_t segment_id_supertx;
-#endif                      // CONFIG_SUPERTX
   int8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
-#if CONFIG_MRC_TX
-  int valid_mrc_mask;
-#endif  // CONFIG_MRC_TX
-
   // Only for INTRA blocks
   UV_PREDICTION_MODE uv_mode;
 
   PALETTE_MODE_INFO palette_mode_info;
-#if CONFIG_INTRABC
   uint8_t use_intrabc;
-#endif  // CONFIG_INTRABC
 
   // Only for INTER blocks
   InterpFilters interp_filters;
   MV_REFERENCE_FRAME ref_frame[2];
-  TX_TYPE tx_type;
-#if CONFIG_TXK_SEL
-  TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-#endif
-#if CONFIG_LGT_FROM_PRED
-  int use_lgt;
-#endif
 
-#if CONFIG_FILTER_INTRA
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+
   // The actual prediction angle is the base angle + (angle_delta * step).
-  int8_t angle_delta[2];
-#if CONFIG_INTRA_INTERP
-  // To-Do (huisu): this may be replaced by interp_filter
-  INTRA_FILTER intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-
-#if CONFIG_INTERINTRA
+  int8_t angle_delta[PLANE_TYPES];
+
   // interintra members
   INTERINTRA_MODE interintra_mode;
-#endif
   // TODO(debargha): Consolidate these flags
   int use_wedge_interintra;
   int interintra_wedge_index;
   int interintra_wedge_sign;
   // interinter members
-  COMPOUND_TYPE interinter_compound_type;
-#if CONFIG_WEDGE
-  int wedge_index;
-  int wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-  SEG_MASK_TYPE mask_type;
-#endif  // CONFIG_COMPOUND_SEGMENT
+  INTERINTER_COMPOUND_DATA interinter_comp;
   MOTION_MODE motion_mode;
-#if CONFIG_MOTION_VAR
   int overlappable_neighbors[2];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  // Applying different weighting kernels in ncobmc
-  // In current implementation, interpolation modes only defined for squared
-  // blocks. A rectangular block is divided into two squared blocks and each
-  // squared block has an interpolation mode.
-  NCOBMC_MODE ncobmc_mode[2];
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
   int_mv mv[2];
-  int_mv pred_mv[2];
   uint8_t ref_mv_idx;
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
-#endif
-#if CONFIG_NEW_QUANT
-  int dq_off_index;
-  int send_dq_bit;
-#endif  // CONFIG_NEW_QUANT
   /* deringing gain *per-superblock* */
   int8_t cdef_strength;
-  int current_q_index;
-#if CONFIG_EXT_DELTA_Q
-  int current_delta_lf_from_base;
-#if CONFIG_LOOPFILTER_LEVEL
-  int curr_delta_lf[FRAME_LF_COUNT];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
+  int current_qindex;
+  int delta_lf_from_base;
+  int delta_lf[FRAME_LF_COUNT];
 #if CONFIG_RD_DEBUG
   RD_STATS rd_stats;
   int mi_row;
   int mi_col;
 #endif
-#if CONFIG_WARPED_MOTION
   int num_proj_ref[2];
   WarpedMotionParams wm_params[2];
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_CFL
   // Index of the alpha Cb and alpha Cr combination
   int cfl_alpha_idx;
   // Joint sign of alpha Cb and alpha Cr
   int cfl_alpha_signs;
-#endif
 
-  BOUNDARY_TYPE boundary_info;
-#if CONFIG_LPF_SB
-  uint8_t filt_lvl;
-  int reuse_sb_lvl;
-  int sign;
-  int delta;
-#endif
+  int compound_idx;
+  int comp_group_idx;
 } MB_MODE_INFO;
 
-typedef struct MODE_INFO {
-  MB_MODE_INFO mbmi;
-  b_mode_info bmi[4];
-} MODE_INFO;
-
-#if CONFIG_INTRABC
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
   return mbmi->use_intrabc;
 }
-#endif
-
-static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
-#if CONFIG_CB4X4
-  (void)block;
-  return mi->mbmi.mode;
-#else
-  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode : mi->mbmi.mode;
-#endif
-}
 
-#if CONFIG_CFL
 static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
-  static const PREDICTION_MODE uv2y[UV_INTRA_MODES] = {
-    DC_PRED,      // UV_DC_PRED
-    V_PRED,       // UV_V_PRED
-    H_PRED,       // UV_H_PRED
-    D45_PRED,     // UV_D45_PRED
-    D135_PRED,    // UV_D135_PRED
-    D117_PRED,    // UV_D117_PRED
-    D153_PRED,    // UV_D153_PRED
-    D207_PRED,    // UV_D207_PRED
-    D63_PRED,     // UV_D63_PRED
-    SMOOTH_PRED,  // UV_SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
+  assert(mode < UV_INTRA_MODES);
+  static const PREDICTION_MODE uv2y[] = {
+    DC_PRED,        // UV_DC_PRED
+    V_PRED,         // UV_V_PRED
+    H_PRED,         // UV_H_PRED
+    D45_PRED,       // UV_D45_PRED
+    D135_PRED,      // UV_D135_PRED
+    D113_PRED,      // UV_D113_PRED
+    D157_PRED,      // UV_D157_PRED
+    D203_PRED,      // UV_D203_PRED
+    D67_PRED,       // UV_D67_PRED
+    SMOOTH_PRED,    // UV_SMOOTH_PRED
     SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
     SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
-#endif              // CONFIG_SMOOTH_HV
-    TM_PRED,        // UV_TM_PRED
-    DC_PRED,        // CFL_PRED
+    PAETH_PRED,     // UV_PAETH_PRED
+    DC_PRED,        // UV_CFL_PRED
+    INTRA_INVALID,  // UV_INTRA_MODES
+    INTRA_INVALID,  // UV_MODE_INVALID
   };
   return uv2y[mode];
 }
-#else
-static INLINE PREDICTION_MODE get_uv_mode(PREDICTION_MODE mode) { return mode; }
-#endif  // CONFIG_CFL
 
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
-#if CONFIG_INTRABC
-  if (is_intrabc_block(mbmi)) return 1;
-#endif
-  return mbmi->ref_frame[0] > INTRA_FRAME;
+  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
 }
 
 static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
 
-#if CONFIG_EXT_COMP_REFS
 static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
   return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
                                     (mbmi->ref_frame[1] >= BWDREF_FRAME)));
@@ -539,48 +321,60 @@ static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
 
 static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
   static const MV_REFERENCE_FRAME lut[] = {
-    LAST_FRAME,    // LAST_LAST2_FRAMES,
-    LAST_FRAME,    // LAST_LAST3_FRAMES,
-    LAST_FRAME,    // LAST_GOLDEN_FRAMES,
-    BWDREF_FRAME,  // BWDREF_ALTREF_FRAMES,
+    LAST_FRAME,     // LAST_LAST2_FRAMES,
+    LAST_FRAME,     // LAST_LAST3_FRAMES,
+    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST2_FRAME,    // LAST2_LAST3_FRAMES
+    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
+    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
+    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
   };
-  assert(NELEMENTS(lut) == UNIDIR_COMP_REFS);
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
   return lut[ref_idx];
 }
 
 static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
   static const MV_REFERENCE_FRAME lut[] = {
-    LAST2_FRAME,   // LAST_LAST2_FRAMES,
-    LAST3_FRAME,   // LAST_LAST3_FRAMES,
-    GOLDEN_FRAME,  // LAST_GOLDEN_FRAMES,
-    ALTREF_FRAME,  // BWDREF_ALTREF_FRAMES,
+    LAST2_FRAME,    // LAST_LAST2_FRAMES,
+    LAST3_FRAME,    // LAST_LAST3_FRAMES,
+    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
+    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST3_FRAME,    // LAST2_LAST3_FRAMES
+    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
+    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
+    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
+    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
   };
-  assert(NELEMENTS(lut) == UNIDIR_COMP_REFS);
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
   return lut[ref_idx];
 }
-#endif  // CONFIG_EXT_COMP_REFS
 
-PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
-                                    const MODE_INFO *left_mi, int b);
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
 
-PREDICTION_MODE av1_above_block_mode(const MODE_INFO *cur_mi,
-                                     const MODE_INFO *above_mi, int b);
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
 
-#if CONFIG_GLOBAL_MOTION
-static INLINE int is_global_mv_block(const MODE_INFO *mi, int block,
+static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
                                      TransformationType type) {
-  PREDICTION_MODE mode = get_y_mode(mi, block);
-#if GLOBAL_SUB8X8_USED
-  const int block_size_allowed = 1;
-#else
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int block_size_allowed =
       AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-#endif  // GLOBAL_SUB8X8_USED
-  return (mode == ZEROMV || mode == ZERO_ZEROMV) && type > TRANSLATION &&
+  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
          block_size_allowed;
 }
-#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_MISMATCH_DEBUG
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+                                   int mi_row, int tx_blk_col, int tx_blk_row,
+                                   int subsampling_x, int subsampling_y) {
+  *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
+             (tx_blk_col << tx_size_wide_log2[0]);
+  *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
+             (tx_blk_row << tx_size_high_log2[0]);
+}
+#endif
 
 enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
 
@@ -592,8 +386,22 @@ struct buf_2d {
   int stride;
 };
 
+typedef struct eob_info {
+  uint16_t eob;
+  uint16_t max_scan_line;
+} eob_info;
+
+typedef struct {
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]);
+  eob_info eob_data[MAX_MB_PLANE]
+                   [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+} CB_BUFFER;
+
 typedef struct macroblockd_plane {
   tran_low_t *dqcoeff;
+  tran_low_t *dqcoeff_block;
+  eob_info *eob_data;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -601,56 +409,36 @@ typedef struct macroblockd_plane {
   struct buf_2d pre[2];
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
-  int16_t seg_dequant[MAX_SEGMENTS][2];
-#if CONFIG_NEW_QUANT
-  dequant_val_type_nuq seg_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES]
-                                      [COEF_BANDS];
-#endif
+
+  // The dequantizers below are true dequntizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
   uint8_t *color_index_map;
 
-  // number of 4x4s in current block
-  uint16_t n4_w, n4_h;
-  // log2 of n4_w, n4_h
-  uint8_t n4_wl, n4_hl;
   // block size in pixels
   uint8_t width, height;
 
-#if CONFIG_AOM_QM
-  qm_val_t *seg_iqmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-#endif
-  // encoder
-  const int16_t *dequant;
-#if CONFIG_NEW_QUANT
-  const dequant_val_type_nuq *dequant_val_nuq[QUANT_PROFILES];
-#endif  // CONFIG_NEW_QUANT
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8
-  DECLARE_ALIGNED(16, int16_t, pred[MAX_SB_SQUARE]);
-#endif
-#if CONFIG_PVQ
-  // PVQ: forward transformed predicted image, a reference for PVQ.
-  tran_low_t *pvq_ref_coeff;
-#endif
+  qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // the 'dequantizers' below are not literal dequantizer values.
+  // They're used by encoder RDO to generate ad-hoc lambda values.
+  // They use a hardwired Q3 coeff shift and do not necessarily match
+  // the TX scale in use.
+  const int16_t *dequant_Q3;
 } MACROBLOCKD_PLANE;
 
 #define BLOCK_OFFSET(x, i) \
   ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
 
 typedef struct RefBuffer {
-  int idx;
+  int idx;      // frame buf idx
+  int map_idx;  // frame map idx
   YV12_BUFFER_CONFIG *buf;
   struct scale_factors sf;
-#if CONFIG_VAR_REFS
-  int is_valid;
-#endif  // CONFIG_VAR_REFS
 } RefBuffer;
 
-#if CONFIG_ADAPT_SCAN
-typedef int16_t EobThresholdMD[TX_TYPES][EOB_THRESHOLD_NUM];
-#endif
-
-#if CONFIG_LOOP_RESTORATION
 typedef struct {
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
   DECLARE_ALIGNED(16, InterpKernel, hfilter);
@@ -660,77 +448,75 @@ typedef struct {
   int ep;
   int xqd[2];
 } SgrprojInfo;
-#endif  // CONFIG_LOOP_RESTORATION
 
-#if CONFIG_CFL
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#if CONFIG_DEBUG
 #define CFL_SUB8X8_VAL_MI_SIZE (4)
 #define CFL_SUB8X8_VAL_MI_SQUARE \
   (CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE)
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#endif  // CONFIG_DEBUG
+#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
+#define CFL_BUF_LINE (32)
+#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
+#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
+#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
 typedef struct cfl_ctx {
-  // The CfL prediction buffer is used in two steps:
-  //   1. Stores Q3 reconstructed luma pixels
-  //      (only Q2 is required, but Q3 is used to avoid shifts)
-  //   2. Stores Q3 AC contributions (step1 - tx block avg)
-  int16_t pred_buf_q3[MAX_SB_SQUARE];
+  // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
+  // shifts)
+  uint16_t recon_buf_q3[CFL_BUF_SQUARE];
+  // Q3 AC contributions (reconstructed luma pixels - tx block avg)
+  int16_t ac_buf_q3[CFL_BUF_SQUARE];
+
+  // Cache the DC_PRED when performing RDO, so it does not have to be recomputed
+  // for every scaling parameter
+  int dc_pred_is_cached[CFL_PRED_PLANES];
+  // The DC_PRED cache is disable when decoding
+  int use_dc_pred_cache;
+  // Only cache the first row of the DC_PRED
+  int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
 
   // Height and width currently used in the CfL prediction buffer.
   int buf_height, buf_width;
 
-  // Height and width of the chroma prediction block currently associated with
-  // this context
-  int uv_height, uv_width;
-
   int are_parameters_computed;
 
   // Chroma subsampling
   int subsampling_x, subsampling_y;
 
-  // Block level DC_PRED for each chromatic plane
-  int dc_pred[CFL_PRED_PLANES];
-
   int mi_row, mi_col;
 
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
 
-#if CONFIG_CB4X4
+#if CONFIG_DEBUG
+  int rate;
+#endif  // CONFIG_DEBUG
+
   int is_chroma_reference;
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-  // The prediction used for sub8x8 blocks originates from multiple luma blocks,
-  // this array is used to validate that cfl_store() is called only once for
-  // each luma block
-  uint8_t sub8x8_val[CFL_SUB8X8_VAL_MI_SQUARE];
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-#endif  // CONFIG_CB4X4
 } CFL_CTX;
-#endif  // CONFIG_CFL
+
+typedef struct jnt_comp_params {
+  int use_jnt_comp_avg;
+  int fwd_offset;
+  int bck_offset;
+} JNT_COMP_PARAMS;
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
-  uint8_t bmode_blocks_wl;
-  uint8_t bmode_blocks_hl;
 
-  FRAME_COUNTS *counts;
   TileInfo tile;
 
   int mi_stride;
 
-  MODE_INFO **mi;
-  MODE_INFO *left_mi;
-  MODE_INFO *above_mi;
+  MB_MODE_INFO **mi;
   MB_MODE_INFO *left_mbmi;
   MB_MODE_INFO *above_mbmi;
+  MB_MODE_INFO *chroma_left_mbmi;
+  MB_MODE_INFO *chroma_above_mbmi;
 
   int up_available;
   int left_available;
-#if CONFIG_CHROMA_SUB8X8
   int chroma_up_available;
   int chroma_left_available;
-#endif
-
-  const aom_prob (*partition_probs)[PARTITION_TYPES - 1];
 
   /* Distance of MB away from frame edges in subpixels (1/8th pixel)  */
   int mb_to_left_edge;
@@ -738,40 +524,24 @@ typedef struct macroblockd {
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  FRAME_CONTEXT *fc;
-
   /* pointers to reference frames */
   const RefBuffer *block_refs[2];
 
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-#if CONFIG_INTRABC
-  /* Scale of the current frame with respect to itself */
-  struct scale_factors sf_identity;
-#endif
-
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
 
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
 
-#if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT *left_txfm_context;
-  TXFM_CONTEXT left_txfm_context_buffer[2 * MAX_MIB_SIZE];
-
-  TX_SIZE max_tx_size;
-#if CONFIG_SUPERTX
-  TX_SIZE supertx_size;
-#endif
-#endif
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
-#if CONFIG_LOOP_RESTORATION
   WienerInfo wiener_info[MAX_MB_PLANE];
   SgrprojInfo sgrproj_info[MAX_MB_PLANE];
-#endif  // CONFIG_LOOP_RESTORATION
 
   // block dimension in the unit of mode_info.
   uint8_t n8_w, n8_h;
@@ -780,9 +550,10 @@ typedef struct macroblockd {
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
   uint8_t is_sec_rect;
 
-#if CONFIG_PVQ
-  daala_dec_ctx daala_dec;
-#endif
+  // Counts of each reference frame in the above and left neighboring blocks.
+  // NOTE: Take into account both single and comp references.
+  uint8_t neighbors_ref_counts[REF_FRAMES];
+
   FRAME_CONTEXT *tile_ctx;
   /* Bit depth: 8, 10, 12 */
   int bd;
@@ -790,27 +561,19 @@ typedef struct macroblockd {
   int qindex[MAX_SEGMENTS];
   int lossless[MAX_SEGMENTS];
   int corrupted;
-#if CONFIG_AMVR
-  int cur_frame_mv_precision_level;
-// same with that in AV1_COMMON
-#endif
+  int cur_frame_force_integer_mv;
+  // same with that in AV1_COMMON
   struct aom_internal_error_info *error_info;
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams *global_motion;
-#endif  // CONFIG_GLOBAL_MOTION
-  int prev_qindex;
+  const WarpedMotionParams *global_motion;
   int delta_qindex;
   int current_qindex;
-#if CONFIG_EXT_DELTA_Q
   // Since actual frame level loop filtering level value is not available
   // at the beginning of the tile (only available during actual filtering)
   // at encoder side.we record the delta_lf (against the frame level loop
   // filtering level) and code the delta between previous superblock's delta
   // lf and current delta lf. It is equivalent to the delta between previous
   // superblock's actual lf and current lf.
-  int prev_delta_lf_from_base;
-  int current_delta_lf_from_base;
-#if CONFIG_LOOPFILTER_LEVEL
+  int delta_lf_from_base;
   // For this experiment, we have four frame filter levels for different plane
   // and direction. So, to support the per superblock update, we need to add
   // a few more params as below.
@@ -824,420 +587,151 @@ typedef struct macroblockd {
   // SEG_LVL_ALT_LF_Y_H = 2;
   // SEG_LVL_ALT_LF_U   = 3;
   // SEG_LVL_ALT_LF_V   = 4;
-  int prev_delta_lf[FRAME_LF_COUNT];
-  int curr_delta_lf[FRAME_LF_COUNT];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
-#if CONFIG_ADAPT_SCAN
-  const EobThresholdMD *eob_threshold_md;
-#endif
+  int delta_lf[FRAME_LF_COUNT];
+  int cdef_preset[4];
 
-#if CONFIG_COMPOUND_SEGMENT
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
-#endif  // CONFIG_COMPOUND_SEGMENT
+  uint8_t *mc_buf[2];
+  CFL_CTX cfl;
 
-#if CONFIG_MRC_TX
-  uint8_t *mrc_mask;
-#endif  // CONFIG_MRC_TX
+  JNT_COMP_PARAMS jcp_param;
 
-#if CONFIG_CFL
-  CFL_CTX *cfl;
-#endif
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  uint8_t *ncobmc_pred_buf[MAX_MB_PLANE];
-  int ncobmc_pred_buf_stride[MAX_MB_PLANE];
-  SB_MI_BD sb_mi_bd;
-#endif
+  uint16_t cb_offset[MAX_MB_PLANE];
+  uint16_t txb_offset[MAX_MB_PLANE];
+  uint16_t color_index_map_offset[2];
 } MACROBLOCKD;
 
 static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }
 
-static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
-                                     PARTITION_TYPE partition) {
-  if (partition == PARTITION_INVALID)
-    return BLOCK_INVALID;
-  else
-    return subsize_lookup[partition][bsize];
-}
-
-static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
-  DCT_DCT,    // DC
-  ADST_DCT,   // V
-  DCT_ADST,   // H
-  DCT_DCT,    // D45
-  ADST_ADST,  // D135
-  ADST_DCT,   // D117
-  DCT_ADST,   // D153
-  DCT_ADST,   // D207
-  ADST_DCT,   // D63
-  ADST_ADST,  // SMOOTH
-#if CONFIG_SMOOTH_HV
-  ADST_DCT,   // SMOOTH_V
-  DCT_ADST,   // SMOOTH_H
-#endif        // CONFIG_SMOOTH_HV
-  ADST_ADST,  // TM
-};
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_4X4: return 0;
+    case BLOCK_8X8: return 1;
+    case BLOCK_16X16: return 2;
+    case BLOCK_32X32: return 3;
+    case BLOCK_64X64: return 4;
+    case BLOCK_128X128: return 5;
+    default: return SQR_BLOCK_SIZES;
+  }
+}
 
-#if CONFIG_SUPERTX
-static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
-  TX_SIZE max_tx_size = txsize_sqr_map[mbmi->tx_size];
-  return tx_size_wide[max_tx_size] >
-         AOMMIN(block_size_wide[mbmi->sb_type], block_size_high[mbmi->sb_type]);
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+                                               PARTITION_TYPE partition) {
+  if (partition == PARTITION_INVALID) {
+    return BLOCK_INVALID;
+  } else {
+    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+    return sqr_bsize_idx >= SQR_BLOCK_SIZES
+               ? BLOCK_INVALID
+               : subsize_lookup[partition][sqr_bsize_idx];
+  }
 }
-#endif  // CONFIG_SUPERTX
 
-#define USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 1
+static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
+                                     PLANE_TYPE plane_type) {
+  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
+    DCT_DCT,    // DC
+    ADST_DCT,   // V
+    DCT_ADST,   // H
+    DCT_DCT,    // D45
+    ADST_ADST,  // D135
+    ADST_DCT,   // D117
+    DCT_ADST,   // D153
+    DCT_ADST,   // D207
+    ADST_DCT,   // D63
+    ADST_ADST,  // SMOOTH
+    ADST_DCT,   // SMOOTH_V
+    DCT_ADST,   // SMOOTH_H
+    ADST_ADST,  // PAETH
+  };
+  const PREDICTION_MODE mode =
+      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  assert(mode < INTRA_MODES);
+  return _intra_mode_to_tx_type[mode];
+}
 
-#if CONFIG_RECT_TX
 static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
-#endif  // CONFIG_RECT_TX
 
 static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
   return bsize > BLOCK_4X4;
-#else
-  return bsize >= BLOCK_8X8;
-#endif
 }
 
-#if CONFIG_MRC_TX
-#define USE_MRC_INTRA 0
-#define USE_MRC_INTER 1
-#define SIGNAL_MRC_MASK_INTRA (USE_MRC_INTRA && 0)
-#define SIGNAL_MRC_MASK_INTER (USE_MRC_INTER && 1)
-#define SIGNAL_ANY_MRC_MASK (SIGNAL_MRC_MASK_INTRA || SIGNAL_MRC_MASK_INTER)
-#endif  // CONFIG_MRC_TX
-
-#if CONFIG_EXT_TX
-#define ALLOW_INTRA_EXT_TX 1
-
 // Number of transform types in each set type
 static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = {
-  1, 2,
-#if CONFIG_MRC_TX
-  2, 3,
-#endif  // CONFIG_MRC_TX
-  5, 7, 12, 16,
-};
-
-static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
-                                                      EXT_TX_SETS_INTER)] = {
-  {
-      // Intra
-      EXT_TX_SET_DCTONLY, EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX,
-#if CONFIG_MRC_TX
-      EXT_TX_SET_MRC_DCT,
-#endif  // CONFIG_MRC_TX
-  },
-  {
-      // Inter
-      EXT_TX_SET_DCTONLY, EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT,
-      EXT_TX_SET_DCT_IDTX,
-#if CONFIG_MRC_TX
-      EXT_TX_SET_MRC_DCT_IDTX,
-#endif  // CONFIG_MRC_TX
-  }
+  1, 2, 5, 7, 12, 16,
 };
 
-#if CONFIG_MRC_TX
 static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-  },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
-#else   // CONFIG_MRC_TX
-static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  },
-};
-#endif  // CONFIG_MRC_TX
 
-static INLINE TxSetType get_ext_tx_set_type(TX_SIZE tx_size, BLOCK_SIZE bs,
-                                            int is_inter, int use_reduced_set) {
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+                                                int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
-  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
-#if CONFIG_CB4X4 && USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-  (void)bs;
   if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
-#else
-  if (tx_size_sqr_up > TX_32X32 || bs < BLOCK_8X8) return EXT_TX_SET_DCTONLY;
-#endif
-  if (use_reduced_set)
-    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
-#if CONFIG_MRC_TX
-  if (tx_size == TX_32X32) {
-    if (is_inter && USE_MRC_INTER)
-      return EXT_TX_SET_MRC_DCT_IDTX;
-    else if (!is_inter && USE_MRC_INTRA)
-      return EXT_TX_SET_MRC_DCT;
-  }
-#endif  // CONFIG_MRC_TX
   if (tx_size_sqr_up == TX_32X32)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
-  if (is_inter)
+  if (use_reduced_set)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+  if (is_inter) {
     return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
                                     : EXT_TX_SET_ALL16);
-  else
+  } else {
     return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
                                     : EXT_TX_SET_DTT4_IDTX_1DDCT);
+  }
 }
 
 // Maps tx set types to the indices.
 static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
-  {
-      // Intra
-      0, -1,
-#if CONFIG_MRC_TX
-      3, -1,
-#endif  // CONFIG_MRC_TX
-      2, 1, -1, -1,
-  },
-  {
-      // Inter
-      0, 3,
-#if CONFIG_MRC_TX
-      -1, 4,
-#endif  // CONFIG_MRC_TX
-      -1, -1, 2, 1,
-  },
+  { // Intra
+    0, -1, 2, 1, -1, -1 },
+  { // Inter
+    0, 3, -1, -1, 2, 1 },
 };
 
-static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter,
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
                                  int use_reduced_set) {
   const TxSetType set_type =
-      get_ext_tx_set_type(tx_size, bs, is_inter, use_reduced_set);
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
   return ext_tx_set_index[is_inter][set_type];
 }
 
-static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter,
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
                                    int use_reduced_set) {
   const int set_type =
-      get_ext_tx_set_type(tx_size, bs, is_inter, use_reduced_set);
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
   return av1_num_ext_tx_set[set_type];
 }
 
-#if CONFIG_LGT_FROM_PRED
-static INLINE int is_lgt_allowed(PREDICTION_MODE mode, TX_SIZE tx_size) {
-  if (!LGT_FROM_PRED_INTRA && !is_inter_mode(mode)) return 0;
-  if (!LGT_FROM_PRED_INTER && is_inter_mode(mode)) return 0;
-
-  switch (mode) {
-    case D45_PRED:
-    case D63_PRED:
-    case D117_PRED:
-    case V_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_V_PRED:
-#endif
-      return tx_size_wide[tx_size] <= 8;
-    case D135_PRED:
-    case D153_PRED:
-    case D207_PRED:
-    case H_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_H_PRED:
-#endif
-      return tx_size_high[tx_size] <= 8;
-    case DC_PRED:
-    case SMOOTH_PRED: return 0;
-    case TM_PRED:
-    default: return tx_size_wide[tx_size] <= 8 || tx_size_high[tx_size] <= 8;
-  }
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_RECT_TX
-static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
-  static const char LUT[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    0,  // BLOCK_2X2
-    0,  // BLOCK_2X4
-    0,  // BLOCK_4X2
-#endif
-    0,  // BLOCK_4X4
-    1,  // BLOCK_4X8
-    1,  // BLOCK_8X4
-    0,  // BLOCK_8X8
-    1,  // BLOCK_8X16
-    1,  // BLOCK_16X8
-    0,  // BLOCK_16X16
-    1,  // BLOCK_16X32
-    1,  // BLOCK_32X16
-    0,  // BLOCK_32X32
-    1,  // BLOCK_32X64
-    1,  // BLOCK_64X32
-    0,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_64X128
-    0,  // BLOCK_128X64
-    0,  // BLOCK_128X128
-#endif  // CONFIG_EXT_PARTITION
-    0,  // BLOCK_4X16
-    0,  // BLOCK_16X4
-    0,  // BLOCK_8X32
-    0,  // BLOCK_32X8
-    0,  // BLOCK_16X64
-    0,  // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_32X128
-    0,  // BLOCK_128X32
-#endif  // CONFIG_EXT_PARTITION
-  };
-
-  return LUT[bsize];
-}
+#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
+#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
 
-static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
-                                     const MB_MODE_INFO *mbmi) {
-  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
-         !xd->lossless[mbmi->segment_id];
-}
-#endif  // CONFIG_RECT_TX
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static INLINE int is_quarter_tx_allowed_bsize(BLOCK_SIZE bsize) {
-  static const char LUT_QTTX[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    0,  // BLOCK_2X2
-    0,  // BLOCK_2X4
-    0,  // BLOCK_4X2
-#endif
-    0,  // BLOCK_4X4
-    0,  // BLOCK_4X8
-    0,  // BLOCK_8X4
-    0,  // BLOCK_8X8
-    1,  // BLOCK_8X16
-    1,  // BLOCK_16X8
-    0,  // BLOCK_16X16
-    0,  // BLOCK_16X32
-    0,  // BLOCK_32X16
-    0,  // BLOCK_32X32
-    0,  // BLOCK_32X64
-    0,  // BLOCK_64X32
-    0,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_64X128
-    0,  // BLOCK_128X64
-    0,  // BLOCK_128X128
-#endif  // CONFIG_EXT_PARTITION
-    0,  // BLOCK_4X16
-    0,  // BLOCK_16X4
-    0,  // BLOCK_8X32
-    0,  // BLOCK_32X8
-    0,  // BLOCK_16X64
-    0,  // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_32X128
-    0,  // BLOCK_128X32
-#endif  // CONFIG_EXT_PARTITION
-  };
-
-  return LUT_QTTX[bsize];
-}
-
-static INLINE int is_quarter_tx_allowed(const MACROBLOCKD *xd,
-                                        const MB_MODE_INFO *mbmi,
-                                        int is_inter) {
-  return is_quarter_tx_allowed_bsize(mbmi->sb_type) && is_inter &&
-         !xd->lossless[mbmi->segment_id];
-}
-#endif
-
-static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode,
-                                           int is_inter) {
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-#if (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
   const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
-#else
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-#endif  // (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
-  (void)is_inter;
-#if CONFIG_VAR_TX && CONFIG_RECT_TX
-#if CONFIG_CB4X4
   if (bsize == BLOCK_4X4)
     return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
-#else
-  if (bsize < BLOCK_8X8)
-    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
-#endif
   if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
     return max_rect_tx_size;
   else
     return largest_tx_size;
-#elif CONFIG_EXT_TX && CONFIG_RECT_TX
-  if (txsize_sqr_up_map[max_rect_tx_size] <= largest_tx_size) {
-    return max_rect_tx_size;
-  } else {
-    return largest_tx_size;
-  }
-#else
-  return AOMMIN(max_tx_size, largest_tx_size);
-#endif  // CONFIG_VAR_TX && CONFIG_RECT_TX
 }
 
-#if CONFIG_EXT_INTRA
-#define MAX_ANGLE_DELTA 3
-#define ANGLE_STEP 3
 extern const int16_t dr_intra_derivative[90];
 static const uint8_t mode_to_angle_map[] = {
-  0, 90, 180, 45, 135, 111, 157, 203, 67, 0, 0,
-#if CONFIG_SMOOTH_HV
-  0, 0,
-#endif  // CONFIG_SMOOTH_HV
+  0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
 };
-#if CONFIG_INTRA_INTERP
-// Returns whether filter selection is needed for a given
-// intra prediction angle.
-int av1_is_intra_filter_switchable(int angle);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-
-#if CONFIG_DCT_ONLY
-#define FIXED_TX_TYPE 1
-#else
-#define FIXED_TX_TYPE 0
-#endif
 
 // Converts block_index for given transform size to index of the block in raster
 // order.
@@ -1261,168 +755,182 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
 }
 
 static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
-                                          const MACROBLOCKD *xd, int block_idx,
+                                          const MACROBLOCKD *xd,
                                           TX_SIZE tx_size) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
 
-  if (CONFIG_DCT_ONLY || is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
       xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
     return DCT_DCT;
 
-  return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
-                                           ? get_y_mode(xd->mi[0], block_idx)
-                                           : get_uv_mode(mbmi->uv_mode)];
+  return intra_mode_to_tx_type(mbmi, plane_type);
+}
+
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                              int subsampling_x,
+                                              int subsampling_y) {
+  if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+  return ss_size_lookup[bsize][subsampling_x][subsampling_y];
+}
+
+static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_log2 - tx_w_log2;
+  const int index =
+      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+  assert(index < INTER_TX_SIZE_BUF_LEN);
+  return index;
+}
+
+static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_uint_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_uint_log2 - tx_w_log2;
+  const int index =
+      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+  assert(index < TXK_TYPE_BUF_LEN);
+  return index;
+}
+
+static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
+                                    int blk_row, int blk_col, TX_SIZE tx_size,
+                                    TX_TYPE tx_type) {
+  const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
+  txk_type[txk_type_idx] = tx_type;
+
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+  // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+  // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+  // the intricacy, cover all the 16x16 units inside a 64 level transform.
+  if (txw == tx_size_wide_unit[TX_64X64] ||
+      txh == tx_size_high_unit[TX_64X64]) {
+    const int tx_unit = tx_size_wide_unit[TX_16X16];
+    for (int idy = 0; idy < txh; idy += tx_unit) {
+      for (int idx = 0; idx < txw; idx += tx_unit) {
+        const int this_index =
+            av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
+        txk_type[this_index] = tx_type;
+      }
+    }
+  }
 }
 
 static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int blk_row,
-                                      int blk_col, int block, TX_SIZE tx_size) {
-  const MODE_INFO *const mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  (void)blk_row;
-  (void)blk_col;
-#if CONFIG_INTRABC && (!CONFIG_EXT_TX || CONFIG_TXK_SEL)
-  // TODO(aconverse@google.com): Handle INTRABC + EXT_TX + TXK_SEL
-  if (is_intrabc_block(mbmi)) return DCT_DCT;
-#endif  // CONFIG_INTRABC && (!CONFIG_EXT_TX || CONFIG_TXK_SEL)
-
-#if CONFIG_TXK_SEL
+                                      int blk_col, TX_SIZE tx_size,
+                                      int reduced_tx_set) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane_type];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+
   TX_TYPE tx_type;
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] >= TX_32X32) {
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
     tx_type = DCT_DCT;
   } else {
-    if (plane_type == PLANE_TYPE_Y)
-      tx_type = mbmi->txk_type[(blk_row << 4) + blk_col];
-    else if (is_inter_block(mbmi))
-      tx_type = mbmi->txk_type[(blk_row << 5) + (blk_col << 1)];
-    else
-      tx_type = intra_mode_to_tx_type_context[mbmi->uv_mode];
-  }
-  assert(tx_type >= DCT_DCT && tx_type < TX_TYPES);
-  return tx_type;
-#endif  // CONFIG_TXK_SEL
-
-#if FIXED_TX_TYPE
-  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
-  return get_default_tx_type(plane_type, xd, block_raster_idx, tx_size);
-#endif  // FIXED_TX_TYPE
-
-#if CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-  if (mbmi->tx_type == MRC_DCT) {
-    assert(((is_inter_block(mbmi) && USE_MRC_INTER) ||
-            (!is_inter_block(mbmi) && USE_MRC_INTRA)) &&
-           "INVALID BLOCK TYPE FOR MRC_DCT");
     if (plane_type == PLANE_TYPE_Y) {
-      assert(tx_size == TX_32X32);
-      return mbmi->tx_type;
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      tx_type = mbmi->txk_type[txk_type_idx];
+    } else if (is_inter_block(mbmi)) {
+      // scale back to y plane's coordinate
+      blk_row <<= pd->subsampling_y;
+      blk_col <<= pd->subsampling_x;
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      tx_type = mbmi->txk_type[txk_type_idx];
+    } else {
+      // In intra mode, uv planes don't share the same prediction mode as y
+      // plane, so the tx_type should not be shared
+      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
     }
-    return DCT_DCT;
   }
-#endif  // CONFIG_MRC_TX
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32 ||
-      (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
-    return DCT_DCT;
-  if (mbmi->sb_type >= BLOCK_8X8 || CONFIG_CB4X4) {
-    if (plane_type == PLANE_TYPE_Y) {
-#if !ALLOW_INTRA_EXT_TX
-      if (is_inter_block(mbmi))
-#endif  // ALLOW_INTRA_EXT_TX
-        return mbmi->tx_type;
-    }
+  assert(tx_type < TX_TYPES);
+  if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+  return tx_type;
+}
 
-    if (is_inter_block(mbmi)) {
-// UV Inter only
-#if CONFIG_CHROMA_2X2
-      if (tx_size < TX_4X4) return DCT_DCT;
-#endif
-      return (mbmi->tx_type == IDTX && txsize_sqr_map[tx_size] >= TX_32X32)
-                 ? DCT_DCT
-                 : mbmi->tx_type;
-    }
-  }
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes);
 
-#if CONFIG_CB4X4
-  (void)block;
-#if CONFIG_CHROMA_2X2
-  if (tx_size < TX_4X4)
-    return DCT_DCT;
-  else
-#endif  // CONFIG_CHROMA_2X2
-    return intra_mode_to_tx_type_context[get_uv_mode(mbmi->uv_mode)];
-#else   // CONFIG_CB4X4
-  // Sub8x8-Inter/Intra OR UV-Intra
-  if (is_inter_block(mbmi)) {  // Sub8x8-Inter
-    return DCT_DCT;
-  } else {  // Sub8x8 Intra OR UV-Intra
-    const int block_raster_idx =
-        av1_block_index_to_raster_order(tx_size, block);
-    return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
-                                             ? get_y_mode(mi, block_raster_idx)
-                                             : get_uv_mode(mbmi->uv_mode)];
-  }
-#endif  // CONFIG_CB4X4
-#else   // CONFIG_EXT_TX
-  (void)block;
-#if CONFIG_MRC_TX
-  if (mbmi->tx_type == MRC_DCT) {
-    if (plane_type == PLANE_TYPE_Y && !xd->lossless[mbmi->segment_id]) {
-      assert(tx_size == TX_32X32);
-      return mbmi->tx_type;
-    }
-    return DCT_DCT;
+static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+    depth++;
+    tx_size = sub_tx_size_map[tx_size];
   }
-#endif  // CONFIG_MRC_TX
-  if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      txsize_sqr_map[tx_size] >= TX_32X32)
-    return DCT_DCT;
-  return mbmi->tx_type;
-#endif  // CONFIG_EXT_TX
+  return depth;
 }
 
-void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
-
-static INLINE int tx_size_to_depth(TX_SIZE tx_size) {
-  return (int)(tx_size - TX_SIZE_LUMA_MIN);
+static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  assert(tx_size != TX_4X4);
+  int depth = 0;
+  while (tx_size != TX_4X4) {
+    depth++;
+    tx_size = sub_tx_size_map[tx_size];
+    assert(depth < 10);
+  }
+  assert(depth <= MAX_TX_CATS);
+  return depth - 1;
 }
 
-static INLINE TX_SIZE depth_to_tx_size(int depth) {
-  return (TX_SIZE)(depth + TX_SIZE_LUMA_MIN);
+static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+  TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  TX_SIZE tx_size = max_tx_size;
+  for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
+  return tx_size;
 }
 
-static INLINE TX_SIZE av1_get_uv_tx_size(const MB_MODE_INFO *mbmi,
-                                         const struct macroblockd_plane *pd) {
-#if CONFIG_CHROMA_2X2
-  assert(mbmi->tx_size > TX_2X2);
-#endif  // CONFIG_CHROMA_2X2
-
-#if CONFIG_SUPERTX
-  if (supertx_enabled(mbmi))
-    return uvsupertx_size_lookup[txsize_sqr_map[mbmi->tx_size]]
-                                [pd->subsampling_x][pd->subsampling_y];
-#endif  // CONFIG_SUPERTX
+static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_64X64:
+    case TX_64X32:
+    case TX_32X64: return TX_32X32;
+    case TX_64X16: return TX_32X16;
+    case TX_16X64: return TX_16X32;
+    default: return tx_size;
+  }
+}
 
-  const TX_SIZE uv_txsize =
-      uv_txsize_lookup[mbmi->sb_type][mbmi->tx_size][pd->subsampling_x]
-                      [pd->subsampling_y];
-  assert(uv_txsize != TX_INVALID);
-  return uv_txsize;
+static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, subsampling_x, subsampling_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
+  return av1_get_adjusted_tx_size(uv_tx);
 }
 
 static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
   if (plane == 0) return mbmi->tx_size;
   const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
-  return av1_get_uv_tx_size(mbmi, pd);
-}
-
-static INLINE BLOCK_SIZE
-get_plane_block_size(BLOCK_SIZE bsize, const struct macroblockd_plane *pd) {
-  return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                               pd->subsampling_y);
 }
 
 void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize);
+                            BLOCK_SIZE bsize, const int num_planes);
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes);
 
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   int blk_row, int blk_col,
@@ -1433,54 +941,31 @@ void av1_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg);
 
-#if CONFIG_LV_MAP
 void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
                                    foreach_transformed_block_visitor visit,
-                                   void *arg);
-#endif
-
-#if CONFIG_COEF_INTERLEAVE
-static INLINE int get_max_4x4_size(int num_4x4, int mb_to_edge,
-                                   int subsampling) {
-  return num_4x4 + (mb_to_edge >= 0 ? 0 : mb_to_edge >> (5 + subsampling));
-}
-
-void av1_foreach_transformed_block_interleave(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg);
-#endif
+                                   void *arg, const int num_planes);
 
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, TX_SIZE tx_size, int has_eob, int aoff,
-                      int loff);
+                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      int has_eob, int aoff, int loff);
+
+#define MAX_INTERINTRA_SB_SQUARE 32 * 32
+static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[0] > INTRA_FRAME &&
+          mbmi->ref_frame[1] == INTRA_FRAME);
+}
 
 static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
-#if CONFIG_INTERINTRA
-  // TODO(debargha): Should this be bsize < BLOCK_LARGEST?
-  return (bsize >= BLOCK_8X8) && (bsize < BLOCK_64X64);
-#else
-  (void)bsize;
-  return 0;
-#endif  // CONFIG_INTERINTRA
+  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
 }
 
 static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
-#if CONFIG_INTERINTRA
   return (mode >= NEARESTMV) && (mode <= NEWMV);
-#else
-  (void)mode;
-  return 0;
-#endif  // CONFIG_INTERINTRA
 }
 
 static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
-#if CONFIG_INTERINTRA
   return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
-#else
-  (void)rf;
-  return 0;
-#endif  // CONFIG_INTERINTRA
 }
 
 static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
@@ -1501,54 +986,30 @@ static INLINE int is_interintra_allowed_bsize_group(int group) {
 }
 
 static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
-  return (mbmi->ref_frame[1] == INTRA_FRAME) && is_interintra_allowed(mbmi);
-}
-
-#if CONFIG_VAR_TX
-static INLINE int get_vartx_max_txsize(const MB_MODE_INFO *const mbmi,
-                                       BLOCK_SIZE bsize, int subsampled) {
-#if CONFIG_CB4X4
-  (void)mbmi;
-  TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
-#else
-  TX_SIZE max_txsize = mbmi->sb_type < BLOCK_8X8
-                           ? max_txsize_rect_lookup[mbmi->sb_type]
-                           : max_txsize_rect_lookup[bsize];
-#endif  // CONFIG_C4X4
-
-#if CONFIG_EXT_PARTITION && CONFIG_TX64X64
-  // The decoder is designed so that it can process 64x64 luma pixels at a
-  // time. If this is a chroma plane with subsampling and bsize corresponds to
-  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
-  // mustn't be used for the subsampled plane (because it would be bigger than
-  // a 64x64 luma block) so we round down to TX_32X32.
-  if (subsampled && max_txsize == TX_64X64) max_txsize = TX_32X32;
-#else
-  (void)subsampled;
-#endif
+  return mbmi->ref_frame[0] > INTRA_FRAME &&
+         mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
+}
 
-  return max_txsize;
+static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                       int plane) {
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+  const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+  if (plane == 0) return max_txsize;            // luma
+  return av1_get_adjusted_tx_size(max_txsize);  // chroma
 }
-#endif  // CONFIG_VAR_TX
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
 }
 
 static INLINE int is_motion_variation_allowed_compound(
     const MB_MODE_INFO *mbmi) {
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi) && !is_inter_singleref_comp_mode(mbmi->mode))
-#else
   if (!has_second_ref(mbmi))
-#endif  // CONFIG_COMPOUND_SINGLEREF
     return 1;
   else
     return 0;
 }
 
-#if CONFIG_MOTION_VAR
 // input: log2 of length, 0(4), 1(8), ...
 static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
 
@@ -1556,102 +1017,53 @@ static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
   return !(mbmi->overlappable_neighbors[0] == 0 &&
            mbmi->overlappable_neighbors[1] == 0);
 }
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static INLINE NCOBMC_MODE ncobmc_mode_allowed_bsize(BLOCK_SIZE bsize) {
-  if (bsize < BLOCK_8X8 || bsize >= BLOCK_64X64)
-    return NO_OVERLAP;
-  else
-    return MAX_NCOBMC_MODES;
-}
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
 
-static INLINE MOTION_MODE motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-    int block, const WarpedMotionParams *gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-    const MACROBLOCKD *xd,
-#endif
-    const MODE_INFO *mi) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-#if CONFIG_AMVR
-  if (xd->cur_frame_mv_precision_level == 0) {
-#endif
-#if CONFIG_GLOBAL_MOTION
+static INLINE MOTION_MODE
+motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+                    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+  if (xd->cur_frame_force_integer_mv == 0) {
     const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
-    if (is_global_mv_block(mi, block, gm_type)) return SIMPLE_TRANSLATION;
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_AMVR
+    if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
   }
-#endif
   if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
       is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
       is_motion_variation_allowed_compound(mbmi)) {
-#if CONFIG_MOTION_VAR
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
-#endif
-#if CONFIG_WARPED_MOTION
-    if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 1 &&
-        !av1_is_scaled(&(xd->block_refs[0]->sf))) {
-#if CONFIG_AMVR
-      if (xd->cur_frame_mv_precision_level) {
+    assert(!has_second_ref(mbmi));
+    if (mbmi->num_proj_ref[0] >= 1 &&
+        (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+      if (xd->cur_frame_force_integer_mv) {
         return OBMC_CAUSAL;
       }
-#endif
       return WARPED_CAUSAL;
     }
-
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    if (ncobmc_mode_allowed_bsize(mbmi->sb_type) < NO_OVERLAP)
-      return NCOBMC_ADAPT_WEIGHT;
-    else
-#endif
-      return OBMC_CAUSAL;
-#else
-    return SIMPLE_TRANSLATION;
-#endif  // CONFIG_MOTION_VAR
+    return OBMC_CAUSAL;
   } else {
     return SIMPLE_TRANSLATION;
   }
 }
 
 static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
-#if CONFIG_GLOBAL_MOTION
-                                            int block,
                                             const WarpedMotionParams *gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
                                             const MACROBLOCKD *xd,
-#endif
-                                            const MODE_INFO *mi) {
-  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      block, gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
+                                            const MB_MODE_INFO *mbmi,
+                                            int allow_warped_motion) {
+  const MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
 
   // Check that the input mode is not illegal
   if (last_motion_mode_allowed < mode)
     assert(0 && "Illegal motion mode selected");
 }
 
-#if CONFIG_MOTION_VAR
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
   return (is_inter_block(mbmi));
 }
-#endif  // CONFIG_MOTION_VAR
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 static INLINE int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
-  return allow_screen_content_tools && sb_type >= BLOCK_8X8 &&
-         sb_type <= BLOCK_LARGEST;
+  return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
+         block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
 }
 
 // Returns sub-sampled dimensions of the given block.
@@ -1677,10 +1089,21 @@ static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
   assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
   assert(block_width >= block_cols);
   assert(block_height >= block_rows);
-  if (width) *width = block_width >> pd->subsampling_x;
-  if (height) *height = block_height >> pd->subsampling_y;
-  if (rows_within_bounds) *rows_within_bounds = block_rows >> pd->subsampling_y;
-  if (cols_within_bounds) *cols_within_bounds = block_cols >> pd->subsampling_x;
+  const int plane_block_width = block_width >> pd->subsampling_x;
+  const int plane_block_height = block_height >> pd->subsampling_y;
+  // Special handling for chroma sub8x8.
+  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
+  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
+  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
+  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (rows_within_bounds) {
+    *rows_within_bounds =
+        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+  }
+  if (cols_within_bounds) {
+    *cols_within_bounds =
+        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+  }
 }
 
 /* clang-format off */
@@ -1701,39 +1124,22 @@ typedef struct {
   ColorCost color_cost;
 } Av1ColorMapParam;
 
-#if CONFIG_GLOBAL_MOTION
-static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd) {
-  const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+                                            const MB_MODE_INFO *mbmi) {
   int ref;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
 
-  // First check if all modes are ZEROMV
-  if (mbmi->sb_type >= BLOCK_8X8 || unify_bsize) {
-    if (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV) return 0;
-  } else {
-    if ((mi->bmi[0].as_mode != ZEROMV && mi->bmi[0].as_mode != ZERO_ZEROMV) ||
-        (mi->bmi[1].as_mode != ZEROMV && mi->bmi[1].as_mode != ZERO_ZEROMV) ||
-        (mi->bmi[2].as_mode != ZEROMV && mi->bmi[2].as_mode != ZERO_ZEROMV) ||
-        (mi->bmi[3].as_mode != ZEROMV && mi->bmi[3].as_mode != ZERO_ZEROMV))
-      return 0;
-  }
+  // First check if all modes are GLOBALMV
+  if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
 
-#if !GLOBAL_SUB8X8_USED
-  if (mbmi->sb_type < BLOCK_8X8) return 0;
-#endif
+  if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+    return 0;
 
   // Now check if all global motion is non translational
   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype <= TRANSLATION) return 0;
+    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
   }
   return 1;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
 static INLINE PLANE_TYPE get_plane_type(int plane) {
   return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
@@ -1771,6 +1177,16 @@ static INLINE void transpose_int32(int32_t *dst, int dst_stride,
     for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
 }
 
+static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+  if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
+    return 1024;
+  }
+  if (tx_size == TX_16X64 || tx_size == TX_64X16) {
+    return 512;
+  }
+  return tx_size_2d[tx_size];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
index 397a14845..c9b974900 100644
--- a/third_party/aom/av1/common/cdef.c
+++ b/third_party/aom/av1/common/cdef.c
@@ -13,7 +13,8 @@
 #include <math.h>
 #include <string.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "av1/common/cdef.h"
 #include "av1/common/cdef_block.h"
@@ -21,7 +22,6 @@
 #include "av1/common/reconinter.h"
 
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
-  int r, c;
   int maxc, maxr;
   int skip = 1;
   maxc = cm->mi_cols - mi_col;
@@ -30,38 +30,40 @@ int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
   maxr = AOMMIN(maxr, MI_SIZE_64X64);
   maxc = AOMMIN(maxc, MI_SIZE_64X64);
 
-  for (r = 0; r < maxr; r++) {
-    for (c = 0; c < maxc; c++) {
-      skip = skip &&
-             cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
-                 ->mbmi.skip;
+  for (int r = 0; r < maxr; r++) {
+    for (int c = 0; c < maxc; c++) {
+      skip =
+          skip &&
+          cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
     }
   }
   return skip;
 }
 
-static int is_8x8_block_skip(MODE_INFO **grid, int mi_row, int mi_col,
+static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
   int is_skip = 1;
   for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
     for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
-      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->mbmi.skip;
+      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
 
   return is_skip;
 }
 
 int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, int filter_skip) {
-  int r, c;
-  int maxc, maxr;
-  MODE_INFO **grid;
-  int count = 0;
-  grid = cm->mi_grid_visible;
-  maxc = cm->mi_cols - mi_col;
-  maxr = cm->mi_rows - mi_row;
+                         cdef_list *dlist, BLOCK_SIZE bs) {
+  MB_MODE_INFO **grid = cm->mi_grid_visible;
+  int maxc = cm->mi_cols - mi_col;
+  int maxr = cm->mi_rows - mi_row;
 
-  maxr = AOMMIN(maxr, MI_SIZE_64X64);
-  maxc = AOMMIN(maxc, MI_SIZE_64X64);
+  if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
+    maxc = AOMMIN(maxc, MI_SIZE_128X128);
+  else
+    maxc = AOMMIN(maxc, MI_SIZE_64X64);
+  if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
+    maxr = AOMMIN(maxr, MI_SIZE_128X128);
+  else
+    maxr = AOMMIN(maxr, MI_SIZE_64X64);
 
   const int r_step = mi_size_high[BLOCK_8X8];
   const int c_step = mi_size_wide[BLOCK_8X8];
@@ -71,36 +73,25 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
   assert(r_step == 1 || r_step == 2);
   assert(c_step == 1 || c_step == 2);
 
-  if (filter_skip) {
-    for (r = 0; r < maxr; r += r_step) {
-      for (c = 0; c < maxc; c += c_step) {
+  int count = 0;
+
+  for (int r = 0; r < maxr; r += r_step) {
+    for (int c = 0; c < maxc; c += c_step) {
+      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
         dlist[count].by = r >> r_shift;
         dlist[count].bx = c >> c_shift;
-        dlist[count].skip =
-            is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride);
+        dlist[count].skip = 0;
         count++;
       }
     }
-  } else {
-    for (r = 0; r < maxr; r += r_step) {
-      for (c = 0; c < maxc; c += c_step) {
-        if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
-          dlist[count].by = r >> r_shift;
-          dlist[count].bx = c >> c_shift;
-          dlist[count].skip = 0;
-          count++;
-        }
-      }
-    }
   }
   return count;
 }
 
 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
                                 int sstride, int v, int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
@@ -109,36 +100,30 @@ void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
                                  const uint16_t *src, int sstride, int v,
                                  int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
 }
 
-static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
                         const uint8_t *src, int src_voffset, int src_hoffset,
                         int sstride, int vsize, int hsize) {
-#if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
     copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
   } else {
-#endif
     const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
     copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
-#if CONFIG_HIGHBITDEPTH
   }
-#endif
 }
 
 static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
                              uint16_t x) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = x;
     }
   }
@@ -146,9 +131,8 @@ static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
 
 static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
                              int sstride, int v, int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
@@ -156,9 +140,8 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
 
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                     MACROBLOCKD *xd) {
-  int fbr, fbc;
-  int nhfb, nvfb;
-  uint16_t src[CDEF_INBUF_SIZE];
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
   uint16_t *linebuf[3];
   uint16_t *colbuf[3];
   cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
@@ -166,48 +149,42 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   int cdef_count;
   int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int stride;
   int mi_wide_l2[3];
   int mi_high_l2[3];
   int xdec[3];
   int ydec[3];
-  int pli;
-  int cdef_left;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
-  int nplanes = MAX_MB_PLANE;
-  int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
-                    xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
-  nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+  const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
   row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
   memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
   prev_row_cdef = row_cdef + 1;
   curr_row_cdef = prev_row_cdef + nhfb + 2;
-  for (pli = 0; pli < nplanes; pli++) {
+  for (int pli = 0; pli < num_planes; pli++) {
     xdec[pli] = xd->plane[pli].subsampling_x;
     ydec[pli] = xd->plane[pli].subsampling_y;
     mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
     mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
-    if (xdec[pli] != ydec[pli]) nplanes = 1;
   }
-  stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
-  for (pli = 0; pli < nplanes; pli++) {
+  const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  for (int pli = 0; pli < num_planes; pli++) {
     linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
     colbuf[pli] =
         aom_malloc(sizeof(*colbuf) *
                    ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) *
                    CDEF_HBORDER);
   }
-  for (fbr = 0; fbr < nvfb; fbr++) {
-    for (pli = 0; pli < nplanes; pli++) {
+  for (int fbr = 0; fbr < nvfb; fbr++) {
+    for (int pli = 0; pli < num_planes; pli++) {
       const int block_height =
           (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
       fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
                 CDEF_VERY_LARGE);
     }
-    cdef_left = 1;
-    for (fbc = 0; fbc < nhfb; fbc++) {
+    int cdef_left = 1;
+    for (int fbc = 0; fbc < nhfb; fbc++) {
       int level, sec_strength;
       int uv_level, uv_sec_strength;
       int nhb, nvb;
@@ -217,38 +194,43 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MI_SIZE_64X64 * fbc] == NULL ||
           cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
                               MI_SIZE_64X64 * fbc]
-                  ->mbmi.cdef_strength == -1) {
+                  ->cdef_strength == -1) {
         cdef_left = 0;
         continue;
       }
       if (!cdef_left) cstart = -CDEF_HBORDER;
       nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
       nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
-      int tile_top, tile_left, tile_bottom, tile_right;
-      int mi_idx = MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
-      MODE_INFO *const mi_tl = cm->mi + mi_idx;
-      BOUNDARY_TYPE boundary_tl = mi_tl->mbmi.boundary_info;
-      tile_top = boundary_tl & TILE_ABOVE_BOUNDARY;
-      tile_left = boundary_tl & TILE_LEFT_BOUNDARY;
+      int frame_top, frame_left, frame_bottom, frame_right;
+
+      int mi_row = MI_SIZE_64X64 * fbr;
+      int mi_col = MI_SIZE_64X64 * fbc;
+      // for the current filter block, it's top left corner mi structure (mi_tl)
+      // is first accessed to check whether the top and left boundaries are
+      // frame boundaries. Then bottom-left and top-right mi structures are
+      // accessed to check whether the bottom and right boundaries
+      // (respectively) are frame boundaries.
+      //
+      // Note that we can't just check the bottom-right mi structure - eg. if
+      // we're at the right-hand edge of the frame but not the bottom, then
+      // the bottom-right mi is NULL but the bottom-left is not.
+      frame_top = (mi_row == 0) ? 1 : 0;
+      frame_left = (mi_col == 0) ? 1 : 0;
 
-      if (fbr != nvfb - 1 &&
-          (&cm->mi[mi_idx + (MI_SIZE_64X64 - 1) * cm->mi_stride]))
-        tile_bottom = cm->mi[mi_idx + (MI_SIZE_64X64 - 1) * cm->mi_stride]
-                          .mbmi.boundary_info &
-                      TILE_BOTTOM_BOUNDARY;
+      if (fbr != nvfb - 1)
+        frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
       else
-        tile_bottom = 1;
+        frame_bottom = 1;
 
-      if (fbc != nhfb - 1 && (&cm->mi[mi_idx + MI_SIZE_64X64 - 1]))
-        tile_right = cm->mi[mi_idx + MI_SIZE_64X64 - 1].mbmi.boundary_info &
-                     TILE_RIGHT_BOUNDARY;
+      if (fbc != nhfb - 1)
+        frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
       else
-        tile_right = 1;
+        frame_right = 1;
 
       const int mbmi_cdef_strength =
           cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
                               MI_SIZE_64X64 * fbc]
-              ->mbmi.cdef_strength;
+              ->cdef_strength;
       level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
       sec_strength =
           cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
@@ -259,23 +241,15 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
       uv_sec_strength += uv_sec_strength == 3;
       if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
            uv_sec_strength == 0) ||
-          (cdef_count = sb_compute_cdef_list(
-               cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist,
-#if CONFIG_CDEF_SINGLEPASS
-               (level & 1) || (uv_level & 1))) == 0)
-#else
-                 get_filter_skip(level) || get_filter_skip(uv_level))) == 0)
-#endif
-      {
+          (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+                                             fbc * MI_SIZE_64X64, dlist,
+                                             BLOCK_64X64)) == 0) {
         cdef_left = 0;
         continue;
       }
 
       curr_row_cdef[fbc] = 1;
-      for (pli = 0; pli < nplanes; pli++) {
-#if !CONFIG_CDEF_SINGLEPASS
-        uint16_t dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE];
-#endif
+      for (int pli = 0; pli < num_planes; pli++) {
         int coffset;
         int rend, cend;
         int pri_damping = cm->cdef_pri_damping;
@@ -284,10 +258,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
         int vsize = nvb << mi_high_l2[pli];
 
         if (pli) {
-          if (chroma_cdef)
-            level = uv_level;
-          else
-            level = 0;
+          level = uv_level;
           sec_strength = uv_sec_strength;
         }
 
@@ -375,81 +346,57 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
             (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
             coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
 
-        if (tile_top) {
+        if (frame_top) {
           fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
                     CDEF_VERY_LARGE);
         }
-        if (tile_left) {
+        if (frame_left) {
           fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
                     CDEF_VERY_LARGE);
         }
-        if (tile_bottom) {
+        if (frame_bottom) {
           fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
                     CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
         }
-        if (tile_right) {
+        if (frame_right) {
           fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
                     vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
         }
-#if CONFIG_HIGHBITDEPTH
+
         if (cm->use_highbitdepth) {
           cdef_filter_fb(
-#if CONFIG_CDEF_SINGLEPASS
               NULL,
-              &CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
-#else
-              (uint8_t *)&CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
-#endif
-                  [xd->plane[pli].dst.stride *
-                       (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
-                   (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-#if CONFIG_CDEF_SINGLEPASS
+              &CONVERT_TO_SHORTPTR(
+                  xd->plane[pli]
+                      .dst.buf)[xd->plane[pli].dst.stride *
+                                    (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                                (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
               xd->plane[pli].dst.stride,
-#else
-              xd->plane[pli].dst.stride, dst,
-#endif
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-#if CONFIG_CDEF_SINGLEPASS
               sec_strength, pri_damping, sec_damping, coeff_shift);
-#else
-              sec_strength, sec_damping, pri_damping, coeff_shift, 0, 1);
-#endif
         } else {
-#endif
           cdef_filter_fb(
               &xd->plane[pli]
                    .dst.buf[xd->plane[pli].dst.stride *
                                 (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
                             (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-#if CONFIG_CDEF_SINGLEPASS
               NULL, xd->plane[pli].dst.stride,
-#else
-              xd->plane[pli].dst.stride, dst,
-#endif
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-#if CONFIG_CDEF_SINGLEPASS
               sec_strength, pri_damping, sec_damping, coeff_shift);
-#else
-              sec_strength, sec_damping, pri_damping, coeff_shift, 0, 0);
-#endif
-
-#if CONFIG_HIGHBITDEPTH
         }
-#endif
       }
       cdef_left = 1;
     }
     {
-      unsigned char *tmp;
-      tmp = prev_row_cdef;
+      unsigned char *tmp = prev_row_cdef;
       prev_row_cdef = curr_row_cdef;
       curr_row_cdef = tmp;
     }
   }
   aom_free(row_cdef);
-  for (pli = 0; pli < nplanes; pli++) {
+  for (int pli = 0; pli < num_planes; pli++) {
     aom_free(linebuf[pli]);
     aom_free(colbuf[pli]);
   }
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
index 9de24bf92..092230de9 100644
--- a/third_party/aom/av1/common/cdef.h
+++ b/third_party/aom/av1/common/cdef.h
@@ -11,12 +11,13 @@
 #ifndef AV1_COMMON_CDEF_H_
 #define AV1_COMMON_CDEF_H_
 
-#define CDEF_STRENGTH_BITS 7
+#define CDEF_STRENGTH_BITS 6
 
-#define CDEF_PRI_STRENGTHS 32
+#define CDEF_PRI_STRENGTHS 16
 #define CDEF_SEC_STRENGTHS 4
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "av1/common/cdef_block.h"
@@ -38,7 +39,7 @@ extern "C" {
 
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
 int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, int filter_skip);
+                         cdef_list *dlist, BLOCK_SIZE bsize);
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c
index aaa32c950..df1de89be 100644
--- a/third_party/aom/av1/common/cdef_block.c
+++ b/third_party/aom/av1/common/cdef_block.c
@@ -12,28 +12,13 @@
 #include <math.h>
 #include <stdlib.h>
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "./cdef.h"
+#include "av1/common/cdef.h"
 
 /* Generated from gen_filter_tables.c. */
-#if !CONFIG_CDEF_SINGLEPASS || CDEF_FULL
-const int cdef_directions[8][3] = {
-  { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2, -3 * CDEF_BSTRIDE + 3 },
-  { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2, -1 * CDEF_BSTRIDE + 3 },
-  { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2, 0 * CDEF_BSTRIDE + 3 },
-  { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2, 1 * CDEF_BSTRIDE + 3 },
-  { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2, 3 * CDEF_BSTRIDE + 3 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1, 3 * CDEF_BSTRIDE + 1 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0, 3 * CDEF_BSTRIDE + 0 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1, 3 * CDEF_BSTRIDE - 1 }
-};
-#else
-const int cdef_directions[8][2] = {
+DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
   { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
   { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
   { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
@@ -43,7 +28,6 @@ const int cdef_directions[8][2] = {
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
 };
-#endif
 
 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
    The search minimizes the weighted variance along all the lines in a
@@ -123,65 +107,38 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
   return best_dir;
 }
 
-#if CONFIG_CDEF_SINGLEPASS
-#if CDEF_FULL
-const int cdef_pri_taps[2][3] = { { 3, 2, 1 }, { 2, 2, 2 } };
-const int cdef_sec_taps[2][2] = { { 3, 1 }, { 3, 1 } };
-#else
 const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
 const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
-#endif
 
 /* Smooth in the direction detected. */
-#if CDEF_CAP
-void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
-                         const uint16_t *in, int pri_strength, int sec_strength,
-                         int dir, int pri_damping, int sec_damping, int bsize,
-                         UNUSED int max_unused)
-#else
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
                          const uint16_t *in, int pri_strength, int sec_strength,
                          int dir, int pri_damping, int sec_damping, int bsize,
-                         int max)
-#endif
-{
+                         AOM_UNUSED int max_unused, int coeff_shift) {
   int i, j, k;
   const int s = CDEF_BSTRIDE;
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
-  for (i = 0; i < 4 << (bsize == BLOCK_8X8); i++) {
-    for (j = 0; j < 4 << (bsize == BLOCK_8X8); j++) {
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
+    for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
       int16_t sum = 0;
       int16_t y;
       int16_t x = in[i * s + j];
-#if CDEF_CAP
       int max = x;
       int min = x;
-#endif
-#if CDEF_FULL
-      for (k = 0; k < 3; k++)
-#else
-      for (k = 0; k < 2; k++)
-#endif
-      {
+      for (k = 0; k < 2; k++) {
         int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
         int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
         sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
         sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
-#if CDEF_CAP
         if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
         if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
         min = AOMMIN(p0, min);
         min = AOMMIN(p1, min);
-#endif
-#if CDEF_FULL
-        if (k == 2) continue;
-#endif
         int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
         int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
         int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
         int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
-#if CDEF_CAP
         if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
         if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
         if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
@@ -190,17 +147,12 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
         min = AOMMIN(s1, min);
         min = AOMMIN(s2, min);
         min = AOMMIN(s3, min);
-#endif
         sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
         sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
         sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
         sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
       }
-#if CDEF_CAP
       y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
-#else
-      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), 0, max);
-#endif
       if (dst8)
         dst8[i * dstride + j] = (uint8_t)y;
       else
@@ -209,67 +161,6 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
   }
 }
 
-#else
-
-/* Smooth in the direction detected. */
-void cdef_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in,
-                          int threshold, int dir, int damping) {
-  int i;
-  int j;
-  int k;
-  static const int taps[3] = { 3, 2, 1 };
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      int16_t sum;
-      int16_t xx;
-      int16_t yy;
-      xx = in[i * CDEF_BSTRIDE + j];
-      sum = 0;
-      for (k = 0; k < 3; k++) {
-        int16_t p0;
-        int16_t p1;
-        p0 = in[i * CDEF_BSTRIDE + j + cdef_directions[dir][k]] - xx;
-        p1 = in[i * CDEF_BSTRIDE + j - cdef_directions[dir][k]] - xx;
-        sum += taps[k] * constrain(p0, threshold, damping);
-        sum += taps[k] * constrain(p1, threshold, damping);
-      }
-      sum = (sum + 8) >> 4;
-      yy = xx + sum;
-      y[i * ystride + j] = yy;
-    }
-  }
-}
-
-/* Smooth in the direction detected. */
-void cdef_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in,
-                          int threshold, int dir, int damping) {
-  int i;
-  int j;
-  int k;
-  static const int taps[2] = { 4, 1 };
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      int16_t sum;
-      int16_t xx;
-      int16_t yy;
-      xx = in[i * CDEF_BSTRIDE + j];
-      sum = 0;
-      for (k = 0; k < 2; k++) {
-        int16_t p0;
-        int16_t p1;
-        p0 = in[i * CDEF_BSTRIDE + j + cdef_directions[dir][k]] - xx;
-        p1 = in[i * CDEF_BSTRIDE + j - cdef_directions[dir][k]] - xx;
-        sum += taps[k] * constrain(p0, threshold, damping);
-        sum += taps[k] * constrain(p1, threshold, damping);
-      }
-      sum = (sum + 8) >> 4;
-      yy = xx + sum;
-      y[i * ystride + j] = yy;
-    }
-  }
-}
-#endif
-
 /* Compute the primary filter strength for an 8x8 block based on the
    directional variance difference. A high variance difference means
    that we have a highly directional pattern (e.g. a high contrast
@@ -282,172 +173,26 @@ static INLINE int adjust_strength(int strength, int32_t var) {
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
-#if !CONFIG_CDEF_SINGLEPASS
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
-                               int sstride) {
-  int i, j;
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++) dst[i * dstride + j] = src[i * sstride + j];
-}
-
-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
-                               int sstride) {
-  int i, j;
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++) dst[i * dstride + j] = src[i * sstride + j];
-}
-
-static void copy_block_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                                      cdef_list *dlist, int cdef_count,
-                                      int bsize) {
-  int bi, bx, by;
-
-  if (bsize == BLOCK_8X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_8x8_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                              &src[bi << (3 + 3)], 8);
-    }
-  } else if (bsize == BLOCK_4X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
-                              &src[bi << (3 + 2)], 4);
-      copy_4x4_16bit_to_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
-                              dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
-    }
-  } else if (bsize == BLOCK_8X4) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
-                              &src[bi << (2 + 3)], 8);
-      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4],
-                              dstride, &src[(bi << (2 + 3)) + 4], 8);
-    }
-  } else {
-    assert(bsize == BLOCK_4X4);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
-                              &src[bi << (2 + 2)], 4);
-    }
-  }
-}
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
-                              int sstride) {
-  int i, j;
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
-}
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
-                              int sstride) {
-  int i, j;
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++)
-      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
-}
-
-static void copy_block_16bit_to_8bit(uint8_t *dst, int dstride,
-                                     const uint16_t *src, cdef_list *dlist,
-                                     int cdef_count, int bsize) {
-  int bi, bx, by;
-  if (bsize == BLOCK_8X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                             &src[bi << (3 + 3)], 8);
-    }
-  } else if (bsize == BLOCK_4X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
-                             &src[bi << (3 + 2)], 4);
-      copy_4x4_16bit_to_8bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
-                             dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
-    }
-  } else if (bsize == BLOCK_8X4) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
-                             &src[bi << (2 + 3)], 8);
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
-                             &src[(bi << (2 + 3)) + 4], 8);
-    }
-  } else {
-    assert(bsize == BLOCK_4X4);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
-                             &src[bi << (2 * 2)], 4);
-    }
-  }
-}
-
-int get_filter_skip(int level) {
-  int filter_skip = level & 1;
-  if (level == 1) filter_skip = 0;
-  return filter_skip;
-}
-
-void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int sec_damping, int pri_damping,
-                    int coeff_shift, int skip_dering, int hbd) {
-#else
-
 void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
                     int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
                     int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int pri_damping, int sec_damping,
                     int coeff_shift) {
-#endif
   int bi;
   int bx;
   int by;
   int bsize, bsizex, bsizey;
 
-#if CONFIG_CDEF_SINGLEPASS
-  int pri_strength = (level >> 1) << coeff_shift;
-  int filter_skip = level & 1;
-  if (!pri_strength && !sec_strength && filter_skip) {
-    pri_strength = 19 << coeff_shift;
-    sec_strength = 7 << coeff_shift;
-  }
-#else
-  int threshold = (level >> 1) << coeff_shift;
-  int filter_skip = get_filter_skip(level);
-  if (level == 1) threshold = 31 << coeff_shift;
-
-  cdef_direction_func cdef_direction[] = { cdef_direction_4x4,
-                                           cdef_direction_8x8 };
-#endif
+  int pri_strength = level << coeff_shift;
+  sec_strength <<= coeff_shift;
   sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
   pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
   bsize =
       ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
   bsizex = 3 - xdec;
   bsizey = 3 - ydec;
-#if CONFIG_CDEF_SINGLEPASS
-  if (dirinit && pri_strength == 0 && sec_strength == 0)
-#else
-  if (!skip_dering)
-#endif
-  {
-#if CONFIG_CDEF_SINGLEPASS
+  if (dirinit && pri_strength == 0 && sec_strength == 0) {
     // If we're here, both primary and secondary strengths are 0, and
     // we still haven't written anything to y[] yet, so we just copy
     // the input to y[]. This is necessary only for av1_cdef_search()
@@ -455,97 +200,16 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
     for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
-#else
-    if (pli == 0) {
-      if (!dirinit || !*dirinit) {
-        for (bi = 0; bi < cdef_count; bi++) {
-          by = dlist[bi].by;
-          bx = dlist[bi].bx;
-          dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
-                                      CDEF_BSTRIDE, &var[by][bx], coeff_shift);
-        }
-        if (dirinit) *dirinit = 1;
-      }
-    }
-    // Only run dering for non-zero threshold (which is always the case for
-    // 4:2:2 or 4:4:0). If we don't dering, we still need to eventually write
-    // something out in y[] later.
-    if (threshold != 0) {
-      assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
-      for (bi = 0; bi < cdef_count; bi++) {
-        int t = !filter_skip && dlist[bi].skip ? 0 : threshold;
-        by = dlist[bi].by;
-        bx = dlist[bi].bx;
-        (cdef_direction[bsize == BLOCK_8X8])(
-            &y[bi << (bsizex + bsizey)], 1 << bsizex,
-            &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
-            pli ? t : adjust_strength(t, var[by][bx]), dir[by][bx],
-            pri_damping);
-      }
-    }
-  }
-
-  if (sec_strength) {
-    if (threshold && !skip_dering)
-      copy_block_16bit_to_16bit(in, CDEF_BSTRIDE, y, dlist, cdef_count, bsize);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      int py = by << bsizey;
-      int px = bx << bsizex;
-
-      if (!filter_skip && dlist[bi].skip) continue;
-      if (!dst || hbd) {
-        // 16 bit destination if high bitdepth or 8 bit destination not given
-        (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
-                                                        : aom_clpf_hblock_hbd)(
-            dst ? (uint16_t *)dst + py * dstride + px
-                : &y[bi << (bsizex + bsizey)],
-            in + py * CDEF_BSTRIDE + px, dst && hbd ? dstride : 1 << bsizex,
-            CDEF_BSTRIDE, 1 << bsizex, 1 << bsizey, sec_strength << coeff_shift,
-            sec_damping);
-      } else {
-        // Do clpf and write the result to an 8 bit destination
-        (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
-                                                        : aom_clpf_hblock)(
-            dst + py * dstride + px, in + py * CDEF_BSTRIDE + px, dstride,
-            CDEF_BSTRIDE, 1 << bsizex, 1 << bsizey, sec_strength << coeff_shift,
-            sec_damping);
-      }
-    }
-  } else if (threshold != 0) {
-    // No clpf, so copy instead
-    if (hbd) {
-      copy_block_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist, cdef_count,
-                                bsize);
-    } else {
-      copy_block_16bit_to_8bit(dst, dstride, y, dlist, cdef_count, bsize);
-    }
-  } else if (dirinit) {
-    // If we're here, both dering and clpf are off, and we still haven't written
-    // anything to y[] yet, so we just copy the input to y[]. This is necessary
-    // only for av1_cdef_search() and only av1_cdef_search() sets dirinit.
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-#endif
       int iy, ix;
       // TODO(stemidts/jmvalin): SIMD optimisations
       for (iy = 0; iy < 1 << bsizey; iy++)
         for (ix = 0; ix < 1 << bsizex; ix++)
-#if CONFIG_CDEF_SINGLEPASS
           dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
-#else
-          y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
-#endif
               in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
     }
-#if CONFIG_CDEF_SINGLEPASS
     return;
-#endif
   }
 
-#if CONFIG_CDEF_SINGLEPASS
   if (pli == 0) {
     if (!dirinit || !*dirinit) {
       for (bi = 0; bi < cdef_count; bi++) {
@@ -557,19 +221,28 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
       if (dirinit) *dirinit = 1;
     }
   }
+  if (pli == 1 && xdec != ydec) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
+      static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
+    }
+  }
 
-  assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
   for (bi = 0; bi < cdef_count; bi++) {
-    int t = !filter_skip && dlist[bi].skip ? 0 : pri_strength;
-    int s = !filter_skip && dlist[bi].skip ? 0 : sec_strength;
+    int t = dlist[bi].skip ? 0 : pri_strength;
+    int s = dlist[bi].skip ? 0 : sec_strength;
     by = dlist[bi].by;
     bx = dlist[bi].bx;
     if (dst8)
-      cdef_filter_block(
-          &dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL, dstride,
-          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
-          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+      cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL,
+                        dstride,
+                        &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+                        (pli ? t : adjust_strength(t, var[by][bx])), s,
+                        t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
+                        (256 << coeff_shift) - 1, coeff_shift);
     else
       cdef_filter_block(
           NULL,
@@ -578,7 +251,7 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
           dirinit ? 1 << bsizex : dstride,
           &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
           (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
+          coeff_shift);
   }
-#endif
 }
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
index bf277faad..81c6da077 100644
--- a/third_party/aom/av1/common/cdef_block.h
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -12,43 +12,28 @@
 #if !defined(_CDEF_BLOCK_H)
 #define _CDEF_BLOCK_H (1)
 
-#include "./odintrin.h"
+#include "av1/common/odintrin.h"
 
 #define CDEF_BLOCKSIZE 64
 #define CDEF_BLOCKSIZE_LOG2 6
-#define CDEF_NBLOCKS (CDEF_BLOCKSIZE / 8)
-#if CONFIG_CDEF_SINGLEPASS
+#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
 #define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
-#endif
 
 /* We need to buffer three vertical lines. */
 #define CDEF_VBORDER (3)
 /* We only need to buffer three horizontal pixels too, but let's align to
    16 bytes (8 x 16 bits) to make vectorization easier. */
 #define CDEF_HBORDER (8)
-#define CDEF_BSTRIDE ALIGN_POWER_OF_TWO(CDEF_BLOCKSIZE + 2 * CDEF_HBORDER, 3)
+#define CDEF_BSTRIDE \
+  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
 
 #define CDEF_VERY_LARGE (30000)
-#define CDEF_INBUF_SIZE (CDEF_BSTRIDE * (CDEF_BLOCKSIZE + 2 * CDEF_VBORDER))
-
-#if CONFIG_CDEF_SINGLEPASS
-// Filter configuration
-#define CDEF_CAP 1   // 1 = Cap change to largest diff
-#define CDEF_FULL 0  // 1 = 7x7 filter, 0 = 5x5 filter
+#define CDEF_INBUF_SIZE \
+  (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
 
-#if CDEF_FULL
-extern const int cdef_pri_taps[2][3];
-extern const int cdef_sec_taps[2][2];
-extern const int cdef_directions[8][3];
-#else
 extern const int cdef_pri_taps[2][2];
 extern const int cdef_sec_taps[2][2];
-extern const int cdef_directions[8][2];
-#endif
-
-#else  // CONFIG_CDEF_SINGLEPASS
-extern const int cdef_directions[8][3];
-#endif
+DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
 
 typedef struct {
   uint8_t by;
@@ -56,35 +41,19 @@ typedef struct {
   uint8_t skip;
 } cdef_list;
 
-#if CONFIG_CDEF_SINGLEPASS
 typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
                                        int dstride, const uint16_t *in,
                                        int pri_strength, int sec_strength,
                                        int dir, int pri_damping,
-                                       int sec_damping, int bsize, int max);
+                                       int sec_damping, int bsize, int max,
+                                       int coeff_shift);
 void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                               cdef_list *dlist, int cdef_count, int bsize);
-#else
-typedef void (*cdef_direction_func)(uint16_t *y, int ystride,
-                                    const uint16_t *in, int threshold, int dir,
-                                    int damping);
 
-int get_filter_skip(int level);
-#endif
-
-#if CONFIG_CDEF_SINGLEPASS
 void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
                     int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
                     int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int pri_damping, int sec_damping,
                     int coeff_shift);
-#else
-void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int sec_damping, int pri_damping,
-                    int coeff_shift, int skip_dering, int hbd);
-#endif
 #endif
diff --git a/third_party/aom/av1/common/cdef_block_avx2.c b/third_party/aom/av1/common/cdef_block_avx2.c
index 5e48045c0..e2b85b3e2 100644
--- a/third_party/aom/av1/common/cdef_block_avx2.c
+++ b/third_party/aom/av1/common/cdef_block_avx2.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_avx2
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_neon.c b/third_party/aom/av1/common/cdef_block_neon.c
index 030b32531..2d6bc65e3 100644
--- a/third_party/aom/av1/common/cdef_block_neon.c
+++ b/third_party/aom/av1/common/cdef_block_neon.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_neon
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
index aa7d3c3ca..d24a7c0fa 100644
--- a/third_party/aom/av1/common/cdef_block_simd.h
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./cdef_block.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef_block.h"
 
 /* partial A is a 16-bit vector of the form:
    [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
@@ -167,39 +168,22 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
         v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
   }
 
-#if defined(__SSE4_1__)
   /* Compute "mostly vertical" directions. */
-  __m128i dir47 = compute_directions(lines, cost + 4);
+  v128 dir47 = compute_directions(lines, cost + 4);
 
   array_reverse_transpose_8x8(lines, lines);
 
   /* Compute "mostly horizontal" directions. */
-  __m128i dir03 = compute_directions(lines, cost);
-
-  __m128i max = _mm_max_epi32(dir03, dir47);
-  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
-  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
-  best_cost = _mm_cvtsi128_si32(max);
-  __m128i t =
-      _mm_packs_epi32(_mm_cmpeq_epi32(max, dir03), _mm_cmpeq_epi32(max, dir47));
-  best_dir = _mm_movemask_epi8(_mm_packs_epi16(t, t));
+  v128 dir03 = compute_directions(lines, cost);
+
+  v128 max = v128_max_s32(dir03, dir47);
+  max = v128_max_s32(max, v128_align(max, max, 8));
+  max = v128_max_s32(max, v128_align(max, max, 4));
+  best_cost = v128_low_u32(max);
+  v128 t =
+      v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
+  best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
   best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
-#else
-  /* Compute "mostly vertical" directions. */
-  compute_directions(lines, cost + 4);
-
-  array_reverse_transpose_8x8(lines, lines);
-
-  /* Compute "mostly horizontal" directions. */
-  compute_directions(lines, cost);
-
-  for (i = 0; i < 8; i++) {
-    if (cost[i] > best_cost) {
-      best_cost = cost[i];
-      best_dir = i;
-    }
-  }
-#endif
 
   /* Difference between the optimal variance and the variance along the
      orthogonal direction. Again, the sum(x^2) terms cancel out. */
@@ -211,17 +195,16 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
 }
 
 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
-SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
+SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
                              unsigned int adjdamp) {
-  v128 diff = v128_sub_16(a, b);
-  const v128 sign = v128_shr_n_s16(diff, 15);
-  diff = v128_abs_s16(diff);
-  const v128 s =
-      v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
-  return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
+  v256 diff = v256_sub_16(a, b);
+  const v256 sign = v256_shr_n_s16(diff, 15);
+  diff = v256_abs_s16(diff);
+  const v256 s =
+      v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
+  return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
 }
 
-#if CONFIG_CDEF_SINGLEPASS
 // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
 SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
                            unsigned int adjdamp) {
@@ -236,37 +219,24 @@ SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
       sign);
 }
 
-#if CDEF_CAP
-void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
-                                        const uint16_t *in, int pri_strength,
-                                        int sec_strength, int dir,
-                                        int pri_damping, int sec_damping,
-                                        UNUSED int max_unused)
-#else
 void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                                         const uint16_t *in, int pri_strength,
                                         int sec_strength, int dir,
                                         int pri_damping, int sec_damping,
-                                        int max)
-#endif
-{
+                                        AOM_UNUSED int max_unused,
+                                        int coeff_shift) {
   v128 p0, p1, p2, p3;
   v256 sum, row, tap, res;
-#if CDEF_CAP
   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-#endif
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -278,9 +248,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                       v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
                       v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
                       v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
-#if CDEF_CAP
   max = min = row;
-#endif
 
   if (pri_strength) {
     // Primary near taps
@@ -288,19 +256,15 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
@@ -313,52 +277,21 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
                                          v256_from_v128(v128_ziphi_8(p0, p1),
                                                         v128_ziplo_8(p0, p1))));
-
-#if CDEF_FULL
-    // Primary extra taps
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po3]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po3]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po3]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po3]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po3]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po3]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-#endif
   }
 
   if (sec_strength) {
@@ -367,37 +300,29 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
@@ -412,37 +337,29 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
@@ -459,11 +376,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
   res = v256_add_16(sum, v256_dup_16(8));
   res = v256_shr_n_s16(res, 4);
   res = v256_add_16(row, res);
-#if CDEF_CAP
   res = v256_min_s16(v256_max_s16(res, min), max);
-#else
-  res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
-#endif
   res = v256_pack_s16_u8(res, res);
 
   p0 = v256_low_v128(res);
@@ -473,38 +386,25 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
   u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
 }
 
-#if CDEF_CAP
 void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
                                         const uint16_t *in, int pri_strength,
                                         int sec_strength, int dir,
                                         int pri_damping, int sec_damping,
-                                        UNUSED int max_unused)
-#else
-void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
-                                        const uint16_t *in, int pri_strength,
-                                        int sec_strength, int dir,
-                                        int pri_damping, int sec_damping,
-                                        int max)
-#endif
-{
+                                        AOM_UNUSED int max_unused,
+                                        int coeff_shift) {
   int i;
   v128 p0, p1, p2, p3;
   v256 sum, row, res, tap;
-#if CDEF_CAP
   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-#endif
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -515,25 +415,19 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
                          v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
 
-#if CDEF_CAP
     max = min = row;
-#endif
     // Primary near taps
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
@@ -545,18 +439,14 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
@@ -564,63 +454,30 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
                                          v256_from_v128(v128_ziphi_8(p0, p1),
                                                         v128_ziplo_8(p0, p1))));
 
-#if CDEF_FULL
-    // Primary extra taps
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-#endif
-
     // Secondary near taps
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
@@ -634,34 +491,26 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
@@ -676,11 +525,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     res = v256_add_16(sum, v256_dup_16(8));
     res = v256_shr_n_s16(res, 4);
     res = v256_add_16(row, res);
-#if CDEF_CAP
     res = v256_min_s16(v256_max_s16(res, min), max);
-#else
-    res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
-#endif
     res = v256_pack_s16_u8(res, res);
 
     p0 = v256_low_v128(res);
@@ -689,499 +534,355 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
   }
 }
 
-#if CDEF_CAP
 void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
                                          const uint16_t *in, int pri_strength,
                                          int sec_strength, int dir,
                                          int pri_damping, int sec_damping,
-                                         UNUSED int max_unused)
-#else
-void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
-                                         const uint16_t *in, int pri_strength,
-                                         int sec_strength, int dir,
-                                         int pri_damping, int sec_damping,
-                                         int max)
-#endif
-{
+                                         AOM_UNUSED int max_unused,
+                                         int coeff_shift) {
   int i;
-  v128 p0, p1, p2, p3, sum, row, res;
-#if CDEF_CAP
-  v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
-#endif
+  v256 p0, p1, p2, p3, sum, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   if (sec_strength)
     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
-  for (i = 0; i < 4; i += 2) {
-    sum = v128_zero();
-    row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
-                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-#if CDEF_CAP
+  for (i = 0; i < 4; i += 4) {
+    sum = v256_zero();
+    row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
     min = max = row;
-#endif
 
     // Primary near taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
 
     // Primary far taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
-
-#if CDEF_FULL
-    // Primary extra taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
-    max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
-#endif
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
 
     // Secondary near taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-    p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-    p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // Secondary far taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-    p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-    p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // res = row + ((sum - (sum < 0) + 8) >> 4)
-    sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-#if CDEF_CAP
-    res = v128_min_s16(v128_max_s16(res, min), max);
-#else
-    res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
-#endif
-    v64_store_aligned(&dst[i * dstride], v128_high_v64(res));
-    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(res));
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 1) * dstride],
+                      v128_low_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 2) * dstride],
+                      v128_high_v64(v256_low_v128(res)));
+    v64_store_aligned(&dst[(i + 3) * dstride],
+                      v128_low_v64(v256_low_v128(res)));
   }
 }
 
-#if CDEF_CAP
 void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
                                          const uint16_t *in, int pri_strength,
                                          int sec_strength, int dir,
                                          int pri_damping, int sec_damping,
-                                         UNUSED int max_unused)
-#else
-void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
-                                         const uint16_t *in, int pri_strength,
-                                         int sec_strength, int dir,
-                                         int pri_damping, int sec_damping,
-                                         int max)
-#endif
-{
+                                         AOM_UNUSED int max_unused,
+                                         int coeff_shift) {
   int i;
-  v128 sum, p0, p1, p2, p3, row, res;
-#if CDEF_CAP
-  v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
-#endif
+  v256 sum, p0, p1, p2, p3, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   if (sec_strength)
     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
 
-  for (i = 0; i < 8; i++) {
-    sum = v128_zero();
-    row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
 
-#if CDEF_CAP
     min = max = row;
-#endif
     // Primary near taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]);
-#if CDEF_CAP
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
 
     // Primary far taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]);
-#if CDEF_CAP
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
-
-#if CDEF_FULL
-    // Primary extra taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]);
-#if CDEF_CAP
-    max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
-#endif
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
 
     // Secondary near taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]);
-    p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]);
-    p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]);
-#if CDEF_CAP
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // Secondary far taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]);
-    p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]);
-    p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]);
-#if CDEF_CAP
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // res = row + ((sum - (sum < 0) + 8) >> 4)
-    sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-#if CDEF_CAP
-    res = v128_min_s16(v128_max_s16(res, min), max);
-#else
-    res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
-#endif
-    v128_store_unaligned(&dst[i * dstride], res);
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+    v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
+    v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
   }
 }
 
 void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
                                   const uint16_t *in, int pri_strength,
                                   int sec_strength, int dir, int pri_damping,
-                                  int sec_damping, int bsize, int max) {
-  if (dst8)
-    (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_8)
-                        : SIMD_FUNC(cdef_filter_block_4x4_8))(
-        dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-        sec_damping, max);
-  else
-    (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_16)
-                        : SIMD_FUNC(cdef_filter_block_4x4_16))(
-        dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-        sec_damping, max);
-}
-
-#else
-
-void SIMD_FUNC(cdef_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in,
-                                   int threshold, int dir, int damping) {
-  int i;
-  v128 p0, p1, sum, row, res;
-  int o1 = cdef_directions[dir][0];
-  int o2 = cdef_directions[dir][1];
-
-  if (threshold) damping -= get_msb(threshold);
-  for (i = 0; i < 4; i += 2) {
-    sum = v128_zero();
-    row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
-                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + o1]));
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - o1]));
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 4 * (p0 + p1)
-    sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + o2]));
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - o2]));
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 1 * (p0 + p1)
-    sum = v128_add_16(sum, v128_add_16(p0, p1));
-
-    // res = row + ((sum + 8) >> 4)
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-    v64_store_aligned(&y[i * ystride], v128_high_v64(res));
-    v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
-  }
-}
-
-void SIMD_FUNC(cdef_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in,
-                                   int threshold, int dir, int damping) {
-  int i;
-  v128 sum, p0, p1, row, res;
-  int o1 = cdef_directions[dir][0];
-  int o2 = cdef_directions[dir][1];
-  int o3 = cdef_directions[dir][2];
-
-  if (threshold) damping -= get_msb(threshold);
-  for (i = 0; i < 8; i++) {
-    sum = v128_zero();
-    row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + o1]);
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - o1]);
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 3 * (p0 + p1)
-    p0 = v128_add_16(p0, p1);
-    p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
-    sum = v128_add_16(sum, p0);
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + o2]);
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - o2]);
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 2 * (p0 + p1)
-    p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
-    sum = v128_add_16(sum, p0);
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + o3]);
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - o3]);
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += (p0 + p1)
-    p0 = v128_add_16(p0, p1);
-    sum = v128_add_16(sum, p0);
-
-    // res = row + ((sum + 8) >> 4)
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-    v128_store_unaligned(&y[i * ystride], res);
-  }
-}
-
-void SIMD_FUNC(copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride,
-                                       const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 8; i++) {
-    v128 row = v128_load_unaligned(&src[i * sstride]);
-    row = v128_pack_s16_u8(row, row);
-    v64_store_unaligned(&dst[i * dstride], v128_low_v64(row));
-  }
-}
-
-void SIMD_FUNC(copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride,
-                                       const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 4; i++) {
-    v128 row = v128_load_unaligned(&src[i * sstride]);
-    row = v128_pack_s16_u8(row, row);
-    u32_store_unaligned(&dst[i * dstride], v128_low_u32(row));
-  }
-}
-
-void SIMD_FUNC(copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                        const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 8; i++) {
-    v128 row = v128_load_unaligned(&src[i * sstride]);
-    v128_store_unaligned(&dst[i * dstride], row);
-  }
-}
-
-void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                        const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 4; i++) {
-    v64 row = v64_load_unaligned(&src[i * sstride]);
-    v64_store_unaligned(&dst[i * dstride], row);
+                                  int sec_damping, int bsize, int max,
+                                  int coeff_shift) {
+  if (dst8) {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    }
+  } else {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else {
+      assert(bsize == BLOCK_4X4);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    }
   }
 }
-#endif
 
 void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
                                          const uint8_t *src, int sstride, int v,
diff --git a/third_party/aom/av1/common/cdef_block_sse2.c b/third_party/aom/av1/common/cdef_block_sse2.c
index f3de763fa..73f115d17 100644
--- a/third_party/aom/av1/common/cdef_block_sse2.c
+++ b/third_party/aom/av1/common/cdef_block_sse2.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_sse2
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_sse4.c b/third_party/aom/av1/common/cdef_block_sse4.c
index 27e9ff32e..349329af6 100644
--- a/third_party/aom/av1/common/cdef_block_sse4.c
+++ b/third_party/aom/av1/common/cdef_block_sse4.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_sse4_1
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_ssse3.c b/third_party/aom/av1/common/cdef_block_ssse3.c
index 863522199..3a93b150f 100644
--- a/third_party/aom/av1/common/cdef_block_ssse3.c
+++ b/third_party/aom/av1/common/cdef_block_ssse3.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_ssse3
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
index f9acfcbc9..ee19f0bcf 100644
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@@ -13,20 +13,77 @@
 #include "av1/common/common_data.h"
 #include "av1/common/onyxc_int.h"
 
+#include "config/av1_rtcd.h"
+
 void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
-  if (!((cm->subsampling_x == 0 && cm->subsampling_y == 0) ||
-        (cm->subsampling_x == 1 && cm->subsampling_y == 1))) {
+  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+  if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) &&
+      !(cm->subsampling_x == 1 && cm->subsampling_y == 1) &&
+      !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Only 4:4:4 and 4:2:0 are currently supported by CfL");
+                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by "
+                       "CfL, %d %d subsampling is not supported.\n",
+                       cm->subsampling_x, cm->subsampling_y);
   }
-  memset(&cfl->pred_buf_q3, 0, sizeof(cfl->pred_buf_q3));
+  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
+  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
   cfl->subsampling_x = cm->subsampling_x;
   cfl->subsampling_y = cm->subsampling_y;
   cfl->are_parameters_computed = 0;
   cfl->store_y = 0;
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-  cfl_clear_sub8x8_val(cfl);
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+  // The DC_PRED cache is disabled by default and is only enabled in
+  // cfl_rd_pick_alpha
+  cfl->use_dc_pred_cache = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
+}
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width) {
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+    memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+    return;
+  }
+
+  memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
+}
+
+static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
+                                 int dst_stride, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, width);
+    dst += dst_stride;
+  }
+}
+
+static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
+                                 int dst_stride, int width, int height) {
+  const size_t num_bytes = width << 1;
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, num_bytes);
+    dst += dst_stride;
+  }
+}
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+  assert(height <= CFL_BUF_LINE);
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
+                         width, height);
+    return;
+  }
+  cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+                       width, height);
 }
 
 // Due to frame boundary issues, it is possible that the total area covered by
@@ -38,217 +95,54 @@ static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
 
   if (diff_width > 0) {
     const int min_height = height - diff_height;
-    int16_t *pred_buf_q3 = cfl->pred_buf_q3 + (width - diff_width);
+    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
     for (int j = 0; j < min_height; j++) {
-      const int last_pixel = pred_buf_q3[-1];
+      const uint16_t last_pixel = recon_buf_q3[-1];
+      assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
       for (int i = 0; i < diff_width; i++) {
-        pred_buf_q3[i] = last_pixel;
+        recon_buf_q3[i] = last_pixel;
       }
-      pred_buf_q3 += MAX_SB_SIZE;
+      recon_buf_q3 += CFL_BUF_LINE;
     }
     cfl->buf_width = width;
   }
   if (diff_height > 0) {
-    int16_t *pred_buf_q3 =
-        cfl->pred_buf_q3 + ((height - diff_height) * MAX_SB_SIZE);
+    uint16_t *recon_buf_q3 =
+        cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
     for (int j = 0; j < diff_height; j++) {
-      const int16_t *last_row_q3 = pred_buf_q3 - MAX_SB_SIZE;
+      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+      assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
       for (int i = 0; i < width; i++) {
-        pred_buf_q3[i] = last_row_q3[i];
+        recon_buf_q3[i] = last_row_q3[i];
       }
-      pred_buf_q3 += MAX_SB_SIZE;
+      recon_buf_q3 += CFL_BUF_LINE;
     }
     cfl->buf_height = height;
   }
 }
 
-static void sum_above_row_lbd(const uint8_t *above_u, const uint8_t *above_v,
-                              int width, int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < width; i++) {
-    sum_u += above_u[i];
-    sum_v += above_v[i];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#if CONFIG_HIGHBITDEPTH
-static void sum_above_row_hbd(const uint16_t *above_u, const uint16_t *above_v,
-                              int width, int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < width; i++) {
-    sum_u += above_u[i];
-    sum_v += above_v[i];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void sum_above_row(const MACROBLOCKD *xd, int width, int *out_sum_u,
-                          int *out_sum_v) {
-  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
-  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
-#if CONFIG_HIGHBITDEPTH
-  if (get_bitdepth_data_path_index(xd)) {
-    const uint16_t *above_u_16 =
-        CONVERT_TO_SHORTPTR(pd_u->dst.buf) - pd_u->dst.stride;
-    const uint16_t *above_v_16 =
-        CONVERT_TO_SHORTPTR(pd_v->dst.buf) - pd_v->dst.stride;
-    sum_above_row_hbd(above_u_16, above_v_16, width, out_sum_u, out_sum_v);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  const uint8_t *above_u = pd_u->dst.buf - pd_u->dst.stride;
-  const uint8_t *above_v = pd_v->dst.buf - pd_v->dst.stride;
-  sum_above_row_lbd(above_u, above_v, width, out_sum_u, out_sum_v);
-}
-
-static void sum_left_col_lbd(const uint8_t *left_u, int u_stride,
-                             const uint8_t *left_v, int v_stride, int height,
-                             int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < height; i++) {
-    sum_u += left_u[i * u_stride];
-    sum_v += left_v[i * v_stride];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#if CONFIG_HIGHBITDEPTH
-static void sum_left_col_hbd(const uint16_t *left_u, int u_stride,
-                             const uint16_t *left_v, int v_stride, int height,
-                             int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < height; i++) {
-    sum_u += left_u[i * u_stride];
-    sum_v += left_v[i * v_stride];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-static void sum_left_col(const MACROBLOCKD *xd, int height, int *out_sum_u,
-                         int *out_sum_v) {
-  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
-  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
-
-#if CONFIG_HIGHBITDEPTH
-  if (get_bitdepth_data_path_index(xd)) {
-    const uint16_t *left_u_16 = CONVERT_TO_SHORTPTR(pd_u->dst.buf) - 1;
-    const uint16_t *left_v_16 = CONVERT_TO_SHORTPTR(pd_v->dst.buf) - 1;
-    sum_left_col_hbd(left_u_16, pd_u->dst.stride, left_v_16, pd_v->dst.stride,
-                     height, out_sum_u, out_sum_v);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  const uint8_t *left_u = pd_u->dst.buf - 1;
-  const uint8_t *left_v = pd_v->dst.buf - 1;
-  sum_left_col_lbd(left_u, pd_u->dst.stride, left_v, pd_v->dst.stride, height,
-                   out_sum_u, out_sum_v);
-}
-
-// CfL computes its own block-level DC_PRED. This is required to compute both
-// alpha_cb and alpha_cr before the prediction are computed.
-static void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
-  CFL_CTX *const cfl = xd->cfl;
-
-  // Compute DC_PRED until block boundary. We can't assume the neighbor will use
-  // the same transform size.
-  const int width = max_block_wide(xd, plane_bsize, AOM_PLANE_U)
-                    << tx_size_wide_log2[0];
-  const int height = max_block_high(xd, plane_bsize, AOM_PLANE_U)
-                     << tx_size_high_log2[0];
-  // Number of pixel on the top and left borders.
-  const int num_pel = width + height;
-
-  int sum_u = 0;
-  int sum_v = 0;
-
-// Match behavior of build_intra_predictors_high (reconintra.c) at superblock
-// boundaries:
-// base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
-// base+1   A      B  ..     Y      Z
-// base+1   C      D  ..     W      X
-// base+1   E      F  ..     U      V
-// base+1   G      H  ..     S      T      T      T      T      T
-// ..
-
-#if CONFIG_CHROMA_SUB8X8
-  if (xd->chroma_up_available && xd->mb_to_right_edge >= 0) {
-#else
-  if (xd->up_available && xd->mb_to_right_edge >= 0) {
-#endif
-    sum_above_row(xd, width, &sum_u, &sum_v);
-  } else {
-    const int base = 128 << (xd->bd - 8);
-    sum_u = width * (base - 1);
-    sum_v = width * (base - 1);
-  }
-
-#if CONFIG_CHROMA_SUB8X8
-  if (xd->chroma_left_available && xd->mb_to_bottom_edge >= 0) {
-#else
-  if (xd->left_available && xd->mb_to_bottom_edge >= 0) {
-#endif
-    sum_left_col(xd, height, &sum_u, &sum_v);
-  } else {
-    const int base = 128 << (xd->bd - 8);
-    sum_u += height * (base + 1);
-    sum_v += height * (base + 1);
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
+                               int height, int round_offset, int num_pel_log2) {
+  int sum = round_offset;
+  const uint16_t *recon = src;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      sum += recon[i];
+    }
+    recon += CFL_BUF_LINE;
   }
-
-  // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
-  // not be a power of two. So these divisions will have to use a lookup table.
-  cfl->dc_pred[CFL_PRED_U] = (sum_u + (num_pel >> 1)) / num_pel;
-  cfl->dc_pred[CFL_PRED_V] = (sum_v + (num_pel >> 1)) / num_pel;
-}
-
-static void cfl_subtract_averages(CFL_CTX *cfl, TX_SIZE tx_size) {
-  const int width = cfl->uv_width;
-  const int height = cfl->uv_height;
-  const int tx_height = tx_size_high[tx_size];
-  const int tx_width = tx_size_wide[tx_size];
-  const int block_row_stride = MAX_SB_SIZE << tx_size_high_log2[tx_size];
-  const int num_pel_log2 =
-      (tx_size_high_log2[tx_size] + tx_size_wide_log2[tx_size]);
-
-  int16_t *pred_buf_q3 = cfl->pred_buf_q3;
-
-  cfl_pad(cfl, width, height);
-
-  for (int b_j = 0; b_j < height; b_j += tx_height) {
-    for (int b_i = 0; b_i < width; b_i += tx_width) {
-      int sum_q3 = 0;
-      int16_t *tx_pred_buf_q3 = pred_buf_q3;
-      for (int t_j = 0; t_j < tx_height; t_j++) {
-        for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          sum_q3 += tx_pred_buf_q3[t_i];
-        }
-        tx_pred_buf_q3 += MAX_SB_SIZE;
-      }
-      int avg_q3 = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
-      // Loss is never more than 1/2 (in Q3)
-      assert(fabs((double)avg_q3 - (sum_q3 / ((double)(1 << num_pel_log2)))) <=
-             0.5);
-
-      tx_pred_buf_q3 = pred_buf_q3;
-      for (int t_j = 0; t_j < tx_height; t_j++) {
-        for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          tx_pred_buf_q3[t_i] -= avg_q3;
-        }
-
-        tx_pred_buf_q3 += MAX_SB_SIZE;
-      }
+  const int avg = sum >> num_pel_log2;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = src[i] - avg;
     }
-    pred_buf_q3 += block_row_stride;
+    src += CFL_BUF_LINE;
+    dst += CFL_BUF_LINE;
   }
 }
 
+CFL_SUB_AVG_FN(c)
+
 static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
                                    CFL_PRED_TYPE pred_type) {
   const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
@@ -259,159 +153,218 @@ static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
   return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
 }
 
-static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
-                                     int dst_stride, int width, int height,
-                                     int alpha_q3, int dc_pred) {
+static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+                                     int dst_stride, int alpha_q3, int width,
+                                     int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      dst[i] =
-          clip_pixel(get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred);
+      dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
     }
     dst += dst_stride;
-    pred_buf_q3 += MAX_SB_SIZE;
+    ac_buf_q3 += CFL_BUF_LINE;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
-                                     int dst_stride, int width, int height,
-                                     int alpha_q3, int dc_pred, int bit_depth) {
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst,
+                          int dst_stride, int alpha_q3) {
+  (void)ac_buf_q3;
+  (void)dst;
+  (void)dst_stride;
+  (void)alpha_q3;
+  assert(0);
+}
+
+CFL_PREDICT_FN(c, lbd)
+
+void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
+                       int alpha_q3, int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] = clip_pixel_highbd(
-          get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred, bit_depth);
+          get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth);
     }
     dst += dst_stride;
-    pred_buf_q3 += MAX_SB_SIZE;
+    ac_buf_q3 += CFL_BUF_LINE;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-static void cfl_build_prediction(const int16_t *pred_buf_q3, uint8_t *dst,
-                                 int dst_stride, int width, int height,
-                                 int alpha_q3, int dc_pred, int use_hbd,
-                                 int bit_depth) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride, width, height,
-                             alpha_q3, dc_pred, bit_depth);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  (void)bit_depth;
-  cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, width, height,
-                           alpha_q3, dc_pred);
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst,
+                          int dst_stride, int alpha_q3, int bd) {
+  (void)ac_buf_q3;
+  (void)dst;
+  (void)dst_stride;
+  (void)alpha_q3;
+  (void)bd;
+  assert(0);
+}
+
+CFL_PREDICT_FN(c, hbd)
+
+static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = &xd->cfl;
+  // Do not call cfl_compute_parameters multiple time on the same values.
+  assert(cfl->are_parameters_computed == 0);
+
+  cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
+  get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+  cfl->are_parameters_computed = 1;
 }
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane) {
-  CFL_CTX *const cfl = xd->cfl;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+                       TX_SIZE tx_size, int plane) {
+  CFL_CTX *const cfl = &xd->cfl;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(is_cfl_allowed(xd));
 
-  // CfL parameters must be computed before prediction can be done.
-  assert(cfl->are_parameters_computed == 1);
+  if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
 
-  const int16_t *pred_buf_q3 =
-      cfl->pred_buf_q3 + ((row * MAX_SB_SIZE + col) << tx_size_wide_log2[0]);
   const int alpha_q3 =
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
+  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
+         CFL_BUF_SQUARE);
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
+                                xd->bd);
+    return;
+  }
+  get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+}
 
-  cfl_build_prediction(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
-                       tx_size_high[tx_size], alpha_q3, cfl->dc_pred[plane - 1],
-                       get_bitdepth_data_path_index(xd), xd->bd);
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+                            uint16_t *output_q3) {
+  (void)input;
+  (void)input_stride;
+  (void)output_q3;
+  assert(0);
 }
 
-static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
-                                         int16_t *output_q3, int width,
-                                         int height) {
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      int top = i << 1;
-      int bot = top + input_stride;
-      output_q3[i] = (input[top] + input[top + 1] + input[bot] + input[bot + 1])
-                     << 1;
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+                            uint16_t *output_q3) {
+  (void)input;
+  (void)input_stride;
+  (void)output_q3;
+  assert(0);
+}
+
+static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
     }
     input += input_stride << 1;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
 
-static void cfl_luma_subsampling_444_lbd(const uint8_t *input, int input_stride,
-                                         int16_t *output_q3, int width,
-                                         int height) {
+static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      output_q3[i] = input[i] << 3;
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
     }
     input += input_stride;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void cfl_luma_subsampling_420_hbd(const uint16_t *input,
-                                         int input_stride, int16_t *output_q3,
-                                         int width, int height) {
+static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      int top = i << 1;
-      int bot = top + input_stride;
-      output_q3[i] = (input[top] + input[top + 1] + input[bot] + input[bot + 1])
-                     << 1;
+      output_q3[i] = input[i] << 3;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
     }
     input += input_stride << 1;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
 
-static void cfl_luma_subsampling_444_hbd(const uint16_t *input,
-                                         int input_stride, int16_t *output_q3,
-                                         int width, int height) {
+static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       output_q3[i] = input[i] << 3;
     }
     input += input_stride;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-static void cfl_luma_subsampling_420(const uint8_t *input, int input_stride,
-                                     int16_t *output_q3, int width, int height,
-                                     int use_hbd) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    const uint16_t *input_16 = CONVERT_TO_SHORTPTR(input);
-    cfl_luma_subsampling_420_hbd(input_16, input_stride, output_q3, width,
-                                 height);
-    return;
+CFL_GET_SUBSAMPLE_FUNCTION(c)
+
+static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_hbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_hbd(tx_size);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  cfl_luma_subsampling_420_lbd(input, input_stride, output_q3, width, height);
+  return cfl_get_luma_subsampling_444_hbd(tx_size);
 }
 
-static void cfl_luma_subsampling_444(const uint8_t *input, int input_stride,
-                                     int16_t *output_q3, int width, int height,
-                                     int use_hbd) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    uint16_t *input_16 = CONVERT_TO_SHORTPTR(input);
-    cfl_luma_subsampling_444_hbd(input_16, input_stride, output_q3, width,
-                                 height);
-    return;
+static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_lbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_lbd(tx_size);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  cfl_luma_subsampling_444_lbd(input, input_stride, output_q3, width, height);
+  return cfl_get_luma_subsampling_444_lbd(tx_size);
 }
 
-static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
-                             int input_stride, int row, int col, int width,
-                             int height, int use_hbd) {
+static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
+                      int row, int col, TX_SIZE tx_size, int use_hbd) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
   const int tx_off_log2 = tx_size_wide_log2[0];
   const int sub_x = cfl->subsampling_x;
   const int sub_y = cfl->subsampling_y;
@@ -435,26 +388,22 @@ static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
   }
 
   // Check that we will remain inside the pixel buffer.
-  assert(store_row + store_height <= MAX_SB_SIZE);
-  assert(store_col + store_width <= MAX_SB_SIZE);
+  assert(store_row + store_height <= CFL_BUF_LINE);
+  assert(store_col + store_width <= CFL_BUF_LINE);
 
   // Store the input into the CfL pixel buffer
-  int16_t *pred_buf_q3 =
-      cfl->pred_buf_q3 + (store_row * MAX_SB_SIZE + store_col);
-
-  if (sub_y == 0 && sub_x == 0) {
-    cfl_luma_subsampling_444(input, input_stride, pred_buf_q3, store_width,
-                             store_height, use_hbd);
-  } else if (sub_y == 1 && sub_x == 1) {
-    cfl_luma_subsampling_420(input, input_stride, pred_buf_q3, store_width,
-                             store_height, use_hbd);
+  uint16_t *recon_buf_q3 =
+      cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
+
+  if (use_hbd) {
+    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+                                               input_stride, recon_buf_q3);
   } else {
-    // TODO(ltrudeau) add support for 4:2:2
-    assert(0);  // Unsupported chroma subsampling
+    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
+                                               recon_buf_q3);
   }
 }
 
-#if CONFIG_CHROMA_SUB8X8
 // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
 // and non-chroma-referenced blocks are stored together in the CfL buffer.
 static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
@@ -471,99 +420,36 @@ static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
     (*col_out)++;
   }
 }
-#if CONFIG_DEBUG
-static INLINE void sub8x8_set_val(CFL_CTX *cfl, int row, int col, int val_high,
-                                  int val_wide) {
-  for (int val_r = 0; val_r < val_high; val_r++) {
-    assert(row + val_r < CFL_SUB8X8_VAL_MI_SIZE);
-    int row_off = (row + val_r) * CFL_SUB8X8_VAL_MI_SIZE;
-    for (int val_c = 0; val_c < val_wide; val_c++) {
-      assert(col + val_c < CFL_SUB8X8_VAL_MI_SIZE);
-      assert(cfl->sub8x8_val[row_off + col + val_c] == 0);
-      cfl->sub8x8_val[row_off + col + val_c]++;
-    }
-  }
-}
-#endif  // CONFIG_DEBUG
-#endif  // CONFIG_CHROMA_SUB8X8
 
 void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
                   BLOCK_SIZE bsize) {
-  CFL_CTX *const cfl = xd->cfl;
+  CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
   uint8_t *dst =
       &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-  (void)bsize;
-#if CONFIG_CHROMA_SUB8X8
 
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
     sub8x8_adjust_offset(cfl, &row, &col);
-#if CONFIG_DEBUG
-    sub8x8_set_val(cfl, row, col, tx_size_high_unit[tx_size],
-                   tx_size_wide_unit[tx_size]);
-#endif  // CONFIG_DEBUG
   }
-#endif
-  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
-            tx_size_high[tx_size], get_bitdepth_data_path_index(xd));
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
+            get_bitdepth_data_path_index(xd));
 }
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  CFL_CTX *const cfl = xd->cfl;
+  CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
   int row = 0;
   int col = 0;
-#if CONFIG_CHROMA_SUB8X8
-  bsize = AOMMAX(BLOCK_4X4, bsize);
+
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     sub8x8_adjust_offset(cfl, &row, &col);
-#if CONFIG_DEBUG
-    sub8x8_set_val(cfl, row, col, mi_size_high[bsize], mi_size_wide[bsize]);
-#endif  // CONFIG_DEBUG
   }
-#endif  // CONFIG_CHROMA_SUB8X8
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
-  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height,
+  tx_size = get_tx_size(width, height);
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
             get_bitdepth_data_path_index(xd));
 }
-
-void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
-  CFL_CTX *const cfl = xd->cfl;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  // Do not call cfl_compute_parameters multiple time on the same values.
-  assert(cfl->are_parameters_computed == 0);
-
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize = AOMMAX(
-      BLOCK_4X4, get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]));
-#if CONFIG_DEBUG
-  if (mbmi->sb_type < BLOCK_8X8) {
-    for (int val_r = 0; val_r < mi_size_high[mbmi->sb_type]; val_r++) {
-      for (int val_c = 0; val_c < mi_size_wide[mbmi->sb_type]; val_c++) {
-        assert(cfl->sub8x8_val[val_r * CFL_SUB8X8_VAL_MI_SIZE + val_c] == 1);
-      }
-    }
-    cfl_clear_sub8x8_val(cfl);
-  }
-#endif  // CONFIG_DEBUG
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]);
-#endif
-  // AOM_PLANE_U is used, but both planes will have the same sizes.
-  cfl->uv_width = max_intra_block_width(xd, plane_bsize, AOM_PLANE_U, tx_size);
-  cfl->uv_height =
-      max_intra_block_height(xd, plane_bsize, AOM_PLANE_U, tx_size);
-
-  assert(cfl->buf_width <= cfl->uv_width);
-  assert(cfl->buf_height <= cfl->uv_height);
-
-  cfl_dc_pred(xd, plane_bsize);
-  cfl_subtract_averages(cfl, tx_size);
-  cfl->are_parameters_computed = 1;
-}
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
index 4ac0b401c..bc9fbce1b 100644
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@@ -13,20 +13,290 @@
 #define AV1_COMMON_CFL_H_
 
 #include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+// Can we use CfL for the current block?
+static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  if (xd->lossless[mbmi->segment_id]) {
+    // In lossless, CfL is available when the partition size is equal to the
+    // transform size.
+    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
+  }
+  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
+  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+                            block_size_high[bsize] <= 32);
+}
+
+// Do we need to save the luma pixels from the current block,
+// for a possible future CfL prediction?
+static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+                                                  const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+
+  if (!xd->cfl.is_chroma_reference) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // If this block has chroma information, we know whether we're
+  // actually going to perform a CfL prediction
+  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
+                            mbmi->uv_mode == UV_CFL_PRED);
+}
 
 static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
   int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
   return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
 }
 
+static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) {
+  assert(plane > 0);
+  return (CFL_PRED_TYPE)(plane - 1);
+}
+
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane);
+                       TX_SIZE tx_size, int plane);
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
 void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
                   BLOCK_SIZE bsize);
 
-void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size);
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width);
+
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+                            uint16_t *output_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+                            uint16_t *output_q3);
+
+// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
+#define CFL_lbd_TYPE uint8_t *cfl_type
+#define CFL_hbd_TYPE uint16_t *cfl_type
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
+  void subsample_##bd##_##sub##_##width##x##height##_##arch(              \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
+    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
+                                               output_q3, width, height); \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd)                            \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 8)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 4)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 32)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 8)                                     \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+      TX_SIZE tx_size) {                                                  \
+    CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+    return subfn_##sub[tx_size];                                          \
+  }
+
+// Declare an architecture-specific array of function pointers for size-specific
+// wrappers.
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                       \
+  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {      \
+    subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
+    subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
+    subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
+    subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
+    cfl_subsample_##bd##_null,             /* 64x64 (invalid CFL size) */ \
+    subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
+    subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
+    subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
+    subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
+    subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
+    subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
+    cfl_subsample_##bd##_null,             /* 32x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_null,             /* 64x32 (invalid CFL size) */ \
+    subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
+    subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
+    subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
+    subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
+    cfl_subsample_##bd##_null,             /* 16x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_null,             /* 64x16 (invalid CFL size) */ \
+  };
+
+// The RTCD script does not support passing in an array, so we wrap it in this
+// function.
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+
+// Null function used for invalid tx_sizes
+static INLINE void cfl_subtract_average_null(const uint16_t *src,
+                                             int16_t *dst) {
+  (void)dst;
+  (void)src;
+  assert(0);
+}
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)   \
+  void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                    int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,       \
+                            num_pel_log2);                               \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUB_AVG_FN(arch)                                                \
+  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                           \
+  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                          \
+  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                         \
+  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                          \
+  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                          \
+  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                         \
+  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                        \
+  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                         \
+  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                         \
+  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                       \
+  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                       \
+  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                        \
+  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                       \
+  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                      \
+  cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
+    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {          \
+      subtract_average_4x4_##arch,   /* 4x4 */                              \
+      subtract_average_8x8_##arch,   /* 8x8 */                              \
+      subtract_average_16x16_##arch, /* 16x16 */                            \
+      subtract_average_32x32_##arch, /* 32x32 */                            \
+      cfl_subtract_average_null,     /* 64x64 (invalid CFL size) */         \
+      subtract_average_4x8_##arch,   /* 4x8 */                              \
+      subtract_average_8x4_##arch,   /* 8x4 */                              \
+      subtract_average_8x16_##arch,  /* 8x16 */                             \
+      subtract_average_16x8_##arch,  /* 16x8 */                             \
+      subtract_average_16x32_##arch, /* 16x32 */                            \
+      subtract_average_32x16_##arch, /* 32x16 */                            \
+      cfl_subtract_average_null,     /* 32x64 (invalid CFL size) */         \
+      cfl_subtract_average_null,     /* 64x32 (invalid CFL size) */         \
+      subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */          \
+      subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */          \
+      subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */          \
+      subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */          \
+      cfl_subtract_average_null,     /* 16x64 (invalid CFL size) */         \
+      cfl_subtract_average_null,     /* 64x16 (invalid CFL size) */         \
+    };                                                                      \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
+    /* index the function pointer array out of bounds. */                   \
+    return sub_avg[tx_size % TX_SIZES_ALL];                                 \
+  }
+
+// For VSX SIMD optimization, the C versions of width == 4 subtract are
+// faster than the VSX. As such, the VSX code calls the C versions.
+void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height)                                 \
+  void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,   \
+                                               uint8_t *dst, int dst_stride, \
+                                               int alpha_q3) {               \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,    \
+                           height);                                          \
+  }
+
+#define CFL_PREDICT_hbd(arch, width, height)                                  \
+  void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,    \
+                                               uint16_t *dst, int dst_stride, \
+                                               int alpha_q3, int bd) {        \
+    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
+                           height);                                           \
+  }
+
+// This wrapper exists because clang format does not like calling macros with
+// lowercase letters.
+#define CFL_PREDICT_X(arch, width, height, bd) \
+  CFL_PREDICT_##bd(arch, width, height)
+
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst,
+                          int dst_stride, int alpha_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
+                          int dst_stride, int alpha_q3, int bd);
+
+#define CFL_PREDICT_FN(arch, bd)                                          \
+  CFL_PREDICT_X(arch, 4, 4, bd)                                           \
+  CFL_PREDICT_X(arch, 4, 8, bd)                                           \
+  CFL_PREDICT_X(arch, 4, 16, bd)                                          \
+  CFL_PREDICT_X(arch, 8, 4, bd)                                           \
+  CFL_PREDICT_X(arch, 8, 8, bd)                                           \
+  CFL_PREDICT_X(arch, 8, 16, bd)                                          \
+  CFL_PREDICT_X(arch, 8, 32, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 4, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 8, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 16, bd)                                         \
+  CFL_PREDICT_X(arch, 16, 32, bd)                                         \
+  CFL_PREDICT_X(arch, 32, 8, bd)                                          \
+  CFL_PREDICT_X(arch, 32, 16, bd)                                         \
+  CFL_PREDICT_X(arch, 32, 32, bd)                                         \
+  cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) {   \
+    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {             \
+      predict_##bd##_4x4_##arch,   /* 4x4 */                              \
+      predict_##bd##_8x8_##arch,   /* 8x8 */                              \
+      predict_##bd##_16x16_##arch, /* 16x16 */                            \
+      predict_##bd##_32x32_##arch, /* 32x32 */                            \
+      cfl_predict_##bd##_null,     /* 64x64 (invalid CFL size) */         \
+      predict_##bd##_4x8_##arch,   /* 4x8 */                              \
+      predict_##bd##_8x4_##arch,   /* 8x4 */                              \
+      predict_##bd##_8x16_##arch,  /* 8x16 */                             \
+      predict_##bd##_16x8_##arch,  /* 16x8 */                             \
+      predict_##bd##_16x32_##arch, /* 16x32 */                            \
+      predict_##bd##_32x16_##arch, /* 32x16 */                            \
+      cfl_predict_##bd##_null,     /* 32x64 (invalid CFL size) */         \
+      cfl_predict_##bd##_null,     /* 64x32 (invalid CFL size) */         \
+      predict_##bd##_4x16_##arch,  /* 4x16  */                            \
+      predict_##bd##_16x4_##arch,  /* 16x4  */                            \
+      predict_##bd##_8x32_##arch,  /* 8x32  */                            \
+      predict_##bd##_32x8_##arch,  /* 32x8  */                            \
+      cfl_predict_##bd##_null,     /* 16x64 (invalid CFL size) */         \
+      cfl_predict_##bd##_null,     /* 64x16 (invalid CFL size) */         \
+    };                                                                    \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+    /* index the function pointer array out of bounds. */                 \
+    return pred[tx_size % TX_SIZES_ALL];                                  \
+  }
 
 #endif  // AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/clpf.c b/third_party/aom/av1/common/clpf.c
deleted file mode 100644
index d643236aa..000000000
--- a/third_party/aom/av1/common/clpf.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "./cdef.h"
-#include "aom/aom_image.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-static int clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
-                       int H, int s, unsigned int dmp) {
-  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
-              1 * constrain(C - X, s, dmp) + 3 * constrain(D - X, s, dmp) +
-              3 * constrain(E - X, s, dmp) + 1 * constrain(F - X, s, dmp) +
-              3 * constrain(G - X, s, dmp) + 1 * constrain(H - X, s, dmp);
-  return (8 + delta - (delta < 0)) >> 4;
-}
-
-static int clpf_hsample(int X, int A, int B, int C, int D, int s,
-                        unsigned int dmp) {
-  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
-              3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp);
-  return (4 + delta - (delta < 0)) >> 3;
-}
-
-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride,
-                      int sstride, int sizex, int sizey, unsigned int strength,
-                      unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[(y - 2) * sstride + x];
-      const int B = src[(y - 1) * sstride + x];
-      const int C = src[y * sstride + x - 2];
-      const int D = src[y * sstride + x - 1];
-      const int E = src[y * sstride + x + 1];
-      const int F = src[y * sstride + x + 2];
-      const int G = src[(y + 1) * sstride + x];
-      const int H = src[(y + 2) * sstride + x];
-      const int delta =
-          clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
-
-// Identical to aom_clpf_block_c() apart from "dst".
-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride,
-                          int sstride, int sizex, int sizey,
-                          unsigned int strength, unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[(y - 2) * sstride + x];
-      const int B = src[(y - 1) * sstride + x];
-      const int C = src[y * sstride + x - 2];
-      const int D = src[y * sstride + x - 1];
-      const int E = src[y * sstride + x + 1];
-      const int F = src[y * sstride + x + 2];
-      const int G = src[(y + 1) * sstride + x];
-      const int H = src[(y + 2) * sstride + x];
-      const int delta =
-          clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
-
-// Vertically restricted filter
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride,
-                       int sstride, int sizex, int sizey, unsigned int strength,
-                       unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[y * sstride + x - 2];
-      const int B = src[y * sstride + x - 1];
-      const int C = src[y * sstride + x + 1];
-      const int D = src[y * sstride + x + 2];
-      const int delta = clpf_hsample(X, A, B, C, D, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride,
-                           int sstride, int sizex, int sizey,
-                           unsigned int strength, unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[y * sstride + x - 2];
-      const int B = src[y * sstride + x - 1];
-      const int C = src[y * sstride + x + 1];
-      const int D = src[y * sstride + x + 2];
-      const int delta = clpf_hsample(X, A, B, C, D, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/clpf_neon.c b/third_party/aom/av1/common/clpf_neon.c
deleted file mode 100644
index f1a004c2c..000000000
--- a/third_party/aom/av1/common/clpf_neon.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_neon
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_simd.h b/third_party/aom/av1/common/clpf_simd.h
deleted file mode 100644
index c7ffc569a..000000000
--- a/third_party/aom/av1/common/clpf_simd.h
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "aom_ports/bitops.h"
-#include "aom_ports/mem.h"
-
-// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
-SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
-                             unsigned int adjdamp) {
-  v128 diff = v128_sub_16(a, b);
-  const v128 sign = v128_shr_n_s16(diff, 15);
-  diff = v128_abs_s16(diff);
-  const v128 s =
-      v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
-  return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
-}
-
-// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
-SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
-                           unsigned int adjdamp) {
-  const v256 diff16 = v256_sub_16(a, b);
-  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
-  const v128 sign = v128_cmplt_s8(diff, v128_zero());
-  diff = v128_abs_s8(diff);
-  return v128_xor(
-      v128_add_8(sign,
-                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
-                                                v128_shr_u8(diff, adjdamp)))),
-      sign);
-}
-
-// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
-//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
-//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
-//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
-SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
-                            v256 f, v256 g, v256 h, unsigned int s,
-                            unsigned int dmp) {
-  const v128 bdeg =
-      v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
-                 v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
-  const v128 delta = v128_add_8(
-      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
-                 v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
-      v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
-  return v128_add_8(
-      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
-      v128_shr_s8(
-          v128_add_8(v128_dup_8(8),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          4));
-}
-
-// delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
-//         3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
-SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
-                             unsigned int s, unsigned int dmp) {
-  const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
-  const v128 delta =
-      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
-                 v128_add_8(v128_add_8(bc, bc), bc));
-  return v128_add_8(
-      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
-      v128_shr_s8(
-          v128_add_8(v128_dup_8(4),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          3));
-}
-
-// Process blocks of width 8, two lines at a time, 8 bit.
-static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
-                                   int dstride, int sstride, int sizey,
-                                   unsigned int strength,
-                                   unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v128 l1 = v128_load_aligned(src);
-    const v128 l2 = v128_load_aligned(src + sstride);
-    const v128 l3 = v128_load_aligned(src - sstride);
-    const v128 l4 = v128_load_aligned(src + 2 * sstride);
-    const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
-    const v256 b = v256_from_v128(l3, l1);
-    const v256 g = v256_from_v128(l2, l4);
-    const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
-    const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
-                                  v128_load_unaligned(src - 2 + sstride));
-    const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
-                                  v128_load_unaligned(src - 1 + sstride));
-    const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
-                                  v128_load_unaligned(src + 1 + sstride));
-    const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
-                                  v128_load_unaligned(src + 2 + sstride));
-    const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
-                              strength, adjdamp);
-
-    v64_store_aligned(dst, v128_high_v64(o));
-    v64_store_aligned(dst + dstride, v128_low_v64(o));
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// Process blocks of width 4, four lines at a time, 8 bit.
-static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
-                                   int dstride, int sstride, int sizey,
-                                   unsigned int strength,
-                                   unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 4) {
-    const v64 l0 = v64_load_aligned(src - 2 * sstride);
-    const v64 l1 = v64_load_aligned(src - sstride);
-    const v64 l2 = v64_load_aligned(src);
-    const v64 l3 = v64_load_aligned(src + sstride);
-    const v64 l4 = v64_load_aligned(src + 2 * sstride);
-    const v64 l5 = v64_load_aligned(src + 3 * sstride);
-    const v64 l6 = v64_load_aligned(src + 4 * sstride);
-    const v64 l7 = v64_load_aligned(src + 5 * sstride);
-    const v128 o =
-        calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
-                   v256_from_v64(l1, l2, l3, l4),
-                   v256_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src + sstride - 2),
-                                 v64_load_unaligned(src + 2 * sstride - 2),
-                                 v64_load_unaligned(src + 3 * sstride - 2)),
-                   v256_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src + sstride - 1),
-                                 v64_load_unaligned(src + 2 * sstride - 1),
-                                 v64_load_unaligned(src + 3 * sstride - 1)),
-                   v256_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + sstride + 1),
-                                 v64_load_unaligned(src + 2 * sstride + 1),
-                                 v64_load_unaligned(src + 3 * sstride + 1)),
-                   v256_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + sstride + 2),
-                                 v64_load_unaligned(src + 2 * sstride + 2),
-                                 v64_load_unaligned(src + 3 * sstride + 2)),
-                   v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
-                   strength, adjdamp);
-
-    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
-    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
-    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
-    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
-
-    dst += 4 * dstride;
-    src += 4 * sstride;
-  }
-}
-
-static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
-                                    int dstride, int sstride, int sizey,
-                                    unsigned int strength,
-                                    unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v256 x = v256_from_v128(v128_load_aligned(src),
-                                  v128_load_aligned(src + sstride));
-    const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
-                                  v128_load_unaligned(src - 2 + sstride));
-    const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
-                                  v128_load_unaligned(src - 1 + sstride));
-    const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
-                                  v128_load_unaligned(src + 1 + sstride));
-    const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
-                                  v128_load_unaligned(src + 2 + sstride));
-    const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
-
-    v64_store_aligned(dst, v128_high_v64(o));
-    v64_store_aligned(dst + dstride, v128_low_v64(o));
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// Process blocks of width 4, four lines at a time, 8 bit.
-static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
-                                    int dstride, int sstride, int sizey,
-                                    unsigned int strength,
-                                    unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 4) {
-    const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src + sstride - 2),
-                                 v64_load_unaligned(src + 2 * sstride - 2),
-                                 v64_load_unaligned(src + 3 * sstride - 2));
-    const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src + sstride - 1),
-                                 v64_load_unaligned(src + 2 * sstride - 1),
-                                 v64_load_unaligned(src + 3 * sstride - 1));
-    const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + sstride + 1),
-                                 v64_load_unaligned(src + 2 * sstride + 1),
-                                 v64_load_unaligned(src + 3 * sstride + 1));
-    const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + sstride + 2),
-                                 v64_load_unaligned(src + 2 * sstride + 2),
-                                 v64_load_unaligned(src + 3 * sstride + 2));
-
-    const v128 o = calc_hdelta(
-        v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
-                      v64_load_aligned(src + 2 * sstride),
-                      v64_load_aligned(src + 3 * sstride)),
-        a, b, c, d, strength, adjdamp);
-
-    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
-    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
-    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
-    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
-
-    dst += 4 * dstride;
-    src += 4 * sstride;
-  }
-}
-
-void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
-                               int sstride, int sizex, int sizey,
-                               unsigned int strength, unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block widths not 4 or 8
-    // * block heights not a multiple of 4 if the block width is 4
-    aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
-
-void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
-                                int sstride, int sizex, int sizey,
-                                unsigned int strength, unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block widths not 4 or 8
-    // * block heights not a multiple of 4 if the block width is 4
-    aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
-
-// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
-//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
-//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
-//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
-SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
-                                v128 f, v128 g, v128 h, unsigned int s,
-                                unsigned int dmp) {
-  const v128 bdeg = v128_add_16(
-      v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
-      v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
-  const v128 delta = v128_add_16(
-      v128_add_16(
-          v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
-          v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
-      v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
-  return v128_add_16(
-      x,
-      v128_shr_s16(
-          v128_add_16(v128_dup_16(8),
-                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
-          4));
-}
-
-static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
-                            v128 f, v128 g, v128 h, uint16_t *dst,
-                            unsigned int s, unsigned int dmp, int dstride) {
-  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
-  v64_store_aligned(dst, v128_high_v64(o));
-  v64_store_aligned(dst + dstride, v128_low_v64(o));
-}
-
-static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
-                            v128 f, v128 g, v128 h, uint16_t *dst,
-                            unsigned int s, unsigned int adjdamp) {
-  v128_store_aligned(dst,
-                     calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
-}
-
-// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
-//         3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
-SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
-                                 unsigned int s, unsigned int dmp) {
-  const v128 bc =
-      v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
-  const v128 delta = v128_add_16(
-      v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
-      v128_add_16(v128_add_16(bc, bc), bc));
-  return v128_add_16(
-      x,
-      v128_shr_s16(
-          v128_add_16(v128_dup_16(4),
-                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
-          3));
-}
-
-static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
-                             uint16_t *dst, unsigned int s,
-                             unsigned int adjdamp, int dstride) {
-  o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
-  v64_store_aligned(dst, v128_high_v64(o));
-  v64_store_aligned(dst + dstride, v128_low_v64(o));
-}
-
-static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
-                             uint16_t *dst, unsigned int s,
-                             unsigned int adjdamp) {
-  v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
-}
-
-// Process blocks of width 4, two lines at time.
-static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
-                                       int dstride, int sstride, int sizey,
-                                       unsigned int strength,
-                                       unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v64 l1 = v64_load_aligned(src);
-    const v64 l2 = v64_load_aligned(src + sstride);
-    const v64 l3 = v64_load_aligned(src - sstride);
-    const v64 l4 = v64_load_aligned(src + 2 * sstride);
-    const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
-    const v128 b = v128_from_v64(l3, l1);
-    const v128 g = v128_from_v64(l2, l4);
-    const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
-    const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src - 2 + sstride));
-    const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src - 1 + sstride));
-    const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + 1 + sstride));
-    const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + 2 + sstride));
-
-    calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
-                    strength, adjdamp, dstride);
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// The most simple case.  Start here if you need to understand the functions.
-static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
-                                      int dstride, int sstride, int sizey,
-                                      unsigned int strength,
-                                      unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y++) {
-    const v128 o = v128_load_aligned(src);
-    const v128 a = v128_load_aligned(src - 2 * sstride);
-    const v128 b = v128_load_aligned(src - 1 * sstride);
-    const v128 g = v128_load_aligned(src + sstride);
-    const v128 h = v128_load_aligned(src + 2 * sstride);
-    const v128 c = v128_load_unaligned(src - 2);
-    const v128 d = v128_load_unaligned(src - 1);
-    const v128 e = v128_load_unaligned(src + 1);
-    const v128 f = v128_load_unaligned(src + 2);
-
-    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
-    src += sstride;
-    dst += dstride;
-  }
-}
-
-// Process blocks of width 4, horizontal filter, two lines at time.
-static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
-                                        int dstride, int sstride, int sizey,
-                                        unsigned int strength,
-                                        unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src - 2 + sstride));
-    const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src - 1 + sstride));
-    const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + 1 + sstride));
-    const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + 2 + sstride));
-
-    calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
-                                   v64_load_unaligned(src + sstride)),
-                     a, b, c, d, dst, strength, adjdamp, dstride);
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// Process blocks of width 8, horizontal filter, two lines at time.
-static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
-                                       int dstride, int sstride, int sizey,
-                                       unsigned int strength,
-                                       unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y++) {
-    const v128 o = v128_load_aligned(src);
-    const v128 a = v128_load_unaligned(src - 2);
-    const v128 b = v128_load_unaligned(src - 1);
-    const v128 c = v128_load_unaligned(src + 1);
-    const v128 d = v128_load_unaligned(src + 2);
-
-    calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
-    src += sstride;
-    dst += dstride;
-  }
-}
-
-void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
-                                   int dstride, int sstride, int sizex,
-                                   int sizey, unsigned int strength,
-                                   unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block width not 4 or 8
-    // * block heights not a multiple of 2 if the block width is 4
-    aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
-                         dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
-
-void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
-                                    int dstride, int sstride, int sizex,
-                                    int sizey, unsigned int strength,
-                                    unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block width not 4 or 8
-    // * block heights not a multiple of 2 if the block width is 4
-    aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
-                          dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
diff --git a/third_party/aom/av1/common/clpf_sse2.c b/third_party/aom/av1/common/clpf_sse2.c
deleted file mode 100644
index e29c2ab7e..000000000
--- a/third_party/aom/av1/common/clpf_sse2.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse2
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_sse4.c b/third_party/aom/av1/common/clpf_sse4.c
deleted file mode 100644
index 537139f17..000000000
--- a/third_party/aom/av1/common/clpf_sse4.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse4_1
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_ssse3.c b/third_party/aom/av1/common/clpf_ssse3.c
deleted file mode 100644
index d7ed8dec5..000000000
--- a/third_party/aom/av1/common/clpf_ssse3.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_ssse3
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
index 8611b776f..72c6d3a1e 100644
--- a/third_party/aom/av1/common/common.h
+++ b/third_party/aom/av1/common/common.h
@@ -20,6 +20,7 @@
 #include "aom_mem/aom_mem.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/bitops.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,6 +54,8 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 
 #define AOM_FRAME_MARKER 0x2
 
+#define AV1_MIN_TILE_SIZE_BYTES 1
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
index 1a74fe76e..f521f10bf 100644
--- a/third_party/aom/av1/common/common_data.h
+++ b/third_party/aom/av1/common/common_data.h
@@ -20,600 +20,78 @@
 extern "C" {
 #endif
 
-#if CONFIG_EXT_PARTITION
-#define IF_EXT_PARTITION(...) __VA_ARGS__,
-#else
-#define IF_EXT_PARTITION(...)
-#endif
-
-// Log 2 conversion lookup tables for block width and height
-static const uint8_t b_width_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 0,
-  1, 1,
-  1, 2,
-  2, 2,
-  3, 3,
-  3, 4,
-  4, IF_EXT_PARTITION(4, 5, 5) 0,
-  2, 1,
-  3, 2,
-  4, IF_EXT_PARTITION(3, 5)
-};
-static const uint8_t b_height_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 1,
-  0, 1,
-  2, 1,
-  2, 3,
-  2, 3,
-  4, 3,
-  4, IF_EXT_PARTITION(5, 4, 5) 2,
-  0, 3,
-  1, 4,
-  2, IF_EXT_PARTITION(5, 3)
+// Log 2 conversion lookup tables in units of mode info(4x4).
+static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
 };
-// Log 2 conversion lookup tables for modeinfo width and height
-static const uint8_t mi_width_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 0,
-  1, 1,
-  1, 2,
-  2, 2,
-  3, 3,
-  3, 4,
-  4, IF_EXT_PARTITION(4, 5, 5) 0,
-  2, 1,
-  3, 2,
-  4, IF_EXT_PARTITION(3, 5)
-#else  // CONFIG_CB4X4
-  0, 0,
-  0, 0,
-  0, 1,
-  1, 1,
-  2, 2,
-  2, 3,
-  3, IF_EXT_PARTITION(3, 4, 4) 0,
-  1, 0,
-  2, 1,
-  3, IF_EXT_PARTITION(2, 4)
-#endif
-};
-static const uint8_t mi_height_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 1,
-  0, 1,
-  2, 1,
-  2, 3,
-  2, 3,
-  4, 3,
-  4, IF_EXT_PARTITION(5, 4, 5) 2,
-  0, 3,
-  1, 4,
-  2, IF_EXT_PARTITION(5, 3)
-#else  // CONFIG_CB4X4
-  0, 0,
-  0, 0,
-  1, 0,
-  1, 2,
-  1, 2,
-  3, 2,
-  3, IF_EXT_PARTITION(4, 3, 4) 1,
-  0, 2,
-  0, 3,
-  1, IF_EXT_PARTITION(2, 4)
-#endif
+static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
+  0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
 };
 
-/* clang-format off */
 static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1, 1,
-#endif
-  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16,
-  IF_EXT_PARTITION(16, 32, 32)  1, 4, 2, 8, 4, 16, IF_EXT_PARTITION(8, 32)
-#else  // CONFIG_CB4X4
-  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16) 1, 2, 1, 4,
-  2, 8, IF_EXT_PARTITION(4, 16)
-#endif
+  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
 };
+
 static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1, 1,
-#endif
-  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16,
-  IF_EXT_PARTITION(32, 16, 32)  4, 1, 8, 2, 16, 4, IF_EXT_PARTITION(32, 8)
-#else  // CONFIG_CB4X4
-  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16) 2, 1, 4, 1,
-  8, 2, IF_EXT_PARTITION(16, 4)
-#endif
+  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
 };
-/* clang-format on */
 
 // Width/height lookup tables in units of various block sizes
 static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  2,
-  4,
-#endif
-  4,  4,
-  8,  8,
-  8,  16,
-  16, 16,
-  32, 32,
-  32, 64,
-  64, IF_EXT_PARTITION(64, 128, 128) 4,
-  16, 8,
-  32, 16,
-  64, IF_EXT_PARTITION(32, 128)
+  4,  4,  8,  8,   8,   16, 16, 16, 32, 32, 32,
+  64, 64, 64, 128, 128, 4,  16, 8,  32, 16, 64
 };
 
 static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  4,
-  2,
-#endif
-  4,  8,
-  4,  8,
-  16, 8,
-  16, 32,
-  16, 32,
-  64, 32,
-  64, IF_EXT_PARTITION(128, 64, 128) 16,
-  4,  32,
-  8,  64,
-  16, IF_EXT_PARTITION(128, 32)
-};
-
-static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1,  1,
-  1,
-#endif
-  1,  1,
-  2,  2,
-  2,  4,
-  4,  4,
-  8,  8,
-  8,  16,
-  16, IF_EXT_PARTITION(16, 32, 32) 1,
-  4,  2,
-  8,  4,
-  16, IF_EXT_PARTITION(8, 32)
-};
-static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1,  1,
-  1,
-#endif
-  1,  2,
-  1,  2,
-  4,  2,
-  4,  8,
-  4,  8,
-  16, 8,
-  16, IF_EXT_PARTITION(32, 16, 32) 4,
-  1,  8,
-  2,  16,
-  4,  IF_EXT_PARTITION(32, 8)
-};
-static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  1, 2,
-  2, 2,
-  4, 4,
-  4, 8,
-  8, IF_EXT_PARTITION(8, 16, 16) 1,
-  2, 1,
-  4, 2,
-  8, IF_EXT_PARTITION(4, 16)
-};
-static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  2, 1,
-  2, 4,
-  2, 4,
-  8, 4,
-  8, IF_EXT_PARTITION(16, 8, 16) 2,
-  1, 4,
-  1, 8,
-  2, IF_EXT_PARTITION(16, 4)
-};
-static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  1, 1,
-  1, 1,
-  2, 2,
-  2, 4,
-  4, IF_EXT_PARTITION(4, 8, 8) 1,
-  1, 1,
-  2, 2,
-  4, IF_EXT_PARTITION(2, 8)
-};
-static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  1, 1,
-  1, 2,
-  1, 2,
-  4, 2,
-  4, IF_EXT_PARTITION(8, 4, 8) 1,
-  1, 2,
-  1, 4,
-  2, IF_EXT_PARTITION(8, 2)
+  4,  8,  4,   8,  16,  8,  16, 32, 16, 32, 64,
+  32, 64, 128, 64, 128, 16, 4,  32, 8,  64, 16
 };
 
 // AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 0,
-  0, 1,
-  1, 1,
-  2, 2,
-  2, 3,
-  3, 3,
-  3, IF_EXT_PARTITION(3, 3, 3) 0,
-  0, 1,
-  1, 2,
-  2, IF_EXT_PARTITION(3, 3)
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
 };
 
 static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  3,
-  3,
-#endif
-  4,  5,
-  5,  6,
-  7,  7,
-  8,  9,
-  9,  10,
-  11, 11,
-  12, IF_EXT_PARTITION(13, 13, 14) 6,
-  6,  8,
-  8,  10,
-  10, IF_EXT_PARTITION(12, 12)
+  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
 };
 
 /* clang-format off */
-#if CONFIG_EXT_PARTITION_TYPES
-static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES_ALL] =
-#else
-static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES_ALL] =
-#endif  // CONFIG_EXT_PARTITION_TYPES
-{
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
   {     // PARTITION_NONE
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_2X2,     BLOCK_2X4,     BLOCK_4X2,
-#endif
-    //                            4X4
-                                  BLOCK_4X4,
-    // 4X8,        8X4,           8X8
-    BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_16X32,   BLOCK_32X16,   BLOCK_32X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_32X64,   BLOCK_64X32,   BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_4X16,    BLOCK_16X4,    BLOCK_8X32,
-    // 32X8,       16X64,         64X16
-    BLOCK_32X8,    BLOCK_16X64,   BLOCK_64X16,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_32X128,  BLOCK_128X32
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_4X4, BLOCK_8X8, BLOCK_16X16,
+    BLOCK_32X32, BLOCK_64X64, BLOCK_128X128
   }, {  // PARTITION_HORZ
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_4X2,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_VERT
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_2X4,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_SPLIT
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_EXT_PARTITION_TYPES
+    BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
+    BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
   }, {  // PARTITION_HORZ_A
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_HORZ_B
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_VERT_A
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_VERT_B
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_HORZ_4
-#if CONFIG_CB4X4
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_INVALID,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 8X16,       16X8,          16X16
     BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X32,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID
   }, {  // PARTITION_VERT_4
-#if CONFIG_CB4X4
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_INVALID,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 8X16,       16X8,          16X16
     BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+    BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
   }
 };
 
 static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
-  // 2X2,    2X4,      4X2,
-#if CONFIG_CHROMA_2X2
-  TX_2X2,    TX_2X2,   TX_2X2,
-#elif CONFIG_CHROMA_SUB8X8
-  TX_4X4,    TX_4X4,   TX_4X4,
-#endif
   //                   4X4
                        TX_4X4,
   // 4X8,    8X4,      8X8
@@ -624,1436 +102,291 @@ static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
   TX_16X16,  TX_16X16, TX_32X32,
   // 32X64,  64X32,
   TX_32X32,  TX_32X32,
-#if CONFIG_TX64X64
   // 64X64
   TX_64X64,
-#if CONFIG_EXT_PARTITION
   // 64x128, 128x64,   128x128
   TX_64X64,  TX_64X64, TX_64X64,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 64X64
-  TX_32X32,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64,   128x128
-  TX_32X32,  TX_32X32, TX_32X32,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
   // 4x16,   16x4,     8x32
   TX_4X4,    TX_4X4,   TX_8X8,
   // 32x8,   16x64     64x16
-  TX_8X8,    TX_16X16, TX_16X16,
-#if CONFIG_EXT_PARTITION
-  // 32x128  128x32
-  TX_32X32,  TX_32X32
-#endif  // CONFIG_EXT_PARTITION
+  TX_8X8,    TX_16X16, TX_16X16
 };
 
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = {
-  // 2X2,    2X4,      4X2,
-#if CONFIG_CHROMA_2X2
-  TX_2X2,    TX_2X2,   TX_2X2,
-#elif CONFIG_CHROMA_SUB8X8
-  TX_4X4,    TX_4X4,   TX_4X4,
-#endif  // CONFIG_CHROMA_SUB8X8
-  //                   4X4
-                       TX_4X4,
-  // 4X8,    8X4,      8X8
-  TX_4X8,    TX_8X4,   TX_8X8,
-  // 8X16,   16X8,     16X16
-  TX_8X16,   TX_16X8,  TX_16X16,
-  // 16X32,  32X16,    32X32
-  TX_16X32,  TX_32X16, TX_32X32,
-#if CONFIG_TX64X64
-  // 32X64,  64X32,
-  TX_32X64,  TX_64X32,
-  // 64X64
-  TX_64X64,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64,   128x128
-  TX_64X64,  TX_64X64, TX_64X64,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 32X64,  64X32,
-  TX_32X32,  TX_32X32,
-  // 64X64
-  TX_32X32,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64,   128x128
-  TX_32X32,  TX_32X32, TX_32X32,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
-#if CONFIG_RECT_TX_EXT
-  // 4x16,   16x4,     8x32
-  TX_4X16,   TX_16X4,  TX_8X32,
-  // 32x8
-  TX_32X8,
-#else
-  // 4x16,   16x4,     8x32
-  TX_4X8,    TX_8X4,   TX_8X16,
-  // 32x8
-  TX_16X8,
-#endif
-  // 16x64,  64x16
-  TX_16X32,  TX_32X16,
-#if CONFIG_EXT_PARTITION
-  // 32x128  128x32
-  TX_32X32,  TX_32X32
-#endif  // CONFIG_EXT_PARTITION
-};
-
-#if CONFIG_RECT_TX_EXT
-static const TX_SIZE quarter_txsize_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //     2X2,        2X4,        4X2,
-  TX_INVALID, TX_INVALID, TX_INVALID,
-#endif
-  //                             4x4,
-                          TX_INVALID,
-  //     4x8,        8x4,        8x8,
-  TX_INVALID, TX_INVALID, TX_INVALID,
-  // 8x16, 16x8, 16x16,
-  TX_4X16, TX_16X4, TX_INVALID,
-  // 16x32, 32x16, 32x32,
-  TX_8X32, TX_32X8, TX_INVALID,
-  // 32x64, 64x32, 64x64
-  TX_INVALID, TX_INVALID, TX_INVALID,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64, 128x128
-  TX_INVALID, TX_INVALID, TX_INVALID,
-#endif
-  // 4x16,    16x4,       8x32
-  TX_4X16,    TX_16X4,    TX_8X32,
-  // 32x8     16x64       64x16
-  TX_32X8,    TX_INVALID, TX_INVALID,
-#if CONFIG_EXT_PARTITION
-  // 32x128   128x32
-  TX_INVALID, TX_INVALID
-#endif  // CONFIG_EXT_PARTITION
+      // 4X4
+      TX_4X4,
+      // 4X8,    8X4,      8X8
+      TX_4X8,    TX_8X4,   TX_8X8,
+      // 8X16,   16X8,     16X16
+      TX_8X16,   TX_16X8,  TX_16X16,
+      // 16X32,  32X16,    32X32
+      TX_16X32,  TX_32X16, TX_32X32,
+      // 32X64,  64X32,
+      TX_32X64,  TX_64X32,
+      // 64X64
+      TX_64X64,
+      // 64x128, 128x64,   128x128
+      TX_64X64,  TX_64X64, TX_64X64,
+      // 4x16,   16x4,
+      TX_4X16,   TX_16X4,
+      // 8x32,   32x8
+      TX_8X32,   TX_32X8,
+      // 16x64,  64x16
+      TX_16X64,  TX_64X16
 };
-#endif
-#else
-#define max_txsize_rect_lookup max_txsize_lookup
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 
 static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
   DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
-#if CONFIG_EXT_TX
   FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
   DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
-#endif  // CONFIG_EXT_TX
 };
 
 static const TX_TYPE_1D htx_tab[TX_TYPES] = {
   DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
-#if CONFIG_EXT_TX
   DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
   IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
-#endif  // CONFIG_EXT_TX
-};
-
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-// Same as "max_txsize_lookup[bsize] - TX_8X8", except for rectangular
-// block which may use a rectangular transform, in which  case it is
-// "(max_txsize_lookup[bsize] + 1) - TX_8X8", invalid for bsize < 8X8
-static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,             2X4,                4X2,
-  INT32_MIN,          INT32_MIN,          INT32_MIN,
-#endif
-  //                                      4X4,
-                                          INT32_MIN,
-  // 4X8,             8X4,                8X8,
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
-#else  // CONFIG_CB4X4
-  //                                      4X4
-                                          INT32_MIN,
-  // 4X8,             8X4,                8X8
-  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
-#endif  // CONFIG_CB4X4
-  // 8X16,            16X8,               16X16
-  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
-  // 16X32,           32X16,              32X32
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-#if CONFIG_TX64X64
-  // 32X64,           64X32,
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-  // 64X64
-  TX_64X64 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 32X64,           64X32,
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-  // 64X64
-  TX_32X32 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
-  // TODO(david.barker): Change these if we support rectangular transforms
-  // for 4:1 shaped partitions
-  // 4x16,            16x4,               8x32
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
-  // 32x8,            16x64,              64x16
-  TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 32x128,          128x32
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8
-#endif  // CONFIG_EXT_PARTITION
-};
-#else
-// Same as "max_txsize_lookup[bsize] - TX_8X8", invalid for bsize < 8X8
-static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,             2X4,                4X2,
-  INT32_MIN,          INT32_MIN,          INT32_MIN,
-#endif
-  //                                      4X4
-                                          INT32_MIN,
-  // 4X8,             8X4,                8X8
-  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
-  // 8X16,            16X8,               16X16
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,
-  // 16X32,           32X16,              32X32
-  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_32X32 - TX_8X8,
-#if CONFIG_TX64X64
-  // 32X64,           64X32,
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-  // 64X64
-  TX_64X64 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 32X64,           64X32,
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-  // 64X64
-  TX_32X32 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
-  // 4x16,            16x4,               8x32
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
-  // 32x8             16x64,              64x16
-  TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 32x128,          128x32
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8
-#endif  // CONFIG_EXT_PARTITION
 };
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 
-#define inter_tx_size_cat_lookup intra_tx_size_cat_lookup
+#define TXSIZE_CAT_INVALID (-1)
 
 /* clang-format on */
 
 static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_4X4,    // TX_8X8
   TX_8X8,    // TX_16X16
   TX_16X16,  // TX_32X32
-#if CONFIG_TX64X64
   TX_32X32,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X8
   TX_4X4,    // TX_8X4
   TX_8X8,    // TX_8X16
   TX_8X8,    // TX_16X8
   TX_16X16,  // TX_16X32
   TX_16X16,  // TX_32X16
-#if CONFIG_TX64X64
   TX_32X32,  // TX_32X64
   TX_32X32,  // TX_64X32
-#endif       // CONFIG_TX64X64
-  TX_4X4,    // TX_4X16
-  TX_4X4,    // TX_16X4
-  TX_8X8,    // TX_8X32
-  TX_8X8,    // TX_32X8
+  TX_4X8,    // TX_4X16
+  TX_8X4,    // TX_16X4
+  TX_8X16,   // TX_8X32
+  TX_16X8,   // TX_32X8
+  TX_16X32,  // TX_16X64
+  TX_32X16,  // TX_64X16
 };
 
 static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X8
   TX_8X8,    // TX_8X4
   TX_8X8,    // TX_8X16
   TX_16X16,  // TX_16X8
   TX_16X16,  // TX_16X32
   TX_32X32,  // TX_32X16
-#if CONFIG_TX64X64
   TX_32X32,  // TX_32X64
   TX_64X64,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X16
   TX_16X16,  // TX_16X4
   TX_8X8,    // TX_8X32
   TX_32X32,  // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_64X64,  // TX_64X16
 };
 
 static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_8X8,    // TX_4X8
   TX_4X4,    // TX_8X4
   TX_16X16,  // TX_8X16
   TX_8X8,    // TX_16X8
   TX_32X32,  // TX_16X32
   TX_16X16,  // TX_32X16
-#if CONFIG_TX64X64
   TX_64X64,  // TX_32X64
   TX_32X32,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_16X16,  // TX_4X16
   TX_4X4,    // TX_16X4
   TX_32X32,  // TX_8X32
   TX_8X8,    // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_16X16,  // TX_64X16
 };
 
-#if CONFIG_CHROMA_2X2
-#define TX_SIZE_W_MIN 2
-#else
 #define TX_SIZE_W_MIN 4
-#endif
 
 // Transform block width in pixels
 static const int tx_size_wide[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  2,
-#endif
-  4,  8,  16, 32,
-#if CONFIG_TX64X64
-  64,
-#endif  // CONFIG_TX64X64
-  4,  8,  8,  16, 16, 32,
-#if CONFIG_TX64X64
-  32, 64,
-#endif  // CONFIG_TX64X64
-  4,  16, 8,  32
+  4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
 };
 
-#if CONFIG_CHROMA_2X2
-#define TX_SIZE_H_MIN 2
-#else
 #define TX_SIZE_H_MIN 4
-#endif
 
 // Transform block height in pixels
 static const int tx_size_high[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  2,
-#endif
-  4,  8,  16, 32,
-#if CONFIG_TX64X64
-  64,
-#endif  // CONFIG_TX64X64
-  8,  4,  16, 8,  32, 16,
-#if CONFIG_TX64X64
-  64, 32,
-#endif  // CONFIG_TX64X64
-  16, 4,  32, 8
+  4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
 };
 
 // Transform block width in unit
 static const int tx_size_wide_unit[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,  2,  4, 8, 16,
-#if CONFIG_TX64X64
-  32,
-#endif  // CONFIG_TX64X64
-  2,  4,  4, 8, 8,  16,
-#if CONFIG_TX64X64
-  16, 32,
-#endif  // CONFIG_TX64X64
-  2,  8,  4, 16
-#else  // CONFIG_CHROMA_2X2
-  1,  2,  4, 8,
-#if CONFIG_TX64X64
-  16,
-#endif  // CONFIG_TX64X64
-  1,  2,  2, 4, 4, 8,
-#if CONFIG_TX64X64
-  8,  16,
-#endif  // CONFIG_TX64X64
-  1,  4,  2, 8
-#endif  // CONFIG_CHROMA_2X2
+  1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16,
 };
 
 // Transform block height in unit
 static const int tx_size_high_unit[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,  2,  4,  8, 16,
-#if CONFIG_TX64X64
-  32,
-#endif  // CONFIG_TX64X64
-  4,  2,  8,  4, 16, 8,
-#if CONFIG_TX64X64
-  32, 16,
-#endif  // CONFIG_TX64X64
-  8,  2,  16, 4
-#else  // CONFIG_CHROMA_2X2
-  1,  2, 4, 8,
-#if CONFIG_TX64X64
-  16,
-#endif  // CONFIG_TX64X64
-  2,  1, 4, 2, 8, 4,
-#if CONFIG_TX64X64
-  16, 8,
-#endif  // CONFIG_TX64X64
-  4,  1, 8, 2
-#endif  // CONFIG_CHROMA_2X2
+  1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4,
 };
 
 // Transform block width in log2
 static const int tx_size_wide_log2[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,
-#endif
-  2, 3, 4, 5,
-#if CONFIG_TX64X64
-  6,
-#endif  // CONFIG_TX64X64
-  2, 3, 3, 4, 4, 5,
-#if CONFIG_TX64X64
-  5, 6,
-#endif  // CONFIG_TX64X64
-  2, 4, 3, 5
+  2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
 };
 
 // Transform block height in log2
 static const int tx_size_high_log2[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,
-#endif
-  2, 3, 4, 5,
-#if CONFIG_TX64X64
-  6,
-#endif  // CONFIG_TX64X64
-  3, 2, 4, 3, 5, 4,
-#if CONFIG_TX64X64
-  6, 5,
-#endif  // CONFIG_TX64X64
-  4, 2, 5, 3
+  2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
 };
 
-#define TX_UNIT_WIDE_LOG2 (MI_SIZE_LOG2 - tx_size_wide_log2[0])
-#define TX_UNIT_HIGH_LOG2 (MI_SIZE_LOG2 - tx_size_high_log2[0])
-
-static const int tx_size_2d[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  4,
-#endif
-  16,   64,   256, 1024,
-#if CONFIG_TX64X64
-  4096,
-#endif  // CONFIG_TX64X64
-  32,   32,   128, 128,  512, 512,
-#if CONFIG_TX64X64
-  2048, 2048,
-#endif  // CONFIG_TX64X64
-  64,   64,   256, 256
+static const int tx_size_2d[TX_SIZES_ALL + 1] = {
+  16,  64,   256,  1024, 4096, 32,  32,  128,  128,  512,
+  512, 2048, 2048, 64,   64,   256, 256, 1024, 1024,
 };
 
 static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  BLOCK_2X2,  // TX_2X2
-#endif
   BLOCK_4X4,    // TX_4X4
   BLOCK_8X8,    // TX_8X8
   BLOCK_16X16,  // TX_16X16
   BLOCK_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   BLOCK_64X64,  // TX_64X64
-#endif          // CONFIG_TX64X64
   BLOCK_4X8,    // TX_4X8
   BLOCK_8X4,    // TX_8X4
   BLOCK_8X16,   // TX_8X16
   BLOCK_16X8,   // TX_16X8
   BLOCK_16X32,  // TX_16X32
   BLOCK_32X16,  // TX_32X16
-#if CONFIG_TX64X64
   BLOCK_32X64,  // TX_32X64
   BLOCK_64X32,  // TX_64X32
-#endif          // CONFIG_TX64X64
   BLOCK_4X16,   // TX_4X16
   BLOCK_16X4,   // TX_16X4
   BLOCK_8X32,   // TX_8X32
   BLOCK_32X8,   // TX_32X8
+  BLOCK_16X64,  // TX_16X64
+  BLOCK_64X16,  // TX_64X16
 };
 
 static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X8
   TX_4X4,    // TX_8X4
   TX_8X8,    // TX_8X16
   TX_8X8,    // TX_16X8
   TX_16X16,  // TX_16X32
   TX_16X16,  // TX_32X16
-#if CONFIG_TX64X64
   TX_32X32,  // TX_32X64
   TX_32X32,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X16
   TX_4X4,    // TX_16X4
   TX_8X8,    // TX_8X32
   TX_8X8,    // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_16X16,  // TX_64X16
 };
 
 static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_8X8,    // TX_4X8
   TX_8X8,    // TX_8X4
   TX_16X16,  // TX_8X16
   TX_16X16,  // TX_16X8
   TX_32X32,  // TX_16X32
   TX_32X32,  // TX_32X16
-#if CONFIG_TX64X64
   TX_64X64,  // TX_32X64
   TX_64X64,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_16X16,  // TX_4X16
   TX_16X16,  // TX_16X4
   TX_32X32,  // TX_8X32
   TX_32X32,  // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_64X64,  // TX_64X16
+};
+
+static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
+  0,  // TX_4X4
+  2,  // TX_8X8
+  4,  // TX_16X16
+  6,  // TX_32X32
+  6,  // TX_64X64
+  1,  // TX_4X8
+  1,  // TX_8X4
+  3,  // TX_8X16
+  3,  // TX_16X8
+  5,  // TX_16X32
+  5,  // TX_32X16
+  6,  // TX_32X64
+  6,  // TX_64X32
+  2,  // TX_4X16
+  2,  // TX_16X4
+  4,  // TX_8X32
+  4,  // TX_32X8
+  5,  // TX_16X64
+  5,  // TX_64X16
 };
 
 /* clang-format off */
 static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_4X4,    // ONLY_4X4
-  TX_8X8,    // ALLOW_8X8
-  TX_16X16,  // ALLOW_16X16
-  TX_32X32,  // ALLOW_32X32
-#if CONFIG_TX64X64
-  TX_64X64,  // ALLOW_64X64
+  TX_64X64,  // TX_MODE_LARGEST
   TX_64X64,  // TX_MODE_SELECT
-#else
-  TX_32X32,  // TX_MODE_SELECT
-#endif  // CONFIG_TX64X64
 };
 /* clang-format on */
 
 static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
-//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { { BLOCK_2X2, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_2X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_4X2, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_4X4, BLOCK_4X2 }, { BLOCK_2X4, BLOCK_2X2 } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_2X4 } },
-  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X2 } },
-#elif CONFIG_CB4X4
+  //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+  //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
   { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } },
-  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } },
-#else
-  { { BLOCK_4X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_INVALID } },
-#endif
+  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
+  { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
   { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
-  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
-  { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } },
+  { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } },
   { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
-  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
-  { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } },
+  { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } },
   { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
-  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
-  { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } },
+  { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } },
   { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
-#if CONFIG_EXT_PARTITION
   { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
   { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
   { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
-#endif  // CONFIG_EXT_PARTITION
-  { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
-  { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } },
+  { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } },
+  { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } },
   { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
   { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
   { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
-  { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } },
-#if CONFIG_EXT_PARTITION
-  { { BLOCK_32X128, BLOCK_32X64 }, { BLOCK_INVALID, BLOCK_16X64 } },
-  { { BLOCK_128X32, BLOCK_INVALID }, { BLOCK_64X32, BLOCK_64X16 } },
-#endif  // CONFIG_EXT_PARTITION
+  { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
 };
 
-static const TX_SIZE uv_txsize_lookup[BLOCK_SIZES_ALL][TX_SIZES_ALL][2][2] = {
-//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-#if CONFIG_CHROMA_2X2
-  {
-      // BLOCK_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-  },
-  {
-      // BLOCK_2X4
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-  },
-  {
-      // BLOCK_4X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-  },
-#elif CONFIG_CHROMA_SUB8X8
-  {
-      // BLOCK_2x2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-      // BLOCK_2X4
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-      // BLOCK_4X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-#endif
-  {
-// BLOCK_4X4
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_4X8
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X8, TX_4X4 }, { TX_2X2, TX_2X2 } },  // used
-#else
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
-#endif
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_8X4
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_CHROMA_2X2
-      { { TX_8X4, TX_2X2 }, { TX_4X4, TX_2X2 } },  // used
-#else
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
-#endif
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_8X8
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-  },
-  {
-// BLOCK_8X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X16, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },  // used
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
-#if CONFIG_TX64X64
-      { { TX_8X16, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X8, TX_4X8 } },
-  },
-  {
-// BLOCK_16X8
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_16X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },  // used
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-#if CONFIG_TX64X64
-      { { TX_16X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_16X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_16X4 }, { TX_8X8, TX_8X4 } },
-  },
-  {
-// BLOCK_16X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-  },
-  {
-// BLOCK_16X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X32, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },  // used
-      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },
-#if CONFIG_TX64X64
-      { { TX_16X32, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X32, TX_8X16 }, { TX_8X32, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-  },
-  {
-// BLOCK_32X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_32X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },
-      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },  // used
-#if CONFIG_TX64X64
-      { { TX_32X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_32X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_32X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
-      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_32X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_64X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
-      { { TX_32X16, TX_16X16 }, { TX_32X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_64X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-#if CONFIG_EXT_PARTITION
-  {
-// BLOCK_64X128
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-  },
-  {
-// BLOCK_128X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_128X128
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-#endif  // CONFIG_EXT_PARTITION
-  {
-// BLOCK_4X16
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_4X4 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X16, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_16X4
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_4X4, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_16X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_16X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-  },
-  {
-// BLOCK_8X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X16 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X16 }, { TX_4X8, TX_4X8 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X32, TX_8X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X8, TX_4X8 } },
-  },
-  {
-// BLOCK_32X8
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_16X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_16X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_16X8, TX_8X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_32X8, TX_16X4 }, { TX_16X8, TX_16X4 } },
-  },
-  {
-// BLOCK_16X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X16, TX_8X16 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X32, TX_8X32 }, { TX_8X32, TX_8X32 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-  },
-  {
-// BLOCK_64X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X16, TX_16X8 }, { TX_16X16, TX_16X8 } },
-      { { TX_32X16, TX_16X8 }, { TX_32X16, TX_16X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_32X8, TX_32X8 } },
-  },
-#if CONFIG_EXT_PARTITION
-  {
-// BLOCK_32X128
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X32, TX_8X32 }, { TX_8X32, TX_8X32 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_128X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X32, TX_8X16 }, { TX_8X32, TX_8X16 } },
-      { { TX_32X8, TX_32X8 }, { TX_32X8, TX_32X8 } },
-  },
-#endif
-};
-
-// Generates 4 bit field in which each bit set to 1 represents
-// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
-// and 8x8.  1000 means we just split the 64x64 to 32x32
+// Generates 5 bit field in which each bit set to 1 represents
+// a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
+// and 8x8.  10000 means we just split the 128x128 to 64x64
 /* clang-format off */
 static const struct {
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_EXT_PARTITION
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 31, 31 },  // 2X2   - {0b11111, 0b11111}
-  { 31, 31 },  // 2X4   - {0b11111, 0b11111}
-  { 31, 31 },  // 4X2   - {0b11111, 0b11111}
-#endif
   { 31, 31 },  // 4X4   - {0b11111, 0b11111}
   { 31, 30 },  // 4X8   - {0b11111, 0b11110}
   { 30, 31 },  // 8X4   - {0b11110, 0b11111}
@@ -2070,131 +403,29 @@ static const struct {
   { 16, 0 },   // 64X128- {0b10000, 0b00000}
   { 0, 16 },   // 128X64- {0b00000, 0b10000}
   { 0, 0 },    // 128X128-{0b00000, 0b00000}
-
   { 31, 28 },  // 4X16  - {0b11111, 0b11100}
   { 28, 31 },  // 16X4  - {0b11100, 0b11111}
   { 30, 24 },  // 8X32  - {0b11110, 0b11000}
   { 24, 30 },  // 32X8  - {0b11000, 0b11110}
   { 28, 16 },  // 16X64 - {0b11100, 0b10000}
   { 16, 28 },  // 64X16 - {0b10000, 0b11100}
-  { 24, 0 },   // 32X128- {0b11000, 0b00000}
-  { 0, 24 },   // 128X32- {0b00000, 0b11000}
-#else
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 15, 15 },  // 2X2   - {0b1111, 0b1111}
-  { 15, 15 },  // 2X4   - {0b1111, 0b1111}
-  { 15, 15 },  // 4X2   - {0b1111, 0b1111}
-#endif
-  { 15, 15 },  // 4X4   - {0b1111, 0b1111}
-  { 15, 14 },  // 4X8   - {0b1111, 0b1110}
-  { 14, 15 },  // 8X4   - {0b1110, 0b1111}
-  { 14, 14 },  // 8X8   - {0b1110, 0b1110}
-  { 14, 12 },  // 8X16  - {0b1110, 0b1100}
-  { 12, 14 },  // 16X8  - {0b1100, 0b1110}
-  { 12, 12 },  // 16X16 - {0b1100, 0b1100}
-  { 12, 8 },   // 16X32 - {0b1100, 0b1000}
-  { 8, 12 },   // 32X16 - {0b1000, 0b1100}
-  { 8, 8 },    // 32X32 - {0b1000, 0b1000}
-  { 8, 0 },    // 32X64 - {0b1000, 0b0000}
-  { 0, 8 },    // 64X32 - {0b0000, 0b1000}
-  { 0, 0 },    // 64X64 - {0b0000, 0b0000}
-
-  { 15, 12 },  // 4X16 - {0b1111, 0b1100}
-  { 12, 15 },  // 16X4 - {0b1100, 0b1111}
-  { 8, 14 },   // 8X32 - {0b1110, 0b1000}
-  { 14, 8 },   // 32X8 - {0b1000, 0b1110}
-  { 12, 0 },   // 16X64- {0b1100, 0b0000}
-  { 0, 12 },   // 64X16- {0b0000, 0b1100}
-#endif  // CONFIG_EXT_PARTITION
 };
 /* clang-format on */
 
-#if CONFIG_KF_CTX
 static const int intra_mode_context[INTRA_MODES] = {
-  0, 1, 2, 3, 4, 4, 4, 4, 3, 0,
-#if CONFIG_SMOOTH_HV
-  1, 2,
-#endif
-  0,
-};
-#endif
-
-#if CONFIG_SUPERTX
-static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
-//  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
-//  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
-#if CONFIG_CHROMA_2X2
-  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-  { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-  { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-};
-
-#if CONFIG_EXT_PARTITION_TYPES
-static const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
-  -1, 0, 0, 1, 0, 0, 0, 0, 0, 0
+  0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0,
 };
 
-#else
-static const int partition_supertx_context_lookup[PARTITION_TYPES] = { -1, 0, 0,
-                                                                       1 };
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-// NCOBMC_ADAPT_INTRPL only supports block size >= BLOCK_8X8 and <= BLOCK_64X64
-static const ADAPT_OVERLAP_BLOCK adapt_overlap_block_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_2X2
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_2X4
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_4X2
-#endif
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_4X4
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_4X8
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_8X4
-
-  // the rest of the block sizes round to the largest squared block less than
-  // the given block size
-  ADAPT_OVERLAP_BLOCK_8X8, ADAPT_OVERLAP_BLOCK_8X8, ADAPT_OVERLAP_BLOCK_8X8,
-  ADAPT_OVERLAP_BLOCK_16X16, ADAPT_OVERLAP_BLOCK_16X16,
-  ADAPT_OVERLAP_BLOCK_16X16, ADAPT_OVERLAP_BLOCK_32X32,
-  ADAPT_OVERLAP_BLOCK_32X32, ADAPT_OVERLAP_BLOCK_32X32,
-  ADAPT_OVERLAP_BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-  ADAPT_OVERLAP_BLOCK_INVALID,
-#endif  // CONFIG_EXT_PARTITION
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+// Note: this is also used in unit tests. So whenever one changes the table,
+// the unit tests need to be changed accordingly.
+static const int quant_dist_weight[4][2] = {
+  { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
 };
-
-static const BLOCK_SIZE bsize_2_sqr_bsize[BLOCK_SIZES] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,
-#endif
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,   BLOCK_8X8,
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-  BLOCK_64X64, BLOCK_64X64,
-#endif
+static const int quant_dist_lookup_table[2][4][2] = {
+  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
+  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
 };
 
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
-#if CONFIG_ADAPT_SCAN
-#define EOB_THRESHOLD_NUM 2
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index 5476f59a6..d57f44f8b 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -12,76 +12,60 @@
 #include <assert.h>
 #include <string.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
-#define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
-#define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
-#define MAX_STEP (32)
-
-void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                          int dst_stride, int w, int h,
-                          const InterpFilterParams filter_params,
-                          const int subpel_x_q4, int x_step_q4,
-                          ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = subpel_x_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, x_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
-      else
-        dst[x] = sum;
-
-      x_q4 += x_step_q4;
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const int16_t *x_filters, int x0_qn,
+                             int x_step_qn) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_qn += x_step_qn;
     }
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const InterpFilterParams filter_params,
-                              const int subpel_x_qn, int x_step_qn,
-                              ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
-      else
-        dst[x] = sum;
-
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const int16_t *x_filters, int x0_qn,
+                                    int x_step_qn, int bd) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_qn += x_step_qn;
     }
     src += src_stride;
@@ -89,417 +73,358 @@ void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int w, int h,
-                         const InterpFilterParams filter_params,
-                         const int subpel_y_q4, int y_step_q4,
-                         ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= src_stride * (filter_size / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = subpel_y_q4;
-    for (y = 0; y < h; ++y) {
-      const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, y_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
-      else
-        dst[y * dst_stride] = sum;
-
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-void av1_convolve_vert_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_y_qn, int y_step_qn,
-                             ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= src_stride * (filter_size / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y) {
-      const uint8_t *const src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
-      else
-        dst[y * dst_stride] = sum;
-
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
                           ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (conv_params->do_average == 0) {
-    int r;
-    for (r = 0; r < h; ++r) {
-      memcpy(dst, src, w);
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    int r, c;
-    for (r = 0; r < h; ++r) {
-      for (c = 0; c < w; ++c) {
-        dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
       }
-      src += src_stride;
-      dst += dst_stride;
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
-}
 
-void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               const InterpFilterParams filter_params,
-                               const int subpel_x_q4, int x_step_q4,
-                               ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-    if (conv_params->do_average == 0)
-      aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                          NULL, -1, w, h);
-    else
-      aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, NULL, -1, w, h);
-  } else {
-    av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
-                       subpel_x_q4, x_step_q4, conv_params);
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+    }
   }
 }
 
-void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams filter_params,
-                                 const int subpel_x_q4, int x_step_q4,
-                                 ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-    if (conv_params->do_average == 0)
-      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, NULL, -1, w, h);
-    else
-      aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, NULL, -1, w, h);
-  } else {
-    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_x_q4, x_step_q4, conv_params);
-  }
-}
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         InterpFilterParams *filter_params_x,
+                         InterpFilterParams *filter_params_y,
+                         const int subpel_x_q4, const int subpel_y_q4,
+                         ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
 
-void av1_convolve_horiz_facade_scale(const uint8_t *src, int src_stride,
-                                     uint8_t *dst, int dst_stride, int w, int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_x_qn, int x_step_qn,
-                                     ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x = av1_get_interp_filter_subpel_kernel(
-        filter_params, subpel_x_qn >> SCALE_EXTRA_BITS);
-    if (conv_params->do_average == 0)
-      aom_convolve8_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
-                                subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
-    else
-      aom_convolve8_avg_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
-                                    subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
-  } else {
-    av1_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
-                             filter_params, subpel_x_qn, x_step_qn,
-                             conv_params);
-  }
-}
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const InterpFilterParams filter_params,
-                              const int subpel_y_q4, int y_step_q4,
-                              ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-    if (conv_params->do_average == 0) {
-      aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
-                         y_step_q4, w, h);
-    } else {
-      aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
-                             filter_y, y_step_q4, w, h);
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      dst[y * dst_stride + x] =
+          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
     }
-  } else {
-    av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
-                      subpel_y_q4, y_step_q4, conv_params);
   }
 }
 
-void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams filter_params,
-                                const int subpel_y_q4, int y_step_q4,
-                                ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-    if (conv_params->do_average == 0) {
-      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
-                           y_step_q4, w, h);
-    } else {
-      aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1,
-                               filter_y, y_step_q4, w, h);
-    }
-  } else {
-    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                        subpel_y_q4, y_step_q4, conv_params);
-  }
-}
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         InterpFilterParams *filter_params_x,
+                         InterpFilterParams *filter_params_y,
+                         const int subpel_x_q4, const int subpel_y_q4,
+                         ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride,
-                                    uint8_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams filter_params,
-                                    const int subpel_y_qn, int y_step_qn,
-                                    ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y = av1_get_interp_filter_subpel_kernel(
-        filter_params, subpel_y_qn >> SCALE_EXTRA_BITS);
-    if (conv_params->do_average == 0) {
-      aom_convolve8_vert_scale(src, src_stride, dst, dst_stride, NULL, 0, -1,
-                               filter_y, subpel_y_qn, y_step_qn, w, h);
-    } else {
-      aom_convolve8_avg_vert_scale(src, src_stride, dst, dst_stride, NULL, 0,
-                                   -1, filter_y, subpel_y_qn, y_step_qn, w, h);
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
     }
-  } else {
-    av1_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
-                            filter_params, subpel_y_qn, y_step_qn, conv_params);
   }
 }
 
-#if CONFIG_CONVOLVE_ROUND
-void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h, int bits) {
-  int r, c;
-  for (r = 0; r < h; ++r) {
-    for (c = 0; c < w; ++c) {
-      dst[r * dst_stride + c] =
-          clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits));
-    }
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               InterpFilterParams *filter_params_x,
+                               InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  for (int y = 0; y < h; ++y) {
+    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
-                       int dst_stride, int w, int h,
-                       InterpFilterParams *filter_params_x,
-                       InterpFilterParams *filter_params_y,
-                       const int subpel_x_q4, const int subpel_y_q4,
-                       ConvolveParams *conv_params) {
-  int x, y, k;
-  uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                           int dst8_stride, int w, int h,
+                           InterpFilterParams *filter_params_x,
+                           InterpFilterParams *filter_params_y,
+                           const int subpel_x_q4, const int subpel_y_q4,
+                           ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
 
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-  for (y = 0; y < im_h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int32_t sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k) {
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
       }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
 
   // vertical filter
-  uint8_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
         dst[y * dst_stride + x] = res;
+      }
     }
   }
 }
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
-                             CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int x_step_qn,
-                             const int subpel_y_qn, const int y_step_qn,
-                             ConvolveParams *conv_params) {
-  int x, y, k;
-  uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
-  int im_stride = w;
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                          int dst8_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-
-  // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
-      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
-      int sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k)
-        sum += x_filter[k] * src_x[k - fo_horiz];
-      im_block[y * im_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
-    }
-    src_horiz += src_stride;
-  }
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
 
   // vertical filter
-  const uint8_t *src_vert = im_block + fo_vert * im_stride;
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const uint8_t *const src_y =
-          &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
       }
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
         dst[y * dst_stride + x] = res;
+      }
     }
-    src_vert++;
   }
 }
 
-#else
-
-/* When convolve-round is enabled and compound-round is disabled, we use a
-   high-precision convolve filter.
-   Note: For notes on hardware implementations, including the required
-   bit widths for various intermediate values, see the comments above
-   av1_warp_affine_c.
-*/
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
-                       int dst_stride, int w, int h,
-                       InterpFilterParams *filter_params_x,
-                       InterpFilterParams *filter_params_y,
-                       const int subpel_x_q4, const int subpel_y_q4,
-                       ConvolveParams *conv_params) {
-  int x, y, k;
-  int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = w;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                          int dst8_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
   const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
 
   // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-  for (y = 0; y < im_h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst[y * dst_stride + x] = res;
       }
-      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
-      im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
+}
 
-  // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst8, int dst8_stride, int w, int h,
+                                InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
-      }
-      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      } else {
         dst[y * dst_stride + x] = res;
+      }
     }
   }
 }
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
-                             CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
                              InterpFilterParams *filter_params_x,
                              InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int x_step_qn,
                              const int subpel_y_qn, const int y_step_qn,
                              ConvolveParams *conv_params) {
-  int x, y, k;
-  int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -507,245 +432,255 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
 
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
+  for (int y = 0; y < im_h; ++y) {
     int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(x_filter_idx < SUBPEL_SHIFTS);
       const int16_t *x_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < filter_params_x->taps; ++k) {
+      for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_x[k - fo_horiz];
       }
       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
     src_horiz += src_stride;
   }
 
   // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(y_filter_idx < SUBPEL_SHIFTS);
       const int16_t *y_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (k = 0; k < filter_params_y->taps; ++k) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
       }
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
     }
     src_vert++;
   }
 }
-#endif  // CONFIG_COMPOUND_ROUND
+
+static void convolve_2d_scale_wrapper(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params) {
+  if (conv_params->is_compound) {
+    assert(conv_params->dst != NULL);
+  }
+  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
+                        y_step_qn, conv_params);
+}
 
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params) {
+                            int scaled, ConvolveParams *conv_params,
+                            const struct scale_factors *sf) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
 
   InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
-                                 &filter_params_y);
-
-  if (filter_params_y.taps < filter_params_x.taps) {
-    uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
-                   (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
-    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
-    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
-    int tr_dst_stride = MAX_SB_SIZE;
-    int fo_vert = filter_params_y.taps / 2 - 1;
-    int fo_horiz = filter_params_x.taps / 2 - 1;
-
-    transpose_uint8(tr_src, tr_src_stride,
-                    src - fo_vert * src_stride - fo_horiz, src_stride,
-                    w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
-    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
-                    conv_params->dst_stride, w, h);
-
-    // horizontal and vertical parameters are swapped because of the transpose
-    if (scaled)
-      av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                            tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                            &filter_params_y, &filter_params_x, subpel_y_q4,
-                            y_step_q4, subpel_x_q4, x_step_q4, conv_params);
-    else
-      av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                      tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                      &filter_params_y, &filter_params_x, subpel_y_q4,
-                      subpel_x_q4, conv_params);
-    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
-                    tr_dst_stride, h, w);
-  } else {
-    if (scaled)
-      av1_convolve_2d_scale(src, src_stride, conv_params->dst,
-                            conv_params->dst_stride, w, h, &filter_params_x,
-                            &filter_params_y, subpel_x_q4, x_step_q4,
-                            subpel_y_q4, y_step_q4, conv_params);
-    else
-      av1_convolve_2d(src, src_stride, conv_params->dst,
-                      conv_params->dst_stride, w, h, &filter_params_x,
-                      &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
-  }
+  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
+                                 &filter_params_y, w, h);
+
+  if (scaled)
+    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
+                              &filter_params_x, &filter_params_y, subpel_x_q4,
+                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+  else
+    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
+        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
-                                    uint8_t *dst8, int dst_stride, int w, int h,
-                                    int bits, int bd) {
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int r, c;
-  for (r = 0; r < h; ++r) {
-    for (c = 0; c < w; ++c) {
-      dst[r * dst_stride + c] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd);
-    }
+void av1_highbd_convolve_2d_copy_sr_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+
+  for (int y = 0; y < h; ++y) {
+    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
-                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params, int bd) {
-  int x, y, k;
-  uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = w;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
   // horizontal filter
-  const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-  for (y = 0; y < im_h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int32_t sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
       }
-      im_block[y * im_stride + x] =
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
     }
   }
+}
 
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params, int bd) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
-  uint16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
       }
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
     }
   }
 }
 
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
-                                    CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                    int h, InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
-                                    const int subpel_x_qn, const int x_step_qn,
-                                    const int subpel_y_qn, const int y_step_qn,
-                                    ConvolveParams *conv_params, int bd) {
-  int x, y, k;
-  uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 InterpFilterParams *filter_params_x,
+                                 InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-  (void)bd;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
 
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
-      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
-      int sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k)
-        sum += x_filter[k] * src_x[k - fo_horiz];
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
+          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
-    src_horiz += src_stride;
   }
 
   // vertical filter
-  uint16_t *src_vert = im_block + fo_vert * im_stride;
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const uint16_t *const src_y =
-          &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
     }
-    src_vert++;
   }
 }
 
-#else
-
-void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
-                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
+                                  uint16_t *dst16, int dst16_stride, int w,
+                                  int h, InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params, int bd) {
   int x, y, k;
-  int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
 
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
@@ -760,439 +695,367 @@ void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       (void)bd;
       im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
 
   // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 1 << offset_bits;
+      int32_t sum = 1 << offset_bits;
       for (k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst16, int dst16_stride, int w,
+                                 int h, InterpFilterParams *filter_params_x,
+                                 InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  assert(bits >= 0);
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst16, int dst16_stride, int w,
+                                 int h, InterpFilterParams *filter_params_x,
+                                 InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  assert(bits >= 0);
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
         dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_copy_c(
+    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+    int w, int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  assert(bits >= 0);
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
     }
   }
 }
 
 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
-                                    CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                    int h, InterpFilterParams *filter_params_x,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
                                     InterpFilterParams *filter_params_y,
                                     const int subpel_x_qn, const int x_step_qn,
                                     const int subpel_y_qn, const int y_step_qn,
                                     ConvolveParams *conv_params, int bd) {
-  int x, y, k;
-  int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
+  for (int y = 0; y < im_h; ++y) {
     int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(x_filter_idx < SUBPEL_SHIFTS);
       const int16_t *x_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < filter_params_x->taps; ++k) {
+      for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_x[k - fo_horiz];
       }
       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
     src_horiz += src_stride;
   }
 
   // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(y_filter_idx < SUBPEL_SHIFTS);
       const int16_t *y_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (k = 0; k < filter_params_y->taps; ++k) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
       }
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      }
     }
     src_vert++;
   }
 }
-#endif  // CONFIG_COMPOUND_ROUND
 
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
-                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   uint8_t *dst8, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   int bd) {
+                                   const struct scale_factors *sf, int bd) {
   (void)x_step_q4;
   (void)y_step_q4;
-  (void)dst;
   (void)dst_stride;
 
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
-                                 &filter_params_y);
-
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  if (filter_params_y.taps < filter_params_x.taps) {
-    uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
-                    (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
-    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
-    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
-    int tr_dst_stride = MAX_SB_SIZE;
-    int fo_vert = filter_params_y.taps / 2 - 1;
-    int fo_horiz = filter_params_x.taps / 2 - 1;
-
-    transpose_uint16(
-        tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
-        src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
-    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
-                    conv_params->dst_stride, w, h);
-
-    // horizontal and vertical parameters are swapped because of the transpose
-    if (scaled)
-      av1_highbd_convolve_2d_scale(
-          tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
-          tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
-          y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
-    else
-      av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                             tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                             &filter_params_y, &filter_params_x, subpel_y_q4,
-                             subpel_x_q4, conv_params, bd);
-    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
-                    tr_dst_stride, h, w);
-  } else {
-    if (scaled)
-      av1_highbd_convolve_2d_scale(
-          src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
-          &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
-          subpel_y_q4, y_step_q4, conv_params, bd);
-    else
-      av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
-                             conv_params->dst_stride, w, h, &filter_params_x,
-                             &filter_params_y, subpel_x_q4, subpel_y_q4,
-                             conv_params, bd);
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#endif  // CONFIG_CONVOLVE_ROUND
-
-typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_q4, int step_q4,
-                             ConvolveParams *conv_params);
-
-static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilters interp_filters,
-                            const int subpel_x_q4, int x_step_q4,
-                            const int subpel_y_q4, int y_step_q4,
-                            ConvolveParams *conv_params,
-                            ConvolveFunc convolve_horiz,
-                            ConvolveFunc convolve_vert) {
-  int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
-  int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
-
   InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_q4 <= MAX_STEP);
-  assert(x_step_q4 <= MAX_STEP);
-
-  if (ignore_horiz && ignore_vert) {
-    convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
-  } else if (ignore_vert) {
-    assert(filter_params_x.taps <= MAX_FILTER_TAP);
-    convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                   subpel_x_q4, x_step_q4, conv_params);
-  } else if (ignore_horiz) {
-    assert(filter_params_y.taps <= MAX_FILTER_TAP);
-    convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                  subpel_y_q4, y_step_q4, conv_params);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    // we do filter with fewer taps first to reduce hardware implementation
-    // complexity
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
-                    intermediate_width, h, filter_params_y, subpel_y_q4,
-                    y_step_q4, &temp_conv_params);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-      convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
-                     w, h, filter_params_x, subpel_x_q4, x_step_q4,
-                     conv_params);
-    } else
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    {
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_y.taps;
-      intermediate_height =
-          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
-                     temp_stride, w, intermediate_height, filter_params_x,
-                     subpel_x_q4, x_step_q4, &temp_conv_params);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
-                    dst, dst_stride, w, h, filter_params_y, subpel_y_q4,
-                    y_step_q4, conv_params);
-    }
-  }
-}
+  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
+                                 &filter_params_y, w, h);
 
-static void convolve_scale_helper(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilters interp_filters,
-                                  const int subpel_x_qn, int x_step_qn,
-                                  const int subpel_y_qn, int y_step_qn,
-                                  ConvolveParams *conv_params,
-                                  ConvolveFunc convolve_horiz,
-                                  ConvolveFunc convolve_vert) {
-  int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
-  int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-  assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-
-  if (ignore_horiz && ignore_vert) {
-    convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
-  } else if (ignore_vert) {
-    assert(filter_params_x.taps <= MAX_FILTER_TAP);
-    convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                   subpel_x_qn, x_step_qn, conv_params);
-  } else if (ignore_horiz) {
-    assert(filter_params_y.taps <= MAX_FILTER_TAP);
-    convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                  subpel_y_qn, y_step_qn, conv_params);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    // we do filter with fewer taps first to reduce hardware implementation
-    // complexity
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
-                    intermediate_width, h, filter_params_y, subpel_y_qn,
-                    y_step_qn, &temp_conv_params);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-      convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
-                     w, h, filter_params_x, subpel_x_qn, x_step_qn,
-                     conv_params);
-    } else {
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_y.taps;
-      intermediate_height =
-          (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
-                     temp_stride, w, intermediate_height, filter_params_x,
-                     subpel_x_qn, x_step_qn, &temp_conv_params);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
-                    dst, dst_stride, w, h, filter_params_y, subpel_y_qn,
-                    y_step_qn, conv_params);
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
+  if (scaled) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    if (conv_params->is_compound) {
+      assert(conv_params->dst != NULL);
     }
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  }
-}
+    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
+                                 &filter_params_x, &filter_params_y,
+                                 subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
+                                 conv_params, bd);
+  } else {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
-void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
-                  int dst_stride, int w, int h, InterpFilters interp_filters,
-                  const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
-                  int y_step_q4, ConvolveParams *conv_params) {
-  convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                  subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
-                  av1_convolve_horiz_facade, av1_convolve_vert_facade);
+    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+                                          0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
+        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+  }
 }
 
-void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                    int dst_stride, int w, int h, InterpFilters interp_filters,
-                    const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
-                    int y_step_q4, ConvolveParams *conv_params) {
-  convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                  subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
-                  av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c);
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 128x128 pixels.
+// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
+#define WIENER_MAX_EXT_SIZE 263
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
 }
 
-void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                        int dst_stride, int w, int h,
-                        InterpFilters interp_filters, const int subpel_x_qn,
-                        int x_step_qn, const int subpel_y_qn, int y_step_qn,
-                        ConvolveParams *conv_params) {
-  convolve_scale_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                        subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn,
-                        conv_params, av1_convolve_horiz_facade_scale,
-                        av1_convolve_vert_facade_scale);
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
 }
 
-void av1_lowbd_convolve_init_c(void) {
-  // A placeholder for SIMD initialization
-  return;
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
 }
 
-void av1_highbd_convolve_init_c(void) {
-  // A placeholder for SIMD initialization
-  return;
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
 }
 
-void av1_convolve_init(AV1_COMMON *cm) {
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    av1_highbd_convolve_init();
-  else
-    av1_lowbd_convolve_init();
-#else
-  (void)cm;
-  av1_lowbd_convolve_init();
-#endif
-  return;
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
-                                 uint16_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams filter_params,
-                                 const int subpel_x_q4, int x_step_q4, int avg,
-                                 int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = subpel_x_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, x_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-      if (avg)
-        dst[x] = ROUND_POWER_OF_TWO(
-            dst[x] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      else
-        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint16_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h,
+                                       int round0_bits) {
+  const int bd = 8;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -1200,66 +1063,25 @@ void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_convolve_horiz_scale(const uint16_t *src, int src_stride,
-                                     uint16_t *dst, int dst_stride, int w,
-                                     int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_x_qn, int x_step_qn,
-                                     int avg, int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-      if (avg)
-        dst[x] = ROUND_POWER_OF_TWO(
-            dst[x] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      else
-        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
-                                uint16_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams filter_params,
-                                const int subpel_y_q4, int y_step_q4, int avg,
-                                int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= src_stride * (filter_size / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = subpel_y_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, y_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      if (avg) {
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-            dst[y * dst_stride] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      } else {
-        dst[y * dst_stride] =
-            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      }
+static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h,
+                                      int round1_bits) {
+  const int bd = 8;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
       y_q4 += y_step_q4;
     }
     ++src;
@@ -1267,325 +1089,111 @@ void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_convolve_vert_scale(const uint16_t *src, int src_stride,
-                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams filter_params,
-                                    const int subpel_y_qn, int y_step_qn,
-                                    int avg, int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= src_stride * (filter_size / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *const src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      if (avg) {
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-            dst[y * dst_stride] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      } else {
-        dst[y * dst_stride] =
-            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      }
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_copy(const uint16_t *src, int src_stride,
-                                 uint16_t *dst, int dst_stride, int w, int h,
-                                 int avg, int bd) {
-  if (avg == 0) {
-    int r;
-    for (r = 0; r < h; ++r) {
-      memcpy(dst, src, w * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    int r, c;
-    for (r = 0; r < h; ++r) {
-      for (c = 0; c < w; ++c) {
-        dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
-      }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h,
+                                   const ConvolveParams *conv_params) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
+                             x_step_q4, w, intermediate_height,
+                             conv_params->round_0);
+  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
+                            y_step_q4, w, h, conv_params->round_1);
 }
 
-void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride,
-                                      uint8_t *dst8, int dst_stride, int w,
-                                      int h,
-                                      const InterpFilterParams filter_params,
-                                      const int subpel_x_q4, int x_step_q4,
-                                      int avg, int bd) {
+static void highbd_convolve_add_src_horiz_hip(
+    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+    int x_step_q4, int w, int h, int round0_bits, int bd) {
+  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-    if (avg == 0)
-      aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x,
-                                 x_step_q4, NULL, -1, w, h, bd);
-    else
-      aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride,
-                                     filter_x, x_step_q4, NULL, -1, w, h, bd);
-  } else {
-    av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
-                              filter_params, subpel_x_q4, x_step_q4, avg, bd);
-  }
-}
-
-void av1_highbd_convolve_horiz_facade_scale(
-    const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
-    int h, const InterpFilterParams filter_params, const int subpel_x_qn,
-    int x_step_qn, int avg, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
-  // as in the function above.
-  av1_highbd_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
-                                  filter_params, subpel_x_qn, x_step_qn, avg,
-                                  bd);
-}
-
-void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
-                                     uint8_t *dst8, int dst_stride, int w,
-                                     int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_y_q4, int y_step_q4,
-                                     int avg, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-    if (avg == 0) {
-      aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1,
-                                filter_y, y_step_q4, w, h, bd);
-    } else {
-      aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL,
-                                    -1, filter_y, y_step_q4, w, h, bd);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               extraprec_clamp_limit - 1);
+      x_q4 += x_step_q4;
     }
-  } else {
-    av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
-                             filter_params, subpel_y_q4, y_step_q4, avg, bd);
+    src += src_stride;
+    dst += dst_stride;
   }
 }
 
-void av1_highbd_convolve_vert_facade_scale(
-    const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
-    int h, const InterpFilterParams filter_params, const int subpel_y_qn,
-    int y_step_qn, int avg, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
-  // as in the function above.
-  av1_highbd_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
-                                 filter_params, subpel_y_qn, y_step_qn, avg,
-                                 bd);
-}
-
-void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
-                         int dst_stride, int w, int h,
-                         InterpFilters interp_filters, const int subpel_x_q4,
-                         int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                         int ref_idx, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+static void highbd_convolve_add_src_vert_hip(
+    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+    int y_step_q4, int w, int h, int round1_bits, int bd) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
-  int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_q4 <= MAX_STEP);
-  assert(x_step_q4 <= MAX_STEP);
-
-  if (ignore_horiz && ignore_vert) {
-    highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
-    return;
-  }
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  if (ignore_vert) {
-    av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h,
-                                     filter_params_x, subpel_x_q4, x_step_q4,
-                                     ref_idx, bd);
-  } else if (ignore_horiz) {
-    av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h,
-                                    filter_params_y, subpel_y_q4, y_step_q4,
-                                    ref_idx, bd);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade(src8 - (filter_size / 2 - 1), src_stride,
-                                      temp8, temp_stride, intermediate_width, h,
-                                      filter_params_y, subpel_y_q4, y_step_q4,
-                                      0, bd);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_horiz_facade(
-          temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
-          filter_params_x, subpel_x_q4, x_step_q4, ref_idx, bd);
-    } else
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    {
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      filter_size = filter_params_y.taps;
-
-      intermediate_height =
-          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      av1_highbd_convolve_horiz_facade(
-          src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
-          temp_stride, w, intermediate_height, filter_params_x, subpel_x_q4,
-          x_step_q4, 0, bd);
-
-      filter_size = filter_params_y.taps;
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade(
-          temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
-          dst_stride, w, h, filter_params_y, subpel_y_q4, y_step_q4, ref_idx,
-          bd);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
+      y_q4 += y_step_q4;
     }
+    ++src;
+    ++dst;
   }
 }
 
-void av1_highbd_convolve_scale(const uint8_t *src8, int src_stride,
-                               uint8_t *dst8, int dst_stride, int w, int h,
-                               InterpFilters interp_filters,
-                               const int subpel_x_qn, int x_step_qn,
-                               const int subpel_y_qn, int y_step_qn,
-                               int ref_idx, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
-  int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-  assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-
-  if (ignore_horiz && ignore_vert) {
-    highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
-    return;
-  }
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  if (ignore_vert) {
-    av1_highbd_convolve_horiz_facade_scale(src8, src_stride, dst8, dst_stride,
-                                           w, h, filter_params_x, subpel_x_qn,
-                                           x_step_qn, ref_idx, bd);
-  } else if (ignore_horiz) {
-    av1_highbd_convolve_vert_facade_scale(src8, src_stride, dst8, dst_stride, w,
-                                          h, filter_params_y, subpel_y_qn,
-                                          y_step_qn, ref_idx, bd);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade_scale(
-          src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
-          intermediate_width, h, filter_params_y, subpel_y_qn, y_step_qn, 0,
-          bd);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_horiz_facade_scale(
-          temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
-          filter_params_x, subpel_x_qn, x_step_qn, ref_idx, bd);
-    } else {
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      filter_size = filter_params_y.taps;
-      intermediate_height =
-          (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      av1_highbd_convolve_horiz_facade_scale(
-          src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
-          temp_stride, w, intermediate_height, filter_params_x, subpel_x_qn,
-          x_step_qn, 0, bd);
-
-      filter_size = filter_params_y.taps;
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade_scale(
-          temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
-          dst_stride, w, h, filter_params_y, subpel_y_qn, y_step_qn, ref_idx,
-          bd);
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    }
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  }
+void av1_highbd_wiener_convolve_add_src_c(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+
+  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                    src_stride, temp, MAX_SB_SIZE, filters_x,
+                                    x0_q4, x_step_q4, w, intermediate_height,
+                                    conv_params->round_0, bd);
+  highbd_convolve_add_src_vert_hip(
+      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
+      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index c43f649e0..1b2c2d0d5 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -17,140 +17,119 @@
 extern "C" {
 #endif
 
-typedef enum CONVOLVE_OPT {
-  // indicate the results in dst buf is rounded by FILTER_BITS or not
-  CONVOLVE_OPT_ROUND,
-  CONVOLVE_OPT_NO_ROUND,
-} CONVOLVE_OPT;
-
-typedef int32_t CONV_BUF_TYPE;
-
+typedef uint16_t CONV_BUF_TYPE;
 typedef struct ConvolveParams {
   int ref;
   int do_average;
-  CONVOLVE_OPT round;
   CONV_BUF_TYPE *dst;
   int dst_stride;
   int round_0;
   int round_1;
   int plane;
-  int do_post_rounding;
+  int is_compound;
+  int use_jnt_comp_avg;
+  int fwd_offset;
+  int bck_offset;
 } ConvolveParams;
 
-static INLINE ConvolveParams get_conv_params(int ref, int do_average,
-                                             int plane) {
-  ConvolveParams conv_params;
-  conv_params.ref = ref;
-  conv_params.do_average = do_average;
-  conv_params.round = CONVOLVE_OPT_ROUND;
-  conv_params.plane = plane;
-  conv_params.do_post_rounding = 0;
-  return conv_params;
-}
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-static INLINE void av1_convolve_filter_params_fixup_1212(
-    const InterpFilterParams *params_x, InterpFilterParams *params_y) {
-  if (params_x->interp_filter == MULTITAP_SHARP &&
-      params_y->interp_filter == MULTITAP_SHARP) {
-    // Avoid two directions both using 12-tap filter.
-    // This will reduce hardware implementation cost.
-    *params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
-  }
-}
-#endif
-
-static INLINE void av1_get_convolve_filter_params(
-    InterpFilters interp_filters, int avoid_1212, InterpFilterParams *params_x,
-    InterpFilterParams *params_y) {
-#if CONFIG_DUAL_FILTER
+#define ROUND0_BITS 3
+#define COMPOUND_ROUND1_BITS 7
+#define WIENER_ROUND0_BITS 3
+
+#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
+
+typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params);
+
+typedef void (*aom_highbd_convolve_fn_t)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
+                                                  InterpFilterParams *params_x,
+                                                  InterpFilterParams *params_y,
+                                                  int w, int h) {
   InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
   InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
-#else
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 0);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
-#endif
-
-  *params_x = av1_get_interp_filter_params(filter_x);
-  *params_y = av1_get_interp_filter_params(filter_y);
-
-  if (avoid_1212) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    convolve_filter_params_fixup_1212(params_x, params_y);
-#endif
-  }
+  *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
+  *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
 }
 
 struct AV1Common;
-void av1_convolve_init(struct AV1Common *cm);
+struct scale_factors;
 
-#if CONFIG_CONVOLVE_ROUND
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params);
+                            int scaled, ConvolveParams *conv_params,
+                            const struct scale_factors *sf);
 
 static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
-                                                      int plane, int32_t *dst,
-                                                      int dst_stride) {
+                                                      int plane,
+                                                      CONV_BUF_TYPE *dst,
+                                                      int dst_stride,
+                                                      int is_compound, int bd) {
   ConvolveParams conv_params;
   conv_params.ref = ref;
   conv_params.do_average = do_average;
-  conv_params.round = CONVOLVE_OPT_NO_ROUND;
-#if CONFIG_COMPOUND_ROUND
-  conv_params.round_0 = FILTER_BITS;
-#else
-  conv_params.round_0 = 5;
-#endif
-  conv_params.round_1 = 0;
+  assert(IMPLIES(do_average, is_compound));
+  conv_params.is_compound = is_compound;
+  conv_params.round_0 = ROUND0_BITS;
+  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+                                    : 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    if (!is_compound) conv_params.round_1 -= intbufrange - 16;
+  }
+  // TODO(yunqing): The following dst should only be valid while
+  // is_compound = 1;
   conv_params.dst = dst;
   conv_params.dst_stride = dst_stride;
   conv_params.plane = plane;
-  conv_params.do_post_rounding = 0;
   return conv_params;
 }
 
-#if CONFIG_HIGHBITDEPTH
+static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+                                             int bd) {
+  return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+}
+
+static INLINE ConvolveParams get_conv_params_wiener(int bd) {
+  ConvolveParams conv_params;
+  (void)bd;
+  conv_params.ref = 0;
+  conv_params.do_average = 0;
+  conv_params.is_compound = 0;
+  conv_params.round_0 = WIENER_ROUND0_BITS;
+  conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    conv_params.round_1 -= intbufrange - 16;
+  }
+  conv_params.dst = NULL;
+  conv_params.dst_stride = 0;
+  conv_params.plane = 0;
+  return conv_params;
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   int bd);
-#endif
-#endif  // CONFIG_CONVOLVE_ROUND
-
-void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
-                  int dst_stride, int w, int h, InterpFilters interp_filters,
-                  const int subpel_x, int xstep, const int subpel_y, int ystep,
-                  ConvolveParams *conv_params);
-
-void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                    int dst_stride, int w, int h, InterpFilters interp_filters,
-                    const int subpel_x, int xstep, const int subpel_y,
-                    int ystep, ConvolveParams *conv_params);
-
-void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                        int dst_stride, int w, int h,
-                        InterpFilters interp_filters, const int subpel_x,
-                        int xstep, const int subpel_y, int ystep,
-                        ConvolveParams *conv_params);
-
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int w, int h,
-                         InterpFilters interp_filters, const int subpel_x,
-                         int xstep, const int subpel_y, int ystep, int avg,
-                         int bd);
-
-void av1_highbd_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               InterpFilters interp_filters, const int subpel_x,
-                               int xstep, const int subpel_y, int ystep,
-                               int avg, int bd);
-#endif  // CONFIG_HIGHBITDEPTH
+                                   const struct scale_factors *sf, int bd);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/daala_tx.c b/third_party/aom/av1/common/daala_tx.c
deleted file mode 100644
index e5b2372e3..000000000
--- a/third_party/aom/av1/common/daala_tx.c
+++ /dev/null
@@ -1,4331 +0,0 @@
-#include "av1/common/daala_tx.h"
-#include "av1/common/odintrin.h"
-
-/* clang-format off */
-
-# define OD_DCT_RSHIFT(_a, _b) OD_UNBIASED_RSHIFT32(_a, _b)
-
-/* TODO: Daala DCT overflow checks need to be ported as a later test */
-# if defined(OD_DCT_CHECK_OVERFLOW)
-# else
-#  define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
-# endif
-
-#define OD_FDCT_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II fDCT. */ \
-  do { \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 13573, 16384, 100); \
-    p0 -= (p1*13573 + 16384) >> 15; \
-    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 5793, 4096, 101); \
-    p1 += (p0*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 3393, 4096, 102); \
-    p0 -= (p1*3393 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDCT_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II iDCT. */ \
-  do { \
-    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    p0 += (p1*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    p1 -= (p0*5793 + 4096) >> 13; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    p0 += (p1*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_FDCT_2_ASYM(p0, p1, p1h) \
-  /* Embedded 2-point asymmetric Type-II fDCT. */ \
-  do { \
-    p0 += p1h; \
-    p1 = p0 - p1; \
-  } \
-  while (0)
-
-#define OD_IDCT_2_ASYM(p0, p1, p1h) \
-  /* Embedded 2-point asymmetric Type-II iDCT. */ \
-  do { \
-    p1 = p0 - p1; \
-    p1h = OD_DCT_RSHIFT(p1, 1); \
-    p0 -= p1h; \
-  } \
-  while (0)
-
-#define OD_FDST_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-IV fDST. */ \
-  do { \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 103); \
-    p0 -= (p1*10947 + 8192) >> 14; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 473, 256, 104); \
-    p1 += (p0*473 + 256) >> 9; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 105); \
-    p0 -= (p1*10947 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-IV iDST. */ \
-  do { \
-    /* 10947/16384 ~= Tan[3*Pi/16]) ~= 0.668178637919299 */ \
-    p0 += (p1*10947 + 8192) >> 14; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    p1 -= (p0*473 + 256) >> 9; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    p0 += (p1*10947 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_FDST_2_ASYM(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV fDST. */ \
-  do { \
-    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 11507, 8192, 187); \
-    p0 -= (p1*11507 + 8192) >> 14; \
-    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 669, 512, 188); \
-    p1 += (p0*669 + 512) >> 10; \
-    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 4573, 2048, 189); \
-    p0 -= (p1*4573 + 2048) >> 12; \
-  } \
-  while (0)
-
-#define OD_IDST_2_ASYM(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV iDST. */ \
-  do { \
-    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
-    p0 += (p1*4573 + 2048) >> 12; \
-    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
-    p1 -= (p0*669 + 512) >> 10; \
-    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
-    p0 += (p1*11507 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_FDCT_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II fDCT. */ \
-  do { \
-    int q2h; \
-    int q3h; \
-    q3 = q0 - q3; \
-    q3h = OD_DCT_RSHIFT(q3, 1); \
-    q0 -= q3h; \
-    q2 += q1; \
-    q2h = OD_DCT_RSHIFT(q2, 1); \
-    q1 = q2h - q1; \
-    OD_FDCT_2_ASYM(q0, q2, q2h); \
-    OD_FDST_2_ASYM(q3, q1); \
-  } \
-  while (0)
-
-#define OD_IDCT_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II iDCT. */ \
-  do { \
-    int q1h; \
-    int q3h; \
-    OD_IDST_2_ASYM(q3, q2); \
-    OD_IDCT_2_ASYM(q0, q1, q1h); \
-    q3h = OD_DCT_RSHIFT(q3, 1); \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q2 = q1h - q2; \
-    q1 -= q2; \
-  } \
-  while (0)
-
-#define OD_FDCT_4_ASYM(q0, q2, q2h, q1, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II fDCT. */ \
-  do { \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q1 = q2h - q1; \
-    q2 = q1 - q2; \
-    OD_FDCT_2(q0, q2); \
-    OD_FDST_2(q3, q1); \
-  } \
-  while (0)
-
-#define OD_IDCT_4_ASYM(q0, q2, q1, q1h, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_2(q3, q2); \
-    OD_IDCT_2(q0, q1); \
-    q1 = q2 - q1; \
-    q1h = OD_DCT_RSHIFT(q1, 1); \
-    q2 = q1h - q2; \
-    q3 = q0 - q3; \
-    q3h = OD_DCT_RSHIFT(q3, 1); \
-    q0 -= q3h; \
-  } \
-  while (0)
-
-#define OD_FDST_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-IV fDST. */ \
-  do { \
-    int q0h; \
-    int q1h; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 13573, 16384, 190); \
-    q2 += (q1*13573 + 16384) >> 15; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(q2, 5793, 4096, 191); \
-    q1 -= (q2*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 3393, 4096, 192); \
-    q2 += (q1*3393 + 4096) >> 13; \
-    q0 += q2; \
-    q0h = OD_DCT_RSHIFT(q0, 1); \
-    q2 = q0h - q2; \
-    q1 += q3; \
-    q1h = OD_DCT_RSHIFT(q1, 1); \
-    q3 -= q1h; \
-    /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
-        0.524455699240090 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 537, 512, 193); \
-    q2 -= (q1*537 + 512) >> 10; \
-    /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
-    OD_DCT_OVERFLOW_CHECK(q2, 1609, 1024, 194); \
-    q1 += (q2*1609 + 1024) >> 11; \
-    /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
-        0.223847182092655 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 7335, 16384, 195); \
-    q2 += (q1*7335 + 16384) >> 15; \
-    /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
-        0.6215036383171189 */ \
-    OD_DCT_OVERFLOW_CHECK(q0, 5091, 4096, 196); \
-    q3 += (q0*5091 + 4096) >> 13; \
-    /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
-    OD_DCT_OVERFLOW_CHECK(q3, 5681, 2048, 197); \
-    q0 -= (q3*5681 + 2048) >> 12; \
-    /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
-        0.52204745462729 */ \
-    OD_DCT_OVERFLOW_CHECK(q0, 4277, 4096, 198); \
-    q3 += (q0*4277 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDST_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-IV iDST. */ \
-  do { \
-    int q0h; \
-    int q2h; \
-    /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
-        0.52204745462729 */ \
-    q3 -= (q0*4277 + 4096) >> 13; \
-    /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
-    q0 += (q3*5681 + 2048) >> 12; \
-    /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
-        0.6215036383171189 */ \
-    q3 -= (q0*5091 + 4096) >> 13; \
-    /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
-        0.223847182092655 */ \
-    q1 -= (q2*7335 + 16384) >> 15; \
-    /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
-    q2 -= (q1*1609 + 1024) >> 11; \
-    /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
-        0.524455699240090 */ \
-    q1 += (q2*537 + 512) >> 10; \
-    q2h = OD_DCT_RSHIFT(q2, 1); \
-    q3 += q2h; \
-    q2 -= q3; \
-    q0h = OD_DCT_RSHIFT(q0, 1); \
-    q1 = q0h - q1; \
-    q0 -= q1; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    q1 -= (q2*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    q2 += (q1*5793 + 4096) >> 13; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    q1 -= (q2*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_FDST_4_ASYM(t0, t0h, t2, t1, t3) \
-  /* Embedded 4-point asymmetric Type-IV fDST. */ \
-  do { \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 7489, 4096, 106); \
-    t2 -= (t1*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 11585, 8192, 107); \
-    t1 += (t2*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 108); \
-    t2 += (t1*19195 + 16384) >> 15; \
-    t3 += OD_DCT_RSHIFT(t2, 1); \
-    t2 -= t3; \
-    t1 = t0h - t1; \
-    t0 -= t1; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 109); \
-    t3 += (t0*6723 + 4096) >> 13; \
-    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 8035, 4096, 110); \
-    t0 -= (t3*8035 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 111); \
-    t3 += (t0*6723 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 112); \
-    t2 += (t1*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 6811, 4096, 113); \
-    t1 -= (t2*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 114); \
-    t2 += (t1*8757 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_4_ASYM(t0, t0h, t2, t1, t3) \
-  /* Embedded 4-point asymmetric Type-IV iDST. */ \
-  do { \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t1 -= (t2*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    t2 += (t1*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t1 -= (t2*8757 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    t3 -= (t0*6723 + 4096) >> 13; \
-    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    t0 += (t3*8035 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    t3 -= (t0*6723 + 4096) >> 13; \
-    t0 += t2; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t2 = t0h - t2; \
-    t1 += t3; \
-    t3 -= OD_DCT_RSHIFT(t1, 1); \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    t1 -= (t2*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    t2 -= (t1*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    t1 += (t2*7489 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_FDCT_8(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II fDCT. */ \
-  do { \
-    int r4h; \
-    int r5h; \
-    int r6h; \
-    int r7h; \
-    r7 = r0 - r7; \
-    r7h = OD_DCT_RSHIFT(r7, 1); \
-    r0 -= r7h; \
-    r6 += r1; \
-    r6h = OD_DCT_RSHIFT(r6, 1); \
-    r1 = r6h - r1; \
-    r5 = r2 - r5; \
-    r5h = OD_DCT_RSHIFT(r5, 1); \
-    r2 -= r5h; \
-    r4 += r3; \
-    r4h = OD_DCT_RSHIFT(r4, 1); \
-    r3 = r4h - r3; \
-    OD_FDCT_4_ASYM(r0, r4, r4h, r2, r6, r6h); \
-    OD_FDST_4_ASYM(r7, r7h, r3, r5, r1); \
-  } \
-  while (0)
-
-#define OD_IDCT_8(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II iDCT. */ \
-  do { \
-    int r1h; \
-    int r3h; \
-    int r5h; \
-    int r7h; \
-    OD_IDST_4_ASYM(r7, r7h, r5, r6, r4); \
-    OD_IDCT_4_ASYM(r0, r2, r1, r1h, r3, r3h); \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r6 = r1h - r6; \
-    r1 -= r6; \
-    r5h = OD_DCT_RSHIFT(r5, 1); \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r4 = r3h - r4; \
-    r3 -= r4; \
-  } \
-  while (0)
-
-#define OD_FDCT_8_ASYM(r0, r4, r4h, r2, r6, r6h, r1, r5, r5h, r3, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II fDCT. */ \
-  do { \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r1 = r6h - r1; \
-    r6 -= r1; \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r3 = r4h - r3; \
-    r4 -= r3; \
-    OD_FDCT_4(r0, r4, r2, r6); \
-    OD_FDST_4(r7, r3, r5, r1); \
-  } \
-  while (0)
-
-#define OD_IDCT_8_ASYM(r0, r4, r2, r6, r1, r1h, r5, r5h, r3, r3h, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_4(r7, r5, r6, r4); \
-    OD_IDCT_4(r0, r2, r1, r3); \
-    r7 = r0 - r7; \
-    r7h = OD_DCT_RSHIFT(r7, 1); \
-    r0 -= r7h; \
-    r1 += r6; \
-    r1h = OD_DCT_RSHIFT(r1, 1); \
-    r6 = r1h - r6; \
-    r5 = r2 - r5; \
-    r5h = OD_DCT_RSHIFT(r5, 1); \
-    r2 -= r5h; \
-    r3 += r4; \
-    r3h = OD_DCT_RSHIFT(r3, 1); \
-    r4 = r3h - r4; \
-  } \
-  while (0)
-
-#define OD_FDST_8(t0, t4, t2, t6, t1, t5, t3, t7)  \
-  /* Embedded 8-point orthonormal Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h; \
-    int t7h; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 115); \
-    t6 -= (t1*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 11585, 8192, 116); \
-    t1 += (t6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 117); \
-    t6 -= (t1*13573 + 16384) >> 15; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 21895, 16384, 118); \
-    t5 -= (t2*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 15137, 8192, 119); \
-    t2 += (t5*15137 + 8192) >> 14; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 10947, 8192, 120); \
-    t5 -= (t2*10947 + 8192) >> 14; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 121); \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3135, 4096, 122); \
-    t3 += (t4*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 123); \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    t7 += t1; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    t1 -= t7h; \
-    t2 = t3 - t2; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 -= t2h; \
-    t0 -= t6; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t6 += t0h; \
-    t5 = t4 - t5; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    t4 -= t5h; \
-    t1 += t5h; \
-    t5 = t1 - t5; \
-    t4 += t0h; \
-    t0 -= t4; \
-    t6 -= t2h; \
-    t2 += t6; \
-    t3 -= t7h; \
-    t7 += t3; \
-    /* TODO: Can we move this into another operation */ \
-    t7 = -t7; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 124); \
-    t0 -= (t7*7425 + 4096) >> 13; \
-    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 8153, 4096, 125); \
-    t7 += (t0*8153 + 4096) >> 13; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 126); \
-    t0 -= (t7*7425 + 4096) >> 13; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 127); \
-    t6 -= (t1*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1189, 2048, 128); \
-    t1 += (t6*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 129); \
-    t6 -= (t1*4861 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 130); \
-    t2 -= (t5*2455 + 2048) >> 12; \
-    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 7225, 4096, 131); \
-    t5 += (t2*7225 + 4096) >> 13; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 132); \
-    t2 -= (t5*2455 + 2048) >> 12; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 133); \
-    t4 -= (t3*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 5197, 4096, 134); \
-    t3 += (t4*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 135); \
-    t4 -= (t3*11725 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_IDST_8(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point orthonormal Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h_; \
-    int t7h_; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    t1 += (t6*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
-    t6 -= (t1*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    t1 += (t6*11725 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    t2 += (t5*2455 + 2048) >> 12; \
-    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    t5 -= (t2*7225 + 4096) >> 13; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    t2 += (t5*2455 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    t3 += (t4*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
-    t4 -= (t3*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    t3 += (t4*4861 + 16384) >> 15; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    t0 += (t7*7425 + 4096) >> 13; \
-    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
-    t7 -= (t0*8153 + 4096) >> 13; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    t0 += (t7*7425 + 4096) >> 13; \
-    /* TODO: Can we move this into another operation */ \
-    t7 = -t7; \
-    t7 -= t6; \
-    t7h_ = OD_DCT_RSHIFT(t7, 1); \
-    t6 += t7h_; \
-    t2 -= t3; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 += t2h; \
-    t0 += t1; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t1 -= t0h; \
-    t5 = t4 - t5; \
-    t5h_ = OD_DCT_RSHIFT(t5, 1); \
-    t4 -= t5h_; \
-    t1 += t5h_; \
-    t5 = t1 - t5; \
-    t3 -= t0h; \
-    t0 += t3; \
-    t6 += t2h; \
-    t2 = t6 - t2; \
-    t4 += t7h_; \
-    t7 -= t4; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    t6 -= (t1*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 += (t2*10947 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t2 -= (t5*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 += (t2*21895 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t3 += (t4*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    t4 -= (t3*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t3 += (t4*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* Rewrite this so that t0h can be passed in. */
-#define OD_FDST_8_ASYM(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h; \
-    int t7h; \
-    /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 1035, 1024, 199); \
-    t6 += (t1*1035 + 1024) >> 11; \
-    /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3675, 2048, 200); \
-    t1 -= (t6*3675 + 2048) >> 12; \
-    /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 851, 4096, 201); \
-    t6 -= (t1*851 + 4096) >> 13; \
-    /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4379, 4096, 202); \
-    t5 += (t2*4379 + 4096) >> 13; \
-    /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 10217, 4096, 203); \
-    t2 -= (t5*10217 + 4096) >> 13; \
-    /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4379, 8192, 204); \
-    t5 += (t2*4379 + 8192) >> 14; \
-    /* 12905/16384 ~= (Sqrt[2] - Cos[3*Pi/32])/(2*Sin[3*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 12905, 8192, 205); \
-    t4 += (t3*12905 + 8192) >> 14; \
-    /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3363, 4096, 206); \
-    t3 -= (t4*3363 + 4096) >> 13; \
-    /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3525, 2048, 207); \
-    t4 -= (t3*3525 + 2048) >> 12; \
-    /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 5417, 4096, 208); \
-    t7 += (t0*5417 + 4096) >> 13; \
-    /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 5765, 2048, 209); \
-    t0 -= (t7*5765 + 2048) >> 12; \
-    /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 2507, 2048, 210); \
-    t7 += (t0*2507 + 2048) >> 12; \
-    t0 += t1; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t1 -= t0h; \
-    t2 -= t3; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 += t2h; \
-    t5 -= t4; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    t4 += t5h; \
-    t7 += t6; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    t6 = t7h - t6; \
-    t4 = t7h - t4; \
-    t7 -= t4; \
-    t1 += t5h; \
-    t5 = t1 - t5; \
-    t6 += t2h; \
-    t2 = t6 - t2; \
-    t3 -= t0h; \
-    t0 += t3; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 211); \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 3135, 4096, 212); \
-    t6 -= (t1*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 213); \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 214); \
-    t5 += (t2*2737 + 2048) >> 12; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 473, 256, 215); \
-    t2 -= (t5*473 + 256) >> 9; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 216); \
-    t5 += (t2*2737 + 2048) >> 12; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 217); \
-    t3 += (t4*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 5793, 4096, 218); \
-    t4 -= (t3*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 219); \
-    t3 += (t4*3393 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDST_8_ASYM(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h__; \
-    int t7h__; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t6 -= (t1*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    t1 += (t6*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t6 -= (t1*3393 + 4096) >> 13; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 -= (t2*2737 + 2048) >> 12; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t2 += (t5*473 + 256) >> 9; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 -= (t2*2737 + 2048) >> 12; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    t3 += (t4*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    t0 -= t6; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t6 += t0h; \
-    t2 = t3 - t2; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 -= t2h; \
-    t5 = t4 - t5; \
-    t5h__ = OD_DCT_RSHIFT(t5, 1); \
-    t4 -= t5h__; \
-    t7 += t1; \
-    t7h__ = OD_DCT_RSHIFT(t7, 1); \
-    t1 = t7h__ - t1; \
-    t3 = t7h__ - t3; \
-    t7 -= t3; \
-    t1 -= t5h__; \
-    t5 += t1; \
-    t6 -= t2h; \
-    t2 += t6; \
-    t4 += t0h; \
-    t0 -= t4; \
-    /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
-    t7 -= (t0*2507 + 2048) >> 12; \
-    /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
-    t0 += (t7*5765 + 2048) >> 12; \
-    /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
-    t7 -= (t0*5417 + 4096) >> 13; \
-    /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
-    t1 += (t6*3525 + 2048) >> 12; \
-    /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
-    t6 += (t1*3363 + 4096) >> 13; \
-    /* 12905/16384 ~= (1/Sqrt[2] - Cos[3*Pi/32]/1)/Sin[3*Pi/32] */ \
-    t1 -= (t6*12905 + 8192) >> 14; \
-    /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
-    t5 -= (t2*4379 + 8192) >> 14; \
-    /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
-    t2 += (t5*10217 + 4096) >> 13; \
-    /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
-    t5 -= (t2*4379 + 4096) >> 13; \
-    /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
-    t3 += (t4*851 + 4096) >> 13; \
-    /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
-    t4 += (t3*3675 + 2048) >> 12; \
-    /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
-    t3 -= (t4*1035 + 1024) >> 11; \
-  } \
-  while (0)
-
-#define OD_FDCT_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II fDCT. */ \
-  do { \
-    int s8h; \
-    int sah; \
-    int sch; \
-    int seh; \
-    int sfh; \
-    sf = s0 - sf; \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    s0 -= sfh; \
-    se += s1; \
-    seh = OD_DCT_RSHIFT(se, 1); \
-    s1 = seh - s1; \
-    sd = s2 - sd; \
-    s2 -= OD_DCT_RSHIFT(sd, 1); \
-    sc += s3; \
-    sch = OD_DCT_RSHIFT(sc, 1); \
-    s3 = sch - s3; \
-    sb = s4 - sb; \
-    s4 -= OD_DCT_RSHIFT(sb, 1); \
-    sa += s5; \
-    sah = OD_DCT_RSHIFT(sa, 1); \
-    s5 = sah - s5; \
-    s9 = s6 - s9; \
-    s6 -= OD_DCT_RSHIFT(s9, 1); \
-    s8 += s7; \
-    s8h = OD_DCT_RSHIFT(s8, 1); \
-    s7 = s8h - s7; \
-    OD_FDCT_8_ASYM(s0, s8, s8h, s4, sc, sch, s2, sa, sah, s6, se, seh); \
-    OD_FDST_8_ASYM(sf, s7, sb, s3, sd, s5, s9, s1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II iDCT. */ \
-  do { \
-    int s1h; \
-    int s3h; \
-    int s5h; \
-    int s7h; \
-    int sfh; \
-    OD_IDST_8_ASYM(sf, sb, sd, s9, se, sa, sc, s8); \
-    OD_IDCT_8_ASYM(s0, s4, s2, s6, s1, s1h, s5, s5h, s3, s3h, s7, s7h); \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    s0 += sfh; \
-    sf = s0 - sf; \
-    se = s1h - se; \
-    s1 -= se; \
-    s2 += OD_DCT_RSHIFT(sd, 1); \
-    sd = s2 - sd; \
-    sc = s3h - sc; \
-    s3 -= sc; \
-    s4 += OD_DCT_RSHIFT(sb, 1); \
-    sb = s4 - sb; \
-    sa = s5h - sa; \
-    s5 -= sa; \
-    s6 += OD_DCT_RSHIFT(s9, 1); \
-    s9 = s6 - s9; \
-    s8 = s7h - s8; \
-    s7 -= s8; \
-  } \
-  while (0)
-
-#define OD_FDCT_16_ASYM(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
-  t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II fDCT. */ \
-  do { \
-    t0 += tfh; \
-    tf = t0 - tf; \
-    t1 -= teh; \
-    te += t1; \
-    t2 += tdh; \
-    td = t2 - td; \
-    t3 -= tch; \
-    tc += t3; \
-    t4 += tbh; \
-    tb = t4 - tb; \
-    t5 -= tah; \
-    ta += t5; \
-    t6 += t9h; \
-    t9 = t6 - t9; \
-    t7 -= t8h; \
-    t8 += t7; \
-    OD_FDCT_8(t0, t8, t4, tc, t2, ta, t6, te); \
-    OD_FDST_8(tf, t7, tb, t3, td, t5, t9, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16_ASYM(t0, t8, t4, tc, t2, ta, t6, te, \
-  t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_8(tf, tb, td, t9, te, ta, tc, t8); \
-    OD_IDCT_8(t0, t4, t2, t6, t1, t5, t3, t7); \
-    t1 -= te; \
-    t1h = OD_DCT_RSHIFT(t1, 1); \
-    te += t1h; \
-    t9 = t6 - t9; \
-    t9h = OD_DCT_RSHIFT(t9, 1); \
-    t6 -= t9h; \
-    t5 -= ta; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    ta += t5h; \
-    td = t2 - td; \
-    tdh = OD_DCT_RSHIFT(td, 1); \
-    t2 -= tdh; \
-    t3 -= tc; \
-    t3h = OD_DCT_RSHIFT(t3, 1); \
-    tc += t3h; \
-    tb = t4 - tb; \
-    tbh = OD_DCT_RSHIFT(tb, 1); \
-    t4 -= tbh; \
-    t7 -= t8; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    t8 += t7h; \
-    tf = t0 - tf; \
-    tfh = OD_DCT_RSHIFT(tf, 1); \
-    t0 -= tfh; \
-  } \
-  while (0)
-
-#define OD_FDST_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV fDST. */ \
-  do { \
-    int s0h; \
-    int s2h; \
-    int sdh; \
-    int sfh; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 220); \
-    s1 += (se*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 11585, 8192, 221); \
-    se -= (s1*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 222); \
-    s1 += (se*13573 + 16384) >> 15; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 223); \
-    sd += (s2*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(sd, 15137, 16384, 224); \
-    s2 -= (sd*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 225); \
-    sd += (s2*21895 + 16384) >> 15; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 226); \
-    sc += (s3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(sc, 3135, 4096, 227); \
-    s3 -= (sc*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 228); \
-    sc += (s3*3259 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 229); \
-    sa += (s5*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 11585, 8192, 230); \
-    s5 -= (sa*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 231); \
-    sa += (s5*13573 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 232); \
-    s6 += (s9*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 11585, 8192, 233); \
-    s9 -= (s6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 234); \
-    s6 += (s9*13573 + 16384) >> 15; \
-    sf += se; \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    se = sfh - se; \
-    s0 += s1; \
-    s0h = OD_DCT_RSHIFT(s0, 1); \
-    s1 = s0h - s1; \
-    s2 = s3 - s2; \
-    s2h = OD_DCT_RSHIFT(s2, 1); \
-    s3 -= s2h; \
-    sd -= sc; \
-    sdh = OD_DCT_RSHIFT(sd, 1); \
-    sc += sdh; \
-    sa = s4 - sa; \
-    s4 -= OD_DCT_RSHIFT(sa, 1); \
-    s5 += sb; \
-    sb = OD_DCT_RSHIFT(s5, 1) - sb; \
-    s8 += s6; \
-    s6 -= OD_DCT_RSHIFT(s8, 1); \
-    s7 = s9 - s7; \
-    s9 -= OD_DCT_RSHIFT(s7, 1); \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 235); \
-    s4 += (sb*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 16069, 8192, 236); \
-    sb -= (s4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 237); \
-    s4 += (sb*6723 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 238); \
-    sa += (s5*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 6811, 4096, 239); \
-    s5 -= (sa*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 240); \
-    sa += (s5*8757 + 8192) >> 14; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 241); \
-    s6 += (s9*2485 + 4096) >> 13; \
-    /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 4551, 4096, 242); \
-    s9 -= (s6*4551 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 243); \
-    s6 += (s9*2485 + 4096) >> 13; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 244); \
-    s7 += (s8*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    OD_DCT_OVERFLOW_CHECK(s7, 6393, 16384, 245); \
-    s8 -= (s7*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 246); \
-    s7 += (s8*3227 + 16384) >> 15; \
-    s1 -= s2h; \
-    s2 += s1; \
-    se += sdh; \
-    sd = se - sd; \
-    s3 += sfh; \
-    sf -= s3; \
-    sc = s0h - sc; \
-    s0 -= sc; \
-    sb += OD_DCT_RSHIFT(s8, 1); \
-    s8 = sb - s8; \
-    s4 += OD_DCT_RSHIFT(s7, 1); \
-    s7 -= s4; \
-    s6 += OD_DCT_RSHIFT(s5, 1); \
-    s5 = s6 - s5; \
-    s9 -= OD_DCT_RSHIFT(sa, 1); \
-    sa += s9; \
-    s8 += s0; \
-    s0 -= OD_DCT_RSHIFT(s8, 1); \
-    sf += s7; \
-    s7 = OD_DCT_RSHIFT(sf, 1) - s7; \
-    s1 -= s6; \
-    s6 += OD_DCT_RSHIFT(s1, 1); \
-    s9 += se; \
-    se = OD_DCT_RSHIFT(s9, 1) - se; \
-    s2 += sa; \
-    sa = OD_DCT_RSHIFT(s2, 1) - sa; \
-    s5 += sd; \
-    sd -= OD_DCT_RSHIFT(s5, 1); \
-    s4 = sc - s4; \
-    sc -= OD_DCT_RSHIFT(s4, 1); \
-    s3 -= sb; \
-    sb += OD_DCT_RSHIFT(s3, 1); \
-    /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sf, 2799, 2048, 247); \
-    s0 -= (sf*2799 + 2048) >> 12; \
-    /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s0, 2893, 1024, 248); \
-    sf += (s0*2893 + 1024) >> 11; \
-    /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sf, 5397, 4096, 249); \
-    s0 -= (sf*5397 + 4096) >> 13; \
-    /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 41, 32, 250); \
-    se += (s1*41 + 32) >> 6; \
-    /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(se, 2865, 1024, 251); \
-    s1 -= (se*2865 + 1024) >> 11; \
-    /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 4641, 4096, 252); \
-    se += (s1*4641 + 4096) >> 13; \
-    /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 2473, 2048, 253); \
-    sd += (s2*2473 + 2048) >> 12; \
-    /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sd, 5619, 2048, 254); \
-    s2 -= (sd*5619 + 2048) >> 12; \
-    /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 7839, 8192, 255); \
-    sd += (s2*7839 + 8192) >> 14; \
-    /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 5747, 4096, 256); \
-    sc -= (s3*5747 + 4096) >> 13; \
-    /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] ~= */ \
-    OD_DCT_OVERFLOW_CHECK(sc, 3903, 4096, 257); \
-    s3 += (sc*3903 + 4096) >> 13; \
-    /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 5701, 4096, 258); \
-    sc += (s3*5701 + 4096) >> 13; \
-    /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 4471, 4096, 259); \
-    sb += (s4*4471 + 4096) >> 13; \
-    /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 1309, 512, 260); \
-    s4 -= (sb*1309 + 512) >> 10; \
-    /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 5067, 8192, 261); \
-    sb += (s4*5067 + 8192) >> 14; \
-    /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 2217, 2048, 262); \
-    sa -= (s5*2217 + 2048) >> 12; \
-    /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] ~= 0.72705107329128 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 1489, 1024, 263); \
-    s5 += (sa*1489 + 1024) >> 11; \
-    /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 75, 128, 264); \
-    sa += (s5*75 + 128) >> 8; \
-    /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2087, 2048, 265); \
-    s6 -= (s9*2087 + 2048) >> 12; \
-    /* 4653/4096 ~= Sqrt[2]*Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 4653, 2048, 266); \
-    s9 += (s6*4653 + 2048) >> 12; \
-    /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 4545, 16384, 267); \
-    s6 -= (s9*4545 + 16384) >> 15; \
-    /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 2053, 2048, 268); \
-    s7 += (s8*2053 + 2048) >> 12; \
-    /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s7, 1945, 1024, 269); \
-    s8 -= (s7*1945 + 1024) >> 11; \
-    /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 1651, 16384, 270); \
-    s7 -= (s8*1651 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_IDST_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV iDST. */ \
-  do { \
-    int s0h; \
-    int s4h; \
-    int sbh; \
-    int sfh; \
-    /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
-    se += (s1*1651 + 16384) >> 15; \
-    /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
-    s1 += (se*1945 + 1024) >> 11; \
-    /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
-    se -= (s1*2053 + 2048) >> 12; \
-    /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
-    s6 += (s9*4545 + 16384) >> 15; \
-    /* 4653/32768 ~= Sqrt[2]*Sin[19*Pi/64] */ \
-    s9 -= (s6*4653 + 2048) >> 12; \
-    /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
-    s6 += (s9*2087 + 2048) >> 12; \
-    /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
-    s5 -= (sa*75 + 128) >> 8; \
-    /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] */ \
-    sa -= (s5*1489 + 1024) >> 11; \
-    /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
-    s5 += (sa*2217 + 2048) >> 12; \
-    /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
-    sd -= (s2*5067 + 8192) >> 14; \
-    /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
-    s2 += (sd*1309 + 512) >> 10; \
-    /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
-    sd -= (s2*4471 + 4096) >> 13; \
-    /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */  \
-    s3 -= (sc*5701 + 4096) >> 13; \
-    /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] */ \
-    sc -= (s3*3903 + 4096) >> 13; \
-    /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
-    s3 += (sc*5747 + 4096) >> 13; \
-    /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
-    sb -= (s4*7839 + 8192) >> 14; \
-    /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
-    s4 += (sb*5619 + 2048) >> 12; \
-    /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
-    sb -= (s4*2473 + 2048) >> 12; \
-    /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
-    s7 -= (s8*4641 + 4096) >> 13; \
-    /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
-    s8 += (s7*2865 + 1024) >> 11; \
-    /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
-    s7 -= (s8*41 + 32) >> 6; \
-    /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
-    s0 += (sf*5397 + 4096) >> 13; \
-    /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
-    sf -= (s0*2893 + 1024) >> 11; \
-    /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
-    s0 += (sf*2799 + 2048) >> 12; \
-    sd -= OD_DCT_RSHIFT(sc, 1); \
-    sc += sd; \
-    s3 += OD_DCT_RSHIFT(s2, 1); \
-    s2 = s3 - s2; \
-    sb += OD_DCT_RSHIFT(sa, 1); \
-    sa -= sb; \
-    s5 = OD_DCT_RSHIFT(s4, 1) - s5; \
-    s4 -= s5; \
-    s7 = OD_DCT_RSHIFT(s9, 1) - s7; \
-    s9 -= s7; \
-    s6 -= OD_DCT_RSHIFT(s8, 1); \
-    s8 += s6; \
-    se = OD_DCT_RSHIFT(sf, 1) - se; \
-    sf -= se; \
-    s0 += OD_DCT_RSHIFT(s1, 1); \
-    s1 -= s0; \
-    s5 -= s9; \
-    s9 += OD_DCT_RSHIFT(s5, 1); \
-    sa = s6 - sa; \
-    s6 -= OD_DCT_RSHIFT(sa, 1); \
-    se += s2; \
-    s2 -= OD_DCT_RSHIFT(se, 1); \
-    s1 = sd - s1; \
-    sd -= OD_DCT_RSHIFT(s1, 1); \
-    s0 += s3; \
-    s0h = OD_DCT_RSHIFT(s0, 1); \
-    s3 = s0h - s3; \
-    sf += sc; \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    sc -= sfh; \
-    sb = s7 - sb; \
-    sbh = OD_DCT_RSHIFT(sb, 1); \
-    s7 -= sbh; \
-    s4 -= s8; \
-    s4h = OD_DCT_RSHIFT(s4, 1); \
-    s8 += s4h; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    se -= (s1*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    s1 += (se*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    se -= (s1*3227 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    s6 -= (s9*2485 + 4096) >> 13; \
-    /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    s9 += (s6*4551 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    s6 -= (s9*2485 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    s5 -= (sa*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    sa += (s5*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    s5 -= (sa*8757 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    s2 -= (sd*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    sd += (s2*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    s2 -= (sd*6723 + 4096) >> 13; \
-    s9 += OD_DCT_RSHIFT(se, 1); \
-    se = s9 - se; \
-    s6 += OD_DCT_RSHIFT(s1, 1); \
-    s1 -= s6; \
-    sd = OD_DCT_RSHIFT(sa, 1) - sd; \
-    sa -= sd; \
-    s2 += OD_DCT_RSHIFT(s5, 1); \
-    s5 = s2 - s5; \
-    s3 -= sbh; \
-    sb += s3; \
-    sc += s4h; \
-    s4 = sc - s4; \
-    s8 = s0h - s8; \
-    s0 -= s8; \
-    s7 = sfh - s7; \
-    sf -= s7; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s6 -= (s9*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    s9 += (s6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s6 -= (s9*13573 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s5 -= (sa*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    sa += (s5*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s5 -= (sa*13573 + 16384) >> 15; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    s3 -= (sc*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    sc += (s3*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    s3 -= (sc*3259 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    sb -= (s4*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    s4 += (sb*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    sb -= (s4*21895 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s8 -= (s7*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    s7 += (s8*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s8 -= (s7*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* TODO: rewrite this to match OD_FDST_16. */
-#define OD_FDST_16_ASYM(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
-  t1, t9, t5, td, t3, tb, t7, t7h, tf) \
-  /* Embedded 16-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t2h; \
-    int t3h; \
-    int t6h; \
-    int t8h; \
-    int t9h; \
-    int tch; \
-    int tdh; \
-    /* TODO: Can we move these into another operation */ \
-    t8 = -t8; \
-    t9 = -t9; \
-    ta = -ta; \
-    tb = -tb; \
-    td = -td; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 136); \
-    t1 -= (te*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 11585, 16384, 137); \
-    te += (t1*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 138); \
-    t1 -= (te*13573 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4161, 8192, 139); \
-    t2 += (td*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 15137, 8192, 140); \
-    td -= (t2*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 14341, 8192, 141); \
-    t2 += (td*14341 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 14341, 8192, 142); \
-    tc -= (t3*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 15137, 8192, 143); \
-    t3 += (tc*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 4161, 8192, 144); \
-    tc -= (t3*4161 + 8192) >> 14; \
-    te = t0h - te; \
-    t0 -= te; \
-    tf = OD_DCT_RSHIFT(t1, 1) - tf; \
-    t1 -= tf; \
-    /* TODO: Can we move this into another operation */ \
-    tc = -tc; \
-    t2 = OD_DCT_RSHIFT(tc, 1) - t2; \
-    tc -= t2; \
-    t3 = OD_DCT_RSHIFT(td, 1) - t3; \
-    td = t3 - td; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 7489, 4096, 145); \
-    t9 -= (t6*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 146); \
-    t6 += (t9*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 147); \
-    t9 += (t6*19195 + 16384) >> 15; \
-    t8 += OD_DCT_RSHIFT(t9, 1); \
-    t9 -= t8; \
-    t6 = t7h - t6; \
-    t7 -= t6; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 148); \
-    t8 += (t7*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 16069, 8192, 149); \
-    t7 -= (t8*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 150); \
-    t8 += (t7*6723 + 4096) >> 13; \
-    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 151); \
-    t9 += (t6*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 13623, 8192, 152); \
-    t6 -= (t9*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 153); \
-    t9 += (t6*17515 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 154); \
-    t5 += (ta*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 155); \
-    ta -= (t5*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 156); \
-    t5 += (ta*13573 + 8192) >> 14; \
-    tb += OD_DCT_RSHIFT(t5, 1); \
-    t5 = tb - t5; \
-    ta += t4h; \
-    t4 -= ta; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 157); \
-    ta += (t5*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 18205, 16384, 158); \
-    t5 -= (ta*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 159); \
-    ta += (t5*2485 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 160); \
-    tb -= (t4*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 16069, 8192, 161); \
-    t4 += (tb*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 162); \
-    tb -= (t4*6723 + 4096) >> 13; \
-    /* TODO: Can we move this into another operation */ \
-    t5 = -t5; \
-    tc -= tf; \
-    tch = OD_DCT_RSHIFT(tc, 1); \
-    tf += tch; \
-    t3 += t0; \
-    t3h = OD_DCT_RSHIFT(t3, 1); \
-    t0 -= t3h; \
-    td -= t1; \
-    tdh = OD_DCT_RSHIFT(td, 1); \
-    t1 += tdh; \
-    t2 += te; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    te -= t2h; \
-    t8 += t4; \
-    t8h = OD_DCT_RSHIFT(t8, 1); \
-    t4 = t8h - t4; \
-    t7 = tb - t7; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    tb = t7h - tb; \
-    t6 -= ta; \
-    t6h = OD_DCT_RSHIFT(t6, 1); \
-    ta += t6h; \
-    t9 = t5 - t9; \
-    t9h = OD_DCT_RSHIFT(t9, 1); \
-    t5 -= t9h; \
-    t0 -= t7h; \
-    t7 += t0; \
-    tf += t8h; \
-    t8 -= tf; \
-    te -= t6h; \
-    t6 += te; \
-    t1 += t9h; \
-    t9 -= t1; \
-    tb -= tch; \
-    tc += tb; \
-    t4 += t3h; \
-    t3 -= t4; \
-    ta -= tdh; \
-    td += ta; \
-    t5 = t2h - t5; \
-    t2 -= t5; \
-    /* TODO: Can we move these into another operation */ \
-    t8 = -t8; \
-    t9 = -t9; \
-    ta = -ta; \
-    tb = -tb; \
-    tc = -tc; \
-    td = -td; \
-    tf = -tf; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 163); \
-    t0 -= (tf*7799 + 4096) >> 13; \
-    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 4091, 2048, 164); \
-    tf += (t0*4091 + 2048) >> 12; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 165); \
-    t0 -= (tf*7799 + 4096) >> 13; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 166); \
-    t1 += (te*2417 + 16384) >> 15; \
-    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 601, 2048, 167); \
-    te -= (t1*601 + 2048) >> 12; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 168); \
-    t1 += (te*2417 + 16384) >> 15; \
-    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 14525, 16384, 169); \
-    t7 -= (t8*14525 + 16384) >> 15; \
-    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 3035, 2048, 170); \
-    t8 += (t7*3035 + 2048) >> 12; \
-    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 7263, 8192, 171); \
-    t7 -= (t8*7263 + 8192) >> 14; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 172); \
-    t2 -= (td*6393 + 4096) >> 13; \
-    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 3973, 2048, 173); \
-    td += (t2*3973 + 2048) >> 12; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 174); \
-    t2 -= (td*6393 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 175); \
-    t5 -= (ta*9281 + 8192) >> 14; \
-    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 7027, 4096, 176); \
-    ta += (t5*7027 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 177); \
-    t5 -= (ta*9281 + 8192) >> 14; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 178); \
-    t3 -= (tc*11539 + 8192) >> 14; \
-    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 7713, 4096, 179); \
-    tc += (t3*7713 + 4096) >> 13; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 180); \
-    t3 -= (tc*11539 + 8192) >> 14; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 181); \
-    t4 -= (tb*10375 + 8192) >> 14; \
-    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 7405, 4096, 182); \
-    tb += (t4*7405 + 4096) >> 13; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 183); \
-    t4 -= (tb*10375 + 8192) >> 14; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 184); \
-    t6 -= (t9*8247 + 8192) >> 14; \
-    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1645, 1024, 185); \
-    t9 += (t6*1645 + 1024) >> 11; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 186); \
-    t6 -= (t9*8247 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_16_ASYM(t0, t0h, t8, t4, tc, t2, t2h, ta, t6, te, teh, \
-  t1, t9, t5, td, t3, tb, t7, tf) \
-  /* Embedded 16-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t1h_; \
-    int t3h_; \
-    int t4h; \
-    int t6h; \
-    int t9h_; \
-    int tbh_; \
-    int tch; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    t6 += (t9*8247 + 8192) >> 14; \
-    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
-    t9 -= (t6*1645 + 1024) >> 11; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    t6 += (t9*8247 + 8192) >> 14; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    t2 += (td*10375 + 8192) >> 14; \
-    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
-    td -= (t2*7405 + 4096) >> 13; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    t2 += (td*10375 + 8192) >> 14; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    tc += (t3*11539 + 8192) >> 14; \
-    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
-    t3 -= (tc*7713 + 4096) >> 13; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    tc += (t3*11539 + 8192) >> 14; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    ta += (t5*9281 + 8192) >> 14; \
-    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
-    t5 -= (ta*7027 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    ta += (t5*9281 + 8192) >> 14; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    t4 += (tb*6393 + 4096) >> 13; \
-    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
-    tb -= (t4*3973 + 2048) >> 12; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    t4 += (tb*6393 + 4096) >> 13; \
-    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    te += (t1*7263 + 8192) >> 14; \
-    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
-    t1 -= (te*3035 + 2048) >> 12; \
-    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    te += (t1*14525 + 16384) >> 15; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    t8 -= (t7*2417 + 16384) >> 15; \
-    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
-    t7 += (t8*601 + 2048) >> 12; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    t8 -= (t7*2417 + 16384) >> 15; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    t0 += (tf*7799 + 4096) >> 13; \
-    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
-    tf -= (t0*4091 + 2048) >> 12; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    t0 += (tf*7799 + 4096) >> 13; \
-    /* TODO: Can we move these into another operation */ \
-    t1 = -t1; \
-    t3 = -t3; \
-    t5 = -t5; \
-    t9 = -t9; \
-    tb = -tb; \
-    td = -td; \
-    tf = -tf; \
-    t4 += ta; \
-    t4h = OD_DCT_RSHIFT(t4, 1); \
-    ta = t4h - ta; \
-    tb -= t5; \
-    tbh_ = OD_DCT_RSHIFT(tb, 1); \
-    t5 += tbh_; \
-    tc += t2; \
-    tch = OD_DCT_RSHIFT(tc, 1); \
-    t2 -= tch; \
-    t3 -= td; \
-    t3h_ = OD_DCT_RSHIFT(t3, 1); \
-    td += t3h_; \
-    t9 += t8; \
-    t9h_ = OD_DCT_RSHIFT(t9, 1); \
-    t8 -= t9h_; \
-    t6 -= t7; \
-    t6h = OD_DCT_RSHIFT(t6, 1); \
-    t7 += t6h; \
-    t1 += tf; \
-    t1h_ = OD_DCT_RSHIFT(t1, 1); \
-    tf -= t1h_; \
-    te -= t0; \
-    teh = OD_DCT_RSHIFT(te, 1); \
-    t0 += teh; \
-    ta += t9h_; \
-    t9 = ta - t9; \
-    t5 -= t6h; \
-    t6 += t5; \
-    td = teh - td; \
-    te = td - te; \
-    t2 = t1h_ - t2; \
-    t1 -= t2; \
-    t7 += t4h; \
-    t4 -= t7; \
-    t8 -= tbh_; \
-    tb += t8; \
-    t0 += tch; \
-    tc -= t0; \
-    tf -= t3h_; \
-    t3 += tf; \
-    /* TODO: Can we move this into another operation */ \
-    ta = -ta; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    td += (t2*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    t2 -= (td*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    td += (t2*6723 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    t5 -= (ta*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    ta += (t5*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    t5 -= (ta*2485 + 4096) >> 13; \
-    t2 += t5; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t5 -= t2h; \
-    ta = td - ta; \
-    td -= OD_DCT_RSHIFT(ta, 1); \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    ta -= (t5*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    t5 += (ta*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    ta -= (t5*13573 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t9 -= (t6*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    t6 += (t9*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    t9 -= (t6*17515 + 16384) >> 15; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    t1 -= (te*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    te += (t1*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    t1 -= (te*6723 + 4096) >> 13; \
-    te += t6; \
-    teh = OD_DCT_RSHIFT(te, 1); \
-    t6 = teh - t6; \
-    t9 += t1; \
-    t1 -= OD_DCT_RSHIFT(t9, 1); \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    t9 -= (t6*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    t6 -= (t9*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    t9 += (t6*7489 + 4096) >> 13; \
-    tb = tc - tb; \
-    tc = OD_DCT_RSHIFT(tb, 1) - tc; \
-    t3 += t4; \
-    t4 = OD_DCT_RSHIFT(t3, 1) - t4; \
-    /* TODO: Can we move this into another operation */ \
-    t3 = -t3; \
-    t8 += tf; \
-    tf = OD_DCT_RSHIFT(t8, 1) - tf; \
-    t0 += t7; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t7 = t0h - t7; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    t3 += (tc*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    tc -= (t3*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    t3 += (tc*14341 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    t4 -= (tb*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    tb += (t4*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    t4 -= (tb*4161 + 8192) >> 14; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    t8 += (t7*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    t7 -= (t8*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    t8 += (t7*13573 + 8192) >> 14; \
-    /* TODO: Can we move these into another operation */ \
-    t1 = -t1; \
-    t5 = -t5; \
-    t9 = -t9; \
-    tb = -tb; \
-    td = -td; \
-  } \
-  while (0)
-
-#define OD_FDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II fDCT. */ \
-  do { \
-    int tgh; \
-    int thh; \
-    int tih; \
-    int tkh; \
-    int tmh; \
-    int tnh; \
-    int toh; \
-    int tqh; \
-    int tsh; \
-    int tuh; \
-    int tvh; \
-    tv = t0 - tv; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    t0 -= tvh; \
-    tu += t1; \
-    tuh = OD_DCT_RSHIFT(tu, 1); \
-    t1 = tuh - t1; \
-    tt = t2 - tt; \
-    t2 -= OD_DCT_RSHIFT(tt, 1); \
-    ts += t3; \
-    tsh = OD_DCT_RSHIFT(ts, 1); \
-    t3 = tsh - t3; \
-    tr = t4 - tr; \
-    t4 -= OD_DCT_RSHIFT(tr, 1); \
-    tq += t5; \
-    tqh = OD_DCT_RSHIFT(tq, 1); \
-    t5 = tqh - t5; \
-    tp = t6 - tp; \
-    t6 -= OD_DCT_RSHIFT(tp, 1); \
-    to += t7; \
-    toh = OD_DCT_RSHIFT(to, 1); \
-    t7 = toh - t7; \
-    tn = t8 - tn; \
-    tnh = OD_DCT_RSHIFT(tn, 1); \
-    t8 -= tnh; \
-    tm += t9; \
-    tmh = OD_DCT_RSHIFT(tm, 1); \
-    t9 = tmh - t9; \
-    tl = ta - tl; \
-    ta -= OD_DCT_RSHIFT(tl, 1); \
-    tk += tb; \
-    tkh = OD_DCT_RSHIFT(tk, 1); \
-    tb = tkh - tb; \
-    tj = tc - tj; \
-    tc -= OD_DCT_RSHIFT(tj, 1); \
-    ti += td; \
-    tih = OD_DCT_RSHIFT(ti, 1); \
-    td = tih - td; \
-    th = te - th; \
-    thh = OD_DCT_RSHIFT(th, 1); \
-    te -= thh; \
-    tg += tf; \
-    tgh = OD_DCT_RSHIFT(tg, 1); \
-    tf = tgh - tf; \
-    OD_FDCT_16_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-     t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
-    OD_FDST_16_ASYM(tv, tvh, tf, tn, tnh, t7, tr, tb, tj, t3, \
-     tt, td, tl, t5, tp, t9, th, thh, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II iDCT. */ \
-  do { \
-    int t1h; \
-    int t3h; \
-    int t5h; \
-    int t7h; \
-    int t9h; \
-    int tbh; \
-    int tdh; \
-    int tfh; \
-    int thh; \
-    int tth; \
-    int tvh; \
-    OD_IDST_16_ASYM(tv, tvh, tn, tr, tj, tt, tth, tl, tp, th, thh, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16_ASYM(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
-    tu = t1h - tu; \
-    t1 -= tu; \
-    te += thh; \
-    th = te - th; \
-    tm = t9h - tm; \
-    t9 -= tm; \
-    t6 += OD_DCT_RSHIFT(tp, 1); \
-    tp = t6 - tp; \
-    tq = t5h - tq; \
-    t5 -= tq; \
-    ta += OD_DCT_RSHIFT(tl, 1); \
-    tl = ta - tl; \
-    ti = tdh - ti; \
-    td -= ti; \
-    t2 += tth; \
-    tt = t2 - tt; \
-    ts = t3h - ts; \
-    t3 -= ts; \
-    tc += OD_DCT_RSHIFT(tj, 1); \
-    tj = tc - tj; \
-    tk = tbh - tk; \
-    tb -= tk; \
-    t4 += OD_DCT_RSHIFT(tr, 1); \
-    tr = t4 - tr; \
-    to = t7h - to; \
-    t7 -= to; \
-    t8 += OD_DCT_RSHIFT(tn, 1); \
-    tn = t8 - tn; \
-    tg = tfh - tg; \
-    tf -= tg; \
-    t0 += tvh; \
-    tv = t0 - tv; \
-  } \
-  while (0)
-
-#if CONFIG_TX64X64
-#define OD_FDCT_32_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-  t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \
-  t9, tp, tph, t5, tl, tlh, td, tt, tth, t3, tj, tjh, tb, tr, trh, \
-  t7, tn, tnh, tf, tv, tvh) \
-  /* Embedded 32-point asymmetric Type-II fDCT. */ \
-  do { \
-    t0 += tvh; \
-    tv = t0 - tv; \
-    t1 = tuh - t1; \
-    tu -= t1; \
-    t2 += tth; \
-    tt = t2 - tt; \
-    t3 = tsh - t3; \
-    ts -= t3; \
-    t4 += trh; \
-    tr = t4 - tr; \
-    t5 = tqh - t5; \
-    tq -= t5; \
-    t6 += tph; \
-    tp = t6 - tp; \
-    t7 = toh - t7; \
-    to -= t7; \
-    t8 += tnh; \
-    tn = t8 - tn; \
-    t9 = tmh - t9; \
-    tm -= t9; \
-    ta += tlh; \
-    tl = ta - tl; \
-    tb = tkh - tb; \
-    tk -= tb; \
-    tc += tjh; \
-    tj = tc - tj; \
-    td = tih - td; \
-    ti -= td; \
-    te += thh; \
-    th = te - th; \
-    tf = tgh - tf; \
-    tg -= tf; \
-    OD_FDCT_16(t0, tg, t8, to, t4, tk, tc, ts, \
-     t2, ti, ta, tq, t6, tm, te, tu); \
-    OD_FDST_16(tv, tf, tn, t7, tr, tb, tj, t3, \
-     tt, td, tl, t5, tp, t9, th, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, \
-  t6, tm, te, tu, t1, t1h, th, thh, t9, t9h, tp, tph, t5, t5h, tl, tlh, \
-  td, tdh, tt, tth, t3, t3h, tj, tjh, tb, tbh, tr, trh, t7, t7h, tn, tnh, \
-  tf, tfh, tv, tvh) \
-  /* Embedded 32-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_16(tv, tn, tr, tj, tt, tl, tp, th, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t9, t5, td, t3, tb, t7, tf); \
-    tv = t0 - tv; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    t0 -= tvh; \
-    t1 += tu; \
-    t1h = OD_DCT_RSHIFT(t1, 1); \
-    tu = t1h - tu; \
-    tt = t2 - tt; \
-    tth = OD_DCT_RSHIFT(tt, 1); \
-    t2 -= tth; \
-    t3 += ts; \
-    t3h = OD_DCT_RSHIFT(t3, 1); \
-    ts = t3h - ts; \
-    tr = t4 - tr; \
-    trh = OD_DCT_RSHIFT(tr, 1); \
-    t4 -= trh; \
-    t5 += tq; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    tq = t5h - tq; \
-    tp = t6 - tp; \
-    tph = OD_DCT_RSHIFT(tp, 1); \
-    t6 -= tph; \
-    t7 += to; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    to = t7h - to; \
-    tn = t8 - tn; \
-    tnh = OD_DCT_RSHIFT(tn, 1); \
-    t8 -= tnh; \
-    t9 += tm; \
-    t9h = OD_DCT_RSHIFT(t9, 1); \
-    tm = t9h - tm; \
-    tl = ta - tl; \
-    tlh = OD_DCT_RSHIFT(tl, 1); \
-    ta -= tlh; \
-    tb += tk; \
-    tbh = OD_DCT_RSHIFT(tb, 1); \
-    tk = tbh - tk; \
-    tj = tc - tj; \
-    tjh = OD_DCT_RSHIFT(tj, 1); \
-    tc -= tjh; \
-    td += ti; \
-    tdh = OD_DCT_RSHIFT(td, 1); \
-    ti = tdh - ti; \
-    th = te - th; \
-    thh = OD_DCT_RSHIFT(th, 1); \
-    te -= thh; \
-    tf += tg; \
-    tfh = OD_DCT_RSHIFT(tf, 1); \
-    tg = tfh - tg; \
-  } \
-  while (0)
-
-#define OD_FDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t1h; \
-    int t4h; \
-    int t5h; \
-    int tqh; \
-    int trh; \
-    int tuh; \
-    int tvh; \
-    \
-    tu = -tu; \
-    \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 271); \
-    t5 -= (tq*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 272); \
-    tq += (t5*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 273); \
-    t5 -= (tq*13573 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 29957, 16384, 274); \
-    tp += (t6*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 11585, 8192, 275); \
-    t6 -= (tp*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 276); \
-    tp -= (t6*19195 + 16384) >> 15; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 29957, 16384, 277); \
-    tu += (t1*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(tu, 11585, 8192, 278); \
-    t1 -= (tu*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 279); \
-    tu -= (t1*19195 + 16384) >> 15; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 28681, 16384, 280); \
-    tt += (t2*28681 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tt, 15137, 8192, 281); \
-    t2 -= (tt*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4161, 8192, 282); \
-    tt += (t2*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 4161, 8192, 283); \
-    t3 += (ts*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 15137, 8192, 284); \
-    ts -= (t3*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 14341, 8192, 285); \
-    t3 += (ts*14341 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 19195, 16384, 286); \
-    t9 -= (tm*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 287); \
-    tm -= (t9*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 7489, 4096, 288); \
-    t9 += (tm*7489 + 4096) >> 13; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 289); \
-    ta += (tl*3259 + 4096) >> 13; \
-    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 3135, 8192, 290); \
-    tl -= (ta*3135 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 291); \
-    ta += (tl*3259 + 4096) >> 13; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 4161, 8192, 292); \
-    tb += (tk*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 15137, 8192, 293); \
-    tk -= (tb*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 14341, 8192, 294); \
-    tb += (tk*14341 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 29957, 16384, 295); \
-    th += (te*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 11585, 8192, 296); \
-    te -= (th*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 19195, 16384, 297); \
-    th -= (te*19195 + 16384) >> 15; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 28681, 16384, 298); \
-    tj += (tc*28681 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 15137, 8192, 299); \
-    tc -= (tj*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 4161, 8192, 300); \
-    tj += (tc*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 4161, 8192, 301); \
-    td += (ti*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 15137, 8192, 302); \
-    ti -= (td*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 14341, 8192, 303); \
-    td += (ti*14341 + 8192) >> 14; \
-    \
-    t1 = -t1; \
-    t2 = -t2; \
-    t3 = -t3; \
-    td = -td; \
-    tg = -tg; \
-    to = -to; \
-    ts = -ts; \
-    \
-    tr -= OD_DCT_RSHIFT(t5, 1); \
-    t5 += tr; \
-    tq -= OD_DCT_RSHIFT(t4, 1); /* pass */ \
-    t4 += tq; \
-    t6 -= OD_DCT_RSHIFT(t7, 1); \
-    t7 += t6; \
-    to -= OD_DCT_RSHIFT(tp, 1); /* pass */ \
-    tp += to; \
-    t1 += OD_DCT_RSHIFT(t0, 1); /* pass */ \
-    t0 -= t1; \
-    tv -= OD_DCT_RSHIFT(tu, 1); \
-    tu += tv; \
-    t3 -= OD_DCT_RSHIFT(tt, 1); \
-    tt += t3; \
-    t2 += OD_DCT_RSHIFT(ts, 1); \
-    ts -= t2; \
-    t9 -= OD_DCT_RSHIFT(t8, 1); /* pass */ \
-    t8 += t9; \
-    tn += OD_DCT_RSHIFT(tm, 1); \
-    tm -= tn; \
-    tb += OD_DCT_RSHIFT(ta, 1); \
-    ta -= tb; \
-    tl -= OD_DCT_RSHIFT(tk, 1); \
-    tk += tl; \
-    te -= OD_DCT_RSHIFT(tf, 1); /* pass */ \
-    tf += te; \
-    tg -= OD_DCT_RSHIFT(th, 1); \
-    th += tg; \
-    tc -= OD_DCT_RSHIFT(ti, 1); \
-    ti += tc; \
-    td += OD_DCT_RSHIFT(tj, 1); \
-    tj -= td; \
-    \
-    t4 = -t4; \
-    \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 304); \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 16069, 8192, 305); \
-    tr -= (t4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 306); \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 307); \
-    t5 += (tq*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 13623, 8192, 308); \
-    tq -= (t5*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 309); \
-    t5 += (tq*17515 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 310); \
-    t7 += (to*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6393, 16384, 311); \
-    to -= (t7*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 312); \
-    t7 += (to*3227 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 313); \
-    t6 += (tp*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 18205, 16384, 314); \
-    tp -= (t6*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 315); \
-    t6 += (tp*2485 + 4096) >> 13; \
-    \
-    t5 = -t5; \
-    \
-    tr += to; \
-    trh = OD_DCT_RSHIFT(tr, 1); \
-    to -= trh; \
-    t4 += t7; \
-    t4h = OD_DCT_RSHIFT(t4, 1); \
-    t7 -= t4h; \
-    t5 += tp; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    tp -= t5h; \
-    tq += t6; \
-    tqh = OD_DCT_RSHIFT(tq, 1); \
-    t6 -= tqh; \
-    t0 -= t3; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t3 += t0h; \
-    tv -= ts; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    ts += tvh; \
-    tu += tt; \
-    tuh = OD_DCT_RSHIFT(tu, 1); \
-    tt -= tuh; \
-    t1 -= t2; \
-    t1h = OD_DCT_RSHIFT(t1, 1); \
-    t2 += t1h; \
-    t8 += tb; \
-    tb -= OD_DCT_RSHIFT(t8, 1); \
-    tn += tk; \
-    tk -= OD_DCT_RSHIFT(tn, 1); \
-    t9 += tl; \
-    tl -= OD_DCT_RSHIFT(t9, 1); \
-    tm -= ta; \
-    ta += OD_DCT_RSHIFT(tm, 1); \
-    tc -= tf; \
-    tf += OD_DCT_RSHIFT(tc, 1); \
-    tj += tg; \
-    tg -= OD_DCT_RSHIFT(tj, 1); \
-    td -= te; \
-    te += OD_DCT_RSHIFT(td, 1); \
-    ti += th; \
-    th -= OD_DCT_RSHIFT(ti, 1); \
-    \
-    t9 = -t9; \
-    tl = -tl; \
-    \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 316); \
-    t8 += (tn*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 803, 4096, 317); \
-    tn -= (t8*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 318); \
-    t8 += (tn*805 + 8192) >> 14; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 319); \
-    tk += (tb*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 5197, 4096, 320); \
-    tb -= (tk*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 321); \
-    tk += (tb*11725 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 322); \
-    ta += (tl*2455 + 2048) >> 12; \
-    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 14449, 8192, 323); \
-    tl -= (ta*14449 + 8192) >> 14; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 324); \
-    ta += (tl*2455 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 325); \
-    t9 += (tm*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 1189, 2048, 326); \
-    tm -= (t9*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 327); \
-    t9 += (tm*4861 + 16384) >> 15; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 328); \
-    tf += (tg*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 803, 4096, 329); \
-    tg -= (tf*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 330); \
-    tf += (tg*805 + 8192) >> 14; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 331); \
-    tc += (tj*2931 + 4096) >> 13; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 5197, 4096, 332); \
-    tj -= (tc*5197 + 4096) >> 13; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 333); \
-    tc += (tj*2931 + 4096) >> 13; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 334); \
-    td += (ti*513 + 1024) >> 11; \
-    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 7723, 8192, 335); \
-    ti -= (td*7723 + 8192) >> 14; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 336); \
-    td += (ti*513 + 1024) >> 11; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 337); \
-    te += (th*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 1189, 2048, 338); \
-    th -= (te*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 339); \
-    te += (th*4861 + 16384) >> 15; \
-    \
-    ta = -ta; \
-    tb = -tb; \
-    \
-    tt += t5h; \
-    t5 -= tt; \
-    t2 -= tqh; \
-    tq += t2; \
-    tp += t1h; \
-    t1 -= tp; \
-    t6 -= tuh; \
-    tu += t6; \
-    t7 += tvh; \
-    tv -= t7; \
-    to += t0h; \
-    t0 -= to; \
-    t3 -= t4h; \
-    t4 += t3; \
-    ts += trh; \
-    tr -= ts; \
-    tf -= OD_DCT_RSHIFT(tn, 1); \
-    tn += tf; \
-    tg -= OD_DCT_RSHIFT(t8, 1); \
-    t8 += tg; \
-    tk += OD_DCT_RSHIFT(tc, 1); \
-    tc -= tk; \
-    tb += OD_DCT_RSHIFT(tj, 1); \
-    tj -= tb; \
-    ta += OD_DCT_RSHIFT(ti, 1); \
-    ti -= ta; \
-    tl += OD_DCT_RSHIFT(td, 1); \
-    td -= tl; \
-    te -= OD_DCT_RSHIFT(tm, 1); \
-    tm += te; \
-    th -= OD_DCT_RSHIFT(t9, 1); \
-    t9 += th; \
-    ta -= t5; \
-    t5 += OD_DCT_RSHIFT(ta, 1); \
-    tq -= tl; \
-    tl += OD_DCT_RSHIFT(tq, 1); \
-    t2 -= ti; \
-    ti += OD_DCT_RSHIFT(t2, 1); \
-    td -= tt; \
-    tt += OD_DCT_RSHIFT(td, 1); \
-    tm += tp; \
-    tp -= OD_DCT_RSHIFT(tm, 1); \
-    t6 += t9; \
-    t9 -= OD_DCT_RSHIFT(t6, 1); \
-    te -= tu; \
-    tu += OD_DCT_RSHIFT(te, 1); \
-    t1 -= th; \
-    th += OD_DCT_RSHIFT(t1, 1); \
-    t0 -= tg; \
-    tg += OD_DCT_RSHIFT(t0, 1); \
-    tf += tv; \
-    tv -= OD_DCT_RSHIFT(tf, 1); \
-    t8 -= t7; \
-    t7 += OD_DCT_RSHIFT(t8, 1); \
-    to -= tn; \
-    tn += OD_DCT_RSHIFT(to, 1); \
-    t4 -= tk; \
-    tk += OD_DCT_RSHIFT(t4, 1); \
-    tb -= tr; \
-    tr += OD_DCT_RSHIFT(tb, 1); \
-    t3 -= tj; \
-    tj += OD_DCT_RSHIFT(t3, 1); \
-    tc -= ts; \
-    ts += OD_DCT_RSHIFT(tc, 1); \
-    \
-    tr = -tr; \
-    ts = -ts; \
-    tt = -tt; \
-    tu = -tu; \
-    \
-    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 2847, 2048, 340); \
-    tv += (t0*2847 + 2048) >> 12; \
-    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */  \
-    OD_DCT_OVERFLOW_CHECK(tv, 5791, 2048, 341); \
-    t0 -= (tv*5791 + 2048) >> 12; \
-    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 5593, 4096, 342); \
-    tv += (t0*5593 + 4096) >> 13; \
-    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 4099, 4096, 343); \
-    tg -= (tf*4099 + 4096) >> 13; \
-    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 1997, 1024, 344); \
-    tf += (tg*1997 + 1024) >> 11; \
-    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 815, 16384, 345); \
-    tg += (tf*815 + 16384) >> 15; \
-    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 2527, 2048, 346); \
-    tn -= (t8*2527 + 2048) >> 12; \
-    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 4695, 4096, 347); \
-    t8 += (tn*4695 + 4096) >> 13; \
-    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 4187, 4096, 348); \
-    tn += (t8*4187 + 4096) >> 13; \
-    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(to, 5477, 4096, 349); \
-    t7 += (to*5477 + 4096) >> 13; \
-    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 4169, 4096, 350); \
-    to -= (t7*4169 + 4096) >> 13; \
-    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(to, 2571, 2048, 351); \
-    t7 -= (to*2571 + 2048) >> 12; \
-    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 5331, 4096, 352); \
-    tt += (t2*5331 + 4096) >> 13; \
-    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tt, 5749, 2048, 353); \
-    t2 -= (tt*5749 + 2048) >> 12; \
-    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2413, 2048, 354); \
-    tt += (t2*2413 + 2048) >> 12; \
-    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4167, 4096, 355); \
-    ti -= (td*4167 + 4096) >> 13; \
-    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 891, 512, 356); \
-    td += (ti*891 + 512) >> 10; \
-    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4327, 16384, 357); \
-    ti += (td*4327 + 16384) >> 15; \
-    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 2261, 2048, 358); \
-    tl -= (ta*2261 + 2048) >> 12; \
-    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2855, 2048, 359); \
-    ta += (tl*2855 + 2048) >> 12; \
-    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 5417, 8192, 360); \
-    tl += (ta*5417 + 8192) >> 14; \
-    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 3459, 2048, 361); \
-    t5 += (tq*3459 + 2048) >> 12; \
-    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 1545, 2048, 362); \
-    tq -= (t5*1545 + 2048) >> 12; \
-    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 1971, 1024, 363); \
-    t5 -= (tq*1971 + 1024) >> 11; \
-    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 323, 256, 364); \
-    ts += (t3*323 + 256) >> 9; \
-    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 5707, 2048, 365); \
-    t3 -= (ts*5707 + 2048) >> 12; \
-    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 2229, 2048, 366); \
-    ts += (t3*2229 + 2048) >> 12; \
-    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 1061, 1024, 367); \
-    tj -= (tc*1061 + 1024) >> 11; \
-    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 6671, 4096, 368); \
-    tc += (tj*6671 + 4096) >> 13; \
-    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 6287, 16384, 369); \
-    tj += (tc*6287 + 16384) >> 15; \
-    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 4359, 4096, 370); \
-    tk -= (tb*4359 + 4096) >> 13; \
-    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 3099, 2048, 371); \
-    tb += (tk*3099 + 2048) >> 12; \
-    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 2109, 4096, 372); \
-    tk += (tb*2109 + 4096) >> 13; \
-    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 5017, 4096, 373); \
-    tr += (t4*5017 + 4096) >> 13; \
-    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 1413, 512, 374); \
-    t4 -= (tr*1413 + 512) >> 10; \
-    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 8195, 8192, 375); \
-    tr += (t4*8195 + 8192) >> 14; \
-    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 2373, 2048, 376); \
-    t9 += (tm*2373 + 2048) >> 12; \
-    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 5209, 4096, 377); \
-    tm -= (t9*5209 + 4096) >> 13; \
-    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 3391, 4096, 378); \
-    t9 -= (tm*3391 + 4096) >> 13; \
-    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1517, 1024, 379); \
-    tp -= (t6*1517 + 1024) >> 11; \
-    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 1817, 2048, 380); \
-    t6 += (tp*1817 + 2048) >> 12; \
-    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 6331, 4096, 381); \
-    tp += (t6*6331 + 4096) >> 13; \
-    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(te, 515, 512, 382); \
-    th -= (te*515 + 512) >> 10; \
-    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(th, 7567, 4096, 383); \
-    te += (th*7567 + 4096) >> 13; \
-    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2513, 16384, 384); \
-    th += (te*2513 + 16384) >> 15; \
-    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 2753, 2048, 385); \
-    tu += (t1*2753 + 2048) >> 12; \
-    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tu, 5777, 2048, 386); \
-    t1 -= (tu*5777 + 2048) >> 12; \
-    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 1301, 1024, 387); \
-    tu += (t1*1301 + 1024) >> 11; \
-  } \
-  while (0)
-
-#define OD_IDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t4h; \
-    int tbh; \
-    int tfh; \
-    int tgh; \
-    int tkh; \
-    int trh; \
-    int tvh; \
-    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
-    tf -= (tg*1301 + 1024) >> 11; \
-    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
-    tg += (tf*5777 + 2048) >> 12; \
-    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
-    tf -= (tg*2753 + 2048) >> 12; \
-    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
-    th -= (te*2513 + 16384) >> 15; \
-    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
-    te -= (th*7567 + 4096) >> 13; \
-    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
-    th += (te*515 + 512) >> 10; \
-    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
-    tj -= (tc*6331 + 4096) >> 13; \
-    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
-    tc -= (tj*1817 + 2048) >> 12; \
-    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
-    tj += (tc*1517 + 1024) >> 11; \
-    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
-    ti += (td*3391 + 4096) >> 13; \
-    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
-    td += (ti*5209 + 4096) >> 13; \
-    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
-    ti -= (td*2373 + 2048) >> 12; \
-    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
-    tr -= (t4*8195 + 8192) >> 14; \
-    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
-    t4 += (tr*1413 + 512) >> 10; \
-    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
-    tr -= (t4*5017 + 4096) >> 13; \
-    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
-    t5 -= (tq*2109 + 4096) >> 13; \
-    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
-    tq -= (t5*3099 + 2048) >> 12; \
-    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
-    t5 += (tq*4359 + 4096) >> 13; \
-    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
-    tp -= (t6*6287 + 16384) >> 15; \
-    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
-    t6 -= (tp*6671 + 4096) >> 13; \
-    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
-    tp += (t6*1061 + 1024) >> 11; \
-    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
-    t7 -= (to*2229 + 2048) >> 12; \
-    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
-    to += (t7*5707 + 2048) >> 12; \
-    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
-    t7 -= (to*323 + 256) >> 9; \
-    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
-    tk += (tb*1971 + 1024) >> 11; \
-    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
-    tb += (tk*1545 + 2048) >> 12; \
-    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
-    tk -= (tb*3459 + 2048) >> 12; \
-    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
-    tl -= (ta*5417 + 8192) >> 14; \
-    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
-    ta -= (tl*2855 + 2048) >> 12; \
-    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
-    tl += (ta*2261 + 2048) >> 12; \
-    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
-    t9 -= (tm*4327 + 16384) >> 15; \
-    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
-    tm -= (t9*891 + 512) >> 10; \
-    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
-    t9 += (tm*4167 + 4096) >> 13; \
-    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
-    tn -= (t8*2413 + 2048) >> 12; \
-    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
-    t8 += (tn*5749 + 2048) >> 12; \
-    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
-    tn -= (t8*5331 + 4096) >> 13; \
-    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
-    ts += (t3*2571 + 2048) >> 12; \
-    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
-    t3 += (ts*4169 + 4096) >> 13; \
-    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
-    ts -= (t3*5477 + 4096) >> 13; \
-    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
-    tt -= (t2*4187 + 4096) >> 13; \
-    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
-    t2 -= (tt*4695 + 4096) >> 13; \
-    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
-    tt += (t2*2527 + 2048) >> 12; \
-    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
-    t1 -= (tu*815 + 16384) >> 15; \
-    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
-    tu -= (t1*1997 + 1024) >> 11; \
-    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
-    t1 += (tu*4099 + 4096) >> 13; \
-    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
-    tv -= (t0*5593 + 4096) >> 13; \
-    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \
-    t0 += (tv*5791 + 2048) >> 12; \
-    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
-    tv -= (t0*2847 + 2048) >> 12; \
-    \
-    t7 = -t7; \
-    tf = -tf; \
-    tn = -tn; \
-    tr = -tr; \
-    \
-    t7 -= OD_DCT_RSHIFT(t6, 1); \
-    t6 += t7; \
-    tp -= OD_DCT_RSHIFT(to, 1); \
-    to += tp; \
-    tr -= OD_DCT_RSHIFT(tq, 1); \
-    tq += tr; \
-    t5 -= OD_DCT_RSHIFT(t4, 1); \
-    t4 += t5; \
-    tt -= OD_DCT_RSHIFT(t3, 1); \
-    t3 += tt; \
-    ts -= OD_DCT_RSHIFT(t2, 1); \
-    t2 += ts; \
-    tv += OD_DCT_RSHIFT(tu, 1); \
-    tu -= tv; \
-    t1 -= OD_DCT_RSHIFT(t0, 1); \
-    t0 += t1; \
-    th -= OD_DCT_RSHIFT(tg, 1); \
-    tg += th; \
-    tf -= OD_DCT_RSHIFT(te, 1); \
-    te += tf; \
-    ti += OD_DCT_RSHIFT(tc, 1); \
-    tc -= ti; \
-    tj += OD_DCT_RSHIFT(td, 1); \
-    td -= tj; \
-    tn -= OD_DCT_RSHIFT(tm, 1); \
-    tm += tn; \
-    t9 -= OD_DCT_RSHIFT(t8, 1); \
-    t8 += t9; \
-    tl -= OD_DCT_RSHIFT(tb, 1); \
-    tb += tl; \
-    tk -= OD_DCT_RSHIFT(ta, 1); \
-    ta += tk; \
-    \
-    ti -= th; \
-    th += OD_DCT_RSHIFT(ti, 1); \
-    td -= te; \
-    te += OD_DCT_RSHIFT(td, 1); \
-    tm += tl; \
-    tl -= OD_DCT_RSHIFT(tm, 1); \
-    t9 += ta; \
-    ta -= OD_DCT_RSHIFT(t9, 1); \
-    tp += tq; \
-    tq -= OD_DCT_RSHIFT(tp, 1); \
-    t6 += t5; \
-    t5 -= OD_DCT_RSHIFT(t6, 1); \
-    t2 -= t1; \
-    t1 += OD_DCT_RSHIFT(t2, 1); \
-    tt -= tu; \
-    tu += OD_DCT_RSHIFT(tt, 1); \
-    tr += t7; \
-    trh = OD_DCT_RSHIFT(tr, 1); \
-    t7 -= trh; \
-    t4 -= to; \
-    t4h = OD_DCT_RSHIFT(t4, 1); \
-    to += t4h; \
-    t0 += t3; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t3 -= t0h; \
-    tv += ts; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    ts -= tvh; \
-    tf -= tc; \
-    tfh = OD_DCT_RSHIFT(tf, 1); \
-    tc += tfh; \
-    tg += tj; \
-    tgh = OD_DCT_RSHIFT(tg, 1); \
-    tj -= tgh; \
-    tb -= t8; \
-    tbh = OD_DCT_RSHIFT(tb, 1); \
-    t8 += tbh; \
-    tk += tn; \
-    tkh = OD_DCT_RSHIFT(tk, 1); \
-    tn -= tkh; \
-    \
-    ta = -ta; \
-    tq = -tq; \
-    \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    te -= (th*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    th += (te*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    te -= (th*4861 + 16384) >> 15; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    tm -= (t9*513 + 1024) >> 11; \
-    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
-    t9 += (tm*7723 + 8192) >> 14; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    tm -= (t9*513 + 1024) >> 11; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t6 -= (tp*2931 + 4096) >> 13; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    tp += (t6*5197 + 4096) >> 13; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t6 -= (tp*2931 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    tu -= (t1*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    t1 += (tu*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    tu -= (t1*805 + 8192) >> 14; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    ti -= (td*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    td += (ti*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    ti -= (td*4861 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    ta -= (tl*2455 + 2048) >> 12; \
-    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    tl += (ta*14449 + 8192) >> 14; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    ta -= (tl*2455 + 2048) >> 12; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t5 -= (tq*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    tq += (t5*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t5 -= (tq*11725 + 16384) >> 15; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    t2 -= (tt*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    tt += (t2*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    t2 -= (tt*805 + 8192) >> 14; \
-    \
-    tl = -tl; \
-    ti = -ti; \
-    \
-    th += OD_DCT_RSHIFT(t9, 1); \
-    t9 -= th; \
-    te -= OD_DCT_RSHIFT(tm, 1); \
-    tm += te; \
-    t1 += OD_DCT_RSHIFT(tp, 1); \
-    tp -= t1; \
-    tu -= OD_DCT_RSHIFT(t6, 1); \
-    t6 += tu; \
-    ta -= OD_DCT_RSHIFT(td, 1); \
-    td += ta; \
-    tl += OD_DCT_RSHIFT(ti, 1); \
-    ti -= tl; \
-    t5 += OD_DCT_RSHIFT(tt, 1); \
-    tt -= t5; \
-    tq += OD_DCT_RSHIFT(t2, 1); \
-    t2 -= tq; \
-    \
-    t8 -= tgh; \
-    tg += t8; \
-    tn += tfh; \
-    tf -= tn; \
-    t7 -= tvh; \
-    tv += t7; \
-    to -= t0h; \
-    t0 += to; \
-    tc += tbh; \
-    tb -= tc; \
-    tj += tkh; \
-    tk -= tj; \
-    ts += t4h; \
-    t4 -= ts; \
-    t3 += trh; \
-    tr -= t3; \
-    \
-    tk = -tk; \
-    \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    tc -= (tj*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    tj += (tc*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    tc -= (tj*2485 + 4096) >> 13; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    ts -= (t3*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    t3 += (ts*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    ts -= (t3*3227 + 16384) >> 15; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    tk -= (tb*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
-    tb += (tk*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    tk -= (tb*17515 + 16384) >> 15; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
-    tr += (t4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    \
-    t4 = -t4; \
-    \
-    tp += tm; \
-    tm -= OD_DCT_RSHIFT(tp, 1); \
-    t9 -= t6; \
-    t6 += OD_DCT_RSHIFT(t9, 1); \
-    th -= t1; \
-    t1 += OD_DCT_RSHIFT(th, 1); \
-    tu -= te; \
-    te += OD_DCT_RSHIFT(tu, 1); /* pass */ \
-    t5 -= tl; \
-    tl += OD_DCT_RSHIFT(t5, 1); \
-    ta += tq; \
-    tq -= OD_DCT_RSHIFT(ta, 1); \
-    td += tt; \
-    tt -= OD_DCT_RSHIFT(td, 1); \
-    t2 -= ti; \
-    ti += OD_DCT_RSHIFT(t2, 1); /* pass */ \
-    t7 += t8; \
-    t8 -= OD_DCT_RSHIFT(t7, 1); \
-    tn -= to; \
-    to += OD_DCT_RSHIFT(tn, 1); \
-    tf -= tv; \
-    tv += OD_DCT_RSHIFT(tf, 1); \
-    t0 += tg; \
-    tg -= OD_DCT_RSHIFT(t0, 1); /* pass */ \
-    tj -= t3; \
-    t3 += OD_DCT_RSHIFT(tj, 1); /* pass */ \
-    ts -= tc; \
-    tc += OD_DCT_RSHIFT(ts, 1); \
-    t4 -= tb; \
-    tb += OD_DCT_RSHIFT(t4, 1); /* pass */ \
-    tk -= tr; \
-    tr += OD_DCT_RSHIFT(tk, 1); \
-    \
-    t1 = -t1; \
-    t3 = -t3; \
-    t7 = -t7; \
-    t8 = -t8; \
-    tg = -tg; \
-    tm = -tm; \
-    to = -to; \
-    \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tm -= (t9*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t9 += (tm*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tm -= (t9*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tp -= (t6*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t6 += (tp*15137 + 8192) >> 14; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tp -= (t6*28681 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    th += (te*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    te += (th*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    th -= (te*29957 + 16384) >> 15; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tq -= (t5*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t5 += (tq*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tq -= (t5*4161 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    ta -= (tl*3259 + 4096) >> 13; \
-    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
-    tl += (ta*3135 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    ta -= (tl*3259 + 4096) >> 13; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    ti -= (td*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    td += (ti*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    ti += (td*19195 + 16384) >> 15; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    to -= (t7*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t7 += (to*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    to -= (t7*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tn -= (t8*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t8 += (tn*15137 + 8192) >> 14; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tn -= (t8*28681 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    tf += (tg*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    tg += (tf*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    tf -= (tg*29957 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    tj += (tc*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    tc += (tj*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    tj -= (tc*29957 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    tk += (tb*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    tb -= (tk*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    tk += (tb*13573 + 8192) >> 14; \
-    \
-    tf = -tf; \
-    \
-  } \
-  while (0)
-
-#define OD_FDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
-  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
-  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
-  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
-  /* Embedded 64-point orthonormal Type-II fDCT. */ \
-  do { \
-    int uwh; \
-    int uxh; \
-    int uyh; \
-    int uzh; \
-    int uAh; \
-    int uBh; \
-    int uCh; \
-    int uDh; \
-    int uEh; \
-    int uFh; \
-    int uGh; \
-    int uHh; \
-    int uIh; \
-    int uJh; \
-    int uKh; \
-    int uLh; \
-    int uMh; \
-    int uNh; \
-    int uOh; \
-    int uPh; \
-    int uQh; \
-    int uRh; \
-    int uSh; \
-    int uTh; \
-    int uUh; \
-    int uVh; \
-    int uWh; \
-    int uXh; \
-    int uYh; \
-    int uZh; \
-    int u_h; \
-    int uh_; \
-    u = u0 - u; \
-    uh_ = OD_DCT_RSHIFT(u, 1); \
-    u0 -= uh_; \
-    u_ += u1; \
-    u_h = OD_DCT_RSHIFT(u_, 1); \
-    u1 = u_h - u1; \
-    uZ = u2 - uZ; \
-    uZh = OD_DCT_RSHIFT(uZ, 1); \
-    u2 -= uZh; \
-    uY += u3; \
-    uYh = OD_DCT_RSHIFT(uY, 1); \
-    u3 = uYh - u3; \
-    uX = u4 - uX; \
-    uXh = OD_DCT_RSHIFT(uX, 1); \
-    u4 -= uXh; \
-    uW += u5; \
-    uWh = OD_DCT_RSHIFT(uW, 1); \
-    u5 = uWh - u5; \
-    uV = u6 - uV; \
-    uVh = OD_DCT_RSHIFT(uV, 1); \
-    u6 -= uVh; \
-    uU += u7; \
-    uUh = OD_DCT_RSHIFT(uU, 1); \
-    u7 = uUh - u7; \
-    uT = u8 - uT; \
-    uTh = OD_DCT_RSHIFT(uT, 1); \
-    u8 -= uTh; \
-    uS += u9; \
-    uSh = OD_DCT_RSHIFT(uS, 1); \
-    u9 = uSh - u9; \
-    uR = ua - uR; \
-    uRh = OD_DCT_RSHIFT(uR, 1); \
-    ua -= uRh; \
-    uQ += ub; \
-    uQh = OD_DCT_RSHIFT(uQ, 1); \
-    ub = uQh - ub; \
-    uP = uc - uP; \
-    uPh = OD_DCT_RSHIFT(uP, 1); \
-    uc -= uPh; \
-    uO += ud; \
-    uOh = OD_DCT_RSHIFT(uO, 1); \
-    ud = uOh - ud; \
-    uN = ue - uN; \
-    uNh = OD_DCT_RSHIFT(uN, 1); \
-    ue -= uNh; \
-    uM += uf; \
-    uMh = OD_DCT_RSHIFT(uM, 1); \
-    uf = uMh - uf; \
-    uL = ug - uL; \
-    uLh = OD_DCT_RSHIFT(uL, 1); \
-    ug -= uLh; \
-    uK += uh; \
-    uKh = OD_DCT_RSHIFT(uK, 1); \
-    uh = uKh - uh; \
-    uJ = ui - uJ; \
-    uJh = OD_DCT_RSHIFT(uJ, 1); \
-    ui -= uJh; \
-    uI += uj; \
-    uIh = OD_DCT_RSHIFT(uI, 1); \
-    uj = uIh - uj; \
-    uH = uk - uH; \
-    uHh = OD_DCT_RSHIFT(uH, 1); \
-    uk -= uHh; \
-    uG += ul; \
-    uGh = OD_DCT_RSHIFT(uG, 1); \
-    ul = uGh - ul; \
-    uF = um - uF; \
-    uFh = OD_DCT_RSHIFT(uF, 1); \
-    um -= uFh; \
-    uE += un; \
-    uEh = OD_DCT_RSHIFT(uE, 1); \
-    un = uEh - un; \
-    uD = uo - uD; \
-    uDh = OD_DCT_RSHIFT(uD, 1); \
-    uo -= uDh; \
-    uC += up; \
-    uCh = OD_DCT_RSHIFT(uC, 1); \
-    up = uCh - up; \
-    uB = uq - uB; \
-    uBh = OD_DCT_RSHIFT(uB, 1); \
-    uq -= uBh; \
-    uA += ur; \
-    uAh = OD_DCT_RSHIFT(uA, 1); \
-    ur = uAh - ur; \
-    uz = us - uz; \
-    uzh = OD_DCT_RSHIFT(uz, 1); \
-    us -= uzh; \
-    uy += ut; \
-    uyh = OD_DCT_RSHIFT(uy, 1); \
-    ut = uyh - ut; \
-    ux = uu - ux; \
-    uxh = OD_DCT_RSHIFT(ux, 1); \
-    uu -= uxh; \
-    uw += uv; \
-    uwh = OD_DCT_RSHIFT(uw, 1); \
-    uv = uwh - uv; \
-    OD_FDCT_32_ASYM(u0, uw, uwh, ug, uM, uMh, u8, uE, uEh, uo, uU, uUh, \
-      u4, uA, uAh, uk, uQ, uQh, uc, uI, uIh, us, uY, uYh, u2, uy, uyh, \
-      ui, uO, uOh, ua, uG, uGh, uq, uW, uWh, u6, uC, uCh, um, uS, uSh, \
-      ue, uK, uKh, uu, u_, u_h); \
-    OD_FDST_32_ASYM(u, uv, uL, uf, uT, un, uD, u7, uX, ur, uH, ub, uP, uj, \
-      uz, u3, uZ, ut, uJ, ud, uR, ul, uB, u5, uV, up, uF, u9, uN, uh, ux, u1); \
-  } \
-  while (0)
-
-#define OD_IDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
-  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
-  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
-  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
-  /* Embedded 64-point orthonormal Type-II fDCT. */ \
-  do { \
-    int u1h; \
-    int u3h; \
-    int u5h; \
-    int u7h; \
-    int u9h; \
-    int ubh; \
-    int udh; \
-    int ufh; \
-    int uhh; \
-    int ujh; \
-    int ulh; \
-    int unh; \
-    int uph; \
-    int urh; \
-    int uth; \
-    int uvh; \
-    int uxh; \
-    int uzh; \
-    int uBh; \
-    int uDh; \
-    int uFh; \
-    int uHh; \
-    int uJh; \
-    int uLh; \
-    int uNh; \
-    int uPh; \
-    int uRh; \
-    int uTh; \
-    int uVh; \
-    int uXh; \
-    int uZh; \
-    int uh_; \
-    OD_IDST_32_ASYM(u, uL, uT, uD, uX, uH, uP, uz, uZ, uJ, uR, uB, uV, uF, \
-      uN, ux, u_, uK, uS, uC, uW, uG, uO, uy, uY, uI, uQ, uA, uU, uE, uM, uw); \
-    OD_IDCT_32_ASYM(u0, ug, u8, uo, u4, uk, uc, us, u2, ui, ua, uq, u6, um, \
-      ue, uu, u1, u1h, uh, uhh, u9, u9h, up, uph, u5, u5h, ul, ulh, ud, udh, \
-      ut, uth, u3, u3h, uj, ujh, ub, ubh, ur, urh, u7, u7h, un, unh, uf, ufh, \
-      uv, uvh); \
-    uh_ = OD_DCT_RSHIFT(u, 1); \
-    u0 += uh_; \
-    u = u0 - u; \
-    u_ = u1h - u_; \
-    u1 -= u_; \
-    uZh = OD_DCT_RSHIFT(uZ, 1); \
-    u2 += uZh; \
-    uZ = u2 - uZ; \
-    uY = u3h - uY; \
-    u3 -= uY; \
-    uXh = OD_DCT_RSHIFT(uX, 1); \
-    u4 += uXh; \
-    uX = u4 - uX; \
-    uW = u5h - uW; \
-    u5 -= uW; \
-    uVh = OD_DCT_RSHIFT(uV, 1); \
-    u6 += uVh; \
-    uV = u6 - uV; \
-    uU = u7h - uU; \
-    u7 -= uU; \
-    uTh = OD_DCT_RSHIFT(uT, 1); \
-    u8 += uTh; \
-    uT = u8 - uT; \
-    uS = u9h - uS; \
-    u9 -= uS; \
-    uRh = OD_DCT_RSHIFT(uR, 1); \
-    ua += uRh; \
-    uR = ua - uR; \
-    uQ = ubh - uQ; \
-    ub -= uQ; \
-    uPh = OD_DCT_RSHIFT(uP, 1); \
-    uc += uPh; \
-    uP = uc - uP; \
-    uO = udh - uO; \
-    ud -= uO; \
-    uNh = OD_DCT_RSHIFT(uN, 1); \
-    ue += uNh; \
-    uN = ue - uN; \
-    uM = ufh - uM; \
-    uf -= uM; \
-    uLh = OD_DCT_RSHIFT(uL, 1); \
-    ug += uLh; \
-    uL = ug - uL; \
-    uK = uhh - uK; \
-    uh -= uK; \
-    uJh = OD_DCT_RSHIFT(uJ, 1); \
-    ui += uJh; \
-    uJ = ui - uJ; \
-    uI = ujh - uI; \
-    uj -= uI; \
-    uHh = OD_DCT_RSHIFT(uH, 1); \
-    uk += uHh; \
-    uH = uk - uH; \
-    uG = ulh - uG; \
-    ul -= uG; \
-    uFh = OD_DCT_RSHIFT(uF, 1); \
-    um += uFh; \
-    uF = um - uF; \
-    uE = unh - uE; \
-    un -= uE; \
-    uDh = OD_DCT_RSHIFT(uD, 1); \
-    uo += uDh; \
-    uD = uo - uD; \
-    uC = uph - uC; \
-    up -= uC; \
-    uBh = OD_DCT_RSHIFT(uB, 1); \
-    uq += uBh; \
-    uB = uq - uB; \
-    uA = urh - uA; \
-    ur -= uA; \
-    uzh = OD_DCT_RSHIFT(uz, 1); \
-    us += uzh; \
-    uz = us - uz; \
-    uy = uth - uy; \
-    ut -= uy; \
-    uxh = OD_DCT_RSHIFT(ux, 1); \
-    uu += uxh; \
-    ux = uu - ux; \
-    uw = uvh - uw; \
-    uv -= uw; \
-  } while (0)
-#endif
-
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = x[0*xstride];
-  q2 = x[1*xstride];
-  q1 = x[2*xstride];
-  q3 = x[3*xstride];
-  OD_FDCT_4(q0, q2, q1, q3);
-  y[0] = (od_coeff)q0;
-  y[1] = (od_coeff)q1;
-  y[2] = (od_coeff)q2;
-  y[3] = (od_coeff)q3;
-}
-
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = y[0];
-  q2 = y[1];
-  q1 = y[2];
-  q3 = y[3];
-  OD_IDCT_4(q0, q2, q1, q3);
-  x[0*xstride] = q0;
-  x[1*xstride] = q1;
-  x[2*xstride] = q2;
-  x[3*xstride] = q3;
-}
-
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = x[3*xstride];
-  q2 = x[2*xstride];
-  q1 = x[1*xstride];
-  q3 = x[0*xstride];
-  OD_FDST_4(q0, q2, q1, q3);
-  y[0] = (od_coeff)q3;
-  y[1] = (od_coeff)q2;
-  y[2] = (od_coeff)q1;
-  y[3] = (od_coeff)q0;
-}
-
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = y[3];
-  q2 = y[2];
-  q1 = y[1];
-  q3 = y[0];
-  OD_IDST_4(q0, q2, q1, q3);
-  x[0*xstride] = q3;
-  x[1*xstride] = q2;
-  x[2*xstride] = q1;
-  x[3*xstride] = q0;
-}
-
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = x[0*xstride];
-  r4 = x[1*xstride];
-  r2 = x[2*xstride];
-  r6 = x[3*xstride];
-  r1 = x[4*xstride];
-  r5 = x[5*xstride];
-  r3 = x[6*xstride];
-  r7 = x[7*xstride];
-  OD_FDCT_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  y[0] = (od_coeff)r0;
-  y[1] = (od_coeff)r1;
-  y[2] = (od_coeff)r2;
-  y[3] = (od_coeff)r3;
-  y[4] = (od_coeff)r4;
-  y[5] = (od_coeff)r5;
-  y[6] = (od_coeff)r6;
-  y[7] = (od_coeff)r7;
-}
-
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = y[0];
-  r4 = y[1];
-  r2 = y[2];
-  r6 = y[3];
-  r1 = y[4];
-  r5 = y[5];
-  r3 = y[6];
-  r7 = y[7];
-  OD_IDCT_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  x[0*xstride] = (od_coeff)r0;
-  x[1*xstride] = (od_coeff)r1;
-  x[2*xstride] = (od_coeff)r2;
-  x[3*xstride] = (od_coeff)r3;
-  x[4*xstride] = (od_coeff)r4;
-  x[5*xstride] = (od_coeff)r5;
-  x[6*xstride] = (od_coeff)r6;
-  x[7*xstride] = (od_coeff)r7;
-}
-
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = x[0*xstride];
-  r4 = x[1*xstride];
-  r2 = x[2*xstride];
-  r6 = x[3*xstride];
-  r1 = x[4*xstride];
-  r5 = x[5*xstride];
-  r3 = x[6*xstride];
-  r7 = x[7*xstride];
-  OD_FDST_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  y[0] = (od_coeff)r0;
-  y[1] = (od_coeff)r1;
-  y[2] = (od_coeff)r2;
-  y[3] = (od_coeff)r3;
-  y[4] = (od_coeff)r4;
-  y[5] = (od_coeff)r5;
-  y[6] = (od_coeff)r6;
-  y[7] = (od_coeff)r7;
-}
-
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = y[0];
-  r4 = y[1];
-  r2 = y[2];
-  r6 = y[3];
-  r1 = y[4];
-  r5 = y[5];
-  r3 = y[6];
-  r7 = y[7];
-  OD_IDST_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  x[0*xstride] = (od_coeff)r0;
-  x[1*xstride] = (od_coeff)r1;
-  x[2*xstride] = (od_coeff)r2;
-  x[3*xstride] = (od_coeff)r3;
-  x[4*xstride] = (od_coeff)r4;
-  x[5*xstride] = (od_coeff)r5;
-  x[6*xstride] = (od_coeff)r6;
-  x[7*xstride] = (od_coeff)r7;
-}
-
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = x[0*xstride];
-  s8 = x[1*xstride];
-  s4 = x[2*xstride];
-  sc = x[3*xstride];
-  s2 = x[4*xstride];
-  sa = x[5*xstride];
-  s6 = x[6*xstride];
-  se = x[7*xstride];
-  s1 = x[8*xstride];
-  s9 = x[9*xstride];
-  s5 = x[10*xstride];
-  sd = x[11*xstride];
-  s3 = x[12*xstride];
-  sb = x[13*xstride];
-  s7 = x[14*xstride];
-  sf = x[15*xstride];
-  OD_FDCT_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  y[0] = (od_coeff)s0;
-  y[1] = (od_coeff)s1;
-  y[2] = (od_coeff)s2;
-  y[3] = (od_coeff)s3;
-  y[4] = (od_coeff)s4;
-  y[5] = (od_coeff)s5;
-  y[6] = (od_coeff)s6;
-  y[7] = (od_coeff)s7;
-  y[8] = (od_coeff)s8;
-  y[9] = (od_coeff)s9;
-  y[10] = (od_coeff)sa;
-  y[11] = (od_coeff)sb;
-  y[12] = (od_coeff)sc;
-  y[13] = (od_coeff)sd;
-  y[14] = (od_coeff)se;
-  y[15] = (od_coeff)sf;
-}
-
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = y[0];
-  s8 = y[1];
-  s4 = y[2];
-  sc = y[3];
-  s2 = y[4];
-  sa = y[5];
-  s6 = y[6];
-  se = y[7];
-  s1 = y[8];
-  s9 = y[9];
-  s5 = y[10];
-  sd = y[11];
-  s3 = y[12];
-  sb = y[13];
-  s7 = y[14];
-  sf = y[15];
-  OD_IDCT_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  x[0*xstride] = (od_coeff)s0;
-  x[1*xstride] = (od_coeff)s1;
-  x[2*xstride] = (od_coeff)s2;
-  x[3*xstride] = (od_coeff)s3;
-  x[4*xstride] = (od_coeff)s4;
-  x[5*xstride] = (od_coeff)s5;
-  x[6*xstride] = (od_coeff)s6;
-  x[7*xstride] = (od_coeff)s7;
-  x[8*xstride] = (od_coeff)s8;
-  x[9*xstride] = (od_coeff)s9;
-  x[10*xstride] = (od_coeff)sa;
-  x[11*xstride] = (od_coeff)sb;
-  x[12*xstride] = (od_coeff)sc;
-  x[13*xstride] = (od_coeff)sd;
-  x[14*xstride] = (od_coeff)se;
-  x[15*xstride] = (od_coeff)sf;
-}
-
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = x[15*xstride];
-  s8 = x[14*xstride];
-  s4 = x[13*xstride];
-  sc = x[12*xstride];
-  s2 = x[11*xstride];
-  sa = x[10*xstride];
-  s6 = x[9*xstride];
-  se = x[8*xstride];
-  s1 = x[7*xstride];
-  s9 = x[6*xstride];
-  s5 = x[5*xstride];
-  sd = x[4*xstride];
-  s3 = x[3*xstride];
-  sb = x[2*xstride];
-  s7 = x[1*xstride];
-  sf = x[0*xstride];
-  OD_FDST_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  y[0] = (od_coeff)sf;
-  y[1] = (od_coeff)se;
-  y[2] = (od_coeff)sd;
-  y[3] = (od_coeff)sc;
-  y[4] = (od_coeff)sb;
-  y[5] = (od_coeff)sa;
-  y[6] = (od_coeff)s9;
-  y[7] = (od_coeff)s8;
-  y[8] = (od_coeff)s7;
-  y[9] = (od_coeff)s6;
-  y[10] = (od_coeff)s5;
-  y[11] = (od_coeff)s4;
-  y[12] = (od_coeff)s3;
-  y[13] = (od_coeff)s2;
-  y[14] = (od_coeff)s1;
-  y[15] = (od_coeff)s0;
-}
-
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = y[15];
-  s8 = y[14];
-  s4 = y[13];
-  sc = y[12];
-  s2 = y[11];
-  sa = y[10];
-  s6 = y[9];
-  se = y[8];
-  s1 = y[7];
-  s9 = y[6];
-  s5 = y[5];
-  sd = y[4];
-  s3 = y[3];
-  sb = y[2];
-  s7 = y[1];
-  sf = y[0];
-  OD_IDST_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  x[0*xstride] = (od_coeff)sf;
-  x[1*xstride] = (od_coeff)se;
-  x[2*xstride] = (od_coeff)sd;
-  x[3*xstride] = (od_coeff)sc;
-  x[4*xstride] = (od_coeff)sb;
-  x[5*xstride] = (od_coeff)sa;
-  x[6*xstride] = (od_coeff)s9;
-  x[7*xstride] = (od_coeff)s8;
-  x[8*xstride] = (od_coeff)s7;
-  x[9*xstride] = (od_coeff)s6;
-  x[10*xstride] = (od_coeff)s5;
-  x[11*xstride] = (od_coeff)s4;
-  x[12*xstride] = (od_coeff)s3;
-  x[13*xstride] = (od_coeff)s2;
-  x[14*xstride] = (od_coeff)s1;
-  x[15*xstride] = (od_coeff)s0;
-}
-
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride) {
-  /*215 adds, 38 shifts, 87 "muls".*/
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  t0 = x[0*xstride];
-  tg = x[1*xstride];
-  t8 = x[2*xstride];
-  to = x[3*xstride];
-  t4 = x[4*xstride];
-  tk = x[5*xstride];
-  tc = x[6*xstride];
-  ts = x[7*xstride];
-  t2 = x[8*xstride];
-  ti = x[9*xstride];
-  ta = x[10*xstride];
-  tq = x[11*xstride];
-  t6 = x[12*xstride];
-  tm = x[13*xstride];
-  te = x[14*xstride];
-  tu = x[15*xstride];
-  t1 = x[16*xstride];
-  th = x[17*xstride];
-  t9 = x[18*xstride];
-  tp = x[19*xstride];
-  t5 = x[20*xstride];
-  tl = x[21*xstride];
-  td = x[22*xstride];
-  tt = x[23*xstride];
-  t3 = x[24*xstride];
-  tj = x[25*xstride];
-  tb = x[26*xstride];
-  tr = x[27*xstride];
-  t7 = x[28*xstride];
-  tn = x[29*xstride];
-  tf = x[30*xstride];
-  tv = x[31*xstride];
-  OD_FDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
-  y[0] = (od_coeff)t0;
-  y[1] = (od_coeff)t1;
-  y[2] = (od_coeff)t2;
-  y[3] = (od_coeff)t3;
-  y[4] = (od_coeff)t4;
-  y[5] = (od_coeff)t5;
-  y[6] = (od_coeff)t6;
-  y[7] = (od_coeff)t7;
-  y[8] = (od_coeff)t8;
-  y[9] = (od_coeff)t9;
-  y[10] = (od_coeff)ta;
-  y[11] = (od_coeff)tb;
-  y[12] = (od_coeff)tc;
-  y[13] = (od_coeff)td;
-  y[14] = (od_coeff)te;
-  y[15] = (od_coeff)tf;
-  y[16] = (od_coeff)tg;
-  y[17] = (od_coeff)th;
-  y[18] = (od_coeff)ti;
-  y[19] = (od_coeff)tj;
-  y[20] = (od_coeff)tk;
-  y[21] = (od_coeff)tl;
-  y[22] = (od_coeff)tm;
-  y[23] = (od_coeff)tn;
-  y[24] = (od_coeff)to;
-  y[25] = (od_coeff)tp;
-  y[26] = (od_coeff)tq;
-  y[27] = (od_coeff)tr;
-  y[28] = (od_coeff)ts;
-  y[29] = (od_coeff)tt;
-  y[30] = (od_coeff)tu;
-  y[31] = (od_coeff)tv;
-}
-
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  t0 = y[0];
-  tg = y[1];
-  t8 = y[2];
-  to = y[3];
-  t4 = y[4];
-  tk = y[5];
-  tc = y[6];
-  ts = y[7];
-  t2 = y[8];
-  ti = y[9];
-  ta = y[10];
-  tq = y[11];
-  t6 = y[12];
-  tm = y[13];
-  te = y[14];
-  tu = y[15];
-  t1 = y[16];
-  th = y[17];
-  t9 = y[18];
-  tp = y[19];
-  t5 = y[20];
-  tl = y[21];
-  td = y[22];
-  tt = y[23];
-  t3 = y[24];
-  tj = y[25];
-  tb = y[26];
-  tr = y[27];
-  t7 = y[28];
-  tn = y[29];
-  tf = y[30];
-  tv = y[31];
-  OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
-  x[0*xstride] = (od_coeff)t0;
-  x[1*xstride] = (od_coeff)t1;
-  x[2*xstride] = (od_coeff)t2;
-  x[3*xstride] = (od_coeff)t3;
-  x[4*xstride] = (od_coeff)t4;
-  x[5*xstride] = (od_coeff)t5;
-  x[6*xstride] = (od_coeff)t6;
-  x[7*xstride] = (od_coeff)t7;
-  x[8*xstride] = (od_coeff)t8;
-  x[9*xstride] = (od_coeff)t9;
-  x[10*xstride] = (od_coeff)ta;
-  x[11*xstride] = (od_coeff)tb;
-  x[12*xstride] = (od_coeff)tc;
-  x[13*xstride] = (od_coeff)td;
-  x[14*xstride] = (od_coeff)te;
-  x[15*xstride] = (od_coeff)tf;
-  x[16*xstride] = (od_coeff)tg;
-  x[17*xstride] = (od_coeff)th;
-  x[18*xstride] = (od_coeff)ti;
-  x[19*xstride] = (od_coeff)tj;
-  x[20*xstride] = (od_coeff)tk;
-  x[21*xstride] = (od_coeff)tl;
-  x[22*xstride] = (od_coeff)tm;
-  x[23*xstride] = (od_coeff)tn;
-  x[24*xstride] = (od_coeff)to;
-  x[25*xstride] = (od_coeff)tp;
-  x[26*xstride] = (od_coeff)tq;
-  x[27*xstride] = (od_coeff)tr;
-  x[28*xstride] = (od_coeff)ts;
-  x[29*xstride] = (od_coeff)tt;
-  x[30*xstride] = (od_coeff)tu;
-  x[31*xstride] = (od_coeff)tv;
-}
-
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  int tw;
-  int tx;
-  int ty;
-  int tz;
-  int tA;
-  int tB;
-  int tC;
-  int tD;
-  int tE;
-  int tF;
-  int tG;
-  int tH;
-  int tI;
-  int tJ;
-  int tK;
-  int tL;
-  int tM;
-  int tN;
-  int tO;
-  int tP;
-  int tQ;
-  int tR;
-  int tS;
-  int tT;
-  int tU;
-  int tV;
-  int tW;
-  int tX;
-  int tY;
-  int tZ;
-  int t_;
-  int t;
-  t0 = x[0*xstride];
-  tw = x[1*xstride];
-  tg = x[2*xstride];
-  tM = x[3*xstride];
-  t8 = x[4*xstride];
-  tE = x[5*xstride];
-  to = x[6*xstride];
-  tU = x[7*xstride];
-  t4 = x[8*xstride];
-  tA = x[9*xstride];
-  tk = x[10*xstride];
-  tQ = x[11*xstride];
-  tc = x[12*xstride];
-  tI = x[13*xstride];
-  ts = x[14*xstride];
-  tY = x[15*xstride];
-  t2 = x[16*xstride];
-  ty = x[17*xstride];
-  ti = x[18*xstride];
-  tO = x[19*xstride];
-  ta = x[20*xstride];
-  tG = x[21*xstride];
-  tq = x[22*xstride];
-  tW = x[23*xstride];
-  t6 = x[24*xstride];
-  tC = x[25*xstride];
-  tm = x[26*xstride];
-  tS = x[27*xstride];
-  te = x[28*xstride];
-  tK = x[29*xstride];
-  tu = x[30*xstride];
-  t_ = x[31*xstride];
-  t1 = x[32*xstride];
-  tx = x[33*xstride];
-  th = x[34*xstride];
-  tN = x[35*xstride];
-  t9 = x[36*xstride];
-  tF = x[37*xstride];
-  tp = x[38*xstride];
-  tV = x[39*xstride];
-  t5 = x[40*xstride];
-  tB = x[41*xstride];
-  tl = x[42*xstride];
-  tR = x[43*xstride];
-  td = x[44*xstride];
-  tJ = x[45*xstride];
-  tt = x[46*xstride];
-  tZ = x[47*xstride];
-  t3 = x[48*xstride];
-  tz = x[49*xstride];
-  tj = x[50*xstride];
-  tP = x[51*xstride];
-  tb = x[52*xstride];
-  tH = x[53*xstride];
-  tr = x[54*xstride];
-  tX = x[55*xstride];
-  t7 = x[56*xstride];
-  tD = x[57*xstride];
-  tn = x[58*xstride];
-  tT = x[59*xstride];
-  tf = x[60*xstride];
-  tL = x[61*xstride];
-  tv = x[62*xstride];
-  t = x[63*xstride];
-  OD_FDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
-    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
-    th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
-    tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
-  y[0] = (od_coeff)t0;
-  y[1] = (od_coeff)t1;
-  y[2] = (od_coeff)t2;
-  y[3] = (od_coeff)t3;
-  y[4] = (od_coeff)t4;
-  y[5] = (od_coeff)t5;
-  y[6] = (od_coeff)t6;
-  y[7] = (od_coeff)t7;
-  y[8] = (od_coeff)t8;
-  y[9] = (od_coeff)t9;
-  y[10] = (od_coeff)ta;
-  y[11] = (od_coeff)tb;
-  y[12] = (od_coeff)tc;
-  y[13] = (od_coeff)td;
-  y[14] = (od_coeff)te;
-  y[15] = (od_coeff)tf;
-  y[16] = (od_coeff)tg;
-  y[17] = (od_coeff)th;
-  y[18] = (od_coeff)ti;
-  y[19] = (od_coeff)tj;
-  y[20] = (od_coeff)tk;
-  y[21] = (od_coeff)tl;
-  y[22] = (od_coeff)tm;
-  y[23] = (od_coeff)tn;
-  y[24] = (od_coeff)to;
-  y[25] = (od_coeff)tp;
-  y[26] = (od_coeff)tq;
-  y[27] = (od_coeff)tr;
-  y[28] = (od_coeff)ts;
-  y[29] = (od_coeff)tt;
-  y[30] = (od_coeff)tu;
-  y[31] = (od_coeff)tv;
-  y[32] = (od_coeff)tw;
-  y[33] = (od_coeff)tx;
-  y[34] = (od_coeff)ty;
-  y[35] = (od_coeff)tz;
-  y[36] = (od_coeff)tA;
-  y[37] = (od_coeff)tB;
-  y[38] = (od_coeff)tC;
-  y[39] = (od_coeff)tD;
-  y[40] = (od_coeff)tE;
-  y[41] = (od_coeff)tF;
-  y[41] = (od_coeff)tF;
-  y[42] = (od_coeff)tG;
-  y[43] = (od_coeff)tH;
-  y[44] = (od_coeff)tI;
-  y[45] = (od_coeff)tJ;
-  y[46] = (od_coeff)tK;
-  y[47] = (od_coeff)tL;
-  y[48] = (od_coeff)tM;
-  y[49] = (od_coeff)tN;
-  y[50] = (od_coeff)tO;
-  y[51] = (od_coeff)tP;
-  y[52] = (od_coeff)tQ;
-  y[53] = (od_coeff)tR;
-  y[54] = (od_coeff)tS;
-  y[55] = (od_coeff)tT;
-  y[56] = (od_coeff)tU;
-  y[57] = (od_coeff)tV;
-  y[58] = (od_coeff)tW;
-  y[59] = (od_coeff)tX;
-  y[60] = (od_coeff)tY;
-  y[61] = (od_coeff)tZ;
-  y[62] = (od_coeff)t_;
-  y[63] = (od_coeff)t;
-}
-
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  int tw;
-  int tx;
-  int ty;
-  int tz;
-  int tA;
-  int tB;
-  int tC;
-  int tD;
-  int tE;
-  int tF;
-  int tG;
-  int tH;
-  int tI;
-  int tJ;
-  int tK;
-  int tL;
-  int tM;
-  int tN;
-  int tO;
-  int tP;
-  int tQ;
-  int tR;
-  int tS;
-  int tT;
-  int tU;
-  int tV;
-  int tW;
-  int tX;
-  int tY;
-  int tZ;
-  int t_;
-  int t;
-  t0 = y[0];
-  tw = y[1];
-  tg = y[2];
-  tM = y[3];
-  t8 = y[4];
-  tE = y[5];
-  to = y[6];
-  tU = y[7];
-  t4 = y[8];
-  tA = y[9];
-  tk = y[10];
-  tQ = y[11];
-  tc = y[12];
-  tI = y[13];
-  ts = y[14];
-  tY = y[15];
-  t2 = y[16];
-  ty = y[17];
-  ti = y[18];
-  tO = y[19];
-  ta = y[20];
-  tG = y[21];
-  tq = y[22];
-  tW = y[23];
-  t6 = y[24];
-  tC = y[25];
-  tm = y[26];
-  tS = y[27];
-  te = y[28];
-  tK = y[29];
-  tu = y[30];
-  t_ = y[31];
-  t1 = y[32];
-  tx = y[33];
-  th = y[34];
-  tN = y[35];
-  t9 = y[36];
-  tF = y[37];
-  tp = y[38];
-  tV = y[39];
-  t5 = y[40];
-  tB = y[41];
-  tl = y[42];
-  tR = y[43];
-  td = y[44];
-  tJ = y[45];
-  tt = y[46];
-  tZ = y[47];
-  t3 = y[48];
-  tz = y[49];
-  tj = y[50];
-  tP = y[51];
-  tb = y[52];
-  tH = y[53];
-  tr = y[54];
-  tX = y[55];
-  t7 = y[56];
-  tD = y[57];
-  tn = y[58];
-  tT = y[59];
-  tf = y[60];
-  tL = y[61];
-  tv = y[62];
-  t = y[63];
-  OD_IDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
-    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
-    th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
-    tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
-  x[0*xstride] = (od_coeff)t0;
-  x[1*xstride] = (od_coeff)t1;
-  x[2*xstride] = (od_coeff)t2;
-  x[3*xstride] = (od_coeff)t3;
-  x[4*xstride] = (od_coeff)t4;
-  x[5*xstride] = (od_coeff)t5;
-  x[6*xstride] = (od_coeff)t6;
-  x[7*xstride] = (od_coeff)t7;
-  x[8*xstride] = (od_coeff)t8;
-  x[9*xstride] = (od_coeff)t9;
-  x[10*xstride] = (od_coeff)ta;
-  x[11*xstride] = (od_coeff)tb;
-  x[12*xstride] = (od_coeff)tc;
-  x[13*xstride] = (od_coeff)td;
-  x[14*xstride] = (od_coeff)te;
-  x[15*xstride] = (od_coeff)tf;
-  x[16*xstride] = (od_coeff)tg;
-  x[17*xstride] = (od_coeff)th;
-  x[18*xstride] = (od_coeff)ti;
-  x[19*xstride] = (od_coeff)tj;
-  x[20*xstride] = (od_coeff)tk;
-  x[21*xstride] = (od_coeff)tl;
-  x[22*xstride] = (od_coeff)tm;
-  x[23*xstride] = (od_coeff)tn;
-  x[24*xstride] = (od_coeff)to;
-  x[25*xstride] = (od_coeff)tp;
-  x[26*xstride] = (od_coeff)tq;
-  x[27*xstride] = (od_coeff)tr;
-  x[28*xstride] = (od_coeff)ts;
-  x[29*xstride] = (od_coeff)tt;
-  x[30*xstride] = (od_coeff)tu;
-  x[31*xstride] = (od_coeff)tv;
-  x[32*xstride] = (od_coeff)tw;
-  x[33*xstride] = (od_coeff)tx;
-  x[34*xstride] = (od_coeff)ty;
-  x[35*xstride] = (od_coeff)tz;
-  x[36*xstride] = (od_coeff)tA;
-  x[37*xstride] = (od_coeff)tB;
-  x[38*xstride] = (od_coeff)tC;
-  x[39*xstride] = (od_coeff)tD;
-  x[40*xstride] = (od_coeff)tE;
-  x[41*xstride] = (od_coeff)tF;
-  x[41*xstride] = (od_coeff)tF;
-  x[42*xstride] = (od_coeff)tG;
-  x[43*xstride] = (od_coeff)tH;
-  x[44*xstride] = (od_coeff)tI;
-  x[45*xstride] = (od_coeff)tJ;
-  x[46*xstride] = (od_coeff)tK;
-  x[47*xstride] = (od_coeff)tL;
-  x[48*xstride] = (od_coeff)tM;
-  x[49*xstride] = (od_coeff)tN;
-  x[50*xstride] = (od_coeff)tO;
-  x[51*xstride] = (od_coeff)tP;
-  x[52*xstride] = (od_coeff)tQ;
-  x[53*xstride] = (od_coeff)tR;
-  x[54*xstride] = (od_coeff)tS;
-  x[55*xstride] = (od_coeff)tT;
-  x[56*xstride] = (od_coeff)tU;
-  x[57*xstride] = (od_coeff)tV;
-  x[58*xstride] = (od_coeff)tW;
-  x[59*xstride] = (od_coeff)tX;
-  x[60*xstride] = (od_coeff)tY;
-  x[61*xstride] = (od_coeff)tZ;
-  x[62*xstride] = (od_coeff)t_;
-  x[63*xstride] = (od_coeff)t;
-}
-#endif
-
-void daala_fdct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idct4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idst4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; i++) output[i] = input[i];
-}
-
-void daala_fdct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; i++) output[i] = input[i];
-}
-
-void daala_fdct16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct16(y, x, 1);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct16(x, 1, y);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst16(y, x, 1);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst16(x, 1, y);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; i++) output[i] = input[i];
-}
-
-void daala_fdct32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct32(y, x, 1);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct32(x, 1, y);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_fdst32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i];
-  }
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = input[i + 16];
-  }
-  daala_fdct16(inputhalf, output);
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_idst32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = input[i];
-  }
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i];
-  }
-  daala_idct16(inputhalf, output + 16);
-}
-
-void daala_idtx32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; i++) output[i] = input[i];
-}
-
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[64];
-  od_coeff y[64];
-  for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct64(y, x, 1);
-  for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[64];
-  od_coeff y[64];
-  for (i = 0; i < 64; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct64(x, 1, y);
-  for (i = 0; i < 64; i++) output[i] = (tran_low_t)x[i];
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_fdst64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    output[32 + i] = input[i];
-  }
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = input[i + 32];
-  }
-  daala_fdct32(inputhalf, output);
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_idst64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = input[i];
-  }
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[32 + i];
-  }
-  daala_idct32(inputhalf, output + 32);
-}
-
-void daala_idtx64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; i++) output[i] = input[i];
-}
-#endif
diff --git a/third_party/aom/av1/common/daala_tx.h b/third_party/aom/av1/common/daala_tx.h
deleted file mode 100644
index 7145b66a2..000000000
--- a/third_party/aom/av1/common/daala_tx.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef AOM_DSP_DAALA_TX_H_
-#define AOM_DSP_DAALA_TX_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/odintrin.h"
-
-void daala_fdct4(const tran_low_t *input, tran_low_t *output);
-void daala_idct4(const tran_low_t *input, tran_low_t *output);
-void daala_fdst4(const tran_low_t *input, tran_low_t *output);
-void daala_idst4(const tran_low_t *input, tran_low_t *output);
-void daala_idtx4(const tran_low_t *input, tran_low_t *output);
-void daala_fdct8(const tran_low_t *input, tran_low_t *output);
-void daala_idct8(const tran_low_t *input, tran_low_t *output);
-void daala_fdst8(const tran_low_t *input, tran_low_t *output);
-void daala_idst8(const tran_low_t *input, tran_low_t *output);
-void daala_idtx8(const tran_low_t *input, tran_low_t *output);
-void daala_fdct16(const tran_low_t *input, tran_low_t *output);
-void daala_idct16(const tran_low_t *input, tran_low_t *output);
-void daala_fdst16(const tran_low_t *input, tran_low_t *output);
-void daala_idst16(const tran_low_t *input, tran_low_t *output);
-void daala_idtx16(const tran_low_t *input, tran_low_t *output);
-void daala_fdct32(const tran_low_t *input, tran_low_t *output);
-void daala_idct32(const tran_low_t *input, tran_low_t *output);
-void daala_fdst32(const tran_low_t *input, tran_low_t *output);
-void daala_idst32(const tran_low_t *input, tran_low_t *output);
-void daala_idtx32(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output);
-void daala_idct64(const tran_low_t *input, tran_low_t *output);
-void daala_fdst64(const tran_low_t *input, tran_low_t *output);
-void daala_idst64(const tran_low_t *input, tran_low_t *output);
-void daala_idtx64(const tran_low_t *input, tran_low_t *output);
-#endif
-
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride);
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]);
-#endif
-#endif
diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
index 91f33d4e3..868f341b5 100644
--- a/third_party/aom/av1/common/debugmodes.c
+++ b/third_party/aom/av1/common/debugmodes.c
@@ -27,7 +27,7 @@ static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
 static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
   int mi_row, mi_col;
-  MODE_INFO **mi = cm->mi_grid_visible;
+  MB_MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
@@ -36,8 +36,7 @@ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(file, "%2d ",
-              *((char *)((char *)(&mi[0]->mbmi) + member_offset)));
+      fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
       mi++;
     }
     fprintf(file, "\n");
@@ -50,7 +49,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
   FILE *mvs = fopen(file, "a");
-  MODE_INFO **mi = cm->mi_grid_visible;
+  MB_MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
@@ -65,7 +64,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      fprintf(mvs, "%2d ", mi[0]->skip);
       mi++;
     }
     fprintf(mvs, "\n");
@@ -79,8 +78,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
-              mi[0]->mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
       mi++;
     }
     fprintf(mvs, "\n");
@@ -90,3 +88,20 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
 
   fclose(mvs);
 }
+
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                         const char *filename) {
+  FILE *hdrFile = fopen(filename, "w");
+  fwrite(data, size, sizeof(uint8_t), hdrFile);
+  fclose(hdrFile);
+}
+
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) {
+  FILE *fcFile = fopen(filename, "w");
+  const uint16_t *fcp = (uint16_t *)fc;
+  const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t);
+  unsigned int i;
+
+  for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++);
+  fclose(fcFile);
+}
diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c
index 17a8f1356..4f95ef69b 100644
--- a/third_party/aom/av1/common/entropy.c
+++ b/third_party/aom/av1/common/entropy.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 #include "av1/common/blockd.h"
@@ -17,2442 +18,161 @@
 #include "av1/common/entropymode.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/scan.h"
-#if CONFIG_Q_ADAPT_PROBS
 #include "av1/common/token_cdfs.h"
-#endif  // CONFIG_Q_ADAPT_PROBS
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
-
-// Unconstrained Node Tree
-/* clang-format off */
-const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                // 0 = LOW_VAL
-  -TWO_TOKEN, 4,                       // 1 = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
-  8, 10,                               // 3 = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
-  12, 14,                              // 5 = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
-};
-/* clang-format on */
-
-#if CONFIG_NEW_MULTISYMBOL
-/* Extra bits coded from LSB to MSB */
-const aom_cdf_prob av1_cat1_cdf0[CDF_SIZE(2)] = { AOM_ICDF(20352),
-                                                  AOM_ICDF(32768), 0 };
-const aom_cdf_prob *av1_cat1_cdf[] = { av1_cat1_cdf0 };
-
-const aom_cdf_prob av1_cat2_cdf0[CDF_SIZE(4)] = {
-  AOM_ICDF(11963), AOM_ICDF(21121), AOM_ICDF(27719), AOM_ICDF(32768), 0
-};
-const aom_cdf_prob *av1_cat2_cdf[] = { av1_cat2_cdf0 };
-const aom_cdf_prob av1_cat3_cdf0[CDF_SIZE(8)] = {
-  AOM_ICDF(7001),  AOM_ICDF(12802), AOM_ICDF(17911),
-  AOM_ICDF(22144), AOM_ICDF(25503), AOM_ICDF(28286),
-  AOM_ICDF(30737), AOM_ICDF(32768), 0
-};
-const aom_cdf_prob *av1_cat3_cdf[] = { av1_cat3_cdf0 };
-
-const aom_cdf_prob av1_cat4_cdf0[CDF_SIZE(16)] = { AOM_ICDF(3934),
-                                                   AOM_ICDF(7460),
-                                                   AOM_ICDF(10719),
-                                                   AOM_ICDF(13640),
-                                                   AOM_ICDF(16203),
-                                                   AOM_ICDF(18500),
-                                                   AOM_ICDF(20624),
-                                                   AOM_ICDF(22528),
-                                                   AOM_ICDF(24316),
-                                                   AOM_ICDF(25919),
-                                                   AOM_ICDF(27401),
-                                                   AOM_ICDF(28729),
-                                                   AOM_ICDF(29894),
-                                                   AOM_ICDF(30938),
-                                                   AOM_ICDF(31903),
-                                                   AOM_ICDF(32768),
-                                                   0 };
-const aom_cdf_prob *av1_cat4_cdf[] = { av1_cat4_cdf0 };
-
-const aom_cdf_prob av1_cat5_cdf0[CDF_SIZE(16)] = { AOM_ICDF(2942),
-                                                   AOM_ICDF(5794),
-                                                   AOM_ICDF(8473),
-                                                   AOM_ICDF(11069),
-                                                   AOM_ICDF(13469),
-                                                   AOM_ICDF(15795),
-                                                   AOM_ICDF(17980),
-                                                   AOM_ICDF(20097),
-                                                   AOM_ICDF(21952),
-                                                   AOM_ICDF(23750),
-                                                   AOM_ICDF(25439),
-                                                   AOM_ICDF(27076),
-                                                   AOM_ICDF(28589),
-                                                   AOM_ICDF(30056),
-                                                   AOM_ICDF(31434),
-                                                   AOM_ICDF(32768),
-                                                   0 };
-const aom_cdf_prob av1_cat5_cdf1[CDF_SIZE(2)] = { AOM_ICDF(23040),
-                                                  AOM_ICDF(32768), 0 };
-const aom_cdf_prob *av1_cat5_cdf[] = { av1_cat5_cdf0, av1_cat5_cdf1 };
-
-const aom_cdf_prob av1_cat6_cdf0[CDF_SIZE(16)] = {
-  AOM_ICDF(2382),  AOM_ICDF(4727),  AOM_ICDF(7036),  AOM_ICDF(9309),
-  AOM_ICDF(11512), AOM_ICDF(13681), AOM_ICDF(15816), AOM_ICDF(17918),
-  AOM_ICDF(19892), AOM_ICDF(21835), AOM_ICDF(23748), AOM_ICDF(25632),
-  AOM_ICDF(27458), AOM_ICDF(29255), AOM_ICDF(31024), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf1[CDF_SIZE(16)] = {
-  AOM_ICDF(9314),  AOM_ICDF(15584), AOM_ICDF(19741), AOM_ICDF(22540),
-  AOM_ICDF(25391), AOM_ICDF(27310), AOM_ICDF(28583), AOM_ICDF(29440),
-  AOM_ICDF(30493), AOM_ICDF(31202), AOM_ICDF(31672), AOM_ICDF(31988),
-  AOM_ICDF(32310), AOM_ICDF(32527), AOM_ICDF(32671), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf2[CDF_SIZE(16)] = {
-  AOM_ICDF(29548), AOM_ICDF(31129), AOM_ICDF(31960), AOM_ICDF(32004),
-  AOM_ICDF(32473), AOM_ICDF(32498), AOM_ICDF(32511), AOM_ICDF(32512),
-  AOM_ICDF(32745), AOM_ICDF(32757), AOM_ICDF(32763), AOM_ICDF(32764),
-  AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf3[CDF_SIZE(16)] = {
-  AOM_ICDF(32006), AOM_ICDF(32258), AOM_ICDF(32510), AOM_ICDF(32512),
-  AOM_ICDF(32638), AOM_ICDF(32639), AOM_ICDF(32640), AOM_ICDF(32641),
-  AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-  AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf4[CDF_SIZE(4)] = {
-  AOM_ICDF(32513), AOM_ICDF(32641), AOM_ICDF(32767), AOM_ICDF(32768)
-};
-const aom_cdf_prob *av1_cat6_cdf[] = {
-  av1_cat6_cdf0, av1_cat6_cdf1, av1_cat6_cdf2, av1_cat6_cdf3, av1_cat6_cdf4
-};
-#endif
-/* Extra bits coded from MSB to LSB */
-const aom_prob av1_cat1_prob[] = { 159 };
-const aom_prob av1_cat2_prob[] = { 165, 145 };
-const aom_prob av1_cat3_prob[] = { 173, 148, 140 };
-const aom_prob av1_cat4_prob[] = { 176, 155, 140, 135 };
-const aom_prob av1_cat5_prob[] = { 180, 157, 141, 134, 130 };
-const aom_prob av1_cat6_prob[] = {
-  255, 255, 255, 255, 254, 254, 254, 252, 249,
-  243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
-const uint16_t band_count_table[TX_SIZES_ALL][8] = {
-#if CONFIG_CHROMA_2X2
-  { 1, 2, 2, 3, 0, 0, 0 },
-#endif
-  { 1, 2, 3, 4, 3, 16 - 13, 0 },    { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-#if CONFIG_TX64X64
-  { 1, 2, 3, 4, 11, 4096 - 21, 0 },
-#endif  // CONFIG_TX64X64
-  { 1, 2, 3, 4, 8, 32 - 18, 0 },    { 1, 2, 3, 4, 8, 32 - 18, 0 },
-  { 1, 2, 3, 4, 11, 128 - 21, 0 },  { 1, 2, 3, 4, 11, 128 - 21, 0 },
-  { 1, 2, 3, 4, 11, 512 - 21, 0 },  { 1, 2, 3, 4, 11, 512 - 21, 0 },
-#if CONFIG_TX64X64
-  { 1, 2, 3, 4, 11, 2048 - 21, 0 }, { 1, 2, 3, 4, 11, 2048 - 21, 0 },
-#endif  // CONFIG_TX64X64
-  { 1, 2, 3, 4, 11, 64 - 21, 0 },   { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },  { 1, 2, 3, 4, 11, 256 - 21, 0 },
-};
-
-const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
-#if CONFIG_CHROMA_2X2
-  { 0, 1, 3, 6, 10, 13, 16, 0 },
-#endif
-  { 0, 1, 3, 6, 10, 13, 16, 0 },   { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },  { 0, 1, 3, 6, 10, 21, 1024, 0 },
-#if CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 21, 4096, 0 },
-#endif  // CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 18, 32, 0 },   { 0, 1, 3, 6, 10, 18, 32, 0 },
-  { 0, 1, 3, 6, 10, 21, 128, 0 },  { 0, 1, 3, 6, 10, 21, 128, 0 },
-  { 0, 1, 3, 6, 10, 21, 512, 0 },  { 0, 1, 3, 6, 10, 21, 512, 0 },
-#if CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 21, 2048, 0 }, { 0, 1, 3, 6, 10, 21, 2048, 0 },
-#endif  // CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 21, 64, 0 },   { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },  { 0, 1, 3, 6, 10, 21, 256, 0 },
-};
 
-const uint8_t av1_coefband_trans_8x8plus[MAX_TX_SQUARE] = {
-  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
-  // beyond MAXBAND_INDEX+1 all values are filled as 5
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-#if CONFIG_TX64X64
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5
-#endif  // CONFIG_TX64X64
-};
-
-const uint8_t av1_coefband_trans_4x8_8x4[32] = {
-  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
-  4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-};
-
-const uint8_t av1_coefband_trans_4x4[16] = {
-  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
-};
-
-const uint8_t av1_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4,
-                                                      4, 5, 5, 5, 5, 5 };
-
-// Model obtained from a 2-sided zero-centered distribution derived
-// from a Pareto distribution. The cdf of the distribution is:
-// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
-//
-// For a given beta and a given probablity of the 1-node, the alpha
-// is first solved, and then the {alpha, beta} pair is used to generate
-// the probabilities for the rest of the nodes.
-
-// beta = 8
-
-// Every odd line in this table can be generated from the even lines
-// by averaging :
-// av1_pareto8_full[l][node] = (av1_pareto8_full[l-1][node] +
-//                              av1_pareto8_full[l+1][node] ) >> 1;
-// Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
-const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
-  { 3, 86, 128, 6, 86, 23, 88, 29 },
-  { 6, 86, 128, 11, 87, 42, 91, 52 },
-  { 9, 86, 129, 17, 88, 61, 94, 76 },
-  { 12, 86, 129, 22, 88, 77, 97, 93 },
-  { 15, 87, 129, 28, 89, 93, 100, 110 },
-  { 17, 87, 129, 33, 90, 105, 103, 123 },
-  { 20, 88, 130, 38, 91, 118, 106, 136 },
-  { 23, 88, 130, 43, 91, 128, 108, 146 },
-  { 26, 89, 131, 48, 92, 139, 111, 156 },
-  { 28, 89, 131, 53, 93, 147, 114, 163 },
-  { 31, 90, 131, 58, 94, 156, 117, 171 },
-  { 34, 90, 131, 62, 94, 163, 119, 177 },
-  { 37, 90, 132, 66, 95, 171, 122, 184 },
-  { 39, 90, 132, 70, 96, 177, 124, 189 },
-  { 42, 91, 132, 75, 97, 183, 127, 194 },
-  { 44, 91, 132, 79, 97, 188, 129, 198 },
-  { 47, 92, 133, 83, 98, 193, 132, 202 },
-  { 49, 92, 133, 86, 99, 197, 134, 205 },
-  { 52, 93, 133, 90, 100, 201, 137, 208 },
-  { 54, 93, 133, 94, 100, 204, 139, 211 },
-  { 57, 94, 134, 98, 101, 208, 142, 214 },
-  { 59, 94, 134, 101, 102, 211, 144, 216 },
-  { 62, 94, 135, 105, 103, 214, 146, 218 },
-  { 64, 94, 135, 108, 103, 216, 148, 220 },
-  { 66, 95, 135, 111, 104, 219, 151, 222 },
-  { 68, 95, 135, 114, 105, 221, 153, 223 },
-  { 71, 96, 136, 117, 106, 224, 155, 225 },
-  { 73, 96, 136, 120, 106, 225, 157, 226 },
-  { 76, 97, 136, 123, 107, 227, 159, 228 },
-  { 78, 97, 136, 126, 108, 229, 160, 229 },
-  { 80, 98, 137, 129, 109, 231, 162, 231 },
-  { 82, 98, 137, 131, 109, 232, 164, 232 },
-  { 84, 98, 138, 134, 110, 234, 166, 233 },
-  { 86, 98, 138, 137, 111, 235, 168, 234 },
-  { 89, 99, 138, 140, 112, 236, 170, 235 },
-  { 91, 99, 138, 142, 112, 237, 171, 235 },
-  { 93, 100, 139, 145, 113, 238, 173, 236 },
-  { 95, 100, 139, 147, 114, 239, 174, 237 },
-  { 97, 101, 140, 149, 115, 240, 176, 238 },
-  { 99, 101, 140, 151, 115, 241, 177, 238 },
-  { 101, 102, 140, 154, 116, 242, 179, 239 },
-  { 103, 102, 140, 156, 117, 242, 180, 239 },
-  { 105, 103, 141, 158, 118, 243, 182, 240 },
-  { 107, 103, 141, 160, 118, 243, 183, 240 },
-  { 109, 104, 141, 162, 119, 244, 185, 241 },
-  { 111, 104, 141, 164, 119, 244, 186, 241 },
-  { 113, 104, 142, 166, 120, 245, 187, 242 },
-  { 114, 104, 142, 168, 121, 245, 188, 242 },
-  { 116, 105, 143, 170, 122, 246, 190, 243 },
-  { 118, 105, 143, 171, 122, 246, 191, 243 },
-  { 120, 106, 143, 173, 123, 247, 192, 244 },
-  { 121, 106, 143, 175, 124, 247, 193, 244 },
-  { 123, 107, 144, 177, 125, 248, 195, 244 },
-  { 125, 107, 144, 178, 125, 248, 196, 244 },
-  { 127, 108, 145, 180, 126, 249, 197, 245 },
-  { 128, 108, 145, 181, 127, 249, 198, 245 },
-  { 130, 109, 145, 183, 128, 249, 199, 245 },
-  { 132, 109, 145, 184, 128, 249, 200, 245 },
-  { 134, 110, 146, 186, 129, 250, 201, 246 },
-  { 135, 110, 146, 187, 130, 250, 202, 246 },
-  { 137, 111, 147, 189, 131, 251, 203, 246 },
-  { 138, 111, 147, 190, 131, 251, 204, 246 },
-  { 140, 112, 147, 192, 132, 251, 205, 247 },
-  { 141, 112, 147, 193, 132, 251, 206, 247 },
-  { 143, 113, 148, 194, 133, 251, 207, 247 },
-  { 144, 113, 148, 195, 134, 251, 207, 247 },
-  { 146, 114, 149, 197, 135, 252, 208, 248 },
-  { 147, 114, 149, 198, 135, 252, 209, 248 },
-  { 149, 115, 149, 199, 136, 252, 210, 248 },
-  { 150, 115, 149, 200, 137, 252, 210, 248 },
-  { 152, 115, 150, 201, 138, 252, 211, 248 },
-  { 153, 115, 150, 202, 138, 252, 212, 248 },
-  { 155, 116, 151, 204, 139, 253, 213, 249 },
-  { 156, 116, 151, 205, 139, 253, 213, 249 },
-  { 158, 117, 151, 206, 140, 253, 214, 249 },
-  { 159, 117, 151, 207, 141, 253, 215, 249 },
-  { 161, 118, 152, 208, 142, 253, 216, 249 },
-  { 162, 118, 152, 209, 142, 253, 216, 249 },
-  { 163, 119, 153, 210, 143, 253, 217, 249 },
-  { 164, 119, 153, 211, 143, 253, 217, 249 },
-  { 166, 120, 153, 212, 144, 254, 218, 250 },
-  { 167, 120, 153, 212, 145, 254, 219, 250 },
-  { 168, 121, 154, 213, 146, 254, 220, 250 },
-  { 169, 121, 154, 214, 146, 254, 220, 250 },
-  { 171, 122, 155, 215, 147, 254, 221, 250 },
-  { 172, 122, 155, 216, 147, 254, 221, 250 },
-  { 173, 123, 155, 217, 148, 254, 222, 250 },
-  { 174, 123, 155, 217, 149, 254, 222, 250 },
-  { 176, 124, 156, 218, 150, 254, 223, 250 },
-  { 177, 124, 156, 219, 150, 254, 223, 250 },
-  { 178, 125, 157, 220, 151, 254, 224, 251 },
-  { 179, 125, 157, 220, 151, 254, 224, 251 },
-  { 180, 126, 157, 221, 152, 254, 225, 251 },
-  { 181, 126, 157, 221, 152, 254, 225, 251 },
-  { 183, 127, 158, 222, 153, 254, 226, 251 },
-  { 184, 127, 158, 223, 154, 254, 226, 251 },
-  { 185, 128, 159, 224, 155, 255, 227, 251 },
-  { 186, 128, 159, 224, 155, 255, 227, 251 },
-  { 187, 129, 160, 225, 156, 255, 228, 251 },
-  { 188, 130, 160, 225, 156, 255, 228, 251 },
-  { 189, 131, 160, 226, 157, 255, 228, 251 },
-  { 190, 131, 160, 226, 158, 255, 228, 251 },
-  { 191, 132, 161, 227, 159, 255, 229, 251 },
-  { 192, 132, 161, 227, 159, 255, 229, 251 },
-  { 193, 133, 162, 228, 160, 255, 230, 252 },
-  { 194, 133, 162, 229, 160, 255, 230, 252 },
-  { 195, 134, 163, 230, 161, 255, 231, 252 },
-  { 196, 134, 163, 230, 161, 255, 231, 252 },
-  { 197, 135, 163, 231, 162, 255, 231, 252 },
-  { 198, 135, 163, 231, 162, 255, 231, 252 },
-  { 199, 136, 164, 232, 163, 255, 232, 252 },
-  { 200, 136, 164, 232, 164, 255, 232, 252 },
-  { 201, 137, 165, 233, 165, 255, 233, 252 },
-  { 201, 137, 165, 233, 165, 255, 233, 252 },
-  { 202, 138, 166, 233, 166, 255, 233, 252 },
-  { 203, 138, 166, 233, 166, 255, 233, 252 },
-  { 204, 139, 166, 234, 167, 255, 234, 252 },
-  { 205, 139, 166, 234, 167, 255, 234, 252 },
-  { 206, 140, 167, 235, 168, 255, 235, 252 },
-  { 206, 140, 167, 235, 168, 255, 235, 252 },
-  { 207, 141, 168, 236, 169, 255, 235, 252 },
-  { 208, 141, 168, 236, 170, 255, 235, 252 },
-  { 209, 142, 169, 237, 171, 255, 236, 252 },
-  { 209, 143, 169, 237, 171, 255, 236, 252 },
-  { 210, 144, 169, 237, 172, 255, 236, 252 },
-  { 211, 144, 169, 237, 172, 255, 236, 252 },
-  { 212, 145, 170, 238, 173, 255, 237, 252 },
-  { 213, 145, 170, 238, 173, 255, 237, 252 },
-  { 214, 146, 171, 239, 174, 255, 237, 253 },
-  { 214, 146, 171, 239, 174, 255, 237, 253 },
-  { 215, 147, 172, 240, 175, 255, 238, 253 },
-  { 215, 147, 172, 240, 175, 255, 238, 253 },
-  { 216, 148, 173, 240, 176, 255, 238, 253 },
-  { 217, 148, 173, 240, 176, 255, 238, 253 },
-  { 218, 149, 173, 241, 177, 255, 239, 253 },
-  { 218, 149, 173, 241, 178, 255, 239, 253 },
-  { 219, 150, 174, 241, 179, 255, 239, 253 },
-  { 219, 151, 174, 241, 179, 255, 239, 253 },
-  { 220, 152, 175, 242, 180, 255, 240, 253 },
-  { 221, 152, 175, 242, 180, 255, 240, 253 },
-  { 222, 153, 176, 242, 181, 255, 240, 253 },
-  { 222, 153, 176, 242, 181, 255, 240, 253 },
-  { 223, 154, 177, 243, 182, 255, 240, 253 },
-  { 223, 154, 177, 243, 182, 255, 240, 253 },
-  { 224, 155, 178, 244, 183, 255, 241, 253 },
-  { 224, 155, 178, 244, 183, 255, 241, 253 },
-  { 225, 156, 178, 244, 184, 255, 241, 253 },
-  { 225, 157, 178, 244, 184, 255, 241, 253 },
-  { 226, 158, 179, 244, 185, 255, 242, 253 },
-  { 227, 158, 179, 244, 185, 255, 242, 253 },
-  { 228, 159, 180, 245, 186, 255, 242, 253 },
-  { 228, 159, 180, 245, 186, 255, 242, 253 },
-  { 229, 160, 181, 245, 187, 255, 242, 253 },
-  { 229, 160, 181, 245, 187, 255, 242, 253 },
-  { 230, 161, 182, 246, 188, 255, 243, 253 },
-  { 230, 162, 182, 246, 188, 255, 243, 253 },
-  { 231, 163, 183, 246, 189, 255, 243, 253 },
-  { 231, 163, 183, 246, 189, 255, 243, 253 },
-  { 232, 164, 184, 247, 190, 255, 243, 253 },
-  { 232, 164, 184, 247, 190, 255, 243, 253 },
-  { 233, 165, 185, 247, 191, 255, 244, 253 },
-  { 233, 165, 185, 247, 191, 255, 244, 253 },
-  { 234, 166, 185, 247, 192, 255, 244, 253 },
-  { 234, 167, 185, 247, 192, 255, 244, 253 },
-  { 235, 168, 186, 248, 193, 255, 244, 253 },
-  { 235, 168, 186, 248, 193, 255, 244, 253 },
-  { 236, 169, 187, 248, 194, 255, 244, 253 },
-  { 236, 169, 187, 248, 194, 255, 244, 253 },
-  { 236, 170, 188, 248, 195, 255, 245, 253 },
-  { 236, 170, 188, 248, 195, 255, 245, 253 },
-  { 237, 171, 189, 249, 196, 255, 245, 254 },
-  { 237, 172, 189, 249, 196, 255, 245, 254 },
-  { 238, 173, 190, 249, 197, 255, 245, 254 },
-  { 238, 173, 190, 249, 197, 255, 245, 254 },
-  { 239, 174, 191, 249, 198, 255, 245, 254 },
-  { 239, 174, 191, 249, 198, 255, 245, 254 },
-  { 240, 175, 192, 249, 199, 255, 246, 254 },
-  { 240, 176, 192, 249, 199, 255, 246, 254 },
-  { 240, 177, 193, 250, 200, 255, 246, 254 },
-  { 240, 177, 193, 250, 200, 255, 246, 254 },
-  { 241, 178, 194, 250, 201, 255, 246, 254 },
-  { 241, 178, 194, 250, 201, 255, 246, 254 },
-  { 242, 179, 195, 250, 202, 255, 246, 254 },
-  { 242, 180, 195, 250, 202, 255, 246, 254 },
-  { 242, 181, 196, 250, 203, 255, 247, 254 },
-  { 242, 181, 196, 250, 203, 255, 247, 254 },
-  { 243, 182, 197, 251, 204, 255, 247, 254 },
-  { 243, 183, 197, 251, 204, 255, 247, 254 },
-  { 244, 184, 198, 251, 205, 255, 247, 254 },
-  { 244, 184, 198, 251, 205, 255, 247, 254 },
-  { 244, 185, 199, 251, 206, 255, 247, 254 },
-  { 244, 185, 199, 251, 206, 255, 247, 254 },
-  { 245, 186, 200, 251, 207, 255, 247, 254 },
-  { 245, 187, 200, 251, 207, 255, 247, 254 },
-  { 246, 188, 201, 252, 207, 255, 248, 254 },
-  { 246, 188, 201, 252, 207, 255, 248, 254 },
-  { 246, 189, 202, 252, 208, 255, 248, 254 },
-  { 246, 190, 202, 252, 208, 255, 248, 254 },
-  { 247, 191, 203, 252, 209, 255, 248, 254 },
-  { 247, 191, 203, 252, 209, 255, 248, 254 },
-  { 247, 192, 204, 252, 210, 255, 248, 254 },
-  { 247, 193, 204, 252, 210, 255, 248, 254 },
-  { 248, 194, 205, 252, 211, 255, 248, 254 },
-  { 248, 194, 205, 252, 211, 255, 248, 254 },
-  { 248, 195, 206, 252, 212, 255, 249, 254 },
-  { 248, 196, 206, 252, 212, 255, 249, 254 },
-  { 249, 197, 207, 253, 213, 255, 249, 254 },
-  { 249, 197, 207, 253, 213, 255, 249, 254 },
-  { 249, 198, 208, 253, 214, 255, 249, 254 },
-  { 249, 199, 209, 253, 214, 255, 249, 254 },
-  { 250, 200, 210, 253, 215, 255, 249, 254 },
-  { 250, 200, 210, 253, 215, 255, 249, 254 },
-  { 250, 201, 211, 253, 215, 255, 249, 254 },
-  { 250, 202, 211, 253, 215, 255, 249, 254 },
-  { 250, 203, 212, 253, 216, 255, 249, 254 },
-  { 250, 203, 212, 253, 216, 255, 249, 254 },
-  { 251, 204, 213, 253, 217, 255, 250, 254 },
-  { 251, 205, 213, 253, 217, 255, 250, 254 },
-  { 251, 206, 214, 254, 218, 255, 250, 254 },
-  { 251, 206, 215, 254, 218, 255, 250, 254 },
-  { 252, 207, 216, 254, 219, 255, 250, 254 },
-  { 252, 208, 216, 254, 219, 255, 250, 254 },
-  { 252, 209, 217, 254, 220, 255, 250, 254 },
-  { 252, 210, 217, 254, 220, 255, 250, 254 },
-  { 252, 211, 218, 254, 221, 255, 250, 254 },
-  { 252, 212, 218, 254, 221, 255, 250, 254 },
-  { 253, 213, 219, 254, 222, 255, 250, 254 },
-  { 253, 213, 220, 254, 222, 255, 250, 254 },
-  { 253, 214, 221, 254, 223, 255, 250, 254 },
-  { 253, 215, 221, 254, 223, 255, 250, 254 },
-  { 253, 216, 222, 254, 224, 255, 251, 254 },
-  { 253, 217, 223, 254, 224, 255, 251, 254 },
-  { 253, 218, 224, 254, 225, 255, 251, 254 },
-  { 253, 219, 224, 254, 225, 255, 251, 254 },
-  { 254, 220, 225, 254, 225, 255, 251, 254 },
-  { 254, 221, 226, 254, 225, 255, 251, 254 },
-  { 254, 222, 227, 255, 226, 255, 251, 254 },
-  { 254, 223, 227, 255, 226, 255, 251, 254 },
-  { 254, 224, 228, 255, 227, 255, 251, 254 },
-  { 254, 225, 229, 255, 227, 255, 251, 254 },
-  { 254, 226, 230, 255, 228, 255, 251, 254 },
-  { 254, 227, 230, 255, 229, 255, 251, 254 },
-  { 255, 228, 231, 255, 230, 255, 251, 254 },
-  { 255, 229, 232, 255, 230, 255, 251, 254 },
-  { 255, 230, 233, 255, 231, 255, 252, 254 },
-  { 255, 231, 234, 255, 231, 255, 252, 254 },
-  { 255, 232, 235, 255, 232, 255, 252, 254 },
-  { 255, 233, 236, 255, 232, 255, 252, 254 },
-  { 255, 235, 237, 255, 233, 255, 252, 254 },
-  { 255, 236, 238, 255, 234, 255, 252, 254 },
-  { 255, 238, 240, 255, 235, 255, 252, 255 },
-  { 255, 239, 241, 255, 235, 255, 252, 254 },
-  { 255, 241, 243, 255, 236, 255, 252, 254 },
-  { 255, 243, 245, 255, 237, 255, 252, 254 },
-  { 255, 246, 247, 255, 239, 255, 253, 255 },
-};
-
-// Model obtained from a 2-sided zero-centered distribution derived
-// from a Pareto distribution. The cdf of the distribution is:
-// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
-//
-// For a given beta and a given probability of the 1-node, the alpha
-// is first solved, and then the {alpha, beta} pair is used to generate
-// the probabilities for the rest of the nodes.
-//
-// The full source code of the generating program is available in:
-// tools/gen_constrained_tokenset.py
-//
-// Values for tokens TWO_TOKEN through CATEGORY6_TOKEN included
-// in the table here : the ONE_TOKEN probability is
-// removed and the probabilities rescaled.
-//
-// ZERO_TOKEN and ONE_TOKEN are coded as one CDF,
-// and EOB_TOKEN is coded as flags outside this coder.
-const aom_cdf_prob av1_pareto8_tail_probs[COEFF_PROB_MODELS][TAIL_NODES] = {
-  { 128, 127, 127, 252, 497, 969, 1839, 3318, 25511 },
-  { 256, 254, 251, 496, 966, 1834, 3308, 5408, 19995 },
-  { 383, 378, 373, 732, 1408, 2605, 4470, 6646, 15773 },
-  { 511, 502, 493, 961, 1824, 3289, 5373, 7298, 12517 },
-  { 638, 625, 611, 1182, 2215, 3894, 6064, 7548, 9991 },
-  { 766, 746, 726, 1396, 2582, 4428, 6578, 7529, 8017 },
-  { 893, 866, 839, 1603, 2927, 4896, 6945, 7332, 6467 },
-  { 1020, 984, 950, 1803, 3250, 5305, 7191, 7022, 5243 },
-  { 1147, 1102, 1059, 1996, 3552, 5659, 7338, 6646, 4269 },
-  { 1274, 1218, 1166, 2183, 3835, 5963, 7403, 6234, 3492 },
-  { 1400, 1334, 1270, 2363, 4099, 6223, 7401, 5809, 2869 },
-  { 1527, 1447, 1372, 2537, 4345, 6442, 7346, 5386, 2366 },
-  { 1654, 1560, 1473, 2704, 4574, 6624, 7247, 4973, 1959 },
-  { 1780, 1672, 1571, 2866, 4787, 6771, 7114, 4579, 1628 },
-  { 1906, 1782, 1667, 3022, 4984, 6889, 6954, 4206, 1358 },
-  { 2032, 1891, 1762, 3172, 5167, 6979, 6773, 3856, 1136 },
-  { 2158, 2000, 1854, 3316, 5335, 7044, 6577, 3530, 954 },
-  { 2284, 2106, 1944, 3455, 5490, 7087, 6370, 3229, 803 },
-  { 2410, 2212, 2032, 3588, 5632, 7109, 6155, 2951, 679 },
-  { 2535, 2317, 2119, 3717, 5761, 7113, 5936, 2695, 575 },
-  { 2661, 2420, 2203, 3840, 5880, 7101, 5714, 2461, 488 },
-  { 2786, 2522, 2286, 3958, 5987, 7074, 5493, 2246, 416 },
-  { 2911, 2624, 2367, 4072, 6083, 7033, 5273, 2050, 355 },
-  { 3037, 2724, 2446, 4180, 6170, 6981, 5055, 1871, 304 },
-  { 3162, 2822, 2523, 4284, 6247, 6919, 4842, 1708, 261 },
-  { 3286, 2920, 2599, 4384, 6315, 6848, 4633, 1559, 224 },
-  { 3411, 3017, 2672, 4478, 6374, 6768, 4430, 1424, 194 },
-  { 3536, 3112, 2745, 4569, 6426, 6681, 4232, 1300, 167 },
-  { 3660, 3207, 2815, 4656, 6469, 6588, 4040, 1188, 145 },
-  { 3785, 3300, 2883, 4738, 6505, 6490, 3855, 1086, 126 },
-  { 3909, 3392, 2950, 4817, 6534, 6387, 3677, 993, 109 },
-  { 4033, 3483, 3015, 4891, 6557, 6281, 3505, 908, 95 },
-  { 4157, 3573, 3079, 4962, 6573, 6170, 3340, 831, 83 },
-  { 4281, 3662, 3141, 5029, 6584, 6058, 3181, 760, 72 },
-  { 4405, 3750, 3201, 5093, 6588, 5943, 3029, 696, 63 },
-  { 4529, 3837, 3260, 5152, 6587, 5826, 2883, 638, 56 },
-  { 4652, 3922, 3317, 5209, 6582, 5709, 2744, 584, 49 },
-  { 4775, 4007, 3373, 5262, 6572, 5590, 2610, 536, 43 },
-  { 4899, 4090, 3427, 5312, 6557, 5470, 2483, 492, 38 },
-  { 5022, 4173, 3480, 5359, 6538, 5351, 2361, 451, 33 },
-  { 5145, 4254, 3531, 5403, 6515, 5231, 2246, 414, 29 },
-  { 5268, 4334, 3581, 5443, 6489, 5112, 2135, 380, 26 },
-  { 5391, 4414, 3629, 5481, 6458, 4993, 2029, 350, 23 },
-  { 5514, 4492, 3676, 5515, 6425, 4875, 1929, 321, 21 },
-  { 5637, 4569, 3721, 5548, 6388, 4758, 1833, 296, 18 },
-  { 5759, 4645, 3766, 5577, 6349, 4642, 1742, 272, 16 },
-  { 5881, 4720, 3808, 5604, 6307, 4528, 1656, 250, 14 },
-  { 6004, 4794, 3849, 5628, 6262, 4414, 1573, 231, 13 },
-  { 6126, 4867, 3890, 5649, 6215, 4302, 1495, 213, 11 },
-  { 6248, 4939, 3928, 5669, 6166, 4192, 1420, 196, 10 },
-  { 6370, 5010, 3966, 5686, 6114, 4083, 1349, 181, 9 },
-  { 6492, 5080, 4002, 5700, 6061, 3976, 1282, 167, 8 },
-  { 6614, 5149, 4037, 5712, 6006, 3871, 1218, 154, 7 },
-  { 6735, 5217, 4070, 5723, 5950, 3767, 1157, 142, 7 },
-  { 6857, 5284, 4103, 5731, 5891, 3666, 1099, 131, 6 },
-  { 6978, 5351, 4134, 5737, 5832, 3566, 1044, 121, 5 },
-  { 7099, 5415, 4164, 5741, 5771, 3469, 992, 112, 5 },
-  { 7221, 5479, 4192, 5743, 5709, 3373, 943, 104, 4 },
-  { 7342, 5542, 4220, 5743, 5646, 3279, 896, 96, 4 },
-  { 7462, 5604, 4246, 5742, 5583, 3187, 851, 89, 4 },
-  { 7584, 5665, 4272, 5739, 5518, 3097, 808, 82, 3 },
-  { 7704, 5725, 4296, 5734, 5453, 3009, 768, 76, 3 },
-  { 7825, 5784, 4318, 5727, 5386, 2924, 730, 71, 3 },
-  { 7945, 5843, 4341, 5719, 5320, 2840, 693, 65, 2 },
-  { 8066, 5900, 4361, 5709, 5252, 2758, 659, 61, 2 },
-  { 8186, 5956, 4381, 5698, 5185, 2678, 626, 56, 2 },
-  { 8306, 6011, 4400, 5685, 5117, 2600, 595, 52, 2 },
-  { 8426, 6066, 4418, 5671, 5049, 2523, 565, 48, 2 },
-  { 8547, 6119, 4434, 5655, 4981, 2449, 537, 45, 1 },
-  { 8666, 6171, 4450, 5638, 4912, 2377, 511, 42, 1 },
-  { 8786, 6223, 4465, 5620, 4843, 2306, 485, 39, 1 },
-  { 8906, 6274, 4478, 5600, 4775, 2237, 461, 36, 1 },
-  { 9025, 6323, 4491, 5580, 4706, 2170, 438, 34, 1 },
-  { 9144, 6372, 4503, 5558, 4637, 2105, 417, 31, 1 },
-  { 9264, 6420, 4514, 5535, 4568, 2041, 396, 29, 1 },
-  { 9383, 6467, 4524, 5511, 4500, 1979, 376, 27, 1 },
-  { 9502, 6513, 4532, 5486, 4432, 1919, 358, 25, 1 },
-  { 9621, 6558, 4541, 5460, 4364, 1860, 340, 23, 1 },
-  { 9740, 6602, 4548, 5433, 4296, 1803, 323, 22, 1 },
-  { 9859, 6645, 4554, 5405, 4229, 1748, 307, 20, 1 },
-  { 9978, 6688, 4559, 5376, 4161, 1694, 292, 19, 1 },
-  { 10096, 6729, 4564, 5347, 4094, 1641, 278, 18, 1 },
-  { 10215, 6770, 4568, 5316, 4028, 1590, 264, 16, 1 },
-  { 10333, 6809, 4571, 5285, 3962, 1541, 251, 15, 1 },
-  { 10452, 6848, 4573, 5253, 3896, 1492, 239, 14, 1 },
-  { 10570, 6886, 4574, 5220, 3831, 1446, 227, 13, 1 },
-  { 10688, 6923, 4575, 5186, 3767, 1400, 216, 12, 1 },
-  { 10806, 6959, 4575, 5152, 3702, 1356, 205, 12, 1 },
-  { 10924, 6994, 4574, 5117, 3639, 1313, 195, 11, 1 },
-  { 11041, 7029, 4572, 5082, 3576, 1271, 186, 10, 1 },
-  { 11159, 7062, 4570, 5046, 3513, 1231, 177, 9, 1 },
-  { 11277, 7095, 4566, 5009, 3451, 1192, 168, 9, 1 },
-  { 11394, 7127, 4563, 4972, 3390, 1153, 160, 8, 1 },
-  { 11512, 7158, 4558, 4934, 3329, 1116, 152, 8, 1 },
-  { 11629, 7188, 4553, 4896, 3269, 1080, 145, 7, 1 },
-  { 11746, 7217, 4547, 4857, 3210, 1045, 138, 7, 1 },
-  { 11864, 7245, 4540, 4818, 3151, 1012, 131, 6, 1 },
-  { 11980, 7273, 4533, 4779, 3093, 979, 124, 6, 1 },
-  { 12097, 7300, 4525, 4739, 3035, 947, 118, 6, 1 },
-  { 12215, 7326, 4516, 4698, 2978, 916, 113, 5, 1 },
-  { 12331, 7351, 4507, 4658, 2922, 886, 107, 5, 1 },
-  { 12448, 7375, 4497, 4617, 2866, 857, 102, 5, 1 },
-  { 12564, 7398, 4487, 4576, 2812, 829, 97, 4, 1 },
-  { 12681, 7421, 4476, 4534, 2757, 802, 92, 4, 1 },
-  { 12797, 7443, 4464, 4492, 2704, 775, 88, 4, 1 },
-  { 12914, 7464, 4452, 4450, 2651, 749, 84, 3, 1 },
-  { 13030, 7484, 4439, 4408, 2599, 725, 79, 3, 1 },
-  { 13147, 7503, 4426, 4365, 2547, 700, 76, 3, 1 },
-  { 13262, 7522, 4412, 4322, 2497, 677, 72, 3, 1 },
-  { 13378, 7539, 4398, 4280, 2447, 654, 68, 3, 1 },
-  { 13494, 7556, 4383, 4237, 2397, 632, 65, 3, 1 },
-  { 13610, 7573, 4368, 4193, 2348, 611, 62, 2, 1 },
-  { 13726, 7588, 4352, 4150, 2300, 590, 59, 2, 1 },
-  { 13841, 7602, 4335, 4107, 2253, 571, 56, 2, 1 },
-  { 13957, 7616, 4318, 4063, 2207, 551, 53, 2, 1 },
-  { 14072, 7629, 4301, 4019, 2161, 532, 51, 2, 1 },
-  { 14188, 7641, 4283, 3976, 2115, 514, 48, 2, 1 },
-  { 14302, 7652, 4265, 3932, 2071, 497, 46, 2, 1 },
-  { 14418, 7663, 4246, 3888, 2027, 480, 44, 1, 1 },
-  { 14533, 7673, 4227, 3844, 1984, 463, 42, 1, 1 },
-  { 14649, 7682, 4207, 3800, 1941, 447, 40, 1, 1 },
-  { 14763, 7690, 4187, 3757, 1899, 432, 38, 1, 1 },
-  { 14878, 7698, 4166, 3713, 1858, 417, 36, 1, 1 },
-  { 14993, 7705, 4146, 3669, 1817, 402, 34, 1, 1 },
-  { 15109, 7711, 4124, 3625, 1777, 388, 32, 1, 1 },
-  { 15223, 7715, 4103, 3581, 1738, 375, 31, 1, 1 },
-  { 15337, 7720, 4081, 3538, 1699, 362, 29, 1, 1 },
-  { 15452, 7724, 4058, 3494, 1661, 349, 28, 1, 1 },
-  { 15567, 7727, 4035, 3450, 1624, 337, 26, 1, 1 },
-  { 15681, 7729, 4012, 3407, 1587, 325, 25, 1, 1 },
-  { 15795, 7730, 3989, 3364, 1551, 313, 24, 1, 1 },
-  { 15909, 7731, 3965, 3320, 1516, 302, 23, 1, 1 },
-  { 16024, 7731, 3940, 3277, 1481, 291, 22, 1, 1 },
-  { 16138, 7730, 3916, 3234, 1446, 281, 21, 1, 1 },
-  { 16252, 7728, 3891, 3191, 1413, 271, 20, 1, 1 },
-  { 16366, 7726, 3866, 3148, 1380, 261, 19, 1, 1 },
-  { 16480, 7723, 3840, 3106, 1347, 252, 18, 1, 1 },
-  { 16594, 7720, 3814, 3063, 1315, 243, 17, 1, 1 },
-  { 16708, 7715, 3788, 3021, 1284, 234, 16, 1, 1 },
-  { 16822, 7710, 3762, 2979, 1253, 225, 15, 1, 1 },
-  { 16936, 7704, 3735, 2937, 1223, 217, 14, 1, 1 },
-  { 17050, 7697, 3708, 2895, 1193, 209, 14, 1, 1 },
-  { 17162, 7690, 3681, 2854, 1164, 202, 13, 1, 1 },
-  { 17276, 7682, 3654, 2812, 1136, 194, 12, 1, 1 },
-  { 17389, 7673, 3626, 2771, 1108, 187, 12, 1, 1 },
-  { 17504, 7663, 3598, 2730, 1080, 180, 11, 1, 1 },
-  { 17617, 7653, 3570, 2689, 1053, 173, 11, 1, 1 },
-  { 17730, 7642, 3541, 2649, 1027, 167, 10, 1, 1 },
-  { 17843, 7630, 3513, 2608, 1001, 161, 10, 1, 1 },
-  { 17957, 7618, 3484, 2569, 975, 154, 9, 1, 1 },
-  { 18069, 7605, 3455, 2529, 950, 149, 9, 1, 1 },
-  { 18183, 7591, 3426, 2489, 926, 143, 8, 1, 1 },
-  { 18296, 7576, 3396, 2450, 902, 138, 8, 1, 1 },
-  { 18410, 7562, 3366, 2411, 878, 132, 7, 1, 1 },
-  { 18523, 7545, 3337, 2372, 855, 127, 7, 1, 1 },
-  { 18636, 7529, 3306, 2333, 833, 122, 7, 1, 1 },
-  { 18749, 7511, 3276, 2295, 811, 118, 6, 1, 1 },
-  { 18862, 7493, 3246, 2257, 789, 113, 6, 1, 1 },
-  { 18975, 7474, 3215, 2219, 768, 109, 6, 1, 1 },
-  { 19088, 7455, 3185, 2182, 747, 104, 5, 1, 1 },
-  { 19201, 7435, 3154, 2144, 727, 100, 5, 1, 1 },
-  { 19314, 7414, 3123, 2107, 707, 96, 5, 1, 1 },
-  { 19427, 7392, 3092, 2071, 687, 92, 5, 1, 1 },
-  { 19541, 7370, 3060, 2034, 668, 89, 4, 1, 1 },
-  { 19654, 7347, 3029, 1998, 649, 85, 4, 1, 1 },
-  { 19766, 7323, 2997, 1963, 631, 82, 4, 1, 1 },
-  { 19878, 7299, 2966, 1927, 613, 79, 4, 1, 1 },
-  { 19991, 7274, 2934, 1892, 596, 75, 4, 1, 1 },
-  { 20105, 7248, 2902, 1857, 579, 72, 3, 1, 1 },
-  { 20218, 7222, 2870, 1822, 562, 69, 3, 1, 1 },
-  { 20331, 7195, 2838, 1788, 545, 66, 3, 1, 1 },
-  { 20443, 7167, 2806, 1754, 529, 64, 3, 1, 1 },
-  { 20556, 7138, 2774, 1720, 514, 61, 3, 1, 1 },
-  { 20670, 7109, 2741, 1687, 498, 58, 3, 1, 1 },
-  { 20783, 7079, 2709, 1654, 483, 56, 2, 1, 1 },
-  { 20895, 7049, 2676, 1621, 469, 54, 2, 1, 1 },
-  { 21008, 7017, 2644, 1589, 455, 51, 2, 1, 1 },
-  { 21121, 6985, 2611, 1557, 441, 49, 2, 1, 1 },
-  { 21234, 6953, 2578, 1525, 427, 47, 2, 1, 1 },
-  { 21347, 6919, 2545, 1494, 414, 45, 2, 1, 1 },
-  { 21460, 6885, 2513, 1462, 401, 43, 2, 1, 1 },
-  { 21573, 6850, 2480, 1432, 388, 41, 2, 1, 1 },
-  { 21687, 6815, 2447, 1401, 375, 39, 2, 1, 1 },
-  { 21801, 6778, 2414, 1371, 363, 38, 1, 1, 1 },
-  { 21914, 6741, 2381, 1341, 352, 36, 1, 1, 1 },
-  { 22028, 6704, 2348, 1311, 340, 34, 1, 1, 1 },
-  { 22141, 6665, 2315, 1282, 329, 33, 1, 1, 1 },
-  { 22255, 6626, 2282, 1253, 318, 31, 1, 1, 1 },
-  { 22368, 6586, 2249, 1225, 307, 30, 1, 1, 1 },
-  { 22482, 6546, 2216, 1196, 297, 28, 1, 1, 1 },
-  { 22595, 6505, 2183, 1169, 286, 27, 1, 1, 1 },
-  { 22709, 6463, 2149, 1141, 277, 26, 1, 1, 1 },
-  { 22823, 6420, 2116, 1114, 267, 25, 1, 1, 1 },
-  { 22938, 6377, 2083, 1087, 257, 23, 1, 1, 1 },
-  { 23053, 6332, 2050, 1060, 248, 22, 1, 1, 1 },
-  { 23167, 6287, 2017, 1034, 239, 21, 1, 1, 1 },
-  { 23280, 6242, 1984, 1008, 231, 20, 1, 1, 1 },
-  { 23396, 6195, 1951, 982, 222, 19, 1, 1, 1 },
-  { 23510, 6148, 1918, 957, 214, 18, 1, 1, 1 },
-  { 23625, 6100, 1885, 932, 206, 17, 1, 1, 1 },
-  { 23741, 6051, 1852, 907, 198, 16, 1, 1, 1 },
-  { 23855, 6002, 1819, 883, 190, 16, 1, 1, 1 },
-  { 23971, 5951, 1786, 859, 183, 15, 1, 1, 1 },
-  { 24087, 5900, 1753, 835, 176, 14, 1, 1, 1 },
-  { 24203, 5848, 1720, 812, 169, 13, 1, 1, 1 },
-  { 24318, 5796, 1687, 789, 162, 13, 1, 1, 1 },
-  { 24435, 5742, 1655, 766, 155, 12, 1, 1, 1 },
-  { 24552, 5688, 1622, 743, 149, 11, 1, 1, 1 },
-  { 24669, 5632, 1589, 721, 143, 11, 1, 1, 1 },
-  { 24786, 5576, 1557, 699, 137, 10, 1, 1, 1 },
-  { 24903, 5519, 1524, 678, 131, 10, 1, 1, 1 },
-  { 25021, 5462, 1491, 657, 125, 9, 1, 1, 1 },
-  { 25139, 5403, 1459, 636, 120, 8, 1, 1, 1 },
-  { 25258, 5343, 1427, 615, 114, 8, 1, 1, 1 },
-  { 25376, 5283, 1394, 595, 109, 8, 1, 1, 1 },
-  { 25496, 5221, 1362, 575, 104, 7, 1, 1, 1 },
-  { 25614, 5159, 1330, 556, 99, 7, 1, 1, 1 },
-  { 25735, 5096, 1298, 536, 94, 6, 1, 1, 1 },
-  { 25856, 5031, 1265, 517, 90, 6, 1, 1, 1 },
-  { 25977, 4966, 1233, 499, 85, 5, 1, 1, 1 },
-  { 26098, 4899, 1202, 480, 81, 5, 1, 1, 1 },
-  { 26220, 4831, 1170, 462, 77, 5, 1, 1, 1 },
-  { 26343, 4763, 1138, 444, 73, 4, 1, 1, 1 },
-  { 26466, 4693, 1106, 427, 69, 4, 1, 1, 1 },
-  { 26589, 4622, 1075, 410, 65, 4, 1, 1, 1 },
-  { 26713, 4550, 1043, 393, 62, 4, 1, 1, 1 },
-  { 26840, 4476, 1012, 376, 58, 3, 1, 1, 1 },
-  { 26966, 4401, 980, 360, 55, 3, 1, 1, 1 },
-  { 27092, 4325, 949, 344, 52, 3, 1, 1, 1 },
-  { 27220, 4248, 918, 328, 48, 3, 1, 1, 1 },
-  { 27350, 4169, 886, 313, 45, 2, 1, 1, 1 },
-  { 27480, 4088, 855, 298, 42, 2, 1, 1, 1 },
-  { 27610, 4006, 824, 283, 40, 2, 1, 1, 1 },
-  { 27743, 3922, 793, 268, 37, 2, 1, 1, 1 },
-  { 27876, 3837, 762, 254, 34, 2, 1, 1, 1 },
-  { 28011, 3749, 731, 240, 32, 2, 1, 1, 1 },
-  { 28147, 3659, 701, 227, 30, 1, 1, 1, 1 },
-  { 28286, 3568, 670, 213, 27, 1, 1, 1, 1 },
-  { 28426, 3474, 639, 200, 25, 1, 1, 1, 1 },
-  { 28569, 3377, 608, 187, 23, 1, 1, 1, 1 },
-  { 28714, 3278, 577, 174, 21, 1, 1, 1, 1 },
-  { 28860, 3176, 547, 162, 19, 1, 1, 1, 1 },
-  { 29010, 3071, 516, 150, 17, 1, 1, 1, 1 },
-  { 29163, 2962, 485, 138, 16, 1, 1, 1, 1 },
-  { 29320, 2849, 454, 127, 14, 1, 1, 1, 1 },
-  { 29483, 2731, 423, 115, 12, 1, 1, 1, 1 },
-  { 29650, 2608, 391, 104, 11, 1, 1, 1, 1 },
-  { 29823, 2479, 360, 93, 9, 1, 1, 1, 1 },
-  { 30002, 2343, 328, 83, 8, 1, 1, 1, 1 },
-  { 30192, 2198, 295, 72, 7, 1, 1, 1, 1 },
-  { 30393, 2041, 262, 62, 6, 1, 1, 1, 1 },
-  { 30612, 1869, 227, 52, 4, 1, 1, 1, 1 },
-  { 30853, 1676, 191, 41, 3, 1, 1, 1, 1 },
-  { 31131, 1448, 152, 31, 2, 1, 1, 1, 1 },
-  { 31486, 1150, 107, 20, 1, 1, 1, 1, 1 },
-};
-
-#if !CONFIG_Q_ADAPT_PROBS
-static const coeff_cdf_model default_coef_head_cdf_4x4[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(25024), AOM_ICDF(25863), AOM_ICDF(27361), AOM_ICDF(29796),
-          AOM_ICDF(30374), AOM_ICDF(32768) },
-        { AOM_ICDF(10816), AOM_ICDF(14127), AOM_ICDF(17116), AOM_ICDF(23516),
-          AOM_ICDF(24999), AOM_ICDF(32768) },
-        { AOM_ICDF(1088), AOM_ICDF(6358), AOM_ICDF(8428), AOM_ICDF(16648),
-          AOM_ICDF(18276), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(14529), AOM_ICDF(18769), AOM_ICDF(29100), AOM_ICDF(29634),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12993), AOM_ICDF(17117), AOM_ICDF(28404), AOM_ICDF(28988),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(14084), AOM_ICDF(25818), AOM_ICDF(26504),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(11267), AOM_ICDF(21775), AOM_ICDF(22451),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7105), AOM_ICDF(7562), AOM_ICDF(15777), AOM_ICDF(16225),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3905), AOM_ICDF(3966), AOM_ICDF(8359), AOM_ICDF(8526),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(20033), AOM_ICDF(23643), AOM_ICDF(31102), AOM_ICDF(31374),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16321), AOM_ICDF(20350), AOM_ICDF(30167), AOM_ICDF(30546),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12993), AOM_ICDF(15512), AOM_ICDF(26859), AOM_ICDF(27396),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10305), AOM_ICDF(11659), AOM_ICDF(21669), AOM_ICDF(22330),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7361), AOM_ICDF(7819), AOM_ICDF(15450), AOM_ICDF(15940),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3521), AOM_ICDF(3580), AOM_ICDF(7805), AOM_ICDF(7976),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(21057), AOM_ICDF(25460), AOM_ICDF(31740), AOM_ICDF(31952),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(21173), AOM_ICDF(30761), AOM_ICDF(31092),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11841), AOM_ICDF(14615), AOM_ICDF(26188), AOM_ICDF(26824),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(8991), AOM_ICDF(18937), AOM_ICDF(19707),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4706), AOM_ICDF(10342), AOM_ICDF(10890),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7617), AOM_ICDF(8392), AOM_ICDF(17295), AOM_ICDF(17915),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(20417), AOM_ICDF(26452), AOM_ICDF(32166), AOM_ICDF(32321),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15809), AOM_ICDF(21634), AOM_ICDF(30947), AOM_ICDF(31298),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10049), AOM_ICDF(12176), AOM_ICDF(23495), AOM_ICDF(24229),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5953), AOM_ICDF(6731), AOM_ICDF(16166), AOM_ICDF(16798),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6081), AOM_ICDF(6188), AOM_ICDF(8114), AOM_ICDF(8764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2291), AOM_ICDF(4448), AOM_ICDF(5527),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(9153), AOM_ICDF(25905), AOM_ICDF(31431), AOM_ICDF(31934),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9025), AOM_ICDF(23345), AOM_ICDF(30033), AOM_ICDF(30965),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5953), AOM_ICDF(13835), AOM_ICDF(22032), AOM_ICDF(24664),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6337), AOM_ICDF(11435), AOM_ICDF(18366), AOM_ICDF(21418),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3137), AOM_ICDF(4871), AOM_ICDF(8519), AOM_ICDF(12426),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(2727), AOM_ICDF(5540), AOM_ICDF(8757),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(24512), AOM_ICDF(26673), AOM_ICDF(28962), AOM_ICDF(31929),
-          AOM_ICDF(32126), AOM_ICDF(32768) },
-        { AOM_ICDF(15936), AOM_ICDF(21711), AOM_ICDF(25569), AOM_ICDF(30899),
-          AOM_ICDF(31305), AOM_ICDF(32768) },
-        { AOM_ICDF(3264), AOM_ICDF(14756), AOM_ICDF(20107), AOM_ICDF(29407),
-          AOM_ICDF(30032), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(21313), AOM_ICDF(26020), AOM_ICDF(32523), AOM_ICDF(32575),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18369), AOM_ICDF(24215), AOM_ICDF(32291), AOM_ICDF(32391),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(19637), AOM_ICDF(30414), AOM_ICDF(30752),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(14040), AOM_ICDF(25408), AOM_ICDF(26033),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9537), AOM_ICDF(10173), AOM_ICDF(18839), AOM_ICDF(19315),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9025), AOM_ICDF(9093), AOM_ICDF(13987), AOM_ICDF(14115),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(22721), AOM_ICDF(27599), AOM_ICDF(32592), AOM_ICDF(32636),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19009), AOM_ICDF(24676), AOM_ICDF(32258), AOM_ICDF(32367),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12737), AOM_ICDF(16769), AOM_ICDF(28739), AOM_ICDF(29247),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8769), AOM_ICDF(10956), AOM_ICDF(21941), AOM_ICDF(22840),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6721), AOM_ICDF(7678), AOM_ICDF(15319), AOM_ICDF(16290),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4430), AOM_ICDF(4583), AOM_ICDF(5712),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22849), AOM_ICDF(28333), AOM_ICDF(32633), AOM_ICDF(32671),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18497), AOM_ICDF(24619), AOM_ICDF(32184), AOM_ICDF(32315),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11841), AOM_ICDF(14640), AOM_ICDF(27251), AOM_ICDF(27752),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8385), AOM_ICDF(10154), AOM_ICDF(18339), AOM_ICDF(19621),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6977), AOM_ICDF(13787), AOM_ICDF(15289),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(20417), AOM_ICDF(28167), AOM_ICDF(32552), AOM_ICDF(32621),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16833), AOM_ICDF(23968), AOM_ICDF(31991), AOM_ICDF(32174),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10433), AOM_ICDF(13387), AOM_ICDF(26356), AOM_ICDF(26951),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5057), AOM_ICDF(6823), AOM_ICDF(18967), AOM_ICDF(19843),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6479), AOM_ICDF(11672), AOM_ICDF(13052),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2265), AOM_ICDF(6355), AOM_ICDF(6432),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(12097), AOM_ICDF(28717), AOM_ICDF(32406), AOM_ICDF(32555),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10433), AOM_ICDF(26113), AOM_ICDF(31504), AOM_ICDF(31975),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5825), AOM_ICDF(14284), AOM_ICDF(21349), AOM_ICDF(24461),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4545), AOM_ICDF(8454), AOM_ICDF(12648), AOM_ICDF(17501),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(7173), AOM_ICDF(15272), AOM_ICDF(19322),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(27456), AOM_ICDF(28244), AOM_ICDF(31289), AOM_ICDF(32358),
-          AOM_ICDF(32534), AOM_ICDF(32768) },
-        { AOM_ICDF(16960), AOM_ICDF(21207), AOM_ICDF(26511), AOM_ICDF(30539),
-          AOM_ICDF(31190), AOM_ICDF(32768) },
-        { AOM_ICDF(5440), AOM_ICDF(13412), AOM_ICDF(18469), AOM_ICDF(26423),
-          AOM_ICDF(27669), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(17857), AOM_ICDF(26327), AOM_ICDF(31983), AOM_ICDF(32219),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16065), AOM_ICDF(24198), AOM_ICDF(31431), AOM_ICDF(31785),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12865), AOM_ICDF(18011), AOM_ICDF(28454), AOM_ICDF(29166),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9665), AOM_ICDF(12501), AOM_ICDF(24331), AOM_ICDF(25147),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(3121), AOM_ICDF(12661), AOM_ICDF(13034),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4033), AOM_ICDF(4140), AOM_ICDF(11834), AOM_ICDF(11977),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(21185), AOM_ICDF(28338), AOM_ICDF(32249), AOM_ICDF(32417),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18497), AOM_ICDF(25227), AOM_ICDF(31905), AOM_ICDF(32122),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12097), AOM_ICDF(16516), AOM_ICDF(28610), AOM_ICDF(29166),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9281), AOM_ICDF(11157), AOM_ICDF(21438), AOM_ICDF(22312),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6566), AOM_ICDF(15585), AOM_ICDF(16340),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9409), AOM_ICDF(9659), AOM_ICDF(11827), AOM_ICDF(12911),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22337), AOM_ICDF(29459), AOM_ICDF(32382), AOM_ICDF(32519),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16961), AOM_ICDF(25262), AOM_ICDF(31874), AOM_ICDF(32123),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(17748), AOM_ICDF(29300), AOM_ICDF(29852),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9025), AOM_ICDF(11528), AOM_ICDF(24468), AOM_ICDF(25141),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6209), AOM_ICDF(6565), AOM_ICDF(15806), AOM_ICDF(16121),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2524), AOM_ICDF(7050), AOM_ICDF(7125),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(20417), AOM_ICDF(29779), AOM_ICDF(32552), AOM_ICDF(32636),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15553), AOM_ICDF(26420), AOM_ICDF(32063), AOM_ICDF(32295),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9665), AOM_ICDF(17946), AOM_ICDF(29385), AOM_ICDF(30096),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(10207), AOM_ICDF(22410), AOM_ICDF(23836),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2395), AOM_ICDF(6822), AOM_ICDF(6898),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(10177), AOM_ICDF(30567), AOM_ICDF(32725), AOM_ICDF(32745),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9537), AOM_ICDF(28243), AOM_ICDF(32179), AOM_ICDF(32423),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(23187), AOM_ICDF(29322), AOM_ICDF(30382),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(21346), AOM_ICDF(29507), AOM_ICDF(30326),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4939), AOM_ICDF(15104), AOM_ICDF(15535),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(29376), AOM_ICDF(30098), AOM_ICDF(32421), AOM_ICDF(32766),
-          AOM_ICDF(32767), AOM_ICDF(32768) },
-        { AOM_ICDF(18368), AOM_ICDF(22916), AOM_ICDF(30116), AOM_ICDF(32541),
-          AOM_ICDF(32650), AOM_ICDF(32768) },
-        { AOM_ICDF(5952), AOM_ICDF(16505), AOM_ICDF(25955), AOM_ICDF(32163),
-          AOM_ICDF(32365), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(19649), AOM_ICDF(30160), AOM_ICDF(32743), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18881), AOM_ICDF(28724), AOM_ICDF(32688), AOM_ICDF(32717),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16833), AOM_ICDF(23053), AOM_ICDF(31244), AOM_ICDF(31573),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14657), AOM_ICDF(17714), AOM_ICDF(26083), AOM_ICDF(26978),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14657), AOM_ICDF(16618), AOM_ICDF(24597), AOM_ICDF(25403),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4326), AOM_ICDF(10686), AOM_ICDF(10751),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(21953), AOM_ICDF(30956), AOM_ICDF(32748), AOM_ICDF(32757),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20929), AOM_ICDF(29412), AOM_ICDF(32700), AOM_ICDF(32725),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(21495), AOM_ICDF(31216), AOM_ICDF(31569),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9153), AOM_ICDF(15097), AOM_ICDF(28295), AOM_ICDF(28990),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5363), AOM_ICDF(13839), AOM_ICDF(13894),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(20289), AOM_ICDF(31164), AOM_ICDF(32745), AOM_ICDF(32755),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17601), AOM_ICDF(29635), AOM_ICDF(32739), AOM_ICDF(32751),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18241), AOM_ICDF(24284), AOM_ICDF(32116), AOM_ICDF(32258),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(32705), AOM_ICDF(32706), AOM_ICDF(32739), AOM_ICDF(32740),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(19009), AOM_ICDF(31481), AOM_ICDF(32742), AOM_ICDF(32754),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15809), AOM_ICDF(30521), AOM_ICDF(32736), AOM_ICDF(32750),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(32705), AOM_ICDF(32737), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7873), AOM_ICDF(8039), AOM_ICDF(19981), AOM_ICDF(20068),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(11841), AOM_ICDF(32116), AOM_ICDF(32728), AOM_ICDF(32748),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(32132), AOM_ICDF(32729), AOM_ICDF(32748),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(12435), AOM_ICDF(25708), AOM_ICDF(26666),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-static const coeff_cdf_model default_coef_head_cdf_8x8[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(16064), AOM_ICDF(18127), AOM_ICDF(22153), AOM_ICDF(27289),
-          AOM_ICDF(28507), AOM_ICDF(32768) },
-        { AOM_ICDF(6720), AOM_ICDF(10545), AOM_ICDF(13491), AOM_ICDF(20948),
-          AOM_ICDF(22631), AOM_ICDF(32768) },
-        { AOM_ICDF(832), AOM_ICDF(5270), AOM_ICDF(5918), AOM_ICDF(12645),
-          AOM_ICDF(13532), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(14017), AOM_ICDF(16139), AOM_ICDF(26799), AOM_ICDF(27295),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12737), AOM_ICDF(15136), AOM_ICDF(26235), AOM_ICDF(26816),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10817), AOM_ICDF(12445), AOM_ICDF(23637), AOM_ICDF(24217),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8897), AOM_ICDF(9702), AOM_ICDF(20040), AOM_ICDF(20500),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5953), AOM_ICDF(6156), AOM_ICDF(13966), AOM_ICDF(14205),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2519), AOM_ICDF(6222), AOM_ICDF(6300),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(19777), AOM_ICDF(21403), AOM_ICDF(30054), AOM_ICDF(30269),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16193), AOM_ICDF(17913), AOM_ICDF(28593), AOM_ICDF(28883),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12609), AOM_ICDF(13572), AOM_ICDF(25248), AOM_ICDF(25534),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9665), AOM_ICDF(10118), AOM_ICDF(20721), AOM_ICDF(20968),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6849), AOM_ICDF(7028), AOM_ICDF(15202), AOM_ICDF(15391),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3036), AOM_ICDF(7601), AOM_ICDF(7675),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22593), AOM_ICDF(23915), AOM_ICDF(31159), AOM_ICDF(31283),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17345), AOM_ICDF(18690), AOM_ICDF(29425), AOM_ICDF(29611),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11969), AOM_ICDF(12540), AOM_ICDF(24685), AOM_ICDF(24867),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8129), AOM_ICDF(8355), AOM_ICDF(18668), AOM_ICDF(18819),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4673), AOM_ICDF(4714), AOM_ICDF(11752), AOM_ICDF(11814),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1876), AOM_ICDF(5057), AOM_ICDF(5138),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(24513), AOM_ICDF(25718), AOM_ICDF(31947), AOM_ICDF(32014),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18881), AOM_ICDF(20029), AOM_ICDF(30409), AOM_ICDF(30527),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12481), AOM_ICDF(12953), AOM_ICDF(25201), AOM_ICDF(25341),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8385), AOM_ICDF(8528), AOM_ICDF(18815), AOM_ICDF(18910),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4327), AOM_ICDF(10797), AOM_ICDF(10861),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1872), AOM_ICDF(4332), AOM_ICDF(4415),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(26049), AOM_ICDF(27752), AOM_ICDF(32415), AOM_ICDF(32462),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20417), AOM_ICDF(22100), AOM_ICDF(31056), AOM_ICDF(31192),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12481), AOM_ICDF(13075), AOM_ICDF(24646), AOM_ICDF(24844),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(7696), AOM_ICDF(17117), AOM_ICDF(17285),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3777), AOM_ICDF(3814), AOM_ICDF(10062), AOM_ICDF(10129),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1473), AOM_ICDF(1486), AOM_ICDF(3735), AOM_ICDF(3820),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(25920), AOM_ICDF(27743), AOM_ICDF(29455), AOM_ICDF(32147),
-          AOM_ICDF(32280), AOM_ICDF(32768) },
-        { AOM_ICDF(13888), AOM_ICDF(19845), AOM_ICDF(23350), AOM_ICDF(30219),
-          AOM_ICDF(30660), AOM_ICDF(32768) },
-        { AOM_ICDF(2368), AOM_ICDF(12781), AOM_ICDF(16196), AOM_ICDF(27232),
-          AOM_ICDF(27894), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(21697), AOM_ICDF(24758), AOM_ICDF(32358), AOM_ICDF(32417),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20289), AOM_ICDF(23960), AOM_ICDF(32111), AOM_ICDF(32213),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17345), AOM_ICDF(19966), AOM_ICDF(30630), AOM_ICDF(30841),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14529), AOM_ICDF(16070), AOM_ICDF(27461), AOM_ICDF(27777),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(10613), AOM_ICDF(21146), AOM_ICDF(21566),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7162), AOM_ICDF(15591), AOM_ICDF(15776),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(23617), AOM_ICDF(26783), AOM_ICDF(32572), AOM_ICDF(32607),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20801), AOM_ICDF(24292), AOM_ICDF(32185), AOM_ICDF(32275),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15169), AOM_ICDF(17905), AOM_ICDF(29916), AOM_ICDF(30181),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(12972), AOM_ICDF(25565), AOM_ICDF(26064),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6849), AOM_ICDF(8334), AOM_ICDF(18543), AOM_ICDF(19446),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(4346), AOM_ICDF(12351), AOM_ICDF(13169),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(25281), AOM_ICDF(28440), AOM_ICDF(32667), AOM_ICDF(32689),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22081), AOM_ICDF(25694), AOM_ICDF(32414), AOM_ICDF(32476),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(18341), AOM_ICDF(30141), AOM_ICDF(30410),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10305), AOM_ICDF(12381), AOM_ICDF(24477), AOM_ICDF(25084),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6673), AOM_ICDF(16325), AOM_ICDF(17080),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2393), AOM_ICDF(6466), AOM_ICDF(6543),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(25921), AOM_ICDF(29445), AOM_ICDF(32729), AOM_ICDF(32739),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22465), AOM_ICDF(26834), AOM_ICDF(32588), AOM_ICDF(32627),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(20062), AOM_ICDF(31016), AOM_ICDF(31233),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11073), AOM_ICDF(13165), AOM_ICDF(25353), AOM_ICDF(25896),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(13837), AOM_ICDF(20144), AOM_ICDF(21734),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2265), AOM_ICDF(6355), AOM_ICDF(6432),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(26177), AOM_ICDF(29403), AOM_ICDF(32705), AOM_ICDF(32721),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22337), AOM_ICDF(26344), AOM_ICDF(32545), AOM_ICDF(32589),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19009), AOM_ICDF(21527), AOM_ICDF(31775), AOM_ICDF(31873),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11585), AOM_ICDF(12685), AOM_ICDF(22632), AOM_ICDF(23137),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8257), AOM_ICDF(8305), AOM_ICDF(16444), AOM_ICDF(16492),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(27200), AOM_ICDF(27981), AOM_ICDF(31389), AOM_ICDF(32444),
-          AOM_ICDF(32592), AOM_ICDF(32768) },
-        { AOM_ICDF(14528), AOM_ICDF(19068), AOM_ICDF(24887), AOM_ICDF(29901),
-          AOM_ICDF(30688), AOM_ICDF(32768) },
-        { AOM_ICDF(3776), AOM_ICDF(11778), AOM_ICDF(14700), AOM_ICDF(23745),
-          AOM_ICDF(24854), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(20289), AOM_ICDF(25202), AOM_ICDF(31672), AOM_ICDF(31909),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18369), AOM_ICDF(23493), AOM_ICDF(31166), AOM_ICDF(31487),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15425), AOM_ICDF(18619), AOM_ICDF(28941), AOM_ICDF(29393),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(12535), AOM_ICDF(24287), AOM_ICDF(24792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6465), AOM_ICDF(6810), AOM_ICDF(15764), AOM_ICDF(16080),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2137), AOM_ICDF(6125), AOM_ICDF(6203),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(23745), AOM_ICDF(27041), AOM_ICDF(31976), AOM_ICDF(32135),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19521), AOM_ICDF(22766), AOM_ICDF(31139), AOM_ICDF(31367),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14273), AOM_ICDF(15834), AOM_ICDF(27820), AOM_ICDF(28105),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9537), AOM_ICDF(10445), AOM_ICDF(22106), AOM_ICDF(22491),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7233), AOM_ICDF(7386), AOM_ICDF(15961), AOM_ICDF(16109),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2401), AOM_ICDF(7891), AOM_ICDF(7964),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26305), AOM_ICDF(28703), AOM_ICDF(32352), AOM_ICDF(32435),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20673), AOM_ICDF(23490), AOM_ICDF(31517), AOM_ICDF(31680),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14017), AOM_ICDF(15251), AOM_ICDF(27458), AOM_ICDF(27702),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(11374), AOM_ICDF(22496), AOM_ICDF(22687),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9153), AOM_ICDF(9435), AOM_ICDF(22299), AOM_ICDF(22411),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(269), AOM_ICDF(13236), AOM_ICDF(13293),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(27713), AOM_ICDF(29770), AOM_ICDF(32522), AOM_ICDF(32575),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21569), AOM_ICDF(24342), AOM_ICDF(31785), AOM_ICDF(31919),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(16497), AOM_ICDF(28367), AOM_ICDF(28569),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17601), AOM_ICDF(17828), AOM_ICDF(24444), AOM_ICDF(24582),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7035), AOM_ICDF(16901), AOM_ICDF(16947),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(384), AOM_ICDF(32706), AOM_ICDF(32707),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(28737), AOM_ICDF(30879), AOM_ICDF(32667), AOM_ICDF(32695),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22593), AOM_ICDF(26241), AOM_ICDF(32073), AOM_ICDF(32207),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16577), AOM_ICDF(19148), AOM_ICDF(28436), AOM_ICDF(28906),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12993), AOM_ICDF(14005), AOM_ICDF(23151), AOM_ICDF(23630),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7617), AOM_ICDF(9188), AOM_ICDF(22797), AOM_ICDF(23313),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(28864), AOM_ICDF(29988), AOM_ICDF(32423), AOM_ICDF(32766),
-          AOM_ICDF(32767), AOM_ICDF(32768) },
-        { AOM_ICDF(18496), AOM_ICDF(24572), AOM_ICDF(30167), AOM_ICDF(32687),
-          AOM_ICDF(32737), AOM_ICDF(32768) },
-        { AOM_ICDF(5440), AOM_ICDF(19618), AOM_ICDF(25332), AOM_ICDF(32393),
-          AOM_ICDF(32491), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(23745), AOM_ICDF(29427), AOM_ICDF(32751), AOM_ICDF(32757),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23745), AOM_ICDF(28704), AOM_ICDF(32716), AOM_ICDF(32731),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23105), AOM_ICDF(27943), AOM_ICDF(32524), AOM_ICDF(32587),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21057), AOM_ICDF(24773), AOM_ICDF(29589), AOM_ICDF(30282),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12609), AOM_ICDF(14823), AOM_ICDF(23831), AOM_ICDF(24713),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(25025), AOM_ICDF(30203), AOM_ICDF(32754), AOM_ICDF(32759),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23617), AOM_ICDF(28361), AOM_ICDF(32715), AOM_ICDF(32729),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17985), AOM_ICDF(21562), AOM_ICDF(31354), AOM_ICDF(31543),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(18915), AOM_ICDF(28742), AOM_ICDF(29548),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(289), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26433), AOM_ICDF(30892), AOM_ICDF(32757), AOM_ICDF(32761),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24513), AOM_ICDF(29274), AOM_ICDF(32721), AOM_ICDF(32735),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20161), AOM_ICDF(24040), AOM_ICDF(32055), AOM_ICDF(32171),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21953), AOM_ICDF(24678), AOM_ICDF(27382), AOM_ICDF(28734),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(27457), AOM_ICDF(31485), AOM_ICDF(32759), AOM_ICDF(32763),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24129), AOM_ICDF(29502), AOM_ICDF(32752), AOM_ICDF(32757),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19009), AOM_ICDF(25452), AOM_ICDF(32473), AOM_ICDF(32544),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(32705), AOM_ICDF(32706), AOM_ICDF(32737), AOM_ICDF(32738),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(27841), AOM_ICDF(32288), AOM_ICDF(32759), AOM_ICDF(32764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19137), AOM_ICDF(30271), AOM_ICDF(32742), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18625), AOM_ICDF(27739), AOM_ICDF(29979), AOM_ICDF(31099),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-static const coeff_cdf_model default_coef_head_cdf_16x16[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(960), AOM_ICDF(4882), AOM_ICDF(9467), AOM_ICDF(17710),
-          AOM_ICDF(20412), AOM_ICDF(32768) },
-        { AOM_ICDF(704), AOM_ICDF(4657), AOM_ICDF(6561), AOM_ICDF(14507),
-          AOM_ICDF(16279), AOM_ICDF(32768) },
-        { AOM_ICDF(192), AOM_ICDF(3443), AOM_ICDF(3759), AOM_ICDF(9011),
-          AOM_ICDF(9685), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(12481), AOM_ICDF(13958), AOM_ICDF(24487), AOM_ICDF(24997),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11457), AOM_ICDF(13075), AOM_ICDF(23820), AOM_ICDF(24406),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(11127), AOM_ICDF(21775), AOM_ICDF(22387),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(8457), AOM_ICDF(18155), AOM_ICDF(18655),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5441), AOM_ICDF(5668), AOM_ICDF(13180), AOM_ICDF(13467),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2520), AOM_ICDF(6340), AOM_ICDF(6417),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(19521), AOM_ICDF(20572), AOM_ICDF(28965), AOM_ICDF(29177),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15425), AOM_ICDF(16741), AOM_ICDF(27247), AOM_ICDF(27554),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11969), AOM_ICDF(12690), AOM_ICDF(23872), AOM_ICDF(24141),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9281), AOM_ICDF(9678), AOM_ICDF(19970), AOM_ICDF(20207),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6081), AOM_ICDF(6266), AOM_ICDF(14682), AOM_ICDF(14876),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2779), AOM_ICDF(7150), AOM_ICDF(7225),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22337), AOM_ICDF(23293), AOM_ICDF(30630), AOM_ICDF(30753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16321), AOM_ICDF(17427), AOM_ICDF(28368), AOM_ICDF(28570),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11457), AOM_ICDF(11907), AOM_ICDF(23570), AOM_ICDF(23741),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7233), AOM_ICDF(7331), AOM_ICDF(17258), AOM_ICDF(17334),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4033), AOM_ICDF(4070), AOM_ICDF(10375), AOM_ICDF(10441),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1601), AOM_ICDF(1619), AOM_ICDF(4706), AOM_ICDF(4788),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(24769), AOM_ICDF(25536), AOM_ICDF(31660), AOM_ICDF(31722),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18113), AOM_ICDF(18886), AOM_ICDF(29420), AOM_ICDF(29534),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(11412), AOM_ICDF(23207), AOM_ICDF(23291),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7033), AOM_ICDF(16599), AOM_ICDF(16646),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4033), AOM_ICDF(4070), AOM_ICDF(10375), AOM_ICDF(10441),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1601), AOM_ICDF(1620), AOM_ICDF(4827), AOM_ICDF(4909),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(28353), AOM_ICDF(28831), AOM_ICDF(32502), AOM_ICDF(32517),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21441), AOM_ICDF(21869), AOM_ICDF(30977), AOM_ICDF(31017),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11969), AOM_ICDF(12088), AOM_ICDF(24116), AOM_ICDF(24158),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(7547), AOM_ICDF(17413), AOM_ICDF(17458),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4545), AOM_ICDF(4585), AOM_ICDF(11325), AOM_ICDF(11388),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2133), AOM_ICDF(5526), AOM_ICDF(5606),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(2496), AOM_ICDF(8717), AOM_ICDF(17280), AOM_ICDF(28922),
-          AOM_ICDF(29751), AOM_ICDF(32768) },
-        { AOM_ICDF(2496), AOM_ICDF(9665), AOM_ICDF(15235), AOM_ICDF(26542),
-          AOM_ICDF(27580), AOM_ICDF(32768) },
-        { AOM_ICDF(448), AOM_ICDF(9240), AOM_ICDF(11886), AOM_ICDF(24124),
-          AOM_ICDF(24898), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(21057), AOM_ICDF(22896), AOM_ICDF(31877), AOM_ICDF(31953),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20673), AOM_ICDF(23151), AOM_ICDF(31706), AOM_ICDF(31825),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18753), AOM_ICDF(20519), AOM_ICDF(30497), AOM_ICDF(30668),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15425), AOM_ICDF(16608), AOM_ICDF(27789), AOM_ICDF(28027),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10305), AOM_ICDF(10977), AOM_ICDF(21405), AOM_ICDF(21749),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(3812), AOM_ICDF(11213), AOM_ICDF(11445),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(24001), AOM_ICDF(25899), AOM_ICDF(32307), AOM_ICDF(32360),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20929), AOM_ICDF(22941), AOM_ICDF(31775), AOM_ICDF(31867),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15169), AOM_ICDF(16734), AOM_ICDF(29228), AOM_ICDF(29425),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10561), AOM_ICDF(12047), AOM_ICDF(24918), AOM_ICDF(25324),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7929), AOM_ICDF(18311), AOM_ICDF(18918),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(3760), AOM_ICDF(9962), AOM_ICDF(10162),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(25793), AOM_ICDF(27526), AOM_ICDF(32565), AOM_ICDF(32591),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21825), AOM_ICDF(23885), AOM_ICDF(32064), AOM_ICDF(32135),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15041), AOM_ICDF(16286), AOM_ICDF(29203), AOM_ICDF(29360),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10433), AOM_ICDF(11058), AOM_ICDF(24349), AOM_ICDF(24538),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(6016), AOM_ICDF(16460), AOM_ICDF(16794),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(26433), AOM_ICDF(28398), AOM_ICDF(32682), AOM_ICDF(32696),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22977), AOM_ICDF(25086), AOM_ICDF(32367), AOM_ICDF(32412),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16577), AOM_ICDF(17928), AOM_ICDF(30144), AOM_ICDF(30275),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12481), AOM_ICDF(13352), AOM_ICDF(25993), AOM_ICDF(26211),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(8069), AOM_ICDF(20501), AOM_ICDF(20657),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(27841), AOM_ICDF(29700), AOM_ICDF(32721), AOM_ICDF(32730),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23873), AOM_ICDF(26202), AOM_ICDF(32578), AOM_ICDF(32604),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17729), AOM_ICDF(19046), AOM_ICDF(30448), AOM_ICDF(30568),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13505), AOM_ICDF(14508), AOM_ICDF(26034), AOM_ICDF(26304),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10049), AOM_ICDF(10494), AOM_ICDF(19945), AOM_ICDF(20233),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(27072), AOM_ICDF(27916), AOM_ICDF(31095), AOM_ICDF(32400),
-          AOM_ICDF(32553), AOM_ICDF(32768) },
-        { AOM_ICDF(12352), AOM_ICDF(16792), AOM_ICDF(22516), AOM_ICDF(28853),
-          AOM_ICDF(29797), AOM_ICDF(32768) },
-        { AOM_ICDF(2880), AOM_ICDF(9023), AOM_ICDF(11126), AOM_ICDF(20602),
-          AOM_ICDF(21713), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(20161), AOM_ICDF(24785), AOM_ICDF(31070), AOM_ICDF(31430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17985), AOM_ICDF(22773), AOM_ICDF(30430), AOM_ICDF(30880),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15937), AOM_ICDF(18802), AOM_ICDF(28265), AOM_ICDF(28788),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11841), AOM_ICDF(13587), AOM_ICDF(24798), AOM_ICDF(25335),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8769), AOM_ICDF(9160), AOM_ICDF(19316), AOM_ICDF(19566),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5357), AOM_ICDF(12874), AOM_ICDF(12932),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(24129), AOM_ICDF(26501), AOM_ICDF(31672), AOM_ICDF(31844),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19649), AOM_ICDF(21553), AOM_ICDF(30130), AOM_ICDF(30370),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(13134), AOM_ICDF(25983), AOM_ICDF(26321),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9409), AOM_ICDF(9948), AOM_ICDF(21408), AOM_ICDF(21663),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(5757), AOM_ICDF(14335), AOM_ICDF(14533),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2305), AOM_ICDF(13152), AOM_ICDF(13209),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26817), AOM_ICDF(28135), AOM_ICDF(32130), AOM_ICDF(32209),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20161), AOM_ICDF(21412), AOM_ICDF(30331), AOM_ICDF(30481),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(13798), AOM_ICDF(26065), AOM_ICDF(26176),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8129), AOM_ICDF(8290), AOM_ICDF(19920), AOM_ICDF(20008),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5751), AOM_ICDF(14950), AOM_ICDF(15002),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(5601), AOM_ICDF(11041), AOM_ICDF(11105),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28225), AOM_ICDF(29079), AOM_ICDF(32387), AOM_ICDF(32426),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21185), AOM_ICDF(22046), AOM_ICDF(30982), AOM_ICDF(31061),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(13595), AOM_ICDF(25762), AOM_ICDF(25824),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8001), AOM_ICDF(8123), AOM_ICDF(20530), AOM_ICDF(20590),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4322), AOM_ICDF(9907), AOM_ICDF(9974),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3393), AOM_ICDF(3412), AOM_ICDF(6663), AOM_ICDF(6739),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30529), AOM_ICDF(31014), AOM_ICDF(32651), AOM_ICDF(32664),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23489), AOM_ICDF(24268), AOM_ICDF(31627), AOM_ICDF(31682),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14017), AOM_ICDF(14239), AOM_ICDF(26653), AOM_ICDF(26707),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(11317), AOM_ICDF(23122), AOM_ICDF(23169),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6721), AOM_ICDF(6768), AOM_ICDF(14810), AOM_ICDF(14863),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6593), AOM_ICDF(6632), AOM_ICDF(13188), AOM_ICDF(13245),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(29888), AOM_ICDF(30492), AOM_ICDF(32500), AOM_ICDF(32766),
-          AOM_ICDF(32767), AOM_ICDF(32768) },
-        { AOM_ICDF(18752), AOM_ICDF(23235), AOM_ICDF(29846), AOM_ICDF(32214),
-          AOM_ICDF(32442), AOM_ICDF(32768) },
-        { AOM_ICDF(5568), AOM_ICDF(17762), AOM_ICDF(25039), AOM_ICDF(31213),
-          AOM_ICDF(31651), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(26433), AOM_ICDF(29681), AOM_ICDF(32757), AOM_ICDF(32760),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24769), AOM_ICDF(28761), AOM_ICDF(32722), AOM_ICDF(32734),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22209), AOM_ICDF(26975), AOM_ICDF(32418), AOM_ICDF(32500),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16321), AOM_ICDF(21333), AOM_ICDF(28368), AOM_ICDF(29283),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12865), AOM_ICDF(14775), AOM_ICDF(22545), AOM_ICDF(23553),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(12354), AOM_ICDF(12473), AOM_ICDF(12532),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(27457), AOM_ICDF(30005), AOM_ICDF(32738), AOM_ICDF(32745),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24897), AOM_ICDF(27541), AOM_ICDF(32723), AOM_ICDF(32731),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(19106), AOM_ICDF(30414), AOM_ICDF(30711),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6593), AOM_ICDF(8826), AOM_ICDF(19732), AOM_ICDF(20840),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4161), AOM_ICDF(4233), AOM_ICDF(16509), AOM_ICDF(16557),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(28609), AOM_ICDF(30482), AOM_ICDF(32761), AOM_ICDF(32763),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(25665), AOM_ICDF(27830), AOM_ICDF(32727), AOM_ICDF(32733),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21057), AOM_ICDF(23803), AOM_ICDF(30367), AOM_ICDF(30721),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(21878), AOM_ICDF(32726), AOM_ICDF(32737),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28993), AOM_ICDF(30944), AOM_ICDF(32762), AOM_ICDF(32764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(26561), AOM_ICDF(28695), AOM_ICDF(32733), AOM_ICDF(32739),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17985), AOM_ICDF(19028), AOM_ICDF(31008), AOM_ICDF(31079),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7873), AOM_ICDF(8039), AOM_ICDF(19981), AOM_ICDF(20068),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30273), AOM_ICDF(32029), AOM_ICDF(32764), AOM_ICDF(32766),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(28609), AOM_ICDF(30847), AOM_ICDF(32745), AOM_ICDF(32751),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21313), AOM_ICDF(24377), AOM_ICDF(31986), AOM_ICDF(32098),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(32705), AOM_ICDF(32709), AOM_ICDF(32739), AOM_ICDF(32741),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-static const coeff_cdf_model default_coef_head_cdf_32x32[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(2240), AOM_ICDF(5407), AOM_ICDF(18304), AOM_ICDF(25601),
-          AOM_ICDF(27911), AOM_ICDF(32768) },
-        { AOM_ICDF(960), AOM_ICDF(4633), AOM_ICDF(8197), AOM_ICDF(16254),
-          AOM_ICDF(18796), AOM_ICDF(32768) },
-        { AOM_ICDF(192), AOM_ICDF(3061), AOM_ICDF(3557), AOM_ICDF(8701),
-          AOM_ICDF(9762), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(11969), AOM_ICDF(15846), AOM_ICDF(25660), AOM_ICDF(26667),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(15794), AOM_ICDF(25737), AOM_ICDF(26760),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9281), AOM_ICDF(12675), AOM_ICDF(23181), AOM_ICDF(24351),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7105), AOM_ICDF(8757), AOM_ICDF(18383), AOM_ICDF(19437),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4579), AOM_ICDF(11353), AOM_ICDF(11792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1874), AOM_ICDF(4695), AOM_ICDF(4777),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(20929), AOM_ICDF(22297), AOM_ICDF(29370), AOM_ICDF(29646),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17473), AOM_ICDF(18985), AOM_ICDF(28079), AOM_ICDF(28413),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(14064), AOM_ICDF(24902), AOM_ICDF(25217),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(10214), AOM_ICDF(20069), AOM_ICDF(20329),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5825), AOM_ICDF(5987), AOM_ICDF(13350), AOM_ICDF(13559),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2260), AOM_ICDF(5520), AOM_ICDF(5600),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(25921), AOM_ICDF(26891), AOM_ICDF(31632), AOM_ICDF(31729),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18241), AOM_ICDF(19463), AOM_ICDF(29222), AOM_ICDF(29419),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11585), AOM_ICDF(12065), AOM_ICDF(23294), AOM_ICDF(23488),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6593), AOM_ICDF(6686), AOM_ICDF(16153), AOM_ICDF(16234),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3137), AOM_ICDF(3170), AOM_ICDF(8751), AOM_ICDF(8821),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1345), AOM_ICDF(1359), AOM_ICDF(3739), AOM_ICDF(3824),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(27713), AOM_ICDF(28504), AOM_ICDF(32068), AOM_ICDF(32132),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19265), AOM_ICDF(20354), AOM_ICDF(29789), AOM_ICDF(29943),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(11538), AOM_ICDF(22701), AOM_ICDF(22848),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6337), AOM_ICDF(6424), AOM_ICDF(15268), AOM_ICDF(15353),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(3681), AOM_ICDF(9052), AOM_ICDF(9121),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1601), AOM_ICDF(1618), AOM_ICDF(4584), AOM_ICDF(4667),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30913), AOM_ICDF(31044), AOM_ICDF(32635), AOM_ICDF(32640),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22081), AOM_ICDF(22261), AOM_ICDF(30452), AOM_ICDF(30477),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10561), AOM_ICDF(10625), AOM_ICDF(21535), AOM_ICDF(21568),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6081), AOM_ICDF(6130), AOM_ICDF(14369), AOM_ICDF(14423),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3777), AOM_ICDF(3809), AOM_ICDF(9156), AOM_ICDF(9225),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1875), AOM_ICDF(4936), AOM_ICDF(5018),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(4672), AOM_ICDF(6927), AOM_ICDF(23534), AOM_ICDF(29846),
-          AOM_ICDF(30928), AOM_ICDF(32768) },
-        { AOM_ICDF(3776), AOM_ICDF(6784), AOM_ICDF(18075), AOM_ICDF(25863),
-          AOM_ICDF(27926), AOM_ICDF(32768) },
-        { AOM_ICDF(1344), AOM_ICDF(5588), AOM_ICDF(12166), AOM_ICDF(20966),
-          AOM_ICDF(23504), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(19393), AOM_ICDF(22016), AOM_ICDF(31280), AOM_ICDF(31444),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21185), AOM_ICDF(24329), AOM_ICDF(31706), AOM_ICDF(31865),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20673), AOM_ICDF(23240), AOM_ICDF(31186), AOM_ICDF(31379),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17857), AOM_ICDF(20035), AOM_ICDF(29594), AOM_ICDF(29889),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13633), AOM_ICDF(14929), AOM_ICDF(24883), AOM_ICDF(25337),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7873), AOM_ICDF(8416), AOM_ICDF(17452), AOM_ICDF(17886),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(25665), AOM_ICDF(27145), AOM_ICDF(32256), AOM_ICDF(32314),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21057), AOM_ICDF(22826), AOM_ICDF(31465), AOM_ICDF(31576),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13633), AOM_ICDF(14885), AOM_ICDF(27873), AOM_ICDF(28088),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8769), AOM_ICDF(9515), AOM_ICDF(21941), AOM_ICDF(22248),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6209), AOM_ICDF(6594), AOM_ICDF(15598), AOM_ICDF(15950),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1985), AOM_ICDF(2014), AOM_ICDF(6855), AOM_ICDF(6931),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26817), AOM_ICDF(27824), AOM_ICDF(32362), AOM_ICDF(32399),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21185), AOM_ICDF(22321), AOM_ICDF(31389), AOM_ICDF(31466),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13761), AOM_ICDF(14154), AOM_ICDF(27163), AOM_ICDF(27245),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8897), AOM_ICDF(9011), AOM_ICDF(20600), AOM_ICDF(20659),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4673), AOM_ICDF(4774), AOM_ICDF(15044), AOM_ICDF(15131),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28865), AOM_ICDF(29687), AOM_ICDF(32655), AOM_ICDF(32667),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23233), AOM_ICDF(24218), AOM_ICDF(32080), AOM_ICDF(32118),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15041), AOM_ICDF(15444), AOM_ICDF(28787), AOM_ICDF(28845),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9921), AOM_ICDF(10248), AOM_ICDF(22818), AOM_ICDF(22944),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(7866), AOM_ICDF(16591), AOM_ICDF(16702),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(31169), AOM_ICDF(31559), AOM_ICDF(32741), AOM_ICDF(32744),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24769), AOM_ICDF(25583), AOM_ICDF(32347), AOM_ICDF(32370),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15937), AOM_ICDF(16169), AOM_ICDF(29120), AOM_ICDF(29152),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(7578), AOM_ICDF(22647), AOM_ICDF(22677),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7617), AOM_ICDF(7689), AOM_ICDF(19849), AOM_ICDF(19887),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(23232), AOM_ICDF(24301), AOM_ICDF(30231), AOM_ICDF(31582),
-          AOM_ICDF(32091), AOM_ICDF(32768) },
-        { AOM_ICDF(7872), AOM_ICDF(11041), AOM_ICDF(22542), AOM_ICDF(27086),
-          AOM_ICDF(29145), AOM_ICDF(32768) },
-        { AOM_ICDF(1344), AOM_ICDF(3989), AOM_ICDF(18125), AOM_ICDF(25340),
-          AOM_ICDF(27820), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(15937), AOM_ICDF(29000), AOM_ICDF(32210), AOM_ICDF(32434),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(26626), AOM_ICDF(31533), AOM_ICDF(31993),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11457), AOM_ICDF(29187), AOM_ICDF(30896), AOM_ICDF(31750),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(21278), AOM_ICDF(28169), AOM_ICDF(29764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(8855), AOM_ICDF(13365), AOM_ICDF(15620),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4833), AOM_ICDF(8572), AOM_ICDF(10108),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(25025), AOM_ICDF(30783), AOM_ICDF(32603), AOM_ICDF(32666),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24385), AOM_ICDF(29586), AOM_ICDF(31803), AOM_ICDF(32142),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22337), AOM_ICDF(23002), AOM_ICDF(27573), AOM_ICDF(27903),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(12336), AOM_ICDF(21900), AOM_ICDF(22590),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8257), AOM_ICDF(8830), AOM_ICDF(19986), AOM_ICDF(20298),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(10990), AOM_ICDF(18660), AOM_ICDF(18701),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(29761), AOM_ICDF(31473), AOM_ICDF(32693), AOM_ICDF(32715),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20417), AOM_ICDF(24512), AOM_ICDF(31394), AOM_ICDF(31650),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(13283), AOM_ICDF(25819), AOM_ICDF(26206),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(14099), AOM_ICDF(21909), AOM_ICDF(22514),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(248), AOM_ICDF(9546), AOM_ICDF(9614),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2524), AOM_ICDF(7050), AOM_ICDF(7125),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(30657), AOM_ICDF(31885), AOM_ICDF(32691), AOM_ICDF(32715),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19393), AOM_ICDF(26050), AOM_ICDF(31698), AOM_ICDF(31988),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15809), AOM_ICDF(15863), AOM_ICDF(24985), AOM_ICDF(25008),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23489), AOM_ICDF(28138), AOM_ICDF(32751), AOM_ICDF(32756),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2395), AOM_ICDF(6822), AOM_ICDF(6898),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(32705), AOM_ICDF(32744), AOM_ICDF(32766), AOM_ICDF(32767),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21953), AOM_ICDF(24962), AOM_ICDF(32156), AOM_ICDF(32246),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(15358), AOM_ICDF(26284), AOM_ICDF(26835),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7417), AOM_ICDF(20132), AOM_ICDF(20885),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4939), AOM_ICDF(15104), AOM_ICDF(15535),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(25280), AOM_ICDF(25678), AOM_ICDF(32446), AOM_ICDF(32622),
-          AOM_ICDF(32724), AOM_ICDF(32768) },
-        { AOM_ICDF(10560), AOM_ICDF(11822), AOM_ICDF(28682), AOM_ICDF(29919),
-          AOM_ICDF(31276), AOM_ICDF(32768) },
-        { AOM_ICDF(3264), AOM_ICDF(5170), AOM_ICDF(21779), AOM_ICDF(24026),
-          AOM_ICDF(27905), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(24257), AOM_ICDF(30554), AOM_ICDF(32719), AOM_ICDF(32738),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17217), AOM_ICDF(27413), AOM_ICDF(32617), AOM_ICDF(32667),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22977), AOM_ICDF(27600), AOM_ICDF(32482), AOM_ICDF(32552),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16833), AOM_ICDF(24360), AOM_ICDF(30746), AOM_ICDF(31293),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17089), AOM_ICDF(20060), AOM_ICDF(28880), AOM_ICDF(29370),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(11009), AOM_ICDF(21900), AOM_ICDF(21932),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(27201), AOM_ICDF(30217), AOM_ICDF(32736), AOM_ICDF(32745),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22721), AOM_ICDF(27676), AOM_ICDF(32749), AOM_ICDF(32754),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5057), AOM_ICDF(12431), AOM_ICDF(25246), AOM_ICDF(26620),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(321), AOM_ICDF(22016), AOM_ICDF(22048),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5363), AOM_ICDF(13839), AOM_ICDF(13894),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(27713), AOM_ICDF(30739), AOM_ICDF(32759), AOM_ICDF(32762),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(26177), AOM_ICDF(30430), AOM_ICDF(32756), AOM_ICDF(32760),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(384), AOM_ICDF(32706), AOM_ICDF(32707),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9409), AOM_ICDF(9528), AOM_ICDF(21591), AOM_ICDF(21646),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28993), AOM_ICDF(31156), AOM_ICDF(32747), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(25153), AOM_ICDF(28701), AOM_ICDF(32754), AOM_ICDF(32758),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16544), AOM_ICDF(32737), AOM_ICDF(32738),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(321), AOM_ICDF(22016), AOM_ICDF(22048),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30785), AOM_ICDF(32088), AOM_ICDF(32765), AOM_ICDF(32766),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22977), AOM_ICDF(26623), AOM_ICDF(32750), AOM_ICDF(32754),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21953), AOM_ICDF(21954), AOM_ICDF(22017), AOM_ICDF(22049),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-
-/* clang-format on */
-#endif  // !CONFIG_Q_ADAPT_PROBS
-
-static void build_tail_cdfs(aom_cdf_prob cdf_tail[CDF_SIZE(ENTROPY_TOKENS)],
-                            aom_cdf_prob cdf_head[CDF_SIZE(ENTROPY_TOKENS)],
-                            int band_zero) {
-  int probNZ, prob1, prob_idx, i;
-  int phead[HEAD_TOKENS + 1], sum;
-  const int is_dc = !!band_zero;
-  aom_cdf_prob prev_cdf;
-  prev_cdf = 0;
-  for (i = 0; i < HEAD_TOKENS + is_dc; ++i) {
-    phead[i] = AOM_ICDF(cdf_head[i]) - prev_cdf;
-    prev_cdf = AOM_ICDF(cdf_head[i]);
-  }
-  // Do the tail
-  probNZ = CDF_PROB_TOP - phead[ZERO_TOKEN + is_dc] - (is_dc ? phead[0] : 0);
-  prob1 = phead[is_dc + ONE_TOKEN_EOB] + phead[is_dc + ONE_TOKEN_NEOB];
-  prob_idx =
-      AOMMIN(COEFF_PROB_MODELS - 1, AOMMAX(0, ((256 * prob1) / probNZ) - 1));
-
-  sum = 0;
-  for (i = 0; i < TAIL_TOKENS; ++i) {
-    sum += av1_pareto8_tail_probs[prob_idx][i];
-    cdf_tail[i] = AOM_ICDF(sum);
-  }
-}
-
-#if !CONFIG_Q_ADAPT_PROBS
-// FIXME. Optimize for TX_2X2 and TX_64X64.
-static void av1_default_coef_cdfs(FRAME_CONTEXT *fc) {
-#if CONFIG_CHROMA_2X2
-  av1_copy(fc->coef_head_cdfs[TX_2X2], default_coef_head_cdf_4x4);
-#endif  // CONFIG_CHROMA_2X2
-  av1_copy(fc->coef_head_cdfs[TX_4X4], default_coef_head_cdf_4x4);
-  av1_copy(fc->coef_head_cdfs[TX_8X8], default_coef_head_cdf_8x8);
-  av1_copy(fc->coef_head_cdfs[TX_16X16], default_coef_head_cdf_16x16);
-  av1_copy(fc->coef_head_cdfs[TX_32X32], default_coef_head_cdf_32x32);
-#if CONFIG_TX64X64
-  av1_copy(fc->coef_head_cdfs[TX_64X64], default_coef_head_cdf_32x32);
-#endif  // CONFIG_TX64X64
-}
-#endif  // !CONFIG_Q_ADAPT_PROBS
-
-void av1_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
-  /* Build the tail based on a Pareto distribution */
-  TX_SIZE t;
-  int i, j, k, l;
-  for (t = 0; t < TX_SIZES; ++t)
-    for (i = 0; i < PLANE_TYPES; ++i)
-      for (j = 0; j < REF_TYPES; ++j)
-        for (k = 0; k < COEF_BANDS; ++k)
-          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
-            build_tail_cdfs(fc->coef_tail_cdfs[t][i][j][k][l],
-                            fc->coef_head_cdfs[t][i][j][k][l], k == 0);
+static int get_q_ctx(int q) {
+  if (q <= 20) return 0;
+  if (q <= 60) return 1;
+  if (q <= 120) return 2;
+  return 3;
 }
 
 void av1_default_coef_probs(AV1_COMMON *cm) {
-#if CONFIG_Q_ADAPT_PROBS
-  const int index = AOMMIN(TOKEN_CDF_Q_CTXS - 1, cm->base_qindex / 64);
-#if CONFIG_CHROMA_2X2
-  av1_copy(cm->fc->coef_head_cdfs[TX_2X2],
-           (*av1_default_qctx_coef_cdfs[index])[TX_4X4]);
-#endif  // CONFIG_CHROMA_2X2
-  av1_copy(cm->fc->coef_head_cdfs[TX_4X4],
-           (*av1_default_qctx_coef_cdfs[index])[TX_4X4]);
-  av1_copy(cm->fc->coef_head_cdfs[TX_8X8],
-           (*av1_default_qctx_coef_cdfs[index])[TX_8X8]);
-  av1_copy(cm->fc->coef_head_cdfs[TX_16X16],
-           (*av1_default_qctx_coef_cdfs[index])[TX_16X16]);
-  av1_copy(cm->fc->coef_head_cdfs[TX_32X32],
-           (*av1_default_qctx_coef_cdfs[index])[TX_32X32]);
-#if CONFIG_TX64X64
-  av1_copy(cm->fc->coef_head_cdfs[TX_64X64],
-           (*av1_default_qctx_coef_cdfs[index])[TX_32X32]);
-#endif  // CONFIG_TX64X64
-#else
-  /* Load the head tokens */
-  av1_default_coef_cdfs(cm->fc);
-#endif  // CONFIG_Q_ADAPT_PROBS
-  av1_coef_pareto_cdfs(cm->fc);
+  const int index = get_q_ctx(cm->base_qindex);
+#if CONFIG_ENTROPY_STATS
+  cm->coef_cdf_category = index;
+#endif
+
+  av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
+  av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
+  av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
+  av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_eob_cdf,
+           av1_default_coeff_base_eob_multi_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
 }
 
-#if CONFIG_LV_MAP
-void av1_adapt_coef_probs(AV1_COMMON *cm) {
-  unsigned int count_sat, update_factor;
-  if (!frame_is_intra_only(cm) && cm->last_frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */
-    count_sat = COEF_COUNT_SAT_AFTER_KEY;
-  } else {
-    update_factor = COEF_MAX_UPDATE_FACTOR;
-    count_sat = COEF_COUNT_SAT;
+static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
+                                     int cdf_stride, int nsymbs) {
+  for (int i = 0; i < num_cdfs; i++) {
+    cdf_ptr[i * cdf_stride + nsymbs] = 0;
   }
-  av1_adapt_txb_probs(cm, count_sat, update_factor);
 }
-#endif  // CONFIG_LV_MAP
 
-static void av1_average_cdf(aom_cdf_prob *cdf_ptr[], aom_cdf_prob *fc_cdf_ptr,
-                            int cdf_size, const int num_tiles) {
-  int i;
-  for (i = 0; i < cdf_size;) {
-    do {
-      int sum = 0;
-      int j;
-      assert(i < cdf_size);
-      for (j = 0; j < num_tiles; ++j) sum += AOM_ICDF(cdf_ptr[j][i]);
-      fc_cdf_ptr[i] = AOM_ICDF(sum / num_tiles);
-    } while (fc_cdf_ptr[i++] != AOM_ICDF(CDF_PROB_TOP));
-    // Zero symbol counts for the next frame
-    assert(i < cdf_size);
-    fc_cdf_ptr[i++] = 0;
-    // Skip trailing zeros until the start of the next CDF.
-    for (; i < cdf_size && fc_cdf_ptr[i] == 0; ++i) {
-    }
+#define RESET_CDF_COUNTER(cname, nsymbs) \
+  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
+
+#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride)          \
+  do {                                                               \
+    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
+    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
+    int num_cdfs = array_size / cdf_stride;                          \
+    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
+  } while (0)
+
+static void reset_nmv_counter(nmv_context *nmv) {
+  RESET_CDF_COUNTER(nmv->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
   }
 }
 
-#define AVERAGE_TILE_CDFS(cname)                            \
-  for (i = 0; i < num_tiles; ++i)                           \
-    cdf_ptr[i] = (aom_cdf_prob *)&ec_ctxs[i]->cname;        \
-  fc_cdf_ptr = (aom_cdf_prob *)&fc->cname;                  \
-  cdf_size = (int)sizeof(fc->cname) / sizeof(aom_cdf_prob); \
-  av1_average_cdf(cdf_ptr, fc_cdf_ptr, cdf_size, num_tiles);
-
-void av1_average_tile_coef_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                                aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-#if CONFIG_LV_MAP
-  AVERAGE_TILE_CDFS(txb_skip_cdf)
-  AVERAGE_TILE_CDFS(nz_map_cdf)
-  AVERAGE_TILE_CDFS(eob_flag_cdf)
-  AVERAGE_TILE_CDFS(dc_sign_cdf)
-  AVERAGE_TILE_CDFS(coeff_base_cdf)
-  AVERAGE_TILE_CDFS(coeff_lps_cdf)
-#if BR_NODE
-  AVERAGE_TILE_CDFS(coeff_br_cdf)
-#endif
-#if CONFIG_CTX1D
-  AVERAGE_TILE_CDFS(eob_mode_cdf)
-  AVERAGE_TILE_CDFS(empty_line_cdf)
-  AVERAGE_TILE_CDFS(hv_eob_cdf)
-#endif
-#else
-  AVERAGE_TILE_CDFS(coef_head_cdfs)
-  AVERAGE_TILE_CDFS(coef_tail_cdfs)
-#endif
-}
-
-void av1_average_tile_mv_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                              aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, k, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-  int j;
-  for (j = 0; j < NMV_CONTEXTS; ++j) {
-    AVERAGE_TILE_CDFS(nmvc[j].joint_cdf)
-
-    for (k = 0; k < 2; ++k) {
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_fp_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].fp_cdf)
-#if CONFIG_NEW_MULTISYMBOL
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].hp_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_hp_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].bits_cdf)
-#endif
-    }
+void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
+  RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
+  RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
+  RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
+  RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
+  RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
+  RESET_CDF_COUNTER(fc->newmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
+  RESET_CDF_COUNTER(fc->refmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->drl_cdf, 2);
+  RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+  RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
+  RESET_CDF_COUNTER(fc->interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
+  RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
+  RESET_CDF_COUNTER(fc->obmc_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
+  RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
+    RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
   }
-}
-
-void av1_average_tile_intra_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-  AVERAGE_TILE_CDFS(tx_size_cdf)
-
-  AVERAGE_TILE_CDFS(intra_ext_tx_cdf)
-  AVERAGE_TILE_CDFS(inter_ext_tx_cdf)
-
-  AVERAGE_TILE_CDFS(seg.tree_cdf)
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(seg.pred_cdf)
-#endif
-  AVERAGE_TILE_CDFS(uv_mode_cdf)
-
-#if CONFIG_CFL
-  AVERAGE_TILE_CDFS(cfl_sign_cdf)
-  AVERAGE_TILE_CDFS(cfl_alpha_cdf)
-#endif
-
-  AVERAGE_TILE_CDFS(partition_cdf)
-
-  AVERAGE_TILE_CDFS(delta_q_cdf)
-#if CONFIG_EXT_DELTA_Q
-  AVERAGE_TILE_CDFS(delta_lf_cdf)
-#endif
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  AVERAGE_TILE_CDFS(intra_filter_cdf)
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(skip_cdfs)
-#if CONFIG_VAR_TX
-  AVERAGE_TILE_CDFS(txfm_partition_cdf)
-#endif
-#endif  // CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(palette_y_size_cdf)
-  AVERAGE_TILE_CDFS(palette_uv_size_cdf)
-  AVERAGE_TILE_CDFS(palette_y_color_index_cdf)
-  AVERAGE_TILE_CDFS(palette_uv_color_index_cdf)
-#if CONFIG_MRC_TX
-  AVERAGE_TILE_CDFS(mrc_mask_intra_cdf)
-#endif  // CONFIG_MRC_TX
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(palette_y_mode_cdf)
-  AVERAGE_TILE_CDFS(palette_uv_mode_cdf)
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  AVERAGE_TILE_CDFS(quarter_tx_size_cdf)
-#endif
-#endif
-#if CONFIG_LPF_SB
-  AVERAGE_TILE_CDFS(lpf_reuse_cdf);
-  AVERAGE_TILE_CDFS(lpf_delta_cdf);
-  AVERAGE_TILE_CDFS(lpf_sign_cdf);
-#endif  // CONFIG_LPF_SB
-}
-
-void av1_average_tile_inter_cdfs(AV1_COMMON *cm, FRAME_CONTEXT *fc,
-                                 FRAME_CONTEXT *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(comp_inter_cdf)
-#if CONFIG_EXT_REFS
-  AVERAGE_TILE_CDFS(comp_ref_cdf)
-  AVERAGE_TILE_CDFS(comp_bwdref_cdf)
-#endif
-#endif
-
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(single_ref_cdf)
-
-  AVERAGE_TILE_CDFS(newmv_cdf)
-  AVERAGE_TILE_CDFS(zeromv_cdf)
-  AVERAGE_TILE_CDFS(refmv_cdf)
-  AVERAGE_TILE_CDFS(drl_cdf)
-#if CONFIG_EXT_COMP_REFS
-  AVERAGE_TILE_CDFS(uni_comp_ref_cdf)
-  AVERAGE_TILE_CDFS(comp_ref_type_cdf)
-#endif
-#endif
-
-  // FIXME: cdfs not defined for super_tx
-
-  AVERAGE_TILE_CDFS(inter_compound_mode_cdf)
-
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  AVERAGE_TILE_CDFS(compound_type_cdf)
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  AVERAGE_TILE_CDFS(inter_singleref_comp_mode_cdf)
-#endif
-
-#if CONFIG_INTERINTRA
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(interintra_cdf)
-  AVERAGE_TILE_CDFS(wedge_interintra_cdf)
-#endif
-  AVERAGE_TILE_CDFS(interintra_mode_cdf)
-#endif
-
-  /* NB: kf_y_cdf is discarded after use, so no need
-     for backwards update */
-  AVERAGE_TILE_CDFS(y_mode_cdf)
-
-  if (cm->interp_filter == SWITCHABLE) {
-    AVERAGE_TILE_CDFS(switchable_interp_cdf)
-  }
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(intra_inter_cdf)
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  AVERAGE_TILE_CDFS(motion_mode_cdf)
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  AVERAGE_TILE_CDFS(obmc_cdf)
-#endif
-#endif
-#endif
-#if CONFIG_MRC_TX
-  AVERAGE_TILE_CDFS(mrc_mask_inter_cdf)
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LPF_SB
-  AVERAGE_TILE_CDFS(lpf_reuse_cdf);
-  AVERAGE_TILE_CDFS(lpf_delta_cdf);
-  AVERAGE_TILE_CDFS(lpf_sign_cdf);
-#endif  // CONFIG_LPF_SB
-}
-
-#if CONFIG_PVQ
-// Averaging PVQ's expected values for symbol coding
-static void av1_average_pvq_ex(int *cxt_ptr[], int *fc_cxt_ptr, int cxt_size,
-                               const int num_tiles) {
-  int i, j;
-  for (i = 0; i < cxt_size; ++i) {
-    int sum = 0;
-    for (j = 0; j < num_tiles; ++j) sum += cxt_ptr[j][i];
-    fc_cxt_ptr[i] = sum / num_tiles;
-  }
-}
-
-#define AVERAGE_TILE_PVQ_EX(cname)                                        \
-  for (i = 0; i < num_tiles; ++i) cxt_ptr[i] = (int *)&ec_ctxs[i]->cname; \
-  fc_cxt_ptr = (int *)&fc->cname;                                         \
-  cxt_size = (int)sizeof(fc->cname) / sizeof(int);                        \
-  av1_average_pvq_ex(cxt_ptr, fc_cxt_ptr, cxt_size, num_tiles);
-
-void av1_default_pvq_probs(AV1_COMMON *cm) {
-  od_adapt_ctx *adapt = &cm->fc->pvq_context;
-
-  // Init with flat probabilities.
-  od_adapt_ctx_reset(adapt, 0);
-
-  // TODO(yushin): Prepare offline cdf and context table for PVQ,
-  // i.e. od_adapt_ctx, then load them from table,
-  // for example od_adapt_ctx default_pvq_context.
-  // Then do sth like this:
-  // av1_copy(cm->fc->pvq_context, default_pvq_context);
-}
-
-void av1_average_tile_pvq_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                               const int num_tiles) {
-  int i, j, cdf_size, cxt_size;
-
-  aom_cdf_prob *cdf_ptr[MAX_TILE_ROWS * MAX_TILE_COLS];
-  aom_cdf_prob *fc_cdf_ptr;
-  int *cxt_ptr[MAX_TILE_ROWS * MAX_TILE_COLS];
-  int *fc_cxt_ptr;
-
-  AVERAGE_TILE_PVQ_EX(pvq_context.ex_dc)
-  AVERAGE_TILE_PVQ_EX(pvq_context.ex_g)
-
-  for (j = 0; j < OD_NPLANES_MAX; j++) {
-    AVERAGE_TILE_CDFS(pvq_context.model_dc[j].cdf)
+  RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
+  RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
+  RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
+  RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
+  RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
+  RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
+  RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+  RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
+  reset_nmv_counter(&fc->nmvc);
+  reset_nmv_counter(&fc->ndvc);
+  RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
+  RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
+  RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
+  RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
+                           CDF_SIZE(UV_INTRA_MODES));
+  RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
+    } else if (i < 16) {
+      RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
+    } else {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
+    }
   }
-
-  AVERAGE_TILE_CDFS(pvq_context.skip_cdf)
-
-  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_codeword_ctx.pvq_adapt)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_codeword_ctx.pvq_k1_cdf)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_codeword_ctx.pvq_split_cdf)
-
-  for (j = 0; j < 3; j++) {
-    AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_param_model[j].cdf)
+  RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
+  RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
+  RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
+                           CDF_SIZE(MAX_TX_DEPTH + 1));
+  RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
+  RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
   }
-
-  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_ext)
-  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_exg)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_gaintheta_cdf)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_skip_dir_cdf)
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
 }
-#endif  // CONFIG_PVQ
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
index 679aae837..ef944c5a0 100644
--- a/third_party/aom/av1/common/entropy.h
+++ b/third_party/aom/av1/common/entropy.h
@@ -12,7 +12,8 @@
 #ifndef AV1_COMMON_ENTROPY_H_
 #define AV1_COMMON_ENTROPY_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/prob.h"
 
@@ -24,82 +25,35 @@
 extern "C" {
 #endif
 
-#define DIFF_UPDATE_PROB 252
-#define GROUP_DIFF_UPDATE_PROB 252
-
-#if CONFIG_Q_ADAPT_PROBS
 #define TOKEN_CDF_Q_CTXS 4
-#endif  // CONFIG_Q_ADAPT_PROBS
-
-// Coefficient token alphabet
-#define ZERO_TOKEN 0        // 0     Extra Bits 0+0
-#define ONE_TOKEN 1         // 1     Extra Bits 0+1
-#define TWO_TOKEN 2         // 2     Extra Bits 0+1
-#define THREE_TOKEN 3       // 3     Extra Bits 0+1
-#define FOUR_TOKEN 4        // 4     Extra Bits 0+1
-#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
-#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
-#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
-#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
-#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
-#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
-#define EOB_TOKEN 11        // EOB   Extra Bits 0+0
-#define NO_EOB 0            // Not an end-of-block
-#define EARLY_EOB 1         // End of block before the last position
-#define LAST_EOB 2          // End of block in the last position (implicit)
-#define BLOCK_Z_TOKEN 255   // block zero
-#define HEAD_TOKENS 5
-#define TAIL_TOKENS 9
-#define ONE_TOKEN_EOB 1
-#define ONE_TOKEN_NEOB 2
-#define TWO_TOKEN_PLUS_EOB 3
-#define TWO_TOKEN_PLUS_NEOB 4
-#define ENTROPY_TOKENS 12
-
-#define ENTROPY_NODES 11
 
-#if CONFIG_LV_MAP
 #define TXB_SKIP_CONTEXTS 13
 
-#if CONFIG_CTX1D
-#define EOB_COEF_CONTEXTS_2D 25
-#define EOB_COEF_CONTEXTS_1D 25
-#define EOB_COEF_CONTEXTS \
-  (EOB_COEF_CONTEXTS_2D + EOB_COEF_CONTEXTS_1D + EOB_COEF_CONTEXTS_1D)
-#else  // CONFIG_CTX1D
-#define EOB_COEF_CONTEXTS 25
-#endif  // CONFIG_CTX1D
+#define EOB_COEF_CONTEXTS 9
 
-#if CONFIG_EXT_TX
-#define SIG_COEF_CONTEXTS_2D 16
+#define SIG_COEF_CONTEXTS_2D 26
 #define SIG_COEF_CONTEXTS_1D 16
-#define SIG_COEF_CONTEXTS \
-  (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D + SIG_COEF_CONTEXTS_1D)
-#else  // CONFIG_EXT_TX
-#define SIG_COEF_CONTEXTS_2D 16
-#define SIG_COEF_CONTEXTS 16
-#endif  // CONFIG_EXT_TX
-#define COEFF_BASE_CONTEXTS 42
+#define SIG_COEF_CONTEXTS_EOB 4
+#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D)
+
+#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS)
 #define DC_SIGN_CONTEXTS 3
 
 #define BR_TMP_OFFSET 12
 #define BR_REF_CAT 4
-#define LEVEL_CONTEXTS (BR_TMP_OFFSET * BR_REF_CAT)
+#define LEVEL_CONTEXTS 21
 
 #define NUM_BASE_LEVELS 2
-#define COEFF_BASE_RANGE (16 - NUM_BASE_LEVELS)
-#define BASE_RANGE_SETS 3
+
+#define BR_CDF_SIZE (4)
+#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
 
 #define COEFF_CONTEXT_BITS 6
 #define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
 
 #define BASE_CONTEXT_POSITION_NUM 12
 
-#if CONFIG_CTX1D
-#define EMPTY_LINE_CONTEXTS 5
-#define HV_EOB_CONTEXTS 24
-#endif  // CONFIG_CTX1D
-
 typedef enum TX_CLASS {
   TX_CLASS_2D = 0,
   TX_CLASS_HORIZ = 1,
@@ -107,161 +61,21 @@ typedef enum TX_CLASS {
   TX_CLASSES = 3,
 } TX_CLASS;
 
-#endif
-
-DECLARE_ALIGNED(16, extern const uint8_t, av1_pt_energy_class[ENTROPY_TOKENS]);
-
-#define CAT1_MIN_VAL 5
-#define CAT2_MIN_VAL 7
-#define CAT3_MIN_VAL 11
-#define CAT4_MIN_VAL 19
-#define CAT5_MIN_VAL 35
-#define CAT6_MIN_VAL 67
-
-// Extra bit probabilities.
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat1_prob[1]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat2_prob[2]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat3_prob[3]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat4_prob[4]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat5_prob[5]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat6_prob[18]);
-#if CONFIG_NEW_MULTISYMBOL
-extern const aom_cdf_prob *av1_cat1_cdf[];
-extern const aom_cdf_prob *av1_cat2_cdf[];
-extern const aom_cdf_prob *av1_cat3_cdf[];
-extern const aom_cdf_prob *av1_cat4_cdf[];
-extern const aom_cdf_prob *av1_cat5_cdf[];
-extern const aom_cdf_prob *av1_cat6_cdf[];
-#endif
-
-#define EOB_MODEL_TOKEN 3
-
-typedef struct {
-#if CONFIG_NEW_MULTISYMBOL
-  const aom_cdf_prob **cdf;
-#else
-  const aom_prob *prob;
-#endif
-  int len;
-  int base_val;
-  const int16_t *cost;
-} av1_extra_bit;
-
-// indexed by token value
-extern const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS];
-
-static INLINE int av1_get_cat6_extrabits_size(TX_SIZE tx_size,
-                                              aom_bit_depth_t bit_depth) {
-  tx_size = txsize_sqr_up_map[tx_size];
-#if CONFIG_TX64X64
-  // TODO(debargha): Does TX_64X64 require an additional extrabit?
-  if (tx_size > TX_32X32) tx_size = TX_32X32;
-#endif
-#if CONFIG_CHROMA_2X2
-  int tx_offset = (tx_size < TX_4X4) ? 0 : (int)(tx_size - TX_4X4);
-#else
-  int tx_offset = (int)(tx_size - TX_4X4);
-#endif
-  int bits = (int)bit_depth + 3 + tx_offset;
-#if CONFIG_NEW_MULTISYMBOL
-  // Round up
-  bits = AOMMIN((int)sizeof(av1_cat6_prob), ((bits + 3) & ~3));
-#endif
-  assert(bits <= (int)sizeof(av1_cat6_prob));
-  return bits;
-}
-
 #define DCT_MAX_VALUE 16384
-#if CONFIG_HIGHBITDEPTH
 #define DCT_MAX_VALUE_HIGH10 65536
 #define DCT_MAX_VALUE_HIGH12 262144
-#endif  // CONFIG_HIGHBITDEPTH
-
-/* Coefficients are predicted via a 3-dimensional probability table. */
 
+/* Coefficients are predicted via a 3-dimensional probability table indexed on
+ * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */
 #define REF_TYPES 2  // intra=0, inter=1
 
-/* Middle dimension reflects the coefficient position within the transform. */
-#define COEF_BANDS 6
-
-/* Inside dimension is measure of nearby complexity, that reflects the energy
-   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
-   block type is 0), we look at the (already encoded) blocks above and to the
-   left of the current block.  The context index is then the number (0,1,or 2)
-   of these blocks having nonzero coefficients.
-   After decoding a coefficient, the measure is determined by the size of the
-   most recently decoded coefficient.
-   Note that the intuitive meaning of this measure changes as coefficients
-   are decoded, e.g., prior to the first token, a zero means that my neighbors
-   are empty while, after the first token, because of the use of end-of-block,
-   a zero means we just decoded a zero and hence guarantees that a non-zero
-   coefficient will appear later in this block.  However, this shift
-   in meaning is perfectly OK because our context depends also on the
-   coefficient band (and since zigzag positions 0, 1, and 2 are in
-   distinct bands). */
-
-#define COEFF_CONTEXTS 6
-#define COEFF_CONTEXTS0 3  // for band 0
-#define BAND_COEFF_CONTEXTS(band) \
-  ((band) == 0 ? COEFF_CONTEXTS0 : COEFF_CONTEXTS)
-
-#define SUBEXP_PARAM 4   /* Subexponential code parameter */
-#define MODULUS_PARAM 13 /* Modulus parameter */
-
 struct AV1Common;
 struct frame_contexts;
+void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
 void av1_default_coef_probs(struct AV1Common *cm);
-#if CONFIG_LV_MAP
-void av1_adapt_coef_probs(struct AV1Common *cm);
-#endif  // CONFIG_LV_MAP
-
-// This is the index in the scan order beyond which all coefficients for
-// 8x8 transform and above are in the top band.
-// This macro is currently unused but may be used by certain implementations
-#define MAXBAND_INDEX 21
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                av1_coefband_trans_8x8plus[MAX_TX_SQUARE]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x8_8x4[32]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x4[16]);
-
-DECLARE_ALIGNED(16, extern const uint16_t, band_count_table[TX_SIZES_ALL][8]);
-DECLARE_ALIGNED(16, extern const uint16_t,
-                band_cum_count_table[TX_SIZES_ALL][8]);
-
-static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return av1_coefband_trans_4x4;
-    case TX_8X4:
-    case TX_4X8: return av1_coefband_trans_4x8_8x4;
-    default: return av1_coefband_trans_8x8plus;
-  }
-}
-
-// 128 lists of probabilities are stored for the following ONE node probs:
-// 1, 3, 5, 7, ..., 253, 255
-// In between probabilities are interpolated linearly
-
-#define COEFF_PROB_MODELS 255
-
-#define UNCONSTRAINED_NODES 3
-
-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
-#define TAIL_NODES (MODEL_NODES + 1)
-extern const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
-extern const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
-
-typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
-                                    [CDF_SIZE(ENTROPY_TOKENS)];
-extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
-                                                 [ENTROPY_TOKENS - 2];
-extern const aom_cdf_prob av1_pareto8_tail_probs[COEFF_PROB_MODELS]
-                                                [ENTROPY_TOKENS - 3];
 struct frame_contexts;
 
-void av1_coef_head_cdfs(struct frame_contexts *fc);
-void av1_coef_pareto_cdfs(struct frame_contexts *fc);
-
 typedef char ENTROPY_CONTEXT;
 
 static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
@@ -273,93 +87,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                                       const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
-#if CONFIG_CHROMA_2X2
-  switch (tx_size) {
-    case TX_2X2:
-      above_ec = a[0] != 0;
-      left_ec = l[0] != 0;
-      break;
-    case TX_4X4:
-      above_ec = !!*(const uint16_t *)a;
-      left_ec = !!*(const uint16_t *)l;
-      break;
-    case TX_4X8:
-      above_ec = !!*(const uint16_t *)a;
-      left_ec = !!*(const uint32_t *)l;
-      break;
-    case TX_8X4:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!*(const uint16_t *)l;
-      break;
-    case TX_8X8:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!*(const uint32_t *)l;
-      break;
-    case TX_8X16:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_16X8:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!*(const uint32_t *)l;
-      break;
-    case TX_16X16:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_16X32:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-    case TX_32X16:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_32X32:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-#if CONFIG_TX64X64
-    case TX_64X64:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8) |
-                    *(const uint64_t *)(a + 16) | *(const uint64_t *)(a + 24));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8) |
-                   *(const uint64_t *)(l + 16) | *(const uint64_t *)(l + 24));
-      break;
-    case TX_32X64:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8) |
-                   *(const uint64_t *)(l + 16) | *(const uint64_t *)(l + 24));
-      break;
-    case TX_64X32:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8) |
-                    *(const uint64_t *)(a + 16) | *(const uint64_t *)(a + 24));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-#endif  // CONFIG_TX64X64
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      above_ec = !!*(const uint16_t *)a;
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_16X4:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!*(const uint16_t *)l;
-      break;
-    case TX_8X32:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-    case TX_32X8:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!*(const uint32_t *)l;
-      break;
-#endif
-    default: assert(0 && "Invalid transform size."); break;
-  }
-  return combine_entropy_contexts(above_ec, left_ec);
-#endif  // CONFIG_CHROMA_2X2
-
   switch (tx_size) {
     case TX_4X4:
       above_ec = a[0] != 0;
@@ -401,7 +128,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       above_ec = !!*(const uint64_t *)a;
       left_ec = !!*(const uint64_t *)l;
       break;
-#if CONFIG_TX64X64
     case TX_64X64:
       above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
       left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
@@ -414,8 +140,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
       left_ec = !!*(const uint64_t *)l;
       break;
-#endif  // CONFIG_TX64X64
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     case TX_4X16:
       above_ec = a[0] != 0;
       left_ec = !!*(const uint32_t *)l;
@@ -432,55 +156,24 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       above_ec = !!*(const uint64_t *)a;
       left_ec = !!*(const uint16_t *)l;
       break;
-#endif
+    case TX_16X64:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_64X16:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint32_t *)l;
+      break;
     default: assert(0 && "Invalid transform size."); break;
   }
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-#define COEF_COUNT_SAT 24
-#define COEF_MAX_UPDATE_FACTOR 112
-#define COEF_COUNT_SAT_AFTER_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-
-#if CONFIG_ADAPT_SCAN
-#define ADAPT_SCAN_PROB_PRECISION 10
-// 1/8 update rate
-#define ADAPT_SCAN_UPDATE_LOG_RATE 3
-#define ADAPT_SCAN_UPDATE_RATE \
-  (1 << (ADAPT_SCAN_PROB_PRECISION - ADAPT_SCAN_UPDATE_LOG_RATE))
-#endif
-
-static INLINE aom_prob av1_merge_probs(aom_prob pre_prob,
-                                       const unsigned int ct[2],
-                                       unsigned int count_sat,
-                                       unsigned int max_update_factor) {
-  return merge_probs(pre_prob, ct, count_sat, max_update_factor);
-}
-
-static INLINE aom_prob av1_mode_mv_merge_probs(aom_prob pre_prob,
-                                               const unsigned int ct[2]) {
-  return mode_mv_merge_probs(pre_prob, ct);
+static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+  return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
+                   1);
 }
 
-void av1_average_tile_coef_cdfs(struct frame_contexts *fc,
-                                struct frame_contexts *ec_ctxs[],
-                                aom_cdf_prob *cdf_ptrs[], int num_tiles);
-void av1_average_tile_mv_cdfs(struct frame_contexts *fc,
-                              struct frame_contexts *ec_ctxs[],
-                              aom_cdf_prob *cdf_ptrs[], int num_tiles);
-void av1_average_tile_intra_cdfs(struct frame_contexts *fc,
-                                 struct frame_contexts *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptrs[], int num_tiles);
-void av1_average_tile_inter_cdfs(struct AV1Common *cm,
-                                 struct frame_contexts *fc,
-                                 struct frame_contexts *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptrs[], int num_tiles);
-#if CONFIG_PVQ
-void av1_default_pvq_probs(struct AV1Common *cm);
-void av1_average_tile_pvq_cdfs(struct frame_contexts *fc,
-                               struct frame_contexts *ec_ctxs[], int num_tiles);
-#endif  // CONFIG_PVQ
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c
index 207f1e245..41dc30ddb 100644
--- a/third_party/aom/av1/common/entropymode.c
+++ b/third_party/aom/av1/common/entropymode.c
@@ -15,6203 +15,1089 @@
 #include "av1/common/scan.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/seg_common.h"
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
 
-#if CONFIG_LV_MAP
-#include "av1/common/txb_common.h"
-const aom_prob default_txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-#endif
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-};
-const aom_prob default_dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS] = {
-  { 125, 102, 147 }, { 119, 101, 135 },
+static const aom_cdf_prob
+    default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
+        INTRA_MODES)] = {
+      { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
+                    24189, 28165, 29093, 30466) },
+        { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032,
+                    24434, 28658, 30172, 31409) },
+        { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620,
+                    26160, 29336, 29929, 31567) },
+        { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096,
+                    24746, 29585, 30958, 32462) },
+        { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583,
+                    26437, 30261, 31073, 32475) } },
+      { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023,
+                    25381, 29014, 30482, 31436) },
+        { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423,
+                    27610, 29905, 31276, 31794) },
+        { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405,
+                    24469, 27915, 29090, 30492) },
+        { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825,
+                    24649, 29153, 31096, 32210) },
+        { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516,
+                    26001, 29675, 30981, 31994) } },
+      { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055,
+                    25729, 29538, 30305, 32077) },
+        { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062,
+                    23219, 27743, 29211, 30907) },
+        { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555,
+                    30467, 30794, 32086) },
+        { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523,
+                    23878, 28975, 30287, 32252) },
+        { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561,
+                    30072, 30737, 32463) } },
+      { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419,
+                    25060, 29696, 30917, 32409) },
+        { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468,
+                    25225, 29485, 31158, 32342) },
+        { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605,
+                    29118, 30078, 32018) },
+        { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743,
+                    30389, 31536, 32528) },
+        { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718,
+                    25769, 29953, 30983, 32485) } },
+      { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449,
+                    26219, 30214, 31150, 32477) },
+        { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236,
+                    25380, 29653, 31143, 32277) },
+        { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466,
+                    29900, 30523, 32261) },
+        { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753,
+                    24615, 29489, 30883, 32482) },
+        { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180,
+                    31355, 31802, 32593) } }
+    };
+
+static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE(
+    2 * MAX_ANGLE_DELTA + 1)] = {
+  { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) },
+  { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) },
+  { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) },
+  { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) },
+  { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) },
+  { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) },
+  { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) },
+  { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) }
 };
 
-const aom_prob default_coeff_base
-    [TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-      { // TX_2X2
-        {
-            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
-              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
-              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
-              135, 92,  166, 129, 190, 157, 217, 128, 128 },
-
-            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
-              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
-              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
-              164, 154, 195, 187, 216, 205, 230, 128, 128 },
-        },
-        {
-            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
-              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
-              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
-              135, 92,  166, 129, 190, 157, 217, 128, 128 },
+static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123,
+                                  26606, 27418, 27945, 29228, 29685, 30349) },
+                      { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649,
+                                  25527, 27364, 28152, 29701, 29984, 30852) },
+                      { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654,
+                                  25136, 27073, 27830, 29360, 29730, 30659) },
+                      { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533,
+                                  23703, 24804, 25352, 26575, 27016, 28049) } };
 
-            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
-              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
-              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
-              164, 154, 195, 187, 216, 205, 230, 128, 128 },
-        } },
-#endif
-      { // TX_4X4
-        {
-            // PLANE_Y
-            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
-              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
-              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
-              135, 92,  166, 129, 190, 157, 217, 128, 128 },
-
-            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
-              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
-              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
-              164, 154, 195, 187, 216, 205, 230, 128, 128 },
-        },
-        {
-            // PLANE_UV
-            { 47,  128, 100, 176, 140, 207, 150, 223, 11,  128, 35,
-              133, 79,  165, 115, 186, 129, 210, 8,   128, 30,  114,
-              80,  159, 116, 187, 146, 214, 2,   128, 9,   59,  28,
-              86,  71,  131, 117, 165, 149, 188, 128, 128 },
+static const aom_cdf_prob
+    default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE(
+        UV_INTRA_MODES)] = {
+      { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923,
+                    28244, 30059, 30941, 31961) },
+        { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824,
+                    28359, 29505, 29800, 31796) },
+        { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854,
+                    30764, 31777, 32029) },
+        { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148,
+                    28577, 30612, 31355, 32493) },
+        { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243,
+                    31101, 31744, 32363) },
+        { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458,
+                    29711, 31161, 31441, 32550) },
+        { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200,
+                    30245, 31837, 32342, 32667) },
+        { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128,
+                    29267, 30643, 31961, 32461) },
+        { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273,
+                    28443, 30388, 30767, 32416) },
+        { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719,
+                    23174, 28861, 30379, 32175) },
+        { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119,
+                    23527, 27053, 31397, 32148) },
+        { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907,
+                    22482, 25896, 26541, 31819) },
+        { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166,
+                    15255, 15753, 16039, 16606) } },
+      { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656,
+                    15986, 20086, 20995, 22455, 24212) },
+        { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451,
+                    22099, 24228, 24693, 27032, 29472) },
+        { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774,
+                    23138, 24256, 24703, 26679) },
+        { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371,
+                    21520, 22206, 23389, 24182) },
+        { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411,
+                    24911, 25380, 26027, 26376) },
+        { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981,
+                    24780, 25386, 26517, 27176) },
+        { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803,
+                    23188, 23763, 24455, 24940) },
+        { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059,
+                    22336, 23204, 23964, 24793) },
+        { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898,
+                    22494, 23139, 24764, 25989) },
+        { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004,
+                    15534, 20714, 21789, 23443, 24861) },
+        { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235,
+                    15902, 20102, 22696, 23774, 25838) },
+        { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163,
+                    15636, 19676, 20474, 23519, 25208) },
+        { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
+                    9875, 10521, 29048) } }
+    };
+
+static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
+    EXT_PARTITION_TYPES)] = {
+  { AOM_CDF4(19132, 25510, 30392) },
+  { AOM_CDF4(13928, 19855, 28540) },
+  { AOM_CDF4(12522, 23679, 28629) },
+  { AOM_CDF4(9896, 18783, 25853) },
+  { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) },
+  { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) },
+  { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) },
+  { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) },
+  { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) },
+  { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) },
+  { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) },
+  { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) },
+  { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) },
+  { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) },
+  { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) },
+  { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) },
+  { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+  { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+  { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+  { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
+};
 
-            { 83,  128, 152, 205, 168, 227, 192, 238, 42,  128, 92,
-              169, 138, 193, 165, 209, 128, 206, 36,  128, 86,  159,
-              141, 198, 181, 213, 102, 223, 18,  128, 50,  132, 90,
-              144, 141, 169, 180, 191, 128, 217, 128, 128 },
-        } },
+static const aom_cdf_prob default_intra_ext_tx_cdf
+    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
       {
-          // TX_8X8
           {
-              // PLANE_Y
-              { 82,  128, 143, 203, 177, 225, 186, 237, 7,   128, 37,
-                109, 78,  151, 110, 182, 139, 213, 25,  128, 51,  115,
-                86,  146, 111, 175, 125, 205, 3,   128, 12,  55,  32,
-                78,  63,  111, 96,  148, 123, 185, 146, 206 },
-
-              { 136, 128, 182, 220, 201, 236, 205, 243, 46,  128, 101,
-                164, 147, 194, 170, 218, 177, 234, 62,  128, 104, 146,
-                143, 183, 165, 207, 183, 228, 30,  128, 60,  95,  95,
-                128, 135, 163, 166, 196, 175, 219, 192, 231 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
           },
           {
-              // PLANE_UV
-              { 47,  128, 112, 189, 164, 202, 163, 218, 8,   128, 32,
-                110, 68,  151, 102, 179, 134, 195, 5,   128, 22,  76,
-                54,  103, 80,  146, 101, 182, 1,   128, 5,   39,  17,
-                53,  46,  93,  79,  127, 112, 161, 64,  195 },
-
-              { 90,  128, 156, 210, 183, 225, 128, 236, 39,  128, 98,
-                164, 146, 201, 209, 219, 171, 208, 32,  128, 68,  123,
-                119, 169, 154, 184, 128, 213, 15,  128, 38,  111, 83,
-                112, 120, 163, 180, 170, 154, 213, 128, 205 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
           },
-      },
-
-      {
-          // TX_16X16
           {
-              // PLANE_Y
-              { 96,  128, 169, 218, 208, 233, 187, 244, 10,  128, 34,
-                101, 82,  153, 113, 184, 137, 212, 6,   128, 34,  104,
-                81,  145, 109, 176, 147, 202, 1,   128, 3,   43,  15,
-                53,  43,  89,  79,  129, 108, 168, 110, 194 },
-
-              { 156, 128, 206, 232, 218, 240, 128, 251, 39,  128, 108,
-                161, 156, 202, 187, 216, 179, 234, 40,  128, 103, 152,
-                144, 185, 159, 208, 205, 227, 14,  128, 39,  84,  76,
-                110, 121, 151, 157, 187, 201, 206, 64,  216 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
           },
           {
-              // PLANE_UV
-              { 42, 128, 139, 211, 180, 230, 199, 238, 3,   128, 32,
-                96, 69,  145, 102, 186, 117, 212, 4,   128, 25,  72,
-                55, 111, 81,  159, 116, 198, 1,   128, 4,   22,  16,
-                34, 35,  68,  63,  116, 89,  165, 102, 199 },
-
-              { 135, 128, 193, 227, 182, 239, 128, 246, 42,  128, 115,
-                156, 146, 203, 188, 216, 128, 229, 32,  128, 82,  127,
-                120, 178, 165, 203, 213, 229, 11,  128, 32,  73,  79,
-                111, 129, 158, 162, 187, 156, 209, 85,  222 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
           },
       },
-
       {
-          // TX_32X32
           {
-              // PLANE_Y
-              { 97,  128, 163, 232, 191, 246, 219, 252, 3,   128, 41,
-                108, 91,  147, 104, 183, 118, 225, 6,   128, 45,  91,
-                83,  125, 92,  160, 99,  215, 1,   128, 11,  36,  28,
-                46,  43,  59,  57,  86,  73,  145, 91,  210 },
-
-              { 127, 128, 201, 239, 247, 248, 128, 254, 40,  128, 103,
-                152, 158, 199, 186, 225, 181, 242, 38,  128, 92,  112,
-                146, 189, 162, 217, 112, 239, 17,  128, 30,  47,  63,
-                89,  113, 146, 147, 187, 168, 217, 150, 233 },
+              { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) },
+              { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) },
+              { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) },
+              { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) },
+              { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) },
+              { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) },
+              { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) },
+              { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) },
+              { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) },
+              { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) },
+              { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) },
+              { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) },
+              { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) },
           },
           {
-              // PLANE_UV
-              { 65,  128, 155, 223, 166, 235, 154, 244, 15,  128, 57,
-                154, 110, 199, 159, 224, 149, 239, 9,   128, 57,  140,
-                97,  185, 148, 218, 176, 236, 1,   128, 3,   43,  19,
-                42,  64,  98,  117, 167, 154, 199, 128, 158 },
-
-              { 130, 128, 189, 231, 171, 247, 128, 246, 63,  128, 132,
-                222, 186, 224, 199, 244, 128, 247, 55,  128, 113, 211,
-                164, 230, 225, 243, 128, 239, 7,   128, 31,  102, 106,
-                138, 147, 183, 171, 223, 171, 224, 128, 128 },
+              { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) },
+              { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) },
+              { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) },
+              { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) },
+              { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) },
+              { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) },
+              { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) },
+              { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) },
+              { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) },
+              { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) },
+              { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) },
+              { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) },
+              { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
           },
       },
-    };
-
-const aom_prob default_nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] = {
-#if CONFIG_EXT_TX
-#if CONFIG_CHROMA_2X2
-  { { 56,  137, 82,  136, 83,  187, 124, 65,
-      215, 118, 155, 97,  160, 111, 71,  55,
-
-      142, 156, 91,  226, 107, 231, 146, 65,
-      105, 91,  232, 97,  185, 121, 90,  74,
-
-      153, 195, 123, 154, 106, 196, 143, 67,
-      232, 125, 121, 105, 159, 113, 88,  66 },
-    { 50,  124, 89,  135, 116, 189, 150, 81,
-      202, 126, 130, 107, 149, 110, 85,  67,
-
-      139, 174, 112, 200, 94,  206, 146, 71,
-      163, 164, 212, 99,  177, 143, 125, 85,
-
-      151, 181, 126, 168, 135, 186, 143, 94,
-      207, 129, 142, 135, 145, 112, 98,  81 } },
-#endif
-  { { 56,  137, 82,  136, 83,  187, 124, 65,
-      215, 118, 155, 97,  160, 111, 71,  55,
-
-      142, 156, 91,  226, 107, 231, 146, 65,
-      105, 91,  232, 97,  185, 121, 90,  74,
-
-      153, 195, 123, 154, 106, 196, 143, 67,
-      232, 125, 121, 105, 159, 113, 88,  66 },
-    { 50,  124, 89,  135, 116, 189, 150, 81,
-      202, 126, 130, 107, 149, 110, 85,  67,
-
-      139, 174, 112, 200, 94,  206, 146, 71,
-      163, 164, 212, 99,  177, 143, 125, 85,
-
-      151, 181, 126, 168, 135, 186, 143, 94,
-      207, 129, 142, 135, 145, 112, 98,  81 } },
-  { { 57,  156, 91,  162, 99,  212, 149, 81,
-      223, 128, 182, 121, 216, 163, 119, 94,
-
-      139, 183, 100, 206, 98,  242, 163, 79,
-      200, 127, 234, 112, 230, 169, 115, 90,
-
-      156, 190, 130, 172, 117, 209, 163, 80,
-      217, 145, 182, 135, 204, 163, 120, 88 },
-    { 48,  133, 102, 143, 119, 190, 170, 109,
-      197, 127, 176, 137, 214, 168, 130, 119,
-
-      139, 185, 129, 210, 84,  237, 177, 75,
-      182, 165, 216, 121, 206, 177, 147, 102,
-
-      159, 192, 153, 182, 139, 203, 160, 125,
-      193, 161, 176, 142, 173, 145, 131, 114 } },
-  { { 33,  148, 81,  149, 84,  219, 152, 76,
-      229, 127, 205, 120, 234, 170, 123, 88,
-
-      134, 197, 101, 213, 91,  244, 169, 85,
-      220, 141, 234, 123, 242, 183, 130, 94,
-
-      141, 184, 121, 173, 98,  213, 156, 85,
-      204, 156, 197, 119, 212, 174, 127, 92 },
-    { 14,  75,  45,  98,  83,  197, 150, 90,
-      235, 124, 242, 155, 246, 187, 143, 103,
-
-      78,  185, 111, 255, 116, 255, 224, 171,
-      185, 157, 255, 85,  219, 122, 128, 128,
-
-      117, 187, 102, 181, 132, 233, 197, 93,
-      207, 135, 191, 107, 222, 175, 130, 47 } },
-  {
-      { 14,  79,  44,  86,  59,  178, 124, 63,
-        244, 106, 233, 117, 252, 185, 132, 92,
-
-        85,  225, 47,  236, 103, 255, 190, 116,
-        235, 114, 247, 123, 250, 174, 122, 110,
-
-        109, 197, 78,  177, 76,  242, 148, 68,
-        236, 123, 231, 103, 247, 171, 122, 91 },
-      { 11,  40,  27,  92,  78,  183, 171, 70,
-        216, 74,  251, 146, 252, 213, 171, 148,
-
-        85,  225, 47,  236, 103, 255, 190, 116,
-        235, 114, 247, 123, 250, 174, 122, 110,
-
-        109, 197, 78,  177, 76,  242, 148, 68,
-        236, 123, 231, 103, 247, 171, 122, 91 },
-  },
-#else  // CONFIG_EXT_TX
-#if CONFIG_CHROMA_2X2
-  {
-      {
-          34, 103, 61, 106, 62, 160, 112, 54, 173, 121, 157, 92, 157, 129, 94,
-          65,
-      },
-
-      {
-          52, 124, 84, 136, 107, 197, 161, 82, 183, 151, 153, 140, 152, 134,
-          109, 81,
-      },
-  },
-#endif
-  {
-      {
-          34, 103, 61, 106, 62, 160, 112, 54, 173, 121, 157, 92, 157, 129, 94,
-          65,
-      },
-
-      {
-          52, 124, 84, 136, 107, 197, 161, 82, 183, 151, 153, 140, 152, 134,
-          109, 81,
-      },
-  },
-  {
-      {
-          34, 127, 74, 124, 74, 204, 153, 76, 226, 162, 207, 126, 227, 192, 149,
-          108,
-      },
-
-      {
-          43, 136, 115, 158, 130, 212, 187, 112, 231, 180, 202, 164, 236, 204,
-          168, 139,
-      },
-  },
-  {
-      {
-          25, 117, 70, 120, 77, 215, 171, 102, 234, 156, 235, 155, 247, 220,
-          176, 127,
-      },
-
-      {
-          24, 88, 49, 100, 62, 202, 148, 62, 237, 178, 233, 168, 244, 198, 162,
-          127,
-      },
-  },
-  {
-      {
-          11, 54, 17, 69, 26, 128, 125, 56, 232, 130, 237, 121, 250, 168, 134,
-          114,
-      },
-
-      {
-          21, 52, 32, 95, 64, 171, 152, 70, 247, 159, 252, 177, 252, 221, 192,
-          143,
-      },
-  },
-#endif  // CONFIG_EXT_TX
-};
-
-#if CONFIG_CTX1D
-const aom_prob default_eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  { { 220, 225, 220, 216, 233, 225, 189, 178, 222, 199, 164, 112, 207,
-      171, 115, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      147, 125, 104, 36,  117, 107, 26,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      156, 124, 128, 128, 146, 68,  128, 128, 131, 17,  128, 128, 64,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 146, 150, 142, 144, 178, 167, 131, 116, 150, 123, 107, 63,  119,
-      89,  74,  128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      117, 127, 105, 69,  53,  56,  30,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      128, 86,  128, 128, 140, 72,  128, 128, 120, 44,  128, 128, 80,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 237, 242, 242, 219, 192, 246, 246, 243, 233, 184, 155, 234, 217,
-      188, 152, 195, 167, 114, 89,  128, 128, 128, 128, 128, 128,
-
-      180, 173, 154, 133, 112, 147, 145, 142, 102, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      198, 173, 130, 200, 128, 208, 182, 160, 106, 171, 128, 144, 128,
-      128, 128, 124, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 140, 170, 162, 111, 94,  182, 195, 165, 153, 110, 81,  178, 169,
-      158, 83,  133, 85,  85,  38,  128, 128, 128, 128, 128, 128,
-
-      112, 127, 107, 87,  31,  57,  49,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      160, 143, 99,  126, 128, 164, 133, 126, 59,  71,  128, 138, 128,
-      128, 128, 99,  128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-#endif
-  { { 220, 225, 220, 216, 233, 225, 189, 178, 222, 199, 164, 112, 207,
-      171, 115, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      147, 125, 104, 36,  117, 107, 26,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      156, 124, 128, 128, 146, 68,  128, 128, 131, 17,  128, 128, 64,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 146, 150, 142, 144, 178, 167, 131, 116, 150, 123, 107, 63,  119,
-      89,  74,  128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      117, 127, 105, 69,  53,  56,  30,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      128, 86,  128, 128, 140, 72,  128, 128, 120, 44,  128, 128, 80,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 237, 242, 242, 219, 192, 246, 246, 243, 233, 184, 155, 234, 217,
-      188, 152, 195, 167, 114, 89,  128, 128, 128, 128, 128, 128,
-
-      180, 173, 154, 133, 112, 147, 145, 142, 102, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      198, 173, 130, 200, 128, 208, 182, 160, 106, 171, 128, 144, 128,
-      128, 128, 124, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 140, 170, 162, 111, 94,  182, 195, 165, 153, 110, 81,  178, 169,
-      158, 83,  133, 85,  85,  38,  128, 128, 128, 128, 128, 128,
-
-      112, 127, 107, 87,  31,  57,  49,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      160, 143, 99,  126, 128, 164, 133, 126, 59,  71,  128, 138, 128,
-      128, 128, 99,  128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 229, 241, 243, 245, 247, 247, 251, 248, 235, 210, 247, 235, 208,
-      166, 245, 247, 244, 182, 236, 229, 180, 136, 128, 128, 128,
-
-      191, 197, 96,  70,  199, 128, 128, 191, 174, 117, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      211, 183, 215, 188, 138, 209, 136, 128, 170, 128, 191, 128, 161,
-      128, 182, 128, 128, 128, 164, 128, 128, 128, 128, 128, 128 },
-
-    { 106, 153, 182, 191, 186, 202, 211, 203, 166, 147, 205, 205, 195,
-      128, 206, 212, 182, 109, 192, 154, 139, 79,  128, 128, 128,
-
-      112, 133, 128, 255, 128, 128, 128, 130, 154, 98,  128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      144, 185, 169, 199, 85,  183, 128, 128, 64,  128, 146, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 169, 203, 224, 222, 220, 228, 229, 223, 234, 247, 242, 230, 222,
-      238, 246, 234, 196, 245, 249, 245, 192, 240, 235, 199, 161,
-
-      176, 148, 158, 77,  178, 128, 128, 158, 128, 128, 196, 208, 155,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      232, 187, 191, 221, 116, 217, 154, 128, 203, 128, 128, 192, 128,
-      201, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 133, 182, 215, 204, 176, 220, 182, 168, 187, 197, 181, 145, 75,
-      164, 136, 51,  57,  156, 128, 128, 128, 85,  128, 128, 128,
-
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-};
-#else  // CONFIG_CTX1D
-const aom_prob default_eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  {
-      { 229, 236, 231, 222, 239, 236, 214, 201, 236, 226, 195, 134, 228,
-        210, 150, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 182, 186, 172, 176, 207, 213, 152, 122, 187, 171, 131, 65, 170,
-        134, 101, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-  },
-#endif
-  {
-      { 229, 236, 231, 222, 239, 236, 214, 201, 236, 226, 195, 134, 228,
-        210, 150, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 182, 186, 172, 176, 207, 213, 152, 122, 187, 171, 131, 65, 170,
-        134, 101, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-  },
-  {
-      { 225, 234, 244, 236, 205, 242, 246, 247, 246, 234, 191, 242, 237,
-        215, 142, 224, 206, 142, 73,  128, 128, 128, 128, 128, 128 },
-      { 154, 171, 187, 175, 62,  199, 202, 206, 215, 200, 111, 197, 199,
-        174, 100, 135, 105, 104, 45,  128, 128, 128, 128, 128, 128 },
-  },
-  {
-      { 180, 213, 216, 229, 233, 232, 240, 235, 220, 178, 239, 238, 225,
-        187, 229, 214, 226, 200, 183, 141, 158, 179, 128, 128, 128 },
-      { 190, 225, 234, 248, 249, 248, 253, 251, 232, 110, 254, 252, 236,
-        57,  253, 248, 232, 85,  244, 189, 112, 64,  128, 128, 128 },
-  },
-  {
-      { 248, 224, 246, 244, 239, 245, 251, 246, 251, 255, 255, 255, 249,
-        255, 255, 255, 229, 255, 255, 255, 228, 255, 255, 247, 137 },
-      { 204, 207, 233, 215, 193, 228, 239, 221, 227, 250, 236, 207, 135,
-        236, 186, 182, 57,  209, 140, 128, 85,  184, 110, 128, 128 },
-  },
-};
-#endif  // CONFIG_CTX1D
-
-const aom_prob default_coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  { { 96,  128, 86,  122, 128, 84,  125, 128, 88,  99,  126, 128,
-      135, 159, 99,  130, 134, 100, 128, 144, 70,  97,  128, 139,
-      157, 168, 127, 148, 162, 121, 149, 157, 118, 127, 143, 157,
-      178, 186, 168, 171, 183, 165, 169, 180, 180, 169, 166, 177 },
-    { 81,  128, 72,  95,  128, 64,  98,  128, 42,  66,  101, 128,
-      129, 163, 97,  122, 130, 91,  119, 141, 70,  94,  118, 166,
-      157, 168, 117, 143, 151, 111, 144, 154, 76,  113, 128, 158,
-      177, 185, 165, 167, 179, 155, 166, 179, 110, 137, 115, 165 } },
-#endif
-  { { 96,  128, 86,  122, 128, 84,  125, 128, 88,  99,  126, 128,
-      135, 159, 99,  130, 134, 100, 128, 144, 70,  97,  128, 139,
-      157, 168, 127, 148, 162, 121, 149, 157, 118, 127, 143, 157,
-      178, 186, 168, 171, 183, 165, 169, 180, 180, 169, 166, 177 },
-    { 81,  128, 72,  95,  128, 64,  98,  128, 42,  66,  101, 128,
-      129, 163, 97,  122, 130, 91,  119, 141, 70,  94,  118, 166,
-      157, 168, 117, 143, 151, 111, 144, 154, 76,  113, 128, 158,
-      177, 185, 165, 167, 179, 155, 166, 179, 110, 137, 115, 165 } },
-  { { 102, 128, 79,  125, 128, 74,  121, 128, 61,  98,  128, 128,
-      141, 164, 96,  132, 150, 90,  128, 153, 62,  100, 128, 153,
-      162, 172, 120, 146, 162, 113, 142, 154, 96,  113, 138, 155,
-      181, 188, 151, 170, 179, 147, 167, 181, 158, 157, 163, 176 },
-    { 103, 128, 80,  116, 128, 66,  94,  128, 35,  65,  109, 128,
-      134, 163, 104, 137, 154, 92,  128, 104, 58,  94,  129, 132,
-      156, 173, 137, 149, 165, 104, 143, 143, 112, 101, 133, 159,
-      176, 186, 134, 172, 175, 155, 169, 177, 255, 107, 137, 168 } },
-  { { 125, 128, 85,  157, 128, 82,  155, 128, 42,  83,  116, 128,
-      155, 174, 101, 144, 155, 93,  140, 155, 57,  92,  124, 149,
-      173, 178, 114, 148, 161, 111, 145, 161, 77,  101, 131, 153,
-      190, 191, 140, 169, 183, 140, 169, 179, 108, 122, 150, 171 },
-    { 136, 128, 108, 163, 128, 96,  140, 128, 48,  90,  85,  128,
-      144, 164, 113, 158, 179, 107, 159, 128, 43,  75,  133, 160,
-      157, 184, 144, 160, 189, 154, 152, 184, 128, 124, 137, 140,
-      188, 196, 148, 170, 178, 128, 177, 159, 128, 179, 135, 135 } },
-  { { 133, 128, 110, 153, 128, 101, 157, 128, 49,  91,  134, 128,
-      151, 168, 129, 158, 162, 112, 154, 168, 63,  99,  130, 158,
-      171, 178, 128, 160, 173, 111, 155, 171, 86,  108, 143, 159,
-      194, 196, 162, 177, 185, 123, 172, 181, 101, 132, 156, 178 },
-    { 133, 128, 129, 144, 128, 116, 135, 128, 43,  101, 100, 128,
-      140, 163, 158, 173, 205, 128, 165, 171, 128, 128, 210, 163,
-      172, 184, 192, 176, 201, 183, 177, 190, 128, 192, 199, 144,
-      192, 192, 1,   196, 192, 255, 171, 178, 255, 128, 171, 179 } }
-};
-#if BR_NODE
-const aom_prob
-    default_coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS][LEVEL_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-      { { { 62,  128, 54,  116, 128, 51,  97,  128, 59,  68,  107, 128,
-            119, 158, 68,  115, 131, 65,  112, 138, 34,  71,  118, 137,
-            171, 184, 110, 152, 178, 105, 146, 172, 89,  111, 145, 173,
-            214, 226, 201, 198, 214, 196, 193, 210, 239, 196, 186, 202 },
-          { 41,  128, 58,  52,  128, 51,  61,  128, 92,  54,  48,  128,
-            67,  113, 36,  55,  75,  30,  56,  72,  12,  25,  50,  79,
-            94,  131, 37,  75,  108, 42,  78,  103, 5,   31,  67,  103,
-            172, 192, 131, 135, 167, 129, 136, 165, 149, 144, 120, 149 },
-          { 35, 128, 74, 50, 128, 63, 59, 128, 87,  74,  38, 128,
-            32, 53,  23, 34, 50,  18, 30, 41,  15,  13,  18, 18,
-            52, 74,  18, 29, 36,  18, 31, 47,  51,  9,   15, 27,
-            96, 134, 85, 70, 93,  96, 79, 100, 108, 100, 55, 65 } },
-        { { 52,  128, 35,  79,  128, 29,  66,  128, 12,  30, 57,  128,
-            113, 156, 64,  107, 172, 54,  103, 145, 23,  57, 96,  110,
-            165, 184, 95,  138, 166, 95,  141, 184, 55,  80, 133, 165,
-            212, 222, 134, 175, 206, 158, 177, 197, 102, 61, 154, 190 },
-          { 36,  128, 18, 26,  128, 15, 29,  128, 4, 6,  30, 128,
-            63,  113, 25, 44,  66,  22, 40,  67,  9, 14, 34, 55,
-            90,  125, 26, 66,  82,  29, 73,  88,  1, 26, 34, 67,
-            158, 179, 70, 121, 134, 69, 111, 129, 1, 85, 54, 105 },
-          { 24, 128, 8,  31, 128, 15, 16, 128, 1,   1, 1,  128,
-            32, 39,  16, 18, 43,  5,  17, 13,  1,   1, 22, 1,
-            37, 65,  26, 20, 28,  16, 15, 24,  128, 1, 1,  1,
-            83, 107, 57, 56, 74,  34, 29, 73,  128, 1, 37, 47 } } },
-#endif
-      { { { 62,  128, 54,  116, 128, 51,  97,  128, 59,  68,  107, 128,
-            119, 158, 68,  115, 131, 65,  112, 138, 34,  71,  118, 137,
-            171, 184, 110, 152, 178, 105, 146, 172, 89,  111, 145, 173,
-            214, 226, 201, 198, 214, 196, 193, 210, 239, 196, 186, 202 },
-          { 41,  128, 58,  52,  128, 51,  61,  128, 92,  54,  48,  128,
-            67,  113, 36,  55,  75,  30,  56,  72,  12,  25,  50,  79,
-            94,  131, 37,  75,  108, 42,  78,  103, 5,   31,  67,  103,
-            172, 192, 131, 135, 167, 129, 136, 165, 149, 144, 120, 149 },
-          { 35, 128, 74, 50, 128, 63, 59, 128, 87,  74,  38, 128,
-            32, 53,  23, 34, 50,  18, 30, 41,  15,  13,  18, 18,
-            52, 74,  18, 29, 36,  18, 31, 47,  51,  9,   15, 27,
-            96, 134, 85, 70, 93,  96, 79, 100, 108, 100, 55, 65 } },
-        { { 52,  128, 35,  79,  128, 29,  66,  128, 12,  30, 57,  128,
-            113, 156, 64,  107, 172, 54,  103, 145, 23,  57, 96,  110,
-            165, 184, 95,  138, 166, 95,  141, 184, 55,  80, 133, 165,
-            212, 222, 134, 175, 206, 158, 177, 197, 102, 61, 154, 190 },
-          { 36,  128, 18, 26,  128, 15, 29,  128, 4, 6,  30, 128,
-            63,  113, 25, 44,  66,  22, 40,  67,  9, 14, 34, 55,
-            90,  125, 26, 66,  82,  29, 73,  88,  1, 26, 34, 67,
-            158, 179, 70, 121, 134, 69, 111, 129, 1, 85, 54, 105 },
-          { 24, 128, 8,  31, 128, 15, 16, 128, 1,   1, 1,  128,
-            32, 39,  16, 18, 43,  5,  17, 13,  1,   1, 22, 1,
-            37, 65,  26, 20, 28,  16, 15, 24,  128, 1, 1,  1,
-            83, 107, 57, 56, 74,  34, 29, 73,  128, 1, 37, 47 } } },
-      { { { 72,  128, 45,  113, 128, 38,  100, 128, 26,  63,  112, 128,
-            134, 177, 65,  121, 148, 57,  111, 143, 27,  68,  116, 152,
-            181, 198, 98,  148, 173, 84,  136, 168, 53,  89,  134, 170,
-            218, 230, 173, 194, 216, 160, 188, 213, 199, 177, 183, 204 },
-          { 54,  128, 34,  55,  128, 32,  53,  128, 66,  45,  54,  128,
-            81,  128, 33,  59,  102, 26,  55,  80,  7,   23,  49,  91,
-            116, 145, 36,  79,  107, 35,  73,  102, 12,  28,  57,  95,
-            170, 201, 102, 133, 173, 105, 127, 173, 166, 132, 114, 149 },
-          { 40,  128, 25, 30, 128, 21, 31, 128, 24, 17, 24, 128,
-            51,  67,  19, 28, 40,  17, 25, 42,  15, 13, 19, 19,
-            61,  77,  19, 30, 48,  13, 33, 50,  11, 15, 21, 30,
-            103, 147, 37, 69, 111, 37, 66, 105, 18, 18, 36, 76 } },
-        { { 74,  128, 42,  99,  128, 32,  57,  128, 9,  28, 76,  128,
-            115, 187, 70,  118, 120, 52,  109, 128, 19, 60, 93,  100,
-            178, 197, 119, 147, 179, 92,  137, 178, 37, 87, 110, 158,
-            216, 227, 169, 186, 201, 128, 178, 204, 1,  96, 155, 217 },
-          { 59,  128, 26, 34,  128, 11, 20,  128, 7,   8, 24, 128,
-            73,  125, 38, 74,  96,  23, 61,  79,  15,  9, 23, 110,
-            96,  151, 49, 79,  164, 22, 70,  65,  1,   1, 9,  69,
-            156, 196, 73, 105, 181, 17, 126, 155, 128, 1, 90, 111 },
-          { 42, 128, 10, 11, 128, 13, 1,  128, 1,   1,   1, 128,
-            55, 63,  13, 17, 85,  1,  16, 64,  1,   1,   1, 1,
-            62, 58,  32, 21, 53,  1,  37, 91,  128, 128, 1, 1,
-            81, 133, 51, 48, 79,  1,  25, 81,  128, 128, 1, 54 } } },
-      { { { 103, 128, 52,  163, 128, 46,  155, 128, 12, 45,  97,  128,
-            162, 196, 69,  140, 170, 60,  130, 158, 21, 58,  109, 150,
-            205, 214, 93,  149, 178, 79,  143, 179, 38, 71,  120, 159,
-            231, 240, 150, 192, 218, 140, 188, 220, 84, 112, 159, 196 },
-          { 93,  128, 42, 143, 128, 41, 132, 128, 6,  15, 40, 128,
-            113, 172, 39, 99,  113, 33, 91,  94,  5,  15, 42, 83,
-            148, 172, 37, 91,  130, 28, 81,  121, 9,  20, 47, 87,
-            201, 223, 75, 139, 183, 77, 132, 176, 23, 41, 82, 147 },
-          { 92,  128, 45, 123, 128, 28, 88, 128, 1,  8,  20, 128,
-            85,  94,  39, 95,  83,  33, 81, 61,  4,  5,  17, 25,
-            84,  109, 17, 59,  76,  11, 46, 62,  1,  4,  13, 35,
-            139, 184, 25, 86,  129, 25, 71, 123, 26, 13, 31, 84 } },
-        { { 123, 128, 82,  169, 128, 62,  139, 128, 1,   28,  77,  128,
-            139, 167, 92,  170, 146, 76,  149, 255, 19,  68,  160, 73,
-            190, 209, 171, 165, 218, 57,  152, 209, 128, 61,  122, 164,
-            237, 240, 146, 210, 227, 128, 224, 220, 128, 128, 196, 199 },
-          { 130, 128, 52,  141, 128, 32,  101, 128, 128, 1,  85,  128,
-            94,  155, 71,  121, 255, 30,  116, 85,  1,   8,  58,  255,
-            105, 169, 110, 101, 132, 1,   77,  142, 128, 1,  54,  96,
-            166, 214, 224, 154, 198, 255, 153, 230, 128, 85, 100, 146 },
-          { 103, 128, 26, 83, 128, 20,  47,  128, 128, 128, 1,  128,
-            91,  90,  19, 76, 128, 1,   42,  1,   128, 255, 64, 128,
-            74,  77,  1,  72, 68,  128, 13,  77,  128, 128, 64, 1,
-            71,  147, 37, 99, 171, 1,   104, 151, 128, 1,   1,  96 } } },
-      { { { 113, 128, 79,  165, 128, 69,  149, 128, 14, 55,  116, 128,
-            163, 202, 104, 169, 205, 82,  159, 180, 22, 64,  121, 165,
-            207, 216, 113, 177, 215, 95,  166, 195, 35, 77,  132, 179,
-            241, 244, 173, 207, 233, 128, 202, 227, 92, 121, 169, 209 },
-          { 114, 128, 67, 136, 128, 54, 132, 128, 6,  26, 62,  128,
-            85,  129, 85, 146, 173, 64, 129, 140, 7,  19, 65,  92,
-            139, 169, 42, 147, 186, 40, 129, 170, 18, 18, 65,  117,
-            213, 230, 74, 172, 213, 69, 165, 196, 1,  40, 103, 170 },
-          { 101, 128, 61, 134, 128, 52, 97,  128, 1,   14, 26, 128,
-            79,  72,  71, 135, 152, 56, 114, 117, 1,   10, 24, 58,
-            64,  66,  60, 133, 148, 16, 126, 123, 1,   32, 26, 56,
-            143, 197, 51, 141, 176, 59, 132, 162, 128, 17, 47, 106 } },
-        { { 115, 128, 112, 135, 128, 89,  130, 128, 15,  49,  89,  128,
-            143, 238, 154, 203, 255, 138, 172, 255, 1,   98,  196, 255,
-            185, 203, 255, 211, 255, 192, 217, 235, 128, 128, 171, 255,
-            233, 233, 255, 247, 255, 1,   239, 245, 1,   128, 255, 255 },
-          { 75,  128, 76,  118, 128, 35,  74,  128, 1,   13,  23,  128,
-            63,  138, 114, 164, 140, 91,  128, 128, 128, 1,   138, 64,
-            96,  128, 255, 175, 236, 85,  166, 209, 128, 1,   128, 146,
-            196, 217, 1,   204, 206, 128, 212, 221, 128, 128, 128, 219 },
-          { 49,  128, 36,  62,  128, 37,  56, 128, 128, 1,   1,   128,
-            45,  37,  68,  102, 128, 90,  56, 1,   128, 128, 37,  1,
-            26,  27,  128, 126, 128, 255, 63, 142, 128, 128, 1,   1,
-            125, 159, 128, 173, 212, 128, 85, 189, 128, 128, 255, 171 } } }
-    };
-#endif  // BR_NODE
-#if CONFIG_CTX1D
-static const aom_prob default_eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES] = {
-#if CONFIG_CHROMA_2X2
-  { { 128, 176, 157 }, { 128, 222, 198 } },
-#endif
-  { { 128, 176, 157 }, { 128, 222, 198 } },
-  { { 128, 35, 56 }, { 128, 203, 225 } },
-  { { 128, 55, 136 }, { 128, 230, 253 } },
-  { { 128, 101, 188 }, { 128, 128, 128 } }
-};
-static const aom_prob default_empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES]
-                                        [EMPTY_LINE_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 142, 153, 211, 205, 128 },
-                                              { 162, 142, 203, 197, 128 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 133, 116, 178, 123, 128 },
-                                              { 139, 109, 159, 115, 128 } } },
-#endif
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 142, 153, 211, 205, 128 },
-                                              { 162, 142, 203, 197, 128 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 133, 116, 178, 123, 128 },
-                                              { 139, 109, 159, 115, 128 } } },
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 185, 130, 183, 204, 227 },
-                                              { 171, 81, 177, 200, 221 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 180, 127, 175, 189, 213 },
-                                              { 120, 74, 129, 134, 156 } } },
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 202, 82, 183, 214, 248 },
-                                              { 144, 41, 163, 185, 203 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 151, 93, 171, 224, 160 },
-                                              { 128, 51, 171, 128, 1 } } },
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 154, 48, 174, 210, 233 },
-                                              { 123, 16, 148, 189, 197 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 128, 128, 128, 128, 128 },
-                                              { 128, 128, 128, 128, 128 } } }
-                                        };
-static const aom_prob
-    default_hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 151, 173, 114, 128, 128, 128, 128, 128, 128, 162, 198, 128,
-            128, 128, 128, 128, 182, 198, 109, 128, 128, 128, 128, 128 },
-          { 152, 173, 119, 128, 128, 128, 128, 128, 128, 164, 193, 128,
-            128, 128, 128, 128, 198, 209, 121, 128, 128, 128, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 123, 143, 70,  128, 128, 128, 128, 128, 128, 127, 154, 128,
-            128, 128, 128, 128, 176, 148, 36,  128, 128, 128, 128, 128 },
-          { 132, 152, 73,  128, 128, 128, 128, 128, 128, 127, 159, 128,
-            128, 128, 128, 128, 186, 181, 48,  128, 128, 128, 128, 128 } } },
-#endif
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 151, 173, 114, 128, 128, 128, 128, 128, 128, 162, 198, 128,
-            128, 128, 128, 128, 182, 198, 109, 128, 128, 128, 128, 128 },
-          { 152, 173, 119, 128, 128, 128, 128, 128, 128, 164, 193, 128,
-            128, 128, 128, 128, 198, 209, 121, 128, 128, 128, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 123, 143, 70,  128, 128, 128, 128, 128, 128, 127, 154, 128,
-            128, 128, 128, 128, 176, 148, 36,  128, 128, 128, 128, 128 },
-          { 132, 152, 73,  128, 128, 128, 128, 128, 128, 127, 159, 128,
-            128, 128, 128, 128, 186, 181, 48,  128, 128, 128, 128, 128 } } },
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 109, 105, 78,  44,  128, 128, 128, 128, 128, 146, 185, 221,
-            128, 128, 128, 128, 199, 188, 134, 69,  128, 128, 128, 128 },
-          { 124, 127, 115, 82,  128, 128, 128, 128, 128, 162, 198, 224,
-            128, 128, 128, 128, 206, 214, 177, 135, 128, 128, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 95,  102, 65,  14,  128, 128, 128, 128, 128, 132, 164, 199,
-            128, 128, 128, 128, 162, 163, 66,  27,  128, 128, 128, 128 },
-          { 83,  141, 97,  38,  128, 128, 128, 128, 128, 154, 132, 184,
-            128, 128, 128, 128, 194, 218, 112, 63,  128, 128, 128, 128 } } },
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 117, 107, 86,  61,  51,  104, 128, 128, 128, 160, 198, 238,
-            252, 251, 128, 128, 221, 223, 209, 186, 99,  81,  128, 128 },
-          { 118, 122, 121, 100, 91,  97,  128, 128, 128, 168, 190, 214,
-            233, 235, 128, 128, 197, 216, 177, 165, 147, 126, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 109, 102, 63,  51,  255, 85,  128, 128, 128, 163, 131, 175,
-            128, 128, 128, 128, 183, 102, 40,  1,   128, 128, 128, 128 },
-          { 255, 255, 1,   1,   128, 1, 128, 128, 128, 1,   128, 128,
-            128, 128, 128, 128, 255, 1, 128, 128, 128, 128, 128, 128 } } },
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 114, 108, 83,  61,  53,  28,  77,  177, 128, 161, 187, 218,
-            240, 237, 228, 234, 200, 207, 167, 136, 98,  78,  183, 128 },
-          { 117, 138, 116, 77,  75,  85,  26,  1,   128, 197, 162, 200,
-            184, 212, 225, 236, 189, 225, 168, 124, 144, 171, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } } }
-    };
-#endif  // CONFIG_CTX1D
-#endif  // CONFIG_LV_MAP
-
-#if CONFIG_EXT_PARTITION_TYPES
-static const aom_prob
-    default_partition_probs[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1] = {
-      // 8x8 -> 4x4
-      { 199, 122, 141, 128, 128, 128, 255, 128, 255 },  // a/l both not split
-      { 147, 63, 159, 128, 128, 128, 255, 128, 255 },   // a split, l not split
-      { 148, 133, 118, 128, 128, 128, 255, 128, 255 },  // l split, a not split
-      { 121, 104, 114, 128, 128, 128, 255, 128, 255 },  // a/l both split
-      // 16x16 -> 8x8
-      { 174, 73, 87, 128, 128, 128, 255, 128, 255 },  // a/l both not split
-      { 92, 41, 83, 128, 128, 128, 255, 128, 255 },   // a split, l not split
-      { 82, 99, 50, 128, 128, 128, 255, 128, 255 },   // l split, a not split
-      { 53, 39, 39, 128, 128, 128, 255, 128, 255 },   // a/l both split
-      // 32x32 -> 16x16
-      { 177, 58, 59, 128, 128, 85, 128, 85, 128 },  // a/l both not split
-      { 68, 26, 63, 128, 128, 85, 128, 85, 128 },   // a split, l not split
-      { 52, 79, 25, 128, 128, 85, 128, 85, 128 },   // l split, a not split
-      { 17, 14, 12, 128, 128, 85, 128, 85, 128 },   // a/l both split
-      // 64x64 -> 32x32
-      { 222, 34, 30, 128, 128, 85, 128, 85, 128 },  // a/l both not split
-      { 72, 16, 44, 128, 128, 85, 128, 85, 128 },   // a split, l not split
-      { 58, 32, 12, 128, 128, 85, 128, 85, 128 },   // l split, a not split
-      { 10, 7, 6, 128, 128, 85, 128, 85, 128 },     // a/l both split
-#if CONFIG_EXT_PARTITION
-      // 128x128 -> 64x64
-      { 222, 34, 30, 128, 128, 128, 255, 128, 255 },  // a/l both not split
-      { 72, 16, 44, 128, 128, 128, 255, 128, 255 },   // a split, l not split
-      { 58, 32, 12, 128, 128, 128, 255, 128, 255 },   // l split, a not split
-      { 10, 7, 6, 128, 128, 128, 255, 128, 255 },     // a/l both split
-#endif                                                // CONFIG_EXT_PARTITION
-#if CONFIG_UNPOISON_PARTITION_CTX
-      { 0, 0, 141, 0, 0, 0, 0, 0, 0 },  // 8x8 -> 4x4
-      { 0, 0, 87, 0, 0, 0, 0, 0, 0 },   // 16x16 -> 8x8
-      { 0, 0, 59, 0, 0, 0, 0, 0, 0 },   // 32x32 -> 16x16
-      { 0, 0, 30, 0, 0, 0, 0, 0, 0 },   // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 0, 30, 0, 0, 0, 0, 0, 0 },   // 128x128 -> 64x64
-#endif                                  // CONFIG_EXT_PARTITION
-      { 0, 122, 0, 0, 0, 0, 0, 0, 0 },  // 8x8 -> 4x4
-      { 0, 73, 0, 0, 0, 0, 0, 0, 0 },   // 16x16 -> 8x8
-      { 0, 58, 0, 0, 0, 0, 0, 0, 0 },   // 32x32 -> 16x16
-      { 0, 34, 0, 0, 0, 0, 0, 0, 0 },   // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 34, 0, 0, 0, 0, 0, 0, 0 },  // 128x128 -> 64x64
-#endif                                 // CONFIG_EXT_PARTITION
-#endif                                 // CONFIG_UNPOISON_PARTITION_CTX
-    };
-#else
-static const aom_prob
-    default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = {
-      // 8x8 -> 4x4
-      { 199, 122, 141 },  // a/l both not split
-      { 147, 63, 159 },   // a split, l not split
-      { 148, 133, 118 },  // l split, a not split
-      { 121, 104, 114 },  // a/l both split
-      // 16x16 -> 8x8
-      { 174, 73, 87 },  // a/l both not split
-      { 92, 41, 83 },   // a split, l not split
-      { 82, 99, 50 },   // l split, a not split
-      { 53, 39, 39 },   // a/l both split
-      // 32x32 -> 16x16
-      { 177, 58, 59 },  // a/l both not split
-      { 68, 26, 63 },   // a split, l not split
-      { 52, 79, 25 },   // l split, a not split
-      { 17, 14, 12 },   // a/l both split
-      // 64x64 -> 32x32
-      { 222, 34, 30 },  // a/l both not split
-      { 72, 16, 44 },   // a split, l not split
-      { 58, 32, 12 },   // l split, a not split
-      { 10, 7, 6 },     // a/l both split
-#if CONFIG_EXT_PARTITION
-      // 128x128 -> 64x64
-      { 222, 34, 30 },  // a/l both not split
-      { 72, 16, 44 },   // a split, l not split
-      { 58, 32, 12 },   // l split, a not split
-      { 10, 7, 6 },     // a/l both split
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_UNPOISON_PARTITION_CTX
-      { 0, 0, 141 },    // 8x8 -> 4x4
-      { 0, 0, 87 },     // 16x16 -> 8x8
-      { 0, 0, 59 },     // 32x32 -> 16x16
-      { 0, 0, 30 },     // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 0, 30 },     // 128x128 -> 64x64
-#endif  // CONFIG_EXT_PARTITION
-      { 0, 122, 0 },    // 8x8 -> 4x4
-      { 0, 73, 0 },     // 16x16 -> 8x8
-      { 0, 58, 0 },     // 32x32 -> 16x16
-      { 0, 34, 0 },     // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 34, 0 },     // 128x128 -> 64x64
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
-    };
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-static const aom_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
-  155, 116, 94, 32, 96, 56, 30,
-};
-
-static const aom_prob default_zeromv_prob[ZEROMV_MODE_CONTEXTS] = {
-  45, 13,
-};
-
-static const aom_prob default_refmv_prob[REFMV_MODE_CONTEXTS] = {
-  178, 212, 135, 244, 203, 122, 128, 128, 128,
-};
-
-static const aom_prob default_drl_prob[DRL_MODE_CONTEXTS] = {
-  119, 128, 189, 134, 128,
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_ICDF(128 * 155), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 116), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 94), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 32), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 96), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 56), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 30), AOM_ICDF(32768), 0 } };
-static const aom_cdf_prob default_zeromv_cdf[ZEROMV_MODE_CONTEXTS][CDF_SIZE(
-    2)] = { { AOM_ICDF(128 * 45), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(128 * 13), AOM_ICDF(32768), 0 } };
-static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_ICDF(128 * 178), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 212), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 135), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 244), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 203), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 122), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 } };
-static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
-  { AOM_ICDF(128 * 119), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 189), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 134), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 }
-};
-#endif
-
-static const aom_prob default_inter_compound_mode_probs
-    [INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES - 1] = {
-      { 154, 167, 233, 165, 143, 170, 167 },  // 0 = both zero mv
-      { 75, 168, 237, 155, 135, 176, 172 },   // 1 = 1 zero + 1 predicted
-      { 7, 173, 227, 128, 153, 188, 189 },    // 2 = two predicted mvs
-      { 8, 120, 214, 113, 154, 178, 174 },    // 3 = 1 pred/zero, 1 new
-      { 4, 85, 194, 94, 155, 173, 167 },      // 4 = two new mvs
-      { 23, 89, 180, 73, 157, 151, 155 },     // 5 = one intra neighbour
-      { 27, 49, 152, 91, 134, 153, 142 },     // 6 = two intra neighbours
-    };
-
-static const aom_cdf_prob
-    default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
-        INTER_COMPOUND_MODES)] = {
-      { AOM_ICDF(19712), AOM_ICDF(28229), AOM_ICDF(30892), AOM_ICDF(31437),
-        AOM_ICDF(31712), AOM_ICDF(32135), AOM_ICDF(32360), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9600), AOM_ICDF(24804), AOM_ICDF(29268), AOM_ICDF(30323),
-        AOM_ICDF(30802), AOM_ICDF(31726), AOM_ICDF(32177), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(896), AOM_ICDF(22434), AOM_ICDF(27015), AOM_ICDF(29026),
-        AOM_ICDF(29753), AOM_ICDF(31114), AOM_ICDF(31597), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1024), AOM_ICDF(15904), AOM_ICDF(22127), AOM_ICDF(25421),
-        AOM_ICDF(26864), AOM_ICDF(28996), AOM_ICDF(30001), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(512), AOM_ICDF(11222), AOM_ICDF(17217), AOM_ICDF(21445),
-        AOM_ICDF(23473), AOM_ICDF(26133), AOM_ICDF(27550), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2944), AOM_ICDF(13313), AOM_ICDF(17214), AOM_ICDF(20751),
-        AOM_ICDF(23211), AOM_ICDF(25500), AOM_ICDF(26992), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3456), AOM_ICDF(9067), AOM_ICDF(14069), AOM_ICDF(16907),
-        AOM_ICDF(18817), AOM_ICDF(21214), AOM_ICDF(23139), AOM_ICDF(32768), 0 }
-    };
-
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): Default values to be further adjusted based on the collected
-//               stats.
-/*
-static const aom_prob default_inter_singleref_comp_mode_probs
-    [INTER_MODE_CONTEXTS][INTER_SINGLEREF_COMP_MODES - 1] = {
-      { 2, 173, 68, 180 },   // 0 = both zero mv
-      { 7, 145, 160, 180 },  // 1 = 1 zero + 1 predicted
-      { 7, 166, 126, 180 },  // 2 = two predicted mvs
-      { 7, 94, 132, 180 },   // 3 = 1 pred/zero, 1 new
-      { 8, 64, 64, 180 },    // 4 = two new mvs
-      { 17, 81, 52, 180 },   // 5 = one intra neighbour
-      { 25, 29, 50, 180 },   // 6 = two intra neighbours
-    };*/
-static const aom_prob default_inter_singleref_comp_mode_probs
-    [INTER_MODE_CONTEXTS][INTER_SINGLEREF_COMP_MODES - 1] = {
-      { 2, 173, 68 },   // 0 = both zero mv
-      { 7, 145, 160 },  // 1 = 1 zero + 1 predicted
-      { 7, 166, 126 },  // 2 = two predicted mvs
-      { 7, 94, 132 },   // 3 = 1 pred/zero, 1 new
-      { 8, 64, 64 },    // 4 = two new mvs
-      { 17, 81, 52 },   // 5 = one intra neighbour
-      { 25, 29, 50 },   // 6 = two intra neighbours
-    };
-
-static const aom_cdf_prob
-    default_inter_singleref_comp_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
-        INTER_SINGLEREF_COMP_MODES)] = {
-      { AOM_ICDF(21971), AOM_ICDF(24771), AOM_ICDF(25027), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18053), AOM_ICDF(26690), AOM_ICDF(27586), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(20667), AOM_ICDF(26182), AOM_ICDF(27078), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(11703), AOM_ICDF(22103), AOM_ICDF(22999), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7936), AOM_ICDF(13888), AOM_ICDF(14912), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9679), AOM_ICDF(13927), AOM_ICDF(16103), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3349), AOM_ICDF(8470), AOM_ICDF(11670), AOM_ICDF(32768), 0 }
-    };
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_prob
-    default_compound_type_probs[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 128, 128 }, { 128, 128 }, { 128, 128 },
-#endif
-      { 128, 128 }, { 255, 128 }, { 255, 128 }, { 66, 51 },   { 72, 35 },
-      { 79, 29 },   { 71, 18 },   { 81, 29 },   { 81, 26 },   { 69, 19 },
-      { 104, 1 },   { 99, 1 },    { 75, 1 },
-#if CONFIG_EXT_PARTITION
-      { 255, 1 },   { 255, 1 },   { 255, 1 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208, 128 }, { 208, 128 }, { 208, 128 }, { 208, 128 }, { 208, 1 },
-      { 208, 1 },
-#if CONFIG_EXT_PARTITION
-      { 208, 1 },   { 208, 1 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_prob
-    default_compound_type_probs[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 216 },
-      { 216 }, { 216 }, { 224 }, { 224 }, { 240 }, { 240 },
-#if CONFIG_EXT_PARTITION
-      { 255 }, { 255 }, { 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 255 }, { 255 },
-#if CONFIG_EXT_PARTITION
-      { 255 }, { 255 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-static const aom_prob
-    default_compound_type_probs[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 216 },
-      { 216 }, { 216 }, { 224 }, { 224 }, { 240 }, { 240 },
-#if CONFIG_EXT_PARTITION
-      { 255 }, { 255 }, { 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 },
-#if CONFIG_EXT_PARTITION
-      { 208 }, { 208 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-#else
-static const aom_prob default_compound_type_probs[BLOCK_SIZES_ALL]
-                                                 [COMPOUND_TYPES - 1];
-#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32704), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32704), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8448), AOM_ICDF(13293), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(12436), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10112), AOM_ICDF(12679), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9088), AOM_ICDF(10753), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10368), AOM_ICDF(12906), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10368), AOM_ICDF(12643), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8832), AOM_ICDF(10609), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(13312), AOM_ICDF(13388), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12672), AOM_ICDF(12751), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9600), AOM_ICDF(9691), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32641), AOM_ICDF(32768), 0 },  // 255, 1
-      { AOM_ICDF(32640), AOM_ICDF(32641), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32641), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },  // 208, 1
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },
-#endif
-    };
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },  // 208
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },  // 216
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },  // 224
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },  // 240
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },  // 208
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },  // 216
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },  // 224
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },  // 240
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },  // 208
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-
-#if CONFIG_INTERINTRA
-static const aom_prob default_interintra_prob[BLOCK_SIZE_GROUPS] = {
-  128, 226, 244, 254,
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
-    2)] = { { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(226 * 128), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(244 * 128), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(254 * 128), AOM_ICDF(32768), 0 } };
-#endif
-
-static const aom_prob
-    default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1] = {
-      { 128, 128, 128 },  // block_size < 8x8
-      { 24, 34, 119 },    // block_size < 16x16
-      { 38, 33, 95 },     // block_size < 32x32
-      { 51, 21, 110 },    // block_size >= 32x32
-    };
-static const aom_cdf_prob
-    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
-        INTERINTRA_MODES)] = {
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3072), AOM_ICDF(7016), AOM_ICDF(18987), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4864), AOM_ICDF(8461), AOM_ICDF(17481), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6528), AOM_ICDF(8681), AOM_ICDF(19031), AOM_ICDF(32768), 0 }
-    };
-
-static const aom_prob default_wedge_interintra_prob[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  128, 128, 128,
-#endif
-  128, 128, 128, 194, 213, 217, 222, 224, 226, 220, 128, 128, 128,
-#if CONFIG_EXT_PARTITION
-  255, 255, 255,
-#endif  // CONFIG_EXT_PARTITION
-  208, 208, 208, 208, 255, 255,
-#if CONFIG_EXT_PARTITION
-  255, 255
-#endif  // CONFIG_EXT_PARTITION
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(194 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(213 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(217 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(222 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(224 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(226 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(220 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#ifdef TWO_MODE
-const aom_tree_index av1_ncobmc_mode_tree[TREE_SIZE(MAX_NCOBMC_MODES)] = {
-  -NCOBMC_MODE_0, -NCOBMC_MODE_1
-};
-#else
-const aom_tree_index av1_ncobmc_mode_tree[TREE_SIZE(MAX_NCOBMC_MODES)] = {
-  -NCOBMC_MODE_0, 2,
-  -NCOBMC_MODE_1, 4,
-  -NCOBMC_MODE_2, 6,
-  -NCOBMC_MODE_3, 8,
-  -NCOBMC_MODE_4, 10,
-  -NCOBMC_MODE_5, 12,
-  -NCOBMC_MODE_6, -NCOBMC_MODE_7
-};
-#endif  // TWO_MODE
-
-// TODO(weitinglin): find default prob
-//                   right now setting the first mode with probability 1/255,
-//                   the last eight modes with equal probabilities
-static const aom_prob
-    default_ncobmc_mode_prob[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES - 1] = {
-#ifdef TWO_MODE
-      { 127 }, { 127 }, { 127 }, { 127 }
-#else
-      { 32, 36, 43, 51, 64, 85, 128 },  // 8x8
-      { 32, 36, 43, 51, 64, 85, 128 },  // 16X16
-      { 32, 36, 43, 51, 64, 85, 128 },  // 32X32
-      { 32, 36, 43, 51, 64, 85, 128 }   // 64X64
-#endif  // TWO_MODE
-    };
-static const aom_cdf_prob
-    default_ncobmc_mode_cdf[ADAPT_OVERLAP_BLOCKS][CDF_SIZE(MAX_NCOBMC_MODES)] =
-#ifdef TWO_MODE
-        { { AOM_ICDF(16256), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(16256), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(16256), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(16256), AOM_ICDF(32768), 0 } };
-#else
-        { { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 } };
-#endif  // TWO_MODEE
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-// Change this section appropriately once warped motion is supported
-#if CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -NCOBMC_ADAPT_WEIGHT,
-};
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255, 255 },
-      { 255, 255 },
-      { 255, 255 },
-#endif
-      { 255, 255 },
-      { 255, 255 },
-      { 255, 255 },
-      /** Only these nine block sizes allow ncobmc_adapt_weight **/
-      { 45, 207 },
-      { 42, 211 },
-      { 34, 207 },
-      { 181, 123 },
-      { 129, 141 },
-      { 15, 209 },
-      { 231, 122 },
-      { 195, 190 },
-      { 168, 190 },
-      /** ----------------------------------------------------- **/
-      { 244, 255 },
-#if CONFIG_EXT_PARTITION
-      { 252, 255 },
-      { 252, 255 },
-      { 252, 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 255, 200 },
-      { 255, 200 },
-      { 255, 200 },
-      { 255, 200 },
-#if CONFIG_EXT_PARTITION
-      { 252, 255 },
-      { 252, 200 },
-      { 252, 200 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-#endif
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      /** Only these seven block sizes allow ncobmc_adapt_weight **/
-      { AOM_ICDF(5702), AOM_ICDF(27555), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(5408), AOM_ICDF(27964), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4330), AOM_ICDF(27298), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(23107), AOM_ICDF(27760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16490), AOM_ICDF(25461), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1959), AOM_ICDF(27153), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(29530), AOM_ICDF(31073), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(25057), AOM_ICDF(30840), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(21588), AOM_ICDF(29940), AOM_ICDF(32768), 0 },
-      /** ----------------------------------------------------- **/
-      { AOM_ICDF(244 * 128), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 }
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-#endif
-    };
-#else  // CONFIG_NCOBMC_ADAPT_WEIGHT
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, -OBMC_CAUSAL
-};
-
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 255 }, { 255 }, { 255 }, { 151 }, { 153 }, { 144 }, { 178 },
-      { 165 }, { 160 }, { 207 }, { 195 }, { 168 }, { 244 },
-#if CONFIG_EXT_PARTITION
-      { 252 }, { 252 }, { 252 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 },
-#if CONFIG_EXT_PARTITION
-      { 208 }, { 208 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(151 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(153 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(144 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(178 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(165 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(207 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(195 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(168 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(244 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#elif !CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, -WARPED_CAUSAL
-};
-
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 255 }, { 255 }, { 255 }, { 151 }, { 153 }, { 144 }, { 178 },
-      { 165 }, { 160 }, { 207 }, { 195 }, { 168 }, { 244 },
-#if CONFIG_EXT_PARTITION
-      { 252 }, { 252 }, { 252 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 },
-#if CONFIG_EXT_PARTITION
-      { 252 }, { 252 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(151 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(153 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(144 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(178 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(165 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(207 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(195 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(168 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(244 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-#elif CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, 4, -NCOBMC_ADAPT_WEIGHT, -WARPED_CAUSAL
-};
-
-static const aom_prob default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES -
-                                                                1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 128, 128, 255 }, { 128, 128, 128 }, { 128, 128, 128 },
-#endif
-  { 128, 128, 128 }, { 128, 128, 128 }, { 128, 128, 128 }, { 62, 115, 128 },
-  { 39, 131, 128 },  { 39, 132, 128 },  { 118, 94, 128 },  { 77, 125, 128 },
-  { 100, 121, 128 }, { 190, 66, 128 },  { 207, 102, 128 }, { 197, 100, 128 },
-  { 239, 76, 128 },
-#if CONFIG_EXT_PARTITION
-  { 252, 200, 128 }, { 252, 200, 128 }, { 252, 200, 128 },
-#endif  // CONFIG_EXT_PARTITION
-  { 208, 200, 128 }, { 208, 200, 128 }, { 208, 200, 128 }, { 208, 200, 128 }
-};
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      /** Only these nine block sizes allow ncobmc_adapt_weight **/
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      /***********************************************************/
-      { AOM_ICDF(30592), AOM_ICDF(31238), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 }
-    };
-
-const aom_tree_index av1_ncobmc_tree[TREE_SIZE(OBMC_FAMILY_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -NCOBMC_ADAPT_WEIGHT
-};
-
-static const aom_prob
-    default_ncobmc_prob[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 128, 255 }, { 128, 255 }, { 128, 255 },
-#endif
-      { 128, 255 }, { 128, 255 }, { 128, 255 }, { 45, 255 },  { 79, 255 },
-      { 75, 255 },  { 130, 255 }, { 141, 255 }, { 144, 255 }, { 208, 255 },
-      { 201, 255 }, { 186, 255 }, { 231, 255 },
-#if CONFIG_EXT_PARTITION
-      { 252, 255 }, { 252, 255 }, { 252, 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208, 255 }, { 208, 255 }, { 208, 255 }, { 208, 255 }
-    };
-
-static const aom_cdf_prob
-    default_ncobmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(OBMC_FAMILY_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      /** Only these nine block sizes allow ncobmc_adapt_weight **/
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      /***********************************************************/
-      { AOM_ICDF(231 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(252 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 }
-    };
-#else
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -WARPED_CAUSAL,
-};
-
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 128, 128 }, { 128, 128 }, { 128, 128 },
-#endif
-      { 128, 128 }, { 128, 128 }, { 128, 128 }, { 62, 115 },  { 39, 131 },
-      { 39, 132 },  { 118, 94 },  { 77, 125 },  { 100, 121 }, { 190, 66 },
-      { 207, 102 }, { 197, 100 }, { 239, 76 },
-#if CONFIG_EXT_PARTITION
-      { 252, 200 }, { 252, 200 }, { 252, 200 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208, 200 }, { 208, 200 }, { 208, 200 }, { 208, 200 }, { 208, 200 },
-      { 208, 200 },
-#if CONFIG_EXT_PARTITION
-      { 252, 200 }, { 252, 200 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7936), AOM_ICDF(19091), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4991), AOM_ICDF(19205), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4992), AOM_ICDF(19314), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15104), AOM_ICDF(21590), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9855), AOM_ICDF(21043), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12800), AOM_ICDF(22238), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24320), AOM_ICDF(26498), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26496), AOM_ICDF(28995), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(25216), AOM_ICDF(28166), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30592), AOM_ICDF(31238), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-#endif
-    };
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-// Probability for the case that only 1 additional motion mode is allowed
-static const aom_prob default_obmc_prob[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  128, 128, 128,
-#endif
-  128, 128, 128, 45,  79,  75,  130, 141, 144, 208, 201, 186, 231,
-#if CONFIG_EXT_PARTITION
-  252, 252, 252,
-#endif  // CONFIG_EXT_PARTITION
-  208, 208, 208, 208, 208, 208,
-#if CONFIG_EXT_PARTITION
-  252, 252
-#endif  // CONFIG_EXT_PARTITION
-};
-
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
-static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#endif
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(45 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(79 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(75 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(130 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(141 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(144 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(201 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(186 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(231 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-};
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif
-
-static const aom_prob default_delta_q_probs[DELTA_Q_PROBS] = { 220, 220, 220 };
-static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
-  AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0
-};
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-static const aom_prob
-    default_delta_lf_multi_probs[FRAME_LF_COUNT][DELTA_LF_PROBS] = {
-      { 220, 220, 220 }, { 220, 220, 220 }, { 220, 220, 220 }, { 220, 220, 220 }
-    };
-static const aom_cdf_prob
-    default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)] = {
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 }
-    };
-#endif  // CONFIG_LOOPFILTER_LEVEL
-static const aom_prob default_delta_lf_probs[DELTA_LF_PROBS] = { 220, 220,
-                                                                 220 };
-static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
-  AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0
-};
-#endif
-
-/* clang-format off */
-#if CONFIG_INTERINTRA
-const aom_tree_index av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)] = {
-  -II_DC_PRED, 2,        /* 0 = II_DC_NODE     */
-  -II_SMOOTH_PRED, 4,    /* 1 = II_SMOOTH_PRED */
-  -II_V_PRED, -II_H_PRED /* 2 = II_V_NODE      */
-};
-#endif  // CONFIG_INTERINTRA
-
-const aom_tree_index av1_inter_compound_mode_tree
-    [TREE_SIZE(INTER_COMPOUND_MODES)] = {
-  -INTER_COMPOUND_OFFSET(ZERO_ZEROMV), 2,
-  -INTER_COMPOUND_OFFSET(NEAREST_NEARESTMV), 4,
-  6, -INTER_COMPOUND_OFFSET(NEW_NEWMV),
-  -INTER_COMPOUND_OFFSET(NEAR_NEARMV), 8,
-  10, 12,
-  -INTER_COMPOUND_OFFSET(NEAREST_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARESTMV),
-  -INTER_COMPOUND_OFFSET(NEAR_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARMV)
-};
-
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): To redesign the tree structure once the number of mode changes.
-/*
-const aom_tree_index av1_inter_singleref_comp_mode_tree
-    [TREE_SIZE(INTER_SINGLEREF_COMP_MODES)] = {
-  -INTER_SINGLEREF_COMP_OFFSET(SR_ZERO_NEWMV), 2,
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEARMV), 4,
-  6, -INTER_SINGLEREF_COMP_OFFSET(SR_NEW_NEWMV),
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEWMV),
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAR_NEWMV)
-};*/
-
-const aom_tree_index av1_inter_singleref_comp_mode_tree
-    [TREE_SIZE(INTER_SINGLEREF_COMP_MODES)] = {
-  -INTER_SINGLEREF_COMP_OFFSET(SR_ZERO_NEWMV), 2,
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEARMV), 4,
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAR_NEWMV),
-      -INTER_SINGLEREF_COMP_OFFSET(SR_NEW_NEWMV)
-};
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
-  -COMPOUND_AVERAGE, 2, -COMPOUND_WEDGE, -COMPOUND_SEG
-};
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
-  -COMPOUND_AVERAGE, -COMPOUND_WEDGE
-};
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
-  -COMPOUND_AVERAGE, -COMPOUND_SEG
-};
-#else
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {};
-#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-/* clang-format on */
-
-const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
-  -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT
-};
-
-#if CONFIG_EXT_PARTITION_TYPES
-/* clang-format off */
-const aom_tree_index av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)] = {
-  -PARTITION_NONE, 2,
-  6, 4,
-  8, -PARTITION_SPLIT,
-  -PARTITION_HORZ, 10,
-  -PARTITION_VERT, 14,
-
-  -PARTITION_HORZ_A, 12,
-  -PARTITION_HORZ_B, -PARTITION_HORZ_4,
-
-  -PARTITION_VERT_A, 16,
-  -PARTITION_VERT_B, -PARTITION_VERT_4
-};
-/* clang-format on */
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-static const aom_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
-  6, 97, 151, 205,
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)] = {
-      { AOM_ICDF(768), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12416), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19328), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26240), AOM_ICDF(32768), 0 }
-    };
-#endif
-
-static const aom_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
-  190, 156, 91, 77, 22
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
-    2)] = { { AOM_ICDF(24290), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(19956), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(11641), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(9804), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(2842), AOM_ICDF(32768), 0 } };
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_EXT_COMP_REFS
-static const aom_prob default_comp_ref_type_p[COMP_REF_TYPE_CONTEXTS] = {
-  8, 20, 78, 91, 194
-};
-static const aom_prob
-    default_uni_comp_ref_p[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] = {
-      { 88, 30, 28 }, { 218, 97, 105 }, { 254, 180, 196 }
-    };
-
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)] = {
-      { AOM_ICDF(8 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(20 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(78 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(91 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(194 * 128), AOM_ICDF(32768), 0 }
-    };
-static const aom_cdf_prob
-    default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
-                            [CDF_SIZE(2)] = {
-                              { { AOM_ICDF(88 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(30 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(28 * 128), AOM_ICDF(32768), 0 } },
-                              { { AOM_ICDF(218 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(97 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(105 * 128), AOM_ICDF(32768), 0 } },
-                              { { AOM_ICDF(254 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(180 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(196 * 128), AOM_ICDF(32768), 0 } }
-                            };
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_EXT_REFS
-static const aom_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1] = {
-  { 28, 10, 8 },
-  { 77, 27, 26 },
-  { 127, 62, 56 },
-  { 186, 126, 160 },
-  { 236, 143, 172 }
-};
-
-static const aom_prob default_comp_bwdref_p[REF_CONTEXTS][BWD_REFS - 1] = {
-  { 22, 13 }, { 140, 124 }, { 241, 239 }, { 128, 128 }, { 128, 128 }
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
-      { { AOM_ICDF(3556), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1217), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(988), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(9857), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(3394), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(3303), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(16237), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(7946), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(7195), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(23826), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16124), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(20536), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(30195), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(18344), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(21980), AOM_ICDF(32768), 0 } }
-    };
-
-static const aom_cdf_prob
-    default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
-      { { AOM_ICDF(2762), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1614), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(17976), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(15912), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(30894), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30639), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } }
-    };
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#else  // !CONFIG_EXT_REFS
-
-static const aom_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
-  { 43 }, { 100 }, { 137 }, { 212 }, { 229 },
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_comp_ref_cdf[REF_CONTEXTS][COMP_REFS - 1][CDF_SIZE(2)] = {
-      { { AOM_ICDF(43 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(100 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(137 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(212 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(229 * 128), AOM_ICDF(32768), 0 } }
-    };
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_REFS
-
-static const aom_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
-#if CONFIG_EXT_REFS
-  { 36, 16, 32, 57, 11, 14 },
-  { 68, 128, 73, 128, 49, 124 },
-  { 136, 236, 127, 170, 81, 238 },
-  { 128, 128, 191, 211, 115, 128 },
-  { 224, 128, 230, 242, 208, 128 }
-#else   // !CONFIG_EXT_REFS
-  { 31, 25 }, { 72, 80 }, { 147, 148 }, { 197, 191 }, { 235, 247 },
-#endif  // CONFIG_EXT_REFS
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)] = {
-#if CONFIG_EXT_REFS
-      { { AOM_ICDF(4623), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2110), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(4132), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(7309), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1392), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1781), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(8659), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16372), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(9371), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16322), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(6216), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(15834), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(17353), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30182), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16300), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(21702), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(10365), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30486), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(24426), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(26972), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(14760), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(28634), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(29425), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30969), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(26676), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } }
-#else   // !CONFIG_EXT_REFS
-      { { AOM_ICDF(31 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(25 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(72 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(80 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(147 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(148 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(197 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(191 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(235 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(247 * 128), AOM_ICDF(32768), 0 } }
-#endif  // CONFIG_EXT_REFS
-    };
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): Default values to be further adjusted based on the collected
-//               stats.
-static const aom_prob default_comp_inter_mode_p[COMP_INTER_MODE_CONTEXTS] = {
-  40, 110, 160, 220
-};
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-// TODO(huisu): tune these cdfs
-const aom_cdf_prob
-    default_palette_y_size_cdf[PALETTE_BLOCK_SIZES][CDF_SIZE(PALETTE_SIZES)] = {
-      { AOM_ICDF(12288), AOM_ICDF(19408), AOM_ICDF(24627), AOM_ICDF(26662),
-        AOM_ICDF(28499), AOM_ICDF(30667), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2815), AOM_ICDF(4570), AOM_ICDF(9416), AOM_ICDF(10875),
-        AOM_ICDF(13782), AOM_ICDF(19863), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3839), AOM_ICDF(5986), AOM_ICDF(11949), AOM_ICDF(13413),
-        AOM_ICDF(16286), AOM_ICDF(21823), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12032), AOM_ICDF(14948), AOM_ICDF(22187), AOM_ICDF(23138),
-        AOM_ICDF(24756), AOM_ICDF(27635), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(14847), AOM_ICDF(20167), AOM_ICDF(25433), AOM_ICDF(26751),
-        AOM_ICDF(28278), AOM_ICDF(30119), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(14336), AOM_ICDF(20240), AOM_ICDF(24840), AOM_ICDF(26079),
-        AOM_ICDF(27908), AOM_ICDF(30034), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18816), AOM_ICDF(25574), AOM_ICDF(29030), AOM_ICDF(29877),
-        AOM_ICDF(30656), AOM_ICDF(31506), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(23039), AOM_ICDF(27333), AOM_ICDF(30220), AOM_ICDF(30708),
-        AOM_ICDF(31070), AOM_ICDF(31826), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(13696), AOM_ICDF(18911), AOM_ICDF(23620), AOM_ICDF(25371),
-        AOM_ICDF(29821), AOM_ICDF(31617), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-#endif
-    };
-
-const aom_cdf_prob default_palette_uv_size_cdf[PALETTE_BLOCK_SIZES][CDF_SIZE(
-    PALETTE_SIZES)] = {
-  { AOM_ICDF(20480), AOM_ICDF(29888), AOM_ICDF(32453), AOM_ICDF(32715),
-    AOM_ICDF(32751), AOM_ICDF(32766), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(11135), AOM_ICDF(23641), AOM_ICDF(31056), AOM_ICDF(31998),
-    AOM_ICDF(32496), AOM_ICDF(32668), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(23108), AOM_ICDF(30806), AOM_ICDF(31871),
-    AOM_ICDF(32414), AOM_ICDF(32637), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9984), AOM_ICDF(21999), AOM_ICDF(29192), AOM_ICDF(30645),
-    AOM_ICDF(31640), AOM_ICDF(32402), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(7552), AOM_ICDF(16614), AOM_ICDF(24880), AOM_ICDF(27283),
-    AOM_ICDF(29254), AOM_ICDF(31203), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9600), AOM_ICDF(20279), AOM_ICDF(27548), AOM_ICDF(29261),
-    AOM_ICDF(30494), AOM_ICDF(31631), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(11391), AOM_ICDF(18656), AOM_ICDF(23727), AOM_ICDF(26058),
-    AOM_ICDF(27788), AOM_ICDF(30278), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(8576), AOM_ICDF(13585), AOM_ICDF(17632), AOM_ICDF(20884),
-    AOM_ICDF(23948), AOM_ICDF(27152), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(15360), AOM_ICDF(24200), AOM_ICDF(26978), AOM_ICDF(30846),
-    AOM_ICDF(31409), AOM_ICDF(32545), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-#endif
-};
-
-// When palette mode is enabled, following probability tables indicate the
-// probabilities to code the "is_palette" bit (i.e. the bit that indicates
-// if this block uses palette mode or DC_PRED mode).
-const aom_prob av1_default_palette_y_mode_prob
-    [PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS] = {
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-      { 240, 180, 100 },
-#if CONFIG_EXT_PARTITION
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-const aom_prob av1_default_palette_uv_mode_prob[PALETTE_UV_MODE_CONTEXTS] = {
-  253, 229
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-const aom_cdf_prob
-    default_palette_y_mode_cdf[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
-                              [CDF_SIZE(2)] = {
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-#if CONFIG_EXT_PARTITION
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-#endif  // CONFIG_EXT_PARTITION
-                              };
-
-const aom_cdf_prob
-    default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
-      { AOM_ICDF(128 * 253), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 229), AOM_ICDF(32768), 0 }
-    };
-
-#endif
-
-const aom_cdf_prob default_palette_y_color_index_cdf
-    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
-      {
-          { AOM_ICDF(29568), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(28672), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31872), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(28032), AOM_ICDF(30326), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(11647), AOM_ICDF(27405), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(4352), AOM_ICDF(30659), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(23552), AOM_ICDF(27800), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(32256), AOM_ICDF(32504), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-      },
-      {
-          { AOM_ICDF(26112), AOM_ICDF(28374), AOM_ICDF(30039), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(9472), AOM_ICDF(22576), AOM_ICDF(27712), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(26138), AOM_ICDF(29608), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19328), AOM_ICDF(23791), AOM_ICDF(28946), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31984), AOM_ICDF(32336), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(27904), AOM_ICDF(29215), AOM_ICDF(30075), AOM_ICDF(31190),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(9728), AOM_ICDF(22598), AOM_ICDF(26134), AOM_ICDF(29425),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(2688), AOM_ICDF(30066), AOM_ICDF(31058), AOM_ICDF(31933),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(22015), AOM_ICDF(25039), AOM_ICDF(27726), AOM_ICDF(29932),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32383), AOM_ICDF(32482), AOM_ICDF(32554), AOM_ICDF(32660),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(24319), AOM_ICDF(26299), AOM_ICDF(27486), AOM_ICDF(28600),
-            AOM_ICDF(29804), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7935), AOM_ICDF(18217), AOM_ICDF(21116), AOM_ICDF(25440),
-            AOM_ICDF(28589), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(25016), AOM_ICDF(27105), AOM_ICDF(28698),
-            AOM_ICDF(30399), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(19967), AOM_ICDF(24117), AOM_ICDF(26550), AOM_ICDF(28566),
-            AOM_ICDF(30224), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31359), AOM_ICDF(31607), AOM_ICDF(31775), AOM_ICDF(31977),
-            AOM_ICDF(32258), AOM_ICDF(32768), 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(26368), AOM_ICDF(27768), AOM_ICDF(28588), AOM_ICDF(29274),
-            AOM_ICDF(29997), AOM_ICDF(30917), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(8960), AOM_ICDF(18260), AOM_ICDF(20810), AOM_ICDF(23986),
-            AOM_ICDF(26627), AOM_ICDF(28882), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7295), AOM_ICDF(24111), AOM_ICDF(25836), AOM_ICDF(27515),
-            AOM_ICDF(29033), AOM_ICDF(30769), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(22016), AOM_ICDF(25208), AOM_ICDF(27305), AOM_ICDF(28159),
-            AOM_ICDF(29221), AOM_ICDF(30274), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31932), AOM_ICDF(32050), AOM_ICDF(32199),
-            AOM_ICDF(32335), AOM_ICDF(32521), AOM_ICDF(32768), 0, 0 },
-      },
-      {
-          { AOM_ICDF(26624), AOM_ICDF(27872), AOM_ICDF(28599), AOM_ICDF(29153),
-            AOM_ICDF(29633), AOM_ICDF(30172), AOM_ICDF(30841), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6655), AOM_ICDF(17569), AOM_ICDF(19587), AOM_ICDF(23345),
-            AOM_ICDF(25884), AOM_ICDF(28088), AOM_ICDF(29678), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(3584), AOM_ICDF(27296), AOM_ICDF(28429), AOM_ICDF(29158),
-            AOM_ICDF(30032), AOM_ICDF(30780), AOM_ICDF(31572), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(23551), AOM_ICDF(25855), AOM_ICDF(27070), AOM_ICDF(27893),
-            AOM_ICDF(28597), AOM_ICDF(29721), AOM_ICDF(30970), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(32128), AOM_ICDF(32173), AOM_ICDF(32245), AOM_ICDF(32337),
-            AOM_ICDF(32416), AOM_ICDF(32500), AOM_ICDF(32609), AOM_ICDF(32768),
-            0 },
-      },
-    };
-
-const aom_cdf_prob default_palette_uv_color_index_cdf
-    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
-      {
-          { AOM_ICDF(29824), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(30720), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(27648), AOM_ICDF(30208), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(14080), AOM_ICDF(26563), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(5120), AOM_ICDF(30932), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(24448), AOM_ICDF(27828), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(31616), AOM_ICDF(32219), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-      },
-      {
-          { AOM_ICDF(25856), AOM_ICDF(28259), AOM_ICDF(30584), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(11520), AOM_ICDF(22476), AOM_ICDF(27944), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(8064), AOM_ICDF(26882), AOM_ICDF(30308), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19455), AOM_ICDF(23823), AOM_ICDF(29134), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(30848), AOM_ICDF(31501), AOM_ICDF(32174), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(26751), AOM_ICDF(28020), AOM_ICDF(29541), AOM_ICDF(31230),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(12032), AOM_ICDF(26045), AOM_ICDF(30772), AOM_ICDF(31497),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(1280), AOM_ICDF(32153), AOM_ICDF(32458), AOM_ICDF(32560),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(23424), AOM_ICDF(24154), AOM_ICDF(29201), AOM_ICDF(29856),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32256), AOM_ICDF(32402), AOM_ICDF(32561), AOM_ICDF(32682),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(24576), AOM_ICDF(26720), AOM_ICDF(28114), AOM_ICDF(28950),
-            AOM_ICDF(31694), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7551), AOM_ICDF(16613), AOM_ICDF(20462), AOM_ICDF(25269),
-            AOM_ICDF(29077), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6272), AOM_ICDF(23039), AOM_ICDF(25623), AOM_ICDF(28163),
-            AOM_ICDF(30861), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(17024), AOM_ICDF(18808), AOM_ICDF(20771), AOM_ICDF(27941),
-            AOM_ICDF(29845), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31616), AOM_ICDF(31936), AOM_ICDF(32079), AOM_ICDF(32321),
-            AOM_ICDF(32546), AOM_ICDF(32768), 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(23296), AOM_ICDF(25590), AOM_ICDF(27833), AOM_ICDF(29337),
-            AOM_ICDF(29954), AOM_ICDF(31229), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7552), AOM_ICDF(13659), AOM_ICDF(16570), AOM_ICDF(21695),
-            AOM_ICDF(24506), AOM_ICDF(27701), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(6911), AOM_ICDF(24788), AOM_ICDF(26284), AOM_ICDF(27753),
-            AOM_ICDF(29575), AOM_ICDF(30872), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(17535), AOM_ICDF(22236), AOM_ICDF(24457), AOM_ICDF(26242),
-            AOM_ICDF(27363), AOM_ICDF(30191), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(30592), AOM_ICDF(31289), AOM_ICDF(31745), AOM_ICDF(31921),
-            AOM_ICDF(32149), AOM_ICDF(32321), AOM_ICDF(32768), 0, 0 },
-      },
-      {
-          { AOM_ICDF(22016), AOM_ICDF(24242), AOM_ICDF(25141), AOM_ICDF(27137),
-            AOM_ICDF(27797), AOM_ICDF(29331), AOM_ICDF(30848), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(8063), AOM_ICDF(13564), AOM_ICDF(16940), AOM_ICDF(21948),
-            AOM_ICDF(24568), AOM_ICDF(25689), AOM_ICDF(26989), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6528), AOM_ICDF(27028), AOM_ICDF(27835), AOM_ICDF(28741),
-            AOM_ICDF(30031), AOM_ICDF(31795), AOM_ICDF(32285), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(18047), AOM_ICDF(23797), AOM_ICDF(25444), AOM_ICDF(26274),
-            AOM_ICDF(27111), AOM_ICDF(27929), AOM_ICDF(30367), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(30208), AOM_ICDF(30628), AOM_ICDF(31046), AOM_ICDF(31658),
-            AOM_ICDF(31762), AOM_ICDF(32367), AOM_ICDF(32469), AOM_ICDF(32768),
-            0 },
-      }
-    };
-#if CONFIG_MRC_TX
-// TODO(sarahparker) Tune these cdfs
-const aom_cdf_prob default_mrc_mask_intra_cdf
-    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
-      {
-          { AOM_ICDF(29568), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(28672), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31872), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(28032), AOM_ICDF(30326), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(11647), AOM_ICDF(27405), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(4352), AOM_ICDF(30659), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(23552), AOM_ICDF(27800), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(32256), AOM_ICDF(32504), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-      },
-      {
-          { AOM_ICDF(26112), AOM_ICDF(28374), AOM_ICDF(30039), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(9472), AOM_ICDF(22576), AOM_ICDF(27712), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(26138), AOM_ICDF(29608), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19328), AOM_ICDF(23791), AOM_ICDF(28946), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31984), AOM_ICDF(32336), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(27904), AOM_ICDF(29215), AOM_ICDF(30075), AOM_ICDF(31190),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(9728), AOM_ICDF(22598), AOM_ICDF(26134), AOM_ICDF(29425),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(2688), AOM_ICDF(30066), AOM_ICDF(31058), AOM_ICDF(31933),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(22015), AOM_ICDF(25039), AOM_ICDF(27726), AOM_ICDF(29932),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32383), AOM_ICDF(32482), AOM_ICDF(32554), AOM_ICDF(32660),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(24319), AOM_ICDF(26299), AOM_ICDF(27486), AOM_ICDF(28600),
-            AOM_ICDF(29804), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7935), AOM_ICDF(18217), AOM_ICDF(21116), AOM_ICDF(25440),
-            AOM_ICDF(28589), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(25016), AOM_ICDF(27105), AOM_ICDF(28698),
-            AOM_ICDF(30399), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(19967), AOM_ICDF(24117), AOM_ICDF(26550), AOM_ICDF(28566),
-            AOM_ICDF(30224), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31359), AOM_ICDF(31607), AOM_ICDF(31775), AOM_ICDF(31977),
-            AOM_ICDF(32258), AOM_ICDF(32768), 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(26368), AOM_ICDF(27768), AOM_ICDF(28588), AOM_ICDF(29274),
-            AOM_ICDF(29997), AOM_ICDF(30917), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(8960), AOM_ICDF(18260), AOM_ICDF(20810), AOM_ICDF(23986),
-            AOM_ICDF(26627), AOM_ICDF(28882), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7295), AOM_ICDF(24111), AOM_ICDF(25836), AOM_ICDF(27515),
-            AOM_ICDF(29033), AOM_ICDF(30769), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(22016), AOM_ICDF(25208), AOM_ICDF(27305), AOM_ICDF(28159),
-            AOM_ICDF(29221), AOM_ICDF(30274), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31932), AOM_ICDF(32050), AOM_ICDF(32199),
-            AOM_ICDF(32335), AOM_ICDF(32521), AOM_ICDF(32768), 0, 0 },
-      },
-      {
-          { AOM_ICDF(26624), AOM_ICDF(27872), AOM_ICDF(28599), AOM_ICDF(29153),
-            AOM_ICDF(29633), AOM_ICDF(30172), AOM_ICDF(30841), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6655), AOM_ICDF(17569), AOM_ICDF(19587), AOM_ICDF(23345),
-            AOM_ICDF(25884), AOM_ICDF(28088), AOM_ICDF(29678), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(3584), AOM_ICDF(27296), AOM_ICDF(28429), AOM_ICDF(29158),
-            AOM_ICDF(30032), AOM_ICDF(30780), AOM_ICDF(31572), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(23551), AOM_ICDF(25855), AOM_ICDF(27070), AOM_ICDF(27893),
-            AOM_ICDF(28597), AOM_ICDF(29721), AOM_ICDF(30970), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(32128), AOM_ICDF(32173), AOM_ICDF(32245), AOM_ICDF(32337),
-            AOM_ICDF(32416), AOM_ICDF(32500), AOM_ICDF(32609), AOM_ICDF(32768),
-            0 },
-      },
-    };
-
-const aom_cdf_prob default_mrc_mask_inter_cdf
-    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
-      {
-          { AOM_ICDF(29568), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(28672), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31872), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(28032), AOM_ICDF(30326), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(11647), AOM_ICDF(27405), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(4352), AOM_ICDF(30659), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(23552), AOM_ICDF(27800), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(32256), AOM_ICDF(32504), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-      },
-      {
-          { AOM_ICDF(26112), AOM_ICDF(28374), AOM_ICDF(30039), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(9472), AOM_ICDF(22576), AOM_ICDF(27712), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(26138), AOM_ICDF(29608), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19328), AOM_ICDF(23791), AOM_ICDF(28946), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31984), AOM_ICDF(32336), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(27904), AOM_ICDF(29215), AOM_ICDF(30075), AOM_ICDF(31190),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(9728), AOM_ICDF(22598), AOM_ICDF(26134), AOM_ICDF(29425),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(2688), AOM_ICDF(30066), AOM_ICDF(31058), AOM_ICDF(31933),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(22015), AOM_ICDF(25039), AOM_ICDF(27726), AOM_ICDF(29932),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32383), AOM_ICDF(32482), AOM_ICDF(32554), AOM_ICDF(32660),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(24319), AOM_ICDF(26299), AOM_ICDF(27486), AOM_ICDF(28600),
-            AOM_ICDF(29804), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7935), AOM_ICDF(18217), AOM_ICDF(21116), AOM_ICDF(25440),
-            AOM_ICDF(28589), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(25016), AOM_ICDF(27105), AOM_ICDF(28698),
-            AOM_ICDF(30399), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(19967), AOM_ICDF(24117), AOM_ICDF(26550), AOM_ICDF(28566),
-            AOM_ICDF(30224), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31359), AOM_ICDF(31607), AOM_ICDF(31775), AOM_ICDF(31977),
-            AOM_ICDF(32258), AOM_ICDF(32768), 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(26368), AOM_ICDF(27768), AOM_ICDF(28588), AOM_ICDF(29274),
-            AOM_ICDF(29997), AOM_ICDF(30917), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(8960), AOM_ICDF(18260), AOM_ICDF(20810), AOM_ICDF(23986),
-            AOM_ICDF(26627), AOM_ICDF(28882), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7295), AOM_ICDF(24111), AOM_ICDF(25836), AOM_ICDF(27515),
-            AOM_ICDF(29033), AOM_ICDF(30769), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(22016), AOM_ICDF(25208), AOM_ICDF(27305), AOM_ICDF(28159),
-            AOM_ICDF(29221), AOM_ICDF(30274), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31932), AOM_ICDF(32050), AOM_ICDF(32199),
-            AOM_ICDF(32335), AOM_ICDF(32521), AOM_ICDF(32768), 0, 0 },
-      },
-      {
-          { AOM_ICDF(26624), AOM_ICDF(27872), AOM_ICDF(28599), AOM_ICDF(29153),
-            AOM_ICDF(29633), AOM_ICDF(30172), AOM_ICDF(30841), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6655), AOM_ICDF(17569), AOM_ICDF(19587), AOM_ICDF(23345),
-            AOM_ICDF(25884), AOM_ICDF(28088), AOM_ICDF(29678), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(3584), AOM_ICDF(27296), AOM_ICDF(28429), AOM_ICDF(29158),
-            AOM_ICDF(30032), AOM_ICDF(30780), AOM_ICDF(31572), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(23551), AOM_ICDF(25855), AOM_ICDF(27070), AOM_ICDF(27893),
-            AOM_ICDF(28597), AOM_ICDF(29721), AOM_ICDF(30970), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(32128), AOM_ICDF(32173), AOM_ICDF(32245), AOM_ICDF(32337),
-            AOM_ICDF(32416), AOM_ICDF(32500), AOM_ICDF(32609), AOM_ICDF(32768),
-            0 },
-      },
-    };
-#endif  // CONFIG_MRC_TX
-
-#if CONFIG_INTRABC
-static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = {
-  AOM_ICDF(192 * 128), AOM_ICDF(32768), 0,
-};
-#endif  // CONFIG_INTRABC
-
-#define MAX_COLOR_CONTEXT_HASH 8
-// Negative values are invalid
-static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
-                                                    1] = { -1, -1, 0, -1, -1,
-                                                           4,  3,  2, 1 };
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static const aom_prob default_quarter_tx_size_prob = 192;
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_quarter_tx_size_cdf[CDF_SIZE(2)] = {
-  AOM_ICDF(192 * 128), AOM_ICDF(32768), 0
-};
-#endif
-#endif
-
-#if CONFIG_LOOP_RESTORATION
-const aom_tree_index
-    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)] = {
-      -RESTORE_NONE, 2, -RESTORE_WIENER, -RESTORE_SGRPROJ,
-    };
-
-static const aom_prob
-    default_switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1] = {
-      32, 128,
-    };
-#endif  // CONFIG_LOOP_RESTORATION
-
-#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
-int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
-                                        int r, int c, int palette_size,
-                                        uint8_t *color_order, int *color_idx) {
-  int i;
-  // The +10 below should not be needed. But we get a warning "array subscript
-  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
-  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
-  int scores[PALETTE_MAX_SIZE + 10];
-  const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
-  const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
-  int color_index_ctx_hash;
-  int color_index_ctx;
-  int color_neighbors[NUM_PALETTE_NEIGHBORS];
-  int inverse_color_order[PALETTE_MAX_SIZE];
-  assert(palette_size <= PALETTE_MAX_SIZE);
-  assert(r > 0 || c > 0);
-
-  // Get color indices of neighbors.
-  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
-  color_neighbors[1] =
-      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
-  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
-
-  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
-    color_order[i] = i;
-    inverse_color_order[i] = i;
-  }
-  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
-  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
-    if (color_neighbors[i] >= 0) {
-      scores[color_neighbors[i]] += weights[i];
-    }
-  }
-
-  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
-  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
-    int max = scores[i];
-    int max_idx = i;
-    int j;
-    for (j = i + 1; j < palette_size; ++j) {
-      if (scores[j] > max) {
-        max = scores[j];
-        max_idx = j;
-      }
-    }
-    if (max_idx != i) {
-      // Move the score at index 'max_idx' to index 'i', and shift the scores
-      // from 'i' to 'max_idx - 1' by 1.
-      const int max_score = scores[max_idx];
-      const uint8_t max_color_order = color_order[max_idx];
-      int k;
-      for (k = max_idx; k > i; --k) {
-        scores[k] = scores[k - 1];
-        color_order[k] = color_order[k - 1];
-        inverse_color_order[color_order[k]] = k;
-      }
-      scores[i] = max_score;
-      color_order[i] = max_color_order;
-      inverse_color_order[color_order[i]] = i;
-    }
-  }
-
-  // Get hash value of context.
-  color_index_ctx_hash = 0;
-  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
-    color_index_ctx_hash += scores[i] * hash_multipliers[i];
-  }
-  assert(color_index_ctx_hash > 0);
-  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
-
-  // Lookup context from hash.
-  color_index_ctx = palette_color_index_context_lookup[color_index_ctx_hash];
-  assert(color_index_ctx >= 0);
-  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
-
-  if (color_idx != NULL) {
-    *color_idx = inverse_color_order[color_map[r * stride + c]];
-  }
-  return color_index_ctx;
-}
-#undef NUM_PALETTE_NEIGHBORS
-#undef MAX_COLOR_CONTEXT_HASH
-
-#if CONFIG_VAR_TX
-static const aom_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
-#if CONFIG_TX64X64
-  249, 240, 223, 249, 229, 177, 250, 243, 208, 226, 187,
-  145, 236, 204, 150, 183, 149, 125, 181, 146, 113, 128
-#else
-  250, 231, 212, 241, 166, 66, 241, 230, 135, 243, 154, 64, 248, 161, 63, 128
-#endif  // CONFIG_TX64X64
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
-#if CONFIG_TX64X64
-      { AOM_ICDF(249 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(240 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(223 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(249 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(229 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(177 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(250 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(243 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(226 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(187 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(145 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(236 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(204 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(150 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(183 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(149 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(125 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(181 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(146 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(113 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 }
-#else
-      { AOM_ICDF(250 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(231 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(212 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(241 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(166 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(66 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(241 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(230 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(135 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(243 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(154 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(64 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(248 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(161 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(63 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_TX64X64
-    };
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_VAR_TX
-
-static const aom_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 };
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
-  { AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(16384), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(8192), AOM_ICDF(32768), 0 }
-};
-#endif
-
-#if CONFIG_LGT_FROM_PRED
-static const aom_prob default_intra_lgt_prob[LGT_SIZES][INTRA_MODES] = {
-  { 255, 208, 208, 180, 230, 208, 194, 214, 220, 255,
-#if CONFIG_SMOOTH_HV
-    220, 220,
-#endif
-    230 },
-  { 255, 192, 216, 180, 180, 180, 180, 200, 200, 255,
-#if CONFIG_SMOOTH_HV
-    220, 220,
-#endif
-    222 },
-};
-
-static const aom_prob default_inter_lgt_prob[LGT_SIZES] = { 230, 230 };
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-static const aom_prob
-    default_intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1] = {
-      { 98, 63, 60 }, { 98, 82, 80 }, { 94, 65, 103 },
-      { 49, 25, 24 }, { 72, 38, 50 },
-    };
-const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)] = {
-  -INTRA_FILTER_LINEAR,      2, -INTRA_FILTER_8TAP, 4, -INTRA_FILTER_8TAP_SHARP,
-  -INTRA_FILTER_8TAP_SMOOTH,
-};
-int av1_intra_filter_ind[INTRA_FILTERS];
-int av1_intra_filter_inv[INTRA_FILTERS];
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-
-#if CONFIG_FILTER_INTRA
-static const aom_prob default_filter_intra_probs[2] = { 230, 230 };
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_SUPERTX
-static const aom_prob
-    default_supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-#if CONFIG_TX64X64
-      { 1, 1, 160, 160, 170, 180 }, { 1, 1, 200, 200, 210, 220 },
-#else
-      { 1, 1, 160, 160, 170 }, { 1, 1, 200, 200, 210 },
-#endif  // CONFIG_TX64X64
-#else
-#if CONFIG_TX64X64
-      { 1, 160, 160, 170, 180 }, { 1, 200, 200, 210, 220 },
-#else
-      { 1, 160, 160, 170 }, { 1, 200, 200, 210 },
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_CHROMA_2X2
-    };
-#endif  // CONFIG_SUPERTX
-
-// FIXME(someone) need real defaults here
-static const aom_prob default_segment_tree_probs[SEG_TREE_PROBS] = {
-  128, 128, 128, 128, 128, 128, 128
-};
-// clang-format off
-static const aom_prob default_segment_pred_probs[PREDICTION_PROBS] = {
-  128, 128, 128
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_segment_pred_cdf[PREDICTION_PROBS][CDF_SIZE(2)] = {
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0},
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0},
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0}
-};
-#endif
-// clang-format on
-#if CONFIG_DUAL_FILTER
-#if USE_EXTRA_FILTER
-static const aom_cdf_prob
-    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
-        SWITCHABLE_FILTERS)] = {
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-    };
-#else   // USE_EXTRA_FILTER
-static const aom_cdf_prob
-    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
-        SWITCHABLE_FILTERS)] = {
-      { AOM_ICDF(32256), AOM_ICDF(32654), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2816), AOM_ICDF(32651), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(512), AOM_ICDF(764), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30464), AOM_ICDF(31778), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32384), AOM_ICDF(32483), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3072), AOM_ICDF(32652), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(256), AOM_ICDF(383), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(25344), AOM_ICDF(26533), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32000), AOM_ICDF(32531), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2048), AOM_ICDF(32648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(384), AOM_ICDF(890), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28928), AOM_ICDF(31358), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(31616), AOM_ICDF(31787), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4224), AOM_ICDF(32433), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128), AOM_ICDF(256), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(17408), AOM_ICDF(18248), AOM_ICDF(32768), 0 }
-    };
-#endif  // USE_EXTRA_FILTER
-#else   // CONFIG_DUAL_FILTER
-static const aom_cdf_prob
-    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
-        SWITCHABLE_FILTERS)] = {
-      { AOM_ICDF(30080), AOM_ICDF(31781), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(32658), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(4685), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(26776), AOM_ICDF(32768), 0 },
-    };
-#endif  // CONFIG_DUAL_FILTER
-
-static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
-  AOM_ICDF(4096),  AOM_ICDF(8192),  AOM_ICDF(12288),
-  AOM_ICDF(16384), AOM_ICDF(20480), AOM_ICDF(24576),
-  AOM_ICDF(28672), AOM_ICDF(32768), 0
-};
-
-static const aom_cdf_prob
-    default_tx_size_cdf[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH +
-                                                                 1)] = {
-      { { AOM_ICDF(12800), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(8448), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(2560), AOM_ICDF(20496), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1920), AOM_ICDF(14091), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(384), AOM_ICDF(17588), AOM_ICDF(19782), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(640), AOM_ICDF(7166), AOM_ICDF(8466), AOM_ICDF(32768), 0 } },
-#if CONFIG_TX64X64
-      { { AOM_ICDF(128), AOM_ICDF(8288), AOM_ICDF(21293), AOM_ICDF(26986),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(128), AOM_ICDF(4208), AOM_ICDF(10009), AOM_ICDF(15965),
-          AOM_ICDF(32768), 0 } },
-#endif
-    };
-
-#if CONFIG_SMOOTH_HV
-static const aom_cdf_prob
-    default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)] = {
-      {
-          AOM_ICDF(7168), AOM_ICDF(10680), AOM_ICDF(13913), AOM_ICDF(16928),
-          AOM_ICDF(20294), AOM_ICDF(22790), AOM_ICDF(24706), AOM_ICDF(26275),
-          AOM_ICDF(28139), AOM_ICDF(29751), AOM_ICDF(30563), AOM_ICDF(31468),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11776), AOM_ICDF(13823), AOM_ICDF(15307), AOM_ICDF(15725),
-          AOM_ICDF(16638), AOM_ICDF(17406), AOM_ICDF(17994), AOM_ICDF(18814),
-          AOM_ICDF(19634), AOM_ICDF(21513), AOM_ICDF(22198), AOM_ICDF(22928),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(14720), AOM_ICDF(16459), AOM_ICDF(18091), AOM_ICDF(18299),
-          AOM_ICDF(18757), AOM_ICDF(19125), AOM_ICDF(19423), AOM_ICDF(19924),
-          AOM_ICDF(20504), AOM_ICDF(22922), AOM_ICDF(24063), AOM_ICDF(25577),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(18944), AOM_ICDF(19925), AOM_ICDF(20908), AOM_ICDF(20998),
-          AOM_ICDF(21017), AOM_ICDF(21072), AOM_ICDF(21084), AOM_ICDF(21121),
-          AOM_ICDF(21159), AOM_ICDF(22064), AOM_ICDF(22820), AOM_ICDF(24290),
-          AOM_ICDF(32768), 0,
-      },
-    };
-
-#if CONFIG_CFL
-static const aom_cdf_prob
-    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)] = {
-      { AOM_ICDF(18377), AOM_ICDF(18815), AOM_ICDF(19743), AOM_ICDF(20178),
-        AOM_ICDF(20560), AOM_ICDF(20889), AOM_ICDF(21359), AOM_ICDF(22098),
-        AOM_ICDF(22481), AOM_ICDF(24563), AOM_ICDF(25781), AOM_ICDF(26662),
-        AOM_ICDF(28396), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(5350), AOM_ICDF(16837), AOM_ICDF(17066), AOM_ICDF(17360),
-        AOM_ICDF(17692), AOM_ICDF(18778), AOM_ICDF(18969), AOM_ICDF(19206),
-        AOM_ICDF(20291), AOM_ICDF(22367), AOM_ICDF(23212), AOM_ICDF(24670),
-        AOM_ICDF(27912), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6671), AOM_ICDF(6759), AOM_ICDF(17812), AOM_ICDF(17998),
-        AOM_ICDF(18260), AOM_ICDF(18384), AOM_ICDF(19408), AOM_ICDF(20667),
-        AOM_ICDF(20806), AOM_ICDF(22760), AOM_ICDF(24142), AOM_ICDF(24875),
-        AOM_ICDF(28072), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7461), AOM_ICDF(8082), AOM_ICDF(8515), AOM_ICDF(15013),
-        AOM_ICDF(15583), AOM_ICDF(16098), AOM_ICDF(16522), AOM_ICDF(18519),
-        AOM_ICDF(20348), AOM_ICDF(22954), AOM_ICDF(24130), AOM_ICDF(25342),
-        AOM_ICDF(26548), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3694), AOM_ICDF(4403), AOM_ICDF(5370), AOM_ICDF(5854),
-        AOM_ICDF(17841), AOM_ICDF(19639), AOM_ICDF(21625), AOM_ICDF(22224),
-        AOM_ICDF(22651), AOM_ICDF(24613), AOM_ICDF(25399), AOM_ICDF(26143),
-        AOM_ICDF(26599), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3700), AOM_ICDF(5651), AOM_ICDF(6112), AOM_ICDF(6541),
-        AOM_ICDF(8929), AOM_ICDF(20623), AOM_ICDF(21213), AOM_ICDF(21640),
-        AOM_ICDF(22214), AOM_ICDF(24306), AOM_ICDF(25412), AOM_ICDF(26406),
-        AOM_ICDF(27249), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4649), AOM_ICDF(4947), AOM_ICDF(7128), AOM_ICDF(7432),
-        AOM_ICDF(9439), AOM_ICDF(9903), AOM_ICDF(21163), AOM_ICDF(21774),
-        AOM_ICDF(22056), AOM_ICDF(24426), AOM_ICDF(25403), AOM_ICDF(26324),
-        AOM_ICDF(27128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7208), AOM_ICDF(7375), AOM_ICDF(8779), AOM_ICDF(9683),
-        AOM_ICDF(10072), AOM_ICDF(10284), AOM_ICDF(10796), AOM_ICDF(19786),
-        AOM_ICDF(20152), AOM_ICDF(22955), AOM_ICDF(24246), AOM_ICDF(25165),
-        AOM_ICDF(26589), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(5897), AOM_ICDF(7283), AOM_ICDF(7555), AOM_ICDF(8910),
-        AOM_ICDF(9391), AOM_ICDF(9937), AOM_ICDF(10276), AOM_ICDF(11044),
-        AOM_ICDF(19841), AOM_ICDF(22620), AOM_ICDF(23784), AOM_ICDF(25060),
-        AOM_ICDF(26418), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12171), AOM_ICDF(12718), AOM_ICDF(13885), AOM_ICDF(14348),
-        AOM_ICDF(14925), AOM_ICDF(15394), AOM_ICDF(16108), AOM_ICDF(17075),
-        AOM_ICDF(17583), AOM_ICDF(21996), AOM_ICDF(23614), AOM_ICDF(25048),
-        AOM_ICDF(27011), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10192), AOM_ICDF(11222), AOM_ICDF(12318), AOM_ICDF(12877),
-        AOM_ICDF(13533), AOM_ICDF(14184), AOM_ICDF(14866), AOM_ICDF(15879),
-        AOM_ICDF(16650), AOM_ICDF(20419), AOM_ICDF(23265), AOM_ICDF(24295),
-        AOM_ICDF(26596), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10776), AOM_ICDF(11387), AOM_ICDF(12899), AOM_ICDF(13471),
-        AOM_ICDF(14088), AOM_ICDF(14575), AOM_ICDF(15366), AOM_ICDF(16456),
-        AOM_ICDF(17040), AOM_ICDF(20815), AOM_ICDF(22009), AOM_ICDF(24448),
-        AOM_ICDF(26492), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4015), AOM_ICDF(6473), AOM_ICDF(9853), AOM_ICDF(10285),
-        AOM_ICDF(10655), AOM_ICDF(11032), AOM_ICDF(11431), AOM_ICDF(12199),
-        AOM_ICDF(12738), AOM_ICDF(14760), AOM_ICDF(16121), AOM_ICDF(17263),
-        AOM_ICDF(28612), AOM_ICDF(32768), 0 },
-    };
-#else
-static const aom_cdf_prob
-    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)] = {
-      {
-          AOM_ICDF(23552), AOM_ICDF(25936), AOM_ICDF(28623), AOM_ICDF(29033),
-          AOM_ICDF(29395), AOM_ICDF(29892), AOM_ICDF(30252), AOM_ICDF(30905),
-          AOM_ICDF(31370), AOM_ICDF(31980), AOM_ICDF(32293), AOM_ICDF(32660),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(26431), AOM_ICDF(27553), AOM_ICDF(27746),
-          AOM_ICDF(28022), AOM_ICDF(29080), AOM_ICDF(29204), AOM_ICDF(29377),
-          AOM_ICDF(30264), AOM_ICDF(31206), AOM_ICDF(31613), AOM_ICDF(32418),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(5120), AOM_ICDF(27952), AOM_ICDF(28117),
-          AOM_ICDF(28473), AOM_ICDF(28759), AOM_ICDF(29563), AOM_ICDF(30864),
-          AOM_ICDF(31051), AOM_ICDF(31694), AOM_ICDF(32073), AOM_ICDF(32435),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(17664), AOM_ICDF(20288), AOM_ICDF(21839), AOM_ICDF(26072),
-          AOM_ICDF(26420), AOM_ICDF(26972), AOM_ICDF(27240), AOM_ICDF(28565),
-          AOM_ICDF(30914), AOM_ICDF(31694), AOM_ICDF(32083), AOM_ICDF(32591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16640), AOM_ICDF(18390), AOM_ICDF(20233), AOM_ICDF(20557),
-          AOM_ICDF(25162), AOM_ICDF(27789), AOM_ICDF(29397), AOM_ICDF(29895),
-          AOM_ICDF(30369), AOM_ICDF(31497), AOM_ICDF(32025), AOM_ICDF(32642),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(13952), AOM_ICDF(17947), AOM_ICDF(18918), AOM_ICDF(19206),
-          AOM_ICDF(21131), AOM_ICDF(30668), AOM_ICDF(31061), AOM_ICDF(31317),
-          AOM_ICDF(31838), AOM_ICDF(32137), AOM_ICDF(32342), AOM_ICDF(32547),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(15872), AOM_ICDF(16990), AOM_ICDF(21479), AOM_ICDF(21732),
-          AOM_ICDF(24134), AOM_ICDF(24854), AOM_ICDF(30296), AOM_ICDF(30887),
-          AOM_ICDF(31163), AOM_ICDF(31902), AOM_ICDF(32218), AOM_ICDF(32702),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16256), AOM_ICDF(17280), AOM_ICDF(23081), AOM_ICDF(24039),
-          AOM_ICDF(24457), AOM_ICDF(24838), AOM_ICDF(25346), AOM_ICDF(30329),
-          AOM_ICDF(30908), AOM_ICDF(31746), AOM_ICDF(32206), AOM_ICDF(32639),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(14720), AOM_ICDF(19249), AOM_ICDF(20501), AOM_ICDF(22079),
-          AOM_ICDF(22439), AOM_ICDF(23218), AOM_ICDF(23463), AOM_ICDF(24107),
-          AOM_ICDF(30308), AOM_ICDF(31379), AOM_ICDF(31866), AOM_ICDF(32556),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16768), AOM_ICDF(19967), AOM_ICDF(22374), AOM_ICDF(22976),
-          AOM_ICDF(23836), AOM_ICDF(24050), AOM_ICDF(24642), AOM_ICDF(25760),
-          AOM_ICDF(26653), AOM_ICDF(29585), AOM_ICDF(30937), AOM_ICDF(32518),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16768), AOM_ICDF(20751), AOM_ICDF(23026), AOM_ICDF(23591),
-          AOM_ICDF(24299), AOM_ICDF(24516), AOM_ICDF(24981), AOM_ICDF(25876),
-          AOM_ICDF(26806), AOM_ICDF(29520), AOM_ICDF(31286), AOM_ICDF(32455),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(17536), AOM_ICDF(20055), AOM_ICDF(22965), AOM_ICDF(23507),
-          AOM_ICDF(24210), AOM_ICDF(24398), AOM_ICDF(25098), AOM_ICDF(26366),
-          AOM_ICDF(27033), AOM_ICDF(29674), AOM_ICDF(30689), AOM_ICDF(32530),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(17536), AOM_ICDF(22753), AOM_ICDF(27126), AOM_ICDF(27353),
-          AOM_ICDF(27571), AOM_ICDF(28139), AOM_ICDF(28505), AOM_ICDF(29198),
-          AOM_ICDF(29886), AOM_ICDF(30801), AOM_ICDF(31335), AOM_ICDF(32054),
-          AOM_ICDF(32768), 0,
-      },
-    };
-#endif  // CONFIG_CFL
-#else   // !CONFIG_SMOOTH_HV
-static const aom_cdf_prob
-    default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)] = {
-      {
-          AOM_ICDF(11264), AOM_ICDF(14965), AOM_ICDF(19742), AOM_ICDF(21904),
-          AOM_ICDF(24115), AOM_ICDF(25678), AOM_ICDF(27210), AOM_ICDF(28705),
-          AOM_ICDF(29782), AOM_ICDF(31424), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(13747), AOM_ICDF(18569), AOM_ICDF(20091),
-          AOM_ICDF(21925), AOM_ICDF(23082), AOM_ICDF(24404), AOM_ICDF(26285),
-          AOM_ICDF(27574), AOM_ICDF(30415), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(14319), AOM_ICDF(19567), AOM_ICDF(20476),
-          AOM_ICDF(21791), AOM_ICDF(22529), AOM_ICDF(23393), AOM_ICDF(24881),
-          AOM_ICDF(26012), AOM_ICDF(30572), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12288), AOM_ICDF(15866), AOM_ICDF(21186), AOM_ICDF(21722),
-          AOM_ICDF(22209), AOM_ICDF(22564), AOM_ICDF(22966), AOM_ICDF(24025),
-          AOM_ICDF(24716), AOM_ICDF(30608), AOM_ICDF(32768), 0,
-      },
-    };
-
-static const aom_cdf_prob
-    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)] = {
-      {
-          AOM_ICDF(25472), AOM_ICDF(27697), AOM_ICDF(30693), AOM_ICDF(30916),
-          AOM_ICDF(31125), AOM_ICDF(31406), AOM_ICDF(31679), AOM_ICDF(32085),
-          AOM_ICDF(32334), AOM_ICDF(32682), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2176), AOM_ICDF(28142), AOM_ICDF(29335), AOM_ICDF(29504),
-          AOM_ICDF(29762), AOM_ICDF(30642), AOM_ICDF(30741), AOM_ICDF(30902),
-          AOM_ICDF(31683), AOM_ICDF(32529), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3328), AOM_ICDF(3901), AOM_ICDF(30984), AOM_ICDF(31068),
-          AOM_ICDF(31241), AOM_ICDF(31389), AOM_ICDF(31697), AOM_ICDF(32379),
-          AOM_ICDF(32483), AOM_ICDF(32653), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(23424), AOM_ICDF(25805), AOM_ICDF(27721), AOM_ICDF(29432),
-          AOM_ICDF(29659), AOM_ICDF(30032), AOM_ICDF(30282), AOM_ICDF(31192),
-          AOM_ICDF(32259), AOM_ICDF(32658), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(22784), AOM_ICDF(24177), AOM_ICDF(26209), AOM_ICDF(26476),
-          AOM_ICDF(28679), AOM_ICDF(29698), AOM_ICDF(30786), AOM_ICDF(31257),
-          AOM_ICDF(31596), AOM_ICDF(32690), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(19712), AOM_ICDF(22988), AOM_ICDF(24275), AOM_ICDF(24520),
-          AOM_ICDF(25823), AOM_ICDF(31469), AOM_ICDF(31880), AOM_ICDF(32189),
-          AOM_ICDF(32614), AOM_ICDF(32615), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(21376), AOM_ICDF(22085), AOM_ICDF(27643), AOM_ICDF(27799),
-          AOM_ICDF(28596), AOM_ICDF(28890), AOM_ICDF(31767), AOM_ICDF(32255),
-          AOM_ICDF(32405), AOM_ICDF(32723), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(19712), AOM_ICDF(20623), AOM_ICDF(28408), AOM_ICDF(28766),
-          AOM_ICDF(29070), AOM_ICDF(29355), AOM_ICDF(29759), AOM_ICDF(32034),
-          AOM_ICDF(32306), AOM_ICDF(32666), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(19584), AOM_ICDF(23437), AOM_ICDF(25295), AOM_ICDF(26200),
-          AOM_ICDF(26612), AOM_ICDF(27372), AOM_ICDF(27704), AOM_ICDF(28319),
-          AOM_ICDF(31664), AOM_ICDF(32562), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(20864), AOM_ICDF(23989), AOM_ICDF(26168), AOM_ICDF(26591),
-          AOM_ICDF(27345), AOM_ICDF(27348), AOM_ICDF(27809), AOM_ICDF(28575),
-          AOM_ICDF(29132), AOM_ICDF(32628), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(23680), AOM_ICDF(27582), AOM_ICDF(30197), AOM_ICDF(30312),
-          AOM_ICDF(30464), AOM_ICDF(30995), AOM_ICDF(31208), AOM_ICDF(31574),
-          AOM_ICDF(31985), AOM_ICDF(32519), AOM_ICDF(32768), 0,
-      },
-    };
-#endif  // CONFIG_SMOOTH_HV
-
-#if CONFIG_EXT_PARTITION_TYPES
-static const aom_cdf_prob
-    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)] = {
-      // 8x8 -> 4x4 only supports the four legacy partition types
-      { AOM_ICDF(25472), AOM_ICDF(28949), AOM_ICDF(31052), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      { AOM_ICDF(18816), AOM_ICDF(22250), AOM_ICDF(28783), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      { AOM_ICDF(18944), AOM_ICDF(26126), AOM_ICDF(29188), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      { AOM_ICDF(15488), AOM_ICDF(22508), AOM_ICDF(27077), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      // 16x16 -> 8x8
-      { AOM_ICDF(22272), AOM_ICDF(23768), AOM_ICDF(25043), AOM_ICDF(29996),
-        AOM_ICDF(30495), AOM_ICDF(30994), AOM_ICDF(31419), AOM_ICDF(31844),
-        AOM_ICDF(32343), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(11776), AOM_ICDF(13457), AOM_ICDF(16315), AOM_ICDF(28229),
-        AOM_ICDF(28789), AOM_ICDF(29349), AOM_ICDF(30302), AOM_ICDF(31255),
-        AOM_ICDF(31816), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10496), AOM_ICDF(14802), AOM_ICDF(16136), AOM_ICDF(27127),
-        AOM_ICDF(28563), AOM_ICDF(29999), AOM_ICDF(30444), AOM_ICDF(30889),
-        AOM_ICDF(32324), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6784), AOM_ICDF(8763), AOM_ICDF(10440), AOM_ICDF(29110),
-        AOM_ICDF(29770), AOM_ICDF(30430), AOM_ICDF(30989), AOM_ICDF(31548),
-        AOM_ICDF(32208), AOM_ICDF(32768), 0 },
-      // 32x32 -> 16x16
-      { AOM_ICDF(22656), AOM_ICDF(23801), AOM_ICDF(24702), AOM_ICDF(30721),
-        AOM_ICDF(31103), AOM_ICDF(31485), AOM_ICDF(31785), AOM_ICDF(32085),
-        AOM_ICDF(32467), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8704), AOM_ICDF(9926), AOM_ICDF(12586), AOM_ICDF(28885),
-        AOM_ICDF(29292), AOM_ICDF(29699), AOM_ICDF(30586), AOM_ICDF(31473),
-        AOM_ICDF(31881), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6656), AOM_ICDF(10685), AOM_ICDF(11566), AOM_ICDF(27857),
-        AOM_ICDF(29200), AOM_ICDF(30543), AOM_ICDF(30837), AOM_ICDF(31131),
-        AOM_ICDF(32474), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2176), AOM_ICDF(3012), AOM_ICDF(3690), AOM_ICDF(31253),
-        AOM_ICDF(31532), AOM_ICDF(31811), AOM_ICDF(32037), AOM_ICDF(32263),
-        AOM_ICDF(32542), AOM_ICDF(32768), 0 },
-      // 64x64 -> 32x32
-      { AOM_ICDF(28416), AOM_ICDF(28705), AOM_ICDF(28926), AOM_ICDF(32258),
-        AOM_ICDF(32354), AOM_ICDF(32450), AOM_ICDF(32523), AOM_ICDF(32596),
-        AOM_ICDF(32693), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(9952), AOM_ICDF(11849), AOM_ICDF(30134),
-        AOM_ICDF(30379), AOM_ICDF(30624), AOM_ICDF(31256), AOM_ICDF(31888),
-        AOM_ICDF(32134), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(9008), AOM_ICDF(9528), AOM_ICDF(30664),
-        AOM_ICDF(31192), AOM_ICDF(31720), AOM_ICDF(31893), AOM_ICDF(32066),
-        AOM_ICDF(32594), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(1710), AOM_ICDF(2069), AOM_ICDF(31978),
-        AOM_ICDF(32121), AOM_ICDF(32264), AOM_ICDF(32383), AOM_ICDF(32502),
-        AOM_ICDF(32647), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      // 128x128 -> 64x64
-      { AOM_ICDF(28416), AOM_ICDF(28705), AOM_ICDF(28926), AOM_ICDF(32258),
-        AOM_ICDF(32354), AOM_ICDF(32450), AOM_ICDF(32523), AOM_ICDF(32596),
-        AOM_ICDF(32693), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(9952), AOM_ICDF(11849), AOM_ICDF(30134),
-        AOM_ICDF(30379), AOM_ICDF(30624), AOM_ICDF(31256), AOM_ICDF(31888),
-        AOM_ICDF(32134), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(9008), AOM_ICDF(9528), AOM_ICDF(30664),
-        AOM_ICDF(31192), AOM_ICDF(31720), AOM_ICDF(31893), AOM_ICDF(32066),
-        AOM_ICDF(32594), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(1710), AOM_ICDF(2069), AOM_ICDF(31978),
-        AOM_ICDF(32121), AOM_ICDF(32264), AOM_ICDF(32383), AOM_ICDF(32502),
-        AOM_ICDF(32647), AOM_ICDF(32768), 0 },
-#endif
-    };
-#else
-static const aom_cdf_prob
-    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)] = {
-      { AOM_ICDF(25472), AOM_ICDF(28949), AOM_ICDF(31052), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18816), AOM_ICDF(22250), AOM_ICDF(28783), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18944), AOM_ICDF(26126), AOM_ICDF(29188), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15488), AOM_ICDF(22508), AOM_ICDF(27077), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22272), AOM_ICDF(25265), AOM_ICDF(27815), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(11776), AOM_ICDF(15138), AOM_ICDF(20854), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10496), AOM_ICDF(19109), AOM_ICDF(21777), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6784), AOM_ICDF(10743), AOM_ICDF(14098), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22656), AOM_ICDF(24947), AOM_ICDF(26749), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8704), AOM_ICDF(11148), AOM_ICDF(16469), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6656), AOM_ICDF(14714), AOM_ICDF(16477), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2176), AOM_ICDF(3849), AOM_ICDF(5205), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28416), AOM_ICDF(28994), AOM_ICDF(29436), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(14483), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(10592), AOM_ICDF(11632), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(2141), AOM_ICDF(2859), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(28416), AOM_ICDF(28994), AOM_ICDF(29436), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(14483), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(10592), AOM_ICDF(11632), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(2141), AOM_ICDF(2859), AOM_ICDF(32768), 0 },
-#endif
-    };
-#endif
-
-#if CONFIG_EXT_TX
-static const aom_cdf_prob default_intra_ext_tx_cdf
-    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
-      {
-// FIXME: unused zero positions, from uncoded trivial transform set
-#if CONFIG_CHROMA_2X2
-          {
-              { 0 },
-          },
-#endif
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-      },
-      {
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-      },
-      {
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-      },
-#if CONFIG_MRC_TX
-      {
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-      }
-#endif  // CONFIG_MRC_TX
-    };
-static const aom_cdf_prob
-    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
-        TX_TYPES)] = {
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { 0 },
-          { 0 },
-          { 0 },
-          { 0 } },
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 } },
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 } },
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 } },
-#if CONFIG_MRC_TX
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(30080), AOM_ICDF(31781), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(4608), AOM_ICDF(32658), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(4352), AOM_ICDF(4685), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(19072), AOM_ICDF(26776), AOM_ICDF(32768), 0 } },
-#endif  // CONFIG_MRC_TX
-    };
-#else
-#if CONFIG_MRC_TX
-static const aom_cdf_prob
-    default_intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { { AOM_ICDF(30720), AOM_ICDF(31104), AOM_ICDF(31400), AOM_ICDF(32084),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(31764),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(1280), AOM_ICDF(31760), AOM_ICDF(32264),
-          AOM_ICDF(32768), 0 } },
-#endif
-      { { AOM_ICDF(30720), AOM_ICDF(31104), AOM_ICDF(31400), AOM_ICDF(32084),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(31764),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(1280), AOM_ICDF(31760), AOM_ICDF(32264),
-          AOM_ICDF(32768), 0 } },
-
-      { { AOM_ICDF(31232), AOM_ICDF(31488), AOM_ICDF(31742), AOM_ICDF(32255),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1152), AOM_ICDF(1272), AOM_ICDF(31784),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1152), AOM_ICDF(1272), AOM_ICDF(2256),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1052), AOM_ICDF(1272), AOM_ICDF(2256),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1792), AOM_ICDF(31776), AOM_ICDF(32272),
-          AOM_ICDF(32768), 0 } },
-
-      { { AOM_ICDF(31744), AOM_ICDF(31940), AOM_ICDF(32084), AOM_ICDF(32426),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2176), AOM_ICDF(2528), AOM_ICDF(31823),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2176), AOM_ICDF(2528), AOM_ICDF(3473),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2176), AOM_ICDF(2528), AOM_ICDF(3473),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(28160), AOM_ICDF(31808), AOM_ICDF(32288),
-          AOM_ICDF(32768), 0 } },
-    };
-
-static const aom_cdf_prob
-    default_inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { AOM_ICDF(20480), AOM_ICDF(23040), AOM_ICDF(24560), AOM_ICDF(28664),
-        AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(20480), AOM_ICDF(23040), AOM_ICDF(24560), AOM_ICDF(28664),
-        AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22528), AOM_ICDF(24320), AOM_ICDF(25928), AOM_ICDF(29348),
-        AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24576), AOM_ICDF(25600), AOM_ICDF(27296), AOM_ICDF(30032),
-        AOM_ICDF(32768), 0 },
-    };
-#else  // CONFIG_MRC_TX
-static const aom_cdf_prob
-    default_intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { { AOM_ICDF(30720), AOM_ICDF(31400), AOM_ICDF(32084), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(31764), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(1642), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(31760), AOM_ICDF(32264), AOM_ICDF(32768),
-          0 } },
-#endif
-      { { AOM_ICDF(30720), AOM_ICDF(31400), AOM_ICDF(32084), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(31764), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(1642), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(31760), AOM_ICDF(32264), AOM_ICDF(32768),
-          0 } },
-
-      { { AOM_ICDF(31232), AOM_ICDF(31742), AOM_ICDF(32255), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(1024), AOM_ICDF(1272), AOM_ICDF(31784), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1272), AOM_ICDF(2256), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(31776), AOM_ICDF(32272), AOM_ICDF(32768),
-          0 } },
-      { { AOM_ICDF(31744), AOM_ICDF(32084), AOM_ICDF(32426), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(2048), AOM_ICDF(2528), AOM_ICDF(31823), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2528), AOM_ICDF(3473), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(31808), AOM_ICDF(32288), AOM_ICDF(32768),
-          0 } },
-    };
-
-static const aom_cdf_prob
-    default_inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { AOM_ICDF(20480), AOM_ICDF(24560), AOM_ICDF(28664), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(20480), AOM_ICDF(24560), AOM_ICDF(28664), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22528), AOM_ICDF(25928), AOM_ICDF(29348), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24576), AOM_ICDF(27296), AOM_ICDF(30032), AOM_ICDF(32768), 0 },
-    };
-#endif  // CONFIG_MRC_TX
-#endif  // !CONFIG_EXT_TX
-
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-static const aom_cdf_prob
-    default_intra_filter_cdf[INTRA_FILTERS + 1][CDF_SIZE(INTRA_FILTERS)] = {
-      { AOM_ICDF(12544), AOM_ICDF(17521), AOM_ICDF(21095), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12544), AOM_ICDF(19022), AOM_ICDF(23318), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12032), AOM_ICDF(17297), AOM_ICDF(23522), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6272), AOM_ICDF(8860), AOM_ICDF(11101), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(12712), AOM_ICDF(16629), AOM_ICDF(32768), 0 },
-    };
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-
-#if CONFIG_CFL
-static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
-  AOM_ICDF(1892),  AOM_ICDF(2229),  AOM_ICDF(11464),
-  AOM_ICDF(14116), AOM_ICDF(25661), AOM_ICDF(26409),
-  AOM_ICDF(32508), AOM_ICDF(32768), 0
-};
-
-static const aom_cdf_prob
-    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
-      { AOM_ICDF(16215), AOM_ICDF(27740), AOM_ICDF(31726), AOM_ICDF(32606),
-        AOM_ICDF(32736), AOM_ICDF(32751), AOM_ICDF(32757), AOM_ICDF(32759),
-        AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-        AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15213), AOM_ICDF(24615), AOM_ICDF(29704), AOM_ICDF(31974),
-        AOM_ICDF(32545), AOM_ICDF(32673), AOM_ICDF(32713), AOM_ICDF(32746),
-        AOM_ICDF(32753), AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32761),
-        AOM_ICDF(32763), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(13250), AOM_ICDF(24677), AOM_ICDF(29113), AOM_ICDF(31666),
-        AOM_ICDF(32408), AOM_ICDF(32578), AOM_ICDF(32628), AOM_ICDF(32711),
-        AOM_ICDF(32730), AOM_ICDF(32738), AOM_ICDF(32744), AOM_ICDF(32749),
-        AOM_ICDF(32752), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24593), AOM_ICDF(30787), AOM_ICDF(32062), AOM_ICDF(32495),
-        AOM_ICDF(32656), AOM_ICDF(32707), AOM_ICDF(32735), AOM_ICDF(32747),
-        AOM_ICDF(32752), AOM_ICDF(32757), AOM_ICDF(32760), AOM_ICDF(32763),
-        AOM_ICDF(32764), AOM_ICDF(32765), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19883), AOM_ICDF(27419), AOM_ICDF(30100), AOM_ICDF(31392),
-        AOM_ICDF(31896), AOM_ICDF(32184), AOM_ICDF(32299), AOM_ICDF(32511),
-        AOM_ICDF(32568), AOM_ICDF(32602), AOM_ICDF(32628), AOM_ICDF(32664),
-        AOM_ICDF(32680), AOM_ICDF(32691), AOM_ICDF(32708), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15939), AOM_ICDF(24151), AOM_ICDF(27754), AOM_ICDF(29680),
-        AOM_ICDF(30651), AOM_ICDF(31267), AOM_ICDF(31527), AOM_ICDF(31868),
-        AOM_ICDF(32001), AOM_ICDF(32090), AOM_ICDF(32181), AOM_ICDF(32284),
-        AOM_ICDF(32314), AOM_ICDF(32366), AOM_ICDF(32486), AOM_ICDF(32768), 0 }
-    };
-#endif
-
-#if CONFIG_KF_CTX
-// TODO(jingning): This initial models are copied directly from the entries
-// from the original table. The copied indexes are (0, 0), (0, 1), .. (4, 4).
-// It is possible to re-train this model and bring back the 0.14% loss in CIF
-// set key frame coding. This reduction in context model does not change the
-// key frame coding stats for mid and high resolution sets.
-const aom_cdf_prob
-    default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
-        INTRA_MODES)] = {
-      {
-          {
-              AOM_ICDF(14208), AOM_ICDF(17049), AOM_ICDF(20482),
-              AOM_ICDF(21400), AOM_ICDF(22520), AOM_ICDF(23261),
-              AOM_ICDF(23963), AOM_ICDF(25010), AOM_ICDF(25828),
-              AOM_ICDF(28398), AOM_ICDF(29394), AOM_ICDF(30738),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(10496), AOM_ICDF(18295), AOM_ICDF(19872),
-              AOM_ICDF(20945), AOM_ICDF(21933), AOM_ICDF(22818),
-              AOM_ICDF(23334), AOM_ICDF(24033), AOM_ICDF(24996),
-              AOM_ICDF(27652), AOM_ICDF(29060), AOM_ICDF(30071),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(5120), AOM_ICDF(6461), AOM_ICDF(19840), AOM_ICDF(20310),
-              AOM_ICDF(21151), AOM_ICDF(21506), AOM_ICDF(22535),
-              AOM_ICDF(23900), AOM_ICDF(24281), AOM_ICDF(26958),
-              AOM_ICDF(27680), AOM_ICDF(29636), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(12544), AOM_ICDF(15177), AOM_ICDF(17666),
-              AOM_ICDF(19855), AOM_ICDF(21147), AOM_ICDF(22017),
-              AOM_ICDF(22797), AOM_ICDF(24514), AOM_ICDF(25779),
-              AOM_ICDF(28716), AOM_ICDF(29772), AOM_ICDF(31267),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7552), AOM_ICDF(9909), AOM_ICDF(11908), AOM_ICDF(13141),
-              AOM_ICDF(18765), AOM_ICDF(22029), AOM_ICDF(23872),
-              AOM_ICDF(24920), AOM_ICDF(25674), AOM_ICDF(29031),
-              AOM_ICDF(30244), AOM_ICDF(31684), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(3968), AOM_ICDF(17613), AOM_ICDF(19125), AOM_ICDF(19550),
-              AOM_ICDF(20305), AOM_ICDF(21908), AOM_ICDF(22274),
-              AOM_ICDF(22719), AOM_ICDF(23959), AOM_ICDF(26970),
-              AOM_ICDF(29013), AOM_ICDF(29843), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(3072), AOM_ICDF(21231), AOM_ICDF(21863), AOM_ICDF(22306),
-              AOM_ICDF(22674), AOM_ICDF(23414), AOM_ICDF(23517),
-              AOM_ICDF(23798), AOM_ICDF(24770), AOM_ICDF(27032),
-              AOM_ICDF(29016), AOM_ICDF(29636), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(2560), AOM_ICDF(9825), AOM_ICDF(15681), AOM_ICDF(16370),
-              AOM_ICDF(17054), AOM_ICDF(17687), AOM_ICDF(18236),
-              AOM_ICDF(19273), AOM_ICDF(20311), AOM_ICDF(24863),
-              AOM_ICDF(26825), AOM_ICDF(28756), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(6912), AOM_ICDF(15140), AOM_ICDF(16485), AOM_ICDF(18364),
-              AOM_ICDF(19181), AOM_ICDF(20394), AOM_ICDF(20663),
-              AOM_ICDF(22098), AOM_ICDF(23936), AOM_ICDF(27555),
-              AOM_ICDF(29704), AOM_ICDF(30849), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(2944), AOM_ICDF(13101), AOM_ICDF(14006), AOM_ICDF(14974),
-              AOM_ICDF(17818), AOM_ICDF(21093), AOM_ICDF(21930),
-              AOM_ICDF(22566), AOM_ICDF(24137), AOM_ICDF(27732),
-              AOM_ICDF(29814), AOM_ICDF(30904), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(11392), AOM_ICDF(12961), AOM_ICDF(20901),
-              AOM_ICDF(21544), AOM_ICDF(22490), AOM_ICDF(22928),
-              AOM_ICDF(23888), AOM_ICDF(25214), AOM_ICDF(25777),
-              AOM_ICDF(28256), AOM_ICDF(29102), AOM_ICDF(30513),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(8064), AOM_ICDF(13595), AOM_ICDF(18888), AOM_ICDF(19616),
-              AOM_ICDF(20765), AOM_ICDF(21454), AOM_ICDF(21990),
-              AOM_ICDF(23103), AOM_ICDF(23980), AOM_ICDF(26772),
-              AOM_ICDF(28070), AOM_ICDF(29197), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(4352), AOM_ICDF(5059), AOM_ICDF(21705), AOM_ICDF(22099),
-              AOM_ICDF(22703), AOM_ICDF(22846), AOM_ICDF(23679),
-              AOM_ICDF(25469), AOM_ICDF(25728), AOM_ICDF(27919),
-              AOM_ICDF(28484), AOM_ICDF(30215), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(10752), AOM_ICDF(12277), AOM_ICDF(16471),
-              AOM_ICDF(18276), AOM_ICDF(19443), AOM_ICDF(19917),
-              AOM_ICDF(21158), AOM_ICDF(23881), AOM_ICDF(24892),
-              AOM_ICDF(27709), AOM_ICDF(28771), AOM_ICDF(30274),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(8320), AOM_ICDF(10000), AOM_ICDF(14147), AOM_ICDF(15330),
-              AOM_ICDF(19197), AOM_ICDF(20923), AOM_ICDF(22954),
-              AOM_ICDF(24541), AOM_ICDF(25285), AOM_ICDF(28407),
-              AOM_ICDF(29431), AOM_ICDF(30953), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(10240), AOM_ICDF(12819), AOM_ICDF(15545),
-              AOM_ICDF(18248), AOM_ICDF(19779), AOM_ICDF(20932),
-              AOM_ICDF(21899), AOM_ICDF(23377), AOM_ICDF(25448),
-              AOM_ICDF(28730), AOM_ICDF(29936), AOM_ICDF(31536),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7552), AOM_ICDF(15309), AOM_ICDF(16645), AOM_ICDF(19760),
-              AOM_ICDF(20653), AOM_ICDF(21650), AOM_ICDF(22221),
-              AOM_ICDF(23273), AOM_ICDF(25509), AOM_ICDF(28683),
-              AOM_ICDF(30153), AOM_ICDF(31192), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(5248), AOM_ICDF(6840), AOM_ICDF(16129), AOM_ICDF(17940),
-              AOM_ICDF(19069), AOM_ICDF(19660), AOM_ICDF(20588),
-              AOM_ICDF(22760), AOM_ICDF(23927), AOM_ICDF(27538),
-              AOM_ICDF(28397), AOM_ICDF(30725), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(11008), AOM_ICDF(11903), AOM_ICDF(13794),
-              AOM_ICDF(21320), AOM_ICDF(21931), AOM_ICDF(22310),
-              AOM_ICDF(22546), AOM_ICDF(25375), AOM_ICDF(27347),
-              AOM_ICDF(29800), AOM_ICDF(30761), AOM_ICDF(31833),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(6272), AOM_ICDF(8678), AOM_ICDF(10313), AOM_ICDF(13073),
-              AOM_ICDF(16823), AOM_ICDF(19980), AOM_ICDF(21520),
-              AOM_ICDF(23242), AOM_ICDF(25344), AOM_ICDF(28797),
-              AOM_ICDF(30405), AOM_ICDF(31940), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(7296), AOM_ICDF(9304), AOM_ICDF(11772), AOM_ICDF(12529),
-              AOM_ICDF(18014), AOM_ICDF(20418), AOM_ICDF(23076),
-              AOM_ICDF(24662), AOM_ICDF(25549), AOM_ICDF(29074),
-              AOM_ICDF(30392), AOM_ICDF(31773), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7168), AOM_ICDF(11687), AOM_ICDF(13541), AOM_ICDF(14431),
-              AOM_ICDF(18214), AOM_ICDF(20761), AOM_ICDF(22269),
-              AOM_ICDF(23320), AOM_ICDF(24633), AOM_ICDF(28339),
-              AOM_ICDF(30193), AOM_ICDF(31268), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(3584), AOM_ICDF(4428), AOM_ICDF(13496), AOM_ICDF(14189),
-              AOM_ICDF(17372), AOM_ICDF(18617), AOM_ICDF(20609),
-              AOM_ICDF(22615), AOM_ICDF(23270), AOM_ICDF(27280),
-              AOM_ICDF(28305), AOM_ICDF(30602), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7424), AOM_ICDF(8834), AOM_ICDF(10499), AOM_ICDF(14357),
-              AOM_ICDF(17671), AOM_ICDF(19150), AOM_ICDF(20460),
-              AOM_ICDF(23235), AOM_ICDF(24391), AOM_ICDF(28351),
-              AOM_ICDF(29843), AOM_ICDF(31481), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(4480), AOM_ICDF(5888), AOM_ICDF(7093), AOM_ICDF(7902),
-              AOM_ICDF(18290), AOM_ICDF(22123), AOM_ICDF(24511),
-              AOM_ICDF(25532), AOM_ICDF(26360), AOM_ICDF(29653),
-              AOM_ICDF(30954), AOM_ICDF(32215), AOM_ICDF(32768), 0,
-          },
-      },
-    };
-#else
-const aom_cdf_prob default_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(
-    INTRA_MODES)] = {
-#if CONFIG_SMOOTH_HV
-  {
-      {
-          AOM_ICDF(14208), AOM_ICDF(17049), AOM_ICDF(20482), AOM_ICDF(21400),
-          AOM_ICDF(22520), AOM_ICDF(23261), AOM_ICDF(23963), AOM_ICDF(25010),
-          AOM_ICDF(25828), AOM_ICDF(28398), AOM_ICDF(29394), AOM_ICDF(30738),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(18295), AOM_ICDF(19872), AOM_ICDF(20945),
-          AOM_ICDF(21933), AOM_ICDF(22818), AOM_ICDF(23334), AOM_ICDF(24033),
-          AOM_ICDF(24996), AOM_ICDF(27652), AOM_ICDF(29060), AOM_ICDF(30071),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(6461), AOM_ICDF(19840), AOM_ICDF(20310),
-          AOM_ICDF(21151), AOM_ICDF(21506), AOM_ICDF(22535), AOM_ICDF(23900),
-          AOM_ICDF(24281), AOM_ICDF(26958), AOM_ICDF(27680), AOM_ICDF(29636),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12544), AOM_ICDF(15177), AOM_ICDF(17666), AOM_ICDF(19855),
-          AOM_ICDF(21147), AOM_ICDF(22017), AOM_ICDF(22797), AOM_ICDF(24514),
-          AOM_ICDF(25779), AOM_ICDF(28716), AOM_ICDF(29772), AOM_ICDF(31267),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7552), AOM_ICDF(9909), AOM_ICDF(11908), AOM_ICDF(13141),
-          AOM_ICDF(18765), AOM_ICDF(22029), AOM_ICDF(23872), AOM_ICDF(24920),
-          AOM_ICDF(25674), AOM_ICDF(29031), AOM_ICDF(30244), AOM_ICDF(31684),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(15004), AOM_ICDF(16534), AOM_ICDF(18158),
-          AOM_ICDF(21515), AOM_ICDF(26668), AOM_ICDF(27834), AOM_ICDF(28735),
-          AOM_ICDF(30471), AOM_ICDF(30839), AOM_ICDF(30969), AOM_ICDF(31068),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(7963), AOM_ICDF(11944), AOM_ICDF(12780),
-          AOM_ICDF(17944), AOM_ICDF(19198), AOM_ICDF(24071), AOM_ICDF(25295),
-          AOM_ICDF(25834), AOM_ICDF(29014), AOM_ICDF(29949), AOM_ICDF(31733),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(10189), AOM_ICDF(14596), AOM_ICDF(15680),
-          AOM_ICDF(17143), AOM_ICDF(17909), AOM_ICDF(19201), AOM_ICDF(23711),
-          AOM_ICDF(24503), AOM_ICDF(28207), AOM_ICDF(29338), AOM_ICDF(31424),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10752), AOM_ICDF(13199), AOM_ICDF(15048), AOM_ICDF(17151),
-          AOM_ICDF(18445), AOM_ICDF(19604), AOM_ICDF(20363), AOM_ICDF(21782),
-          AOM_ICDF(24311), AOM_ICDF(28026), AOM_ICDF(29517), AOM_ICDF(30962),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(10301), AOM_ICDF(13245), AOM_ICDF(14307),
-          AOM_ICDF(16021), AOM_ICDF(16257), AOM_ICDF(17265), AOM_ICDF(18739),
-          AOM_ICDF(20080), AOM_ICDF(26066), AOM_ICDF(28325), AOM_ICDF(31184),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(10893), AOM_ICDF(13773), AOM_ICDF(14824),
-          AOM_ICDF(16540), AOM_ICDF(16926), AOM_ICDF(17748), AOM_ICDF(18876),
-          AOM_ICDF(20396), AOM_ICDF(25974), AOM_ICDF(28795), AOM_ICDF(30820),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(11005), AOM_ICDF(14320), AOM_ICDF(15349),
-          AOM_ICDF(16746), AOM_ICDF(16884), AOM_ICDF(17887), AOM_ICDF(19304),
-          AOM_ICDF(20265), AOM_ICDF(26115), AOM_ICDF(27672), AOM_ICDF(31358),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(9504), AOM_ICDF(15437), AOM_ICDF(16399),
-          AOM_ICDF(17355), AOM_ICDF(17948), AOM_ICDF(18814), AOM_ICDF(20270),
-          AOM_ICDF(21134), AOM_ICDF(23690), AOM_ICDF(24759), AOM_ICDF(26454),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(3968), AOM_ICDF(17613), AOM_ICDF(19125), AOM_ICDF(19550),
-          AOM_ICDF(20305), AOM_ICDF(21908), AOM_ICDF(22274), AOM_ICDF(22719),
-          AOM_ICDF(23959), AOM_ICDF(26970), AOM_ICDF(29013), AOM_ICDF(29843),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3072), AOM_ICDF(21231), AOM_ICDF(21863), AOM_ICDF(22306),
-          AOM_ICDF(22674), AOM_ICDF(23414), AOM_ICDF(23517), AOM_ICDF(23798),
-          AOM_ICDF(24770), AOM_ICDF(27032), AOM_ICDF(29016), AOM_ICDF(29636),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2560), AOM_ICDF(9825), AOM_ICDF(15681), AOM_ICDF(16370),
-          AOM_ICDF(17054), AOM_ICDF(17687), AOM_ICDF(18236), AOM_ICDF(19273),
-          AOM_ICDF(20311), AOM_ICDF(24863), AOM_ICDF(26825), AOM_ICDF(28756),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(15140), AOM_ICDF(16485), AOM_ICDF(18364),
-          AOM_ICDF(19181), AOM_ICDF(20394), AOM_ICDF(20663), AOM_ICDF(22098),
-          AOM_ICDF(23936), AOM_ICDF(27555), AOM_ICDF(29704), AOM_ICDF(30849),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(13101), AOM_ICDF(14006), AOM_ICDF(14974),
-          AOM_ICDF(17818), AOM_ICDF(21093), AOM_ICDF(21930), AOM_ICDF(22566),
-          AOM_ICDF(24137), AOM_ICDF(27732), AOM_ICDF(29814), AOM_ICDF(30904),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(17824), AOM_ICDF(18715), AOM_ICDF(19632),
-          AOM_ICDF(21519), AOM_ICDF(26341), AOM_ICDF(26922), AOM_ICDF(27575),
-          AOM_ICDF(29863), AOM_ICDF(30432), AOM_ICDF(30769), AOM_ICDF(30881),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(11971), AOM_ICDF(13509), AOM_ICDF(14295),
-          AOM_ICDF(17202), AOM_ICDF(19005), AOM_ICDF(21605), AOM_ICDF(22458),
-          AOM_ICDF(23839), AOM_ICDF(27774), AOM_ICDF(29492), AOM_ICDF(30787),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4224), AOM_ICDF(13072), AOM_ICDF(15288), AOM_ICDF(16406),
-          AOM_ICDF(17285), AOM_ICDF(18362), AOM_ICDF(19003), AOM_ICDF(21378),
-          AOM_ICDF(22942), AOM_ICDF(27093), AOM_ICDF(29381), AOM_ICDF(30872),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(15375), AOM_ICDF(16188), AOM_ICDF(17415),
-          AOM_ICDF(18183), AOM_ICDF(19756), AOM_ICDF(20030), AOM_ICDF(20883),
-          AOM_ICDF(23935), AOM_ICDF(27428), AOM_ICDF(29627), AOM_ICDF(30608),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2816), AOM_ICDF(14999), AOM_ICDF(16352), AOM_ICDF(16969),
-          AOM_ICDF(17836), AOM_ICDF(18125), AOM_ICDF(18514), AOM_ICDF(19181),
-          AOM_ICDF(20650), AOM_ICDF(25773), AOM_ICDF(29172), AOM_ICDF(30662),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2560), AOM_ICDF(16158), AOM_ICDF(17320), AOM_ICDF(17839),
-          AOM_ICDF(18545), AOM_ICDF(18848), AOM_ICDF(19130), AOM_ICDF(19599),
-          AOM_ICDF(20863), AOM_ICDF(25449), AOM_ICDF(29304), AOM_ICDF(30408),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3328), AOM_ICDF(15146), AOM_ICDF(16880), AOM_ICDF(17523),
-          AOM_ICDF(18340), AOM_ICDF(18563), AOM_ICDF(18896), AOM_ICDF(19582),
-          AOM_ICDF(20944), AOM_ICDF(25914), AOM_ICDF(28759), AOM_ICDF(30583),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2560), AOM_ICDF(16618), AOM_ICDF(18460), AOM_ICDF(19207),
-          AOM_ICDF(19654), AOM_ICDF(20276), AOM_ICDF(20529), AOM_ICDF(21179),
-          AOM_ICDF(22355), AOM_ICDF(25423), AOM_ICDF(27696), AOM_ICDF(28638),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(11392), AOM_ICDF(12961), AOM_ICDF(20901), AOM_ICDF(21544),
-          AOM_ICDF(22490), AOM_ICDF(22928), AOM_ICDF(23888), AOM_ICDF(25214),
-          AOM_ICDF(25777), AOM_ICDF(28256), AOM_ICDF(29102), AOM_ICDF(30513),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(13595), AOM_ICDF(18888), AOM_ICDF(19616),
-          AOM_ICDF(20765), AOM_ICDF(21454), AOM_ICDF(21990), AOM_ICDF(23103),
-          AOM_ICDF(23980), AOM_ICDF(26772), AOM_ICDF(28070), AOM_ICDF(29197),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(5059), AOM_ICDF(21705), AOM_ICDF(22099),
-          AOM_ICDF(22703), AOM_ICDF(22846), AOM_ICDF(23679), AOM_ICDF(25469),
-          AOM_ICDF(25728), AOM_ICDF(27919), AOM_ICDF(28484), AOM_ICDF(30215),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10752), AOM_ICDF(12277), AOM_ICDF(16471), AOM_ICDF(18276),
-          AOM_ICDF(19443), AOM_ICDF(19917), AOM_ICDF(21158), AOM_ICDF(23881),
-          AOM_ICDF(24892), AOM_ICDF(27709), AOM_ICDF(28771), AOM_ICDF(30274),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8320), AOM_ICDF(10000), AOM_ICDF(14147), AOM_ICDF(15330),
-          AOM_ICDF(19197), AOM_ICDF(20923), AOM_ICDF(22954), AOM_ICDF(24541),
-          AOM_ICDF(25285), AOM_ICDF(28407), AOM_ICDF(29431), AOM_ICDF(30953),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11264), AOM_ICDF(14751), AOM_ICDF(18517), AOM_ICDF(20285),
-          AOM_ICDF(23172), AOM_ICDF(25970), AOM_ICDF(27312), AOM_ICDF(28684),
-          AOM_ICDF(29803), AOM_ICDF(30242), AOM_ICDF(30412), AOM_ICDF(30668),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(7509), AOM_ICDF(14190), AOM_ICDF(14953),
-          AOM_ICDF(17905), AOM_ICDF(18452), AOM_ICDF(23074), AOM_ICDF(24910),
-          AOM_ICDF(25374), AOM_ICDF(28605), AOM_ICDF(29542), AOM_ICDF(31640),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(7644), AOM_ICDF(15953), AOM_ICDF(17055),
-          AOM_ICDF(17945), AOM_ICDF(18242), AOM_ICDF(19351), AOM_ICDF(24705),
-          AOM_ICDF(25365), AOM_ICDF(28466), AOM_ICDF(29334), AOM_ICDF(31245),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(9802), AOM_ICDF(14519), AOM_ICDF(15740),
-          AOM_ICDF(17351), AOM_ICDF(18084), AOM_ICDF(18962), AOM_ICDF(20908),
-          AOM_ICDF(22937), AOM_ICDF(26847), AOM_ICDF(28284), AOM_ICDF(29888),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(7534), AOM_ICDF(14635), AOM_ICDF(15436),
-          AOM_ICDF(16710), AOM_ICDF(16830), AOM_ICDF(18000), AOM_ICDF(19760),
-          AOM_ICDF(20571), AOM_ICDF(25777), AOM_ICDF(27649), AOM_ICDF(30668),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(7364), AOM_ICDF(14858), AOM_ICDF(15545),
-          AOM_ICDF(16861), AOM_ICDF(17016), AOM_ICDF(17859), AOM_ICDF(19384),
-          AOM_ICDF(20237), AOM_ICDF(25239), AOM_ICDF(27715), AOM_ICDF(29865),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(7989), AOM_ICDF(15472), AOM_ICDF(16265),
-          AOM_ICDF(17271), AOM_ICDF(17334), AOM_ICDF(18563), AOM_ICDF(20327),
-          AOM_ICDF(20916), AOM_ICDF(26173), AOM_ICDF(27350), AOM_ICDF(31034),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(6411), AOM_ICDF(17828), AOM_ICDF(18611),
-          AOM_ICDF(19399), AOM_ICDF(19684), AOM_ICDF(20504), AOM_ICDF(21782),
-          AOM_ICDF(22335), AOM_ICDF(25286), AOM_ICDF(26352), AOM_ICDF(28016),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10240), AOM_ICDF(12819), AOM_ICDF(15545), AOM_ICDF(18248),
-          AOM_ICDF(19779), AOM_ICDF(20932), AOM_ICDF(21899), AOM_ICDF(23377),
-          AOM_ICDF(25448), AOM_ICDF(28730), AOM_ICDF(29936), AOM_ICDF(31536),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7552), AOM_ICDF(15309), AOM_ICDF(16645), AOM_ICDF(19760),
-          AOM_ICDF(20653), AOM_ICDF(21650), AOM_ICDF(22221), AOM_ICDF(23273),
-          AOM_ICDF(25509), AOM_ICDF(28683), AOM_ICDF(30153), AOM_ICDF(31192),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(6840), AOM_ICDF(16129), AOM_ICDF(17940),
-          AOM_ICDF(19069), AOM_ICDF(19660), AOM_ICDF(20588), AOM_ICDF(22760),
-          AOM_ICDF(23927), AOM_ICDF(27538), AOM_ICDF(28397), AOM_ICDF(30725),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(11903), AOM_ICDF(13794), AOM_ICDF(21320),
-          AOM_ICDF(21931), AOM_ICDF(22310), AOM_ICDF(22546), AOM_ICDF(25375),
-          AOM_ICDF(27347), AOM_ICDF(29800), AOM_ICDF(30761), AOM_ICDF(31833),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(8678), AOM_ICDF(10313), AOM_ICDF(13073),
-          AOM_ICDF(16823), AOM_ICDF(19980), AOM_ICDF(21520), AOM_ICDF(23242),
-          AOM_ICDF(25344), AOM_ICDF(28797), AOM_ICDF(30405), AOM_ICDF(31940),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(10835), AOM_ICDF(12653), AOM_ICDF(16345),
-          AOM_ICDF(19574), AOM_ICDF(24868), AOM_ICDF(25937), AOM_ICDF(27299),
-          AOM_ICDF(31104), AOM_ICDF(31448), AOM_ICDF(31580), AOM_ICDF(31679),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4992), AOM_ICDF(6458), AOM_ICDF(9945), AOM_ICDF(11961),
-          AOM_ICDF(16136), AOM_ICDF(17677), AOM_ICDF(20946), AOM_ICDF(23042),
-          AOM_ICDF(24475), AOM_ICDF(28304), AOM_ICDF(29748), AOM_ICDF(31791),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(11879), AOM_ICDF(14703), AOM_ICDF(17653),
-          AOM_ICDF(19176), AOM_ICDF(20185), AOM_ICDF(20880), AOM_ICDF(25194),
-          AOM_ICDF(26584), AOM_ICDF(29655), AOM_ICDF(30430), AOM_ICDF(32044),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(11385), AOM_ICDF(13457), AOM_ICDF(18705),
-          AOM_ICDF(19577), AOM_ICDF(20266), AOM_ICDF(20746), AOM_ICDF(22207),
-          AOM_ICDF(26724), AOM_ICDF(29431), AOM_ICDF(30645), AOM_ICDF(31604),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(9318), AOM_ICDF(11569), AOM_ICDF(14812),
-          AOM_ICDF(16351), AOM_ICDF(16619), AOM_ICDF(17537), AOM_ICDF(19596),
-          AOM_ICDF(22025), AOM_ICDF(27384), AOM_ICDF(29277), AOM_ICDF(31422),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(9348), AOM_ICDF(11416), AOM_ICDF(14690),
-          AOM_ICDF(16254), AOM_ICDF(16633), AOM_ICDF(17457), AOM_ICDF(19031),
-          AOM_ICDF(21875), AOM_ICDF(27080), AOM_ICDF(29442), AOM_ICDF(31193),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(9329), AOM_ICDF(12218), AOM_ICDF(15177),
-          AOM_ICDF(16806), AOM_ICDF(16998), AOM_ICDF(17991), AOM_ICDF(20005),
-          AOM_ICDF(21952), AOM_ICDF(27108), AOM_ICDF(28867), AOM_ICDF(31657),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(9098), AOM_ICDF(13132), AOM_ICDF(17701),
-          AOM_ICDF(18739), AOM_ICDF(19534), AOM_ICDF(20415), AOM_ICDF(22136),
-          AOM_ICDF(24213), AOM_ICDF(27199), AOM_ICDF(28504), AOM_ICDF(29960),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7296), AOM_ICDF(9304), AOM_ICDF(11772), AOM_ICDF(12529),
-          AOM_ICDF(18014), AOM_ICDF(20418), AOM_ICDF(23076), AOM_ICDF(24662),
-          AOM_ICDF(25549), AOM_ICDF(29074), AOM_ICDF(30392), AOM_ICDF(31773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(11687), AOM_ICDF(13541), AOM_ICDF(14431),
-          AOM_ICDF(18214), AOM_ICDF(20761), AOM_ICDF(22269), AOM_ICDF(23320),
-          AOM_ICDF(24633), AOM_ICDF(28339), AOM_ICDF(30193), AOM_ICDF(31268),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3584), AOM_ICDF(4428), AOM_ICDF(13496), AOM_ICDF(14189),
-          AOM_ICDF(17372), AOM_ICDF(18617), AOM_ICDF(20609), AOM_ICDF(22615),
-          AOM_ICDF(23270), AOM_ICDF(27280), AOM_ICDF(28305), AOM_ICDF(30602),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(8834), AOM_ICDF(10499), AOM_ICDF(14357),
-          AOM_ICDF(17671), AOM_ICDF(19150), AOM_ICDF(20460), AOM_ICDF(23235),
-          AOM_ICDF(24391), AOM_ICDF(28351), AOM_ICDF(29843), AOM_ICDF(31481),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(5888), AOM_ICDF(7093), AOM_ICDF(7902),
-          AOM_ICDF(18290), AOM_ICDF(22123), AOM_ICDF(24511), AOM_ICDF(25532),
-          AOM_ICDF(26360), AOM_ICDF(29653), AOM_ICDF(30954), AOM_ICDF(32215),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(10176), AOM_ICDF(11780), AOM_ICDF(12824),
-          AOM_ICDF(19608), AOM_ICDF(25882), AOM_ICDF(28520), AOM_ICDF(29445),
-          AOM_ICDF(31106), AOM_ICDF(31573), AOM_ICDF(31775), AOM_ICDF(31872),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(4833), AOM_ICDF(7551), AOM_ICDF(8449),
-          AOM_ICDF(16668), AOM_ICDF(18614), AOM_ICDF(23952), AOM_ICDF(25668),
-          AOM_ICDF(26721), AOM_ICDF(29888), AOM_ICDF(30697), AOM_ICDF(32090),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(8011), AOM_ICDF(11083), AOM_ICDF(12427),
-          AOM_ICDF(16188), AOM_ICDF(17548), AOM_ICDF(19625), AOM_ICDF(23787),
-          AOM_ICDF(24792), AOM_ICDF(28649), AOM_ICDF(29872), AOM_ICDF(31845),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(9170), AOM_ICDF(10655), AOM_ICDF(12439),
-          AOM_ICDF(15550), AOM_ICDF(18128), AOM_ICDF(19565), AOM_ICDF(21412),
-          AOM_ICDF(23355), AOM_ICDF(28007), AOM_ICDF(30080), AOM_ICDF(31568),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5504), AOM_ICDF(7636), AOM_ICDF(10174), AOM_ICDF(11056),
-          AOM_ICDF(15562), AOM_ICDF(16252), AOM_ICDF(17931), AOM_ICDF(19598),
-          AOM_ICDF(20967), AOM_ICDF(26845), AOM_ICDF(29149), AOM_ICDF(31490),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(7810), AOM_ICDF(10004), AOM_ICDF(11015),
-          AOM_ICDF(15359), AOM_ICDF(16310), AOM_ICDF(17834), AOM_ICDF(19185),
-          AOM_ICDF(20903), AOM_ICDF(26728), AOM_ICDF(29585), AOM_ICDF(31478),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7322), AOM_ICDF(10592), AOM_ICDF(11694),
-          AOM_ICDF(15586), AOM_ICDF(16103), AOM_ICDF(17999), AOM_ICDF(19740),
-          AOM_ICDF(20950), AOM_ICDF(26894), AOM_ICDF(28912), AOM_ICDF(31591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4608), AOM_ICDF(7276), AOM_ICDF(12153), AOM_ICDF(13388),
-          AOM_ICDF(16091), AOM_ICDF(17970), AOM_ICDF(19548), AOM_ICDF(21175),
-          AOM_ICDF(22481), AOM_ICDF(26543), AOM_ICDF(28212), AOM_ICDF(29908),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(6656), AOM_ICDF(12225), AOM_ICDF(14441), AOM_ICDF(15158),
-          AOM_ICDF(19600), AOM_ICDF(27127), AOM_ICDF(28221), AOM_ICDF(29186),
-          AOM_ICDF(30439), AOM_ICDF(30913), AOM_ICDF(31135), AOM_ICDF(31238),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(14608), AOM_ICDF(15920), AOM_ICDF(16643),
-          AOM_ICDF(20149), AOM_ICDF(27328), AOM_ICDF(27896), AOM_ICDF(28672),
-          AOM_ICDF(30227), AOM_ICDF(30778), AOM_ICDF(31053), AOM_ICDF(31120),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(6925), AOM_ICDF(14671), AOM_ICDF(15709),
-          AOM_ICDF(19830), AOM_ICDF(24216), AOM_ICDF(25507), AOM_ICDF(27459),
-          AOM_ICDF(28552), AOM_ICDF(29569), AOM_ICDF(29808), AOM_ICDF(30169),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(13604), AOM_ICDF(15202), AOM_ICDF(17530),
-          AOM_ICDF(20878), AOM_ICDF(24279), AOM_ICDF(25278), AOM_ICDF(28255),
-          AOM_ICDF(30651), AOM_ICDF(31170), AOM_ICDF(31343), AOM_ICDF(31410),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4608), AOM_ICDF(8535), AOM_ICDF(9588), AOM_ICDF(10740),
-          AOM_ICDF(18673), AOM_ICDF(27664), AOM_ICDF(28826), AOM_ICDF(29828),
-          AOM_ICDF(31081), AOM_ICDF(31503), AOM_ICDF(31680), AOM_ICDF(31778),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4864), AOM_ICDF(10543), AOM_ICDF(11313), AOM_ICDF(12197),
-          AOM_ICDF(16785), AOM_ICDF(27858), AOM_ICDF(28556), AOM_ICDF(29480),
-          AOM_ICDF(30892), AOM_ICDF(31486), AOM_ICDF(31722), AOM_ICDF(31787),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3968), AOM_ICDF(7492), AOM_ICDF(10283), AOM_ICDF(11318),
-          AOM_ICDF(18486), AOM_ICDF(24061), AOM_ICDF(26761), AOM_ICDF(28456),
-          AOM_ICDF(30126), AOM_ICDF(30872), AOM_ICDF(31088), AOM_ICDF(31305),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(10246), AOM_ICDF(12999), AOM_ICDF(15083),
-          AOM_ICDF(18769), AOM_ICDF(22398), AOM_ICDF(23584), AOM_ICDF(27098),
-          AOM_ICDF(29574), AOM_ICDF(30609), AOM_ICDF(30898), AOM_ICDF(31200),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(13404), AOM_ICDF(14723), AOM_ICDF(16413),
-          AOM_ICDF(20186), AOM_ICDF(24739), AOM_ICDF(25407), AOM_ICDF(27106),
-          AOM_ICDF(29929), AOM_ICDF(30507), AOM_ICDF(30827), AOM_ICDF(30915),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2816), AOM_ICDF(6530), AOM_ICDF(8123), AOM_ICDF(9240),
-          AOM_ICDF(12536), AOM_ICDF(17593), AOM_ICDF(18754), AOM_ICDF(20319),
-          AOM_ICDF(22070), AOM_ICDF(27037), AOM_ICDF(29332), AOM_ICDF(30779),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2432), AOM_ICDF(6577), AOM_ICDF(8010), AOM_ICDF(9215),
-          AOM_ICDF(12657), AOM_ICDF(18898), AOM_ICDF(19588), AOM_ICDF(20953),
-          AOM_ICDF(22766), AOM_ICDF(27231), AOM_ICDF(29927), AOM_ICDF(31109),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3200), AOM_ICDF(6974), AOM_ICDF(9162), AOM_ICDF(10450),
-          AOM_ICDF(13818), AOM_ICDF(17757), AOM_ICDF(19119), AOM_ICDF(20842),
-          AOM_ICDF(22269), AOM_ICDF(27170), AOM_ICDF(29271), AOM_ICDF(30804),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(10689), AOM_ICDF(15307), AOM_ICDF(16589),
-          AOM_ICDF(19738), AOM_ICDF(24416), AOM_ICDF(25332), AOM_ICDF(26541),
-          AOM_ICDF(28634), AOM_ICDF(29711), AOM_ICDF(29913), AOM_ICDF(30116),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(9600), AOM_ICDF(11066), AOM_ICDF(15832), AOM_ICDF(16515),
-          AOM_ICDF(18844), AOM_ICDF(19883), AOM_ICDF(24302), AOM_ICDF(25759),
-          AOM_ICDF(26358), AOM_ICDF(29290), AOM_ICDF(30262), AOM_ICDF(31682),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8832), AOM_ICDF(12814), AOM_ICDF(16171), AOM_ICDF(17041),
-          AOM_ICDF(19066), AOM_ICDF(20145), AOM_ICDF(22933), AOM_ICDF(24074),
-          AOM_ICDF(25006), AOM_ICDF(28115), AOM_ICDF(29722), AOM_ICDF(30991),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(4486), AOM_ICDF(15821), AOM_ICDF(16330),
-          AOM_ICDF(18461), AOM_ICDF(18879), AOM_ICDF(22436), AOM_ICDF(25051),
-          AOM_ICDF(25443), AOM_ICDF(28637), AOM_ICDF(29396), AOM_ICDF(31412),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(10863), AOM_ICDF(14050), AOM_ICDF(15920),
-          AOM_ICDF(18783), AOM_ICDF(19531), AOM_ICDF(22502), AOM_ICDF(24577),
-          AOM_ICDF(25361), AOM_ICDF(28559), AOM_ICDF(29600), AOM_ICDF(31336),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(7620), AOM_ICDF(10182), AOM_ICDF(11199),
-          AOM_ICDF(17281), AOM_ICDF(19946), AOM_ICDF(23885), AOM_ICDF(25333),
-          AOM_ICDF(26130), AOM_ICDF(29425), AOM_ICDF(30332), AOM_ICDF(31948),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(11821), AOM_ICDF(13954), AOM_ICDF(15233),
-          AOM_ICDF(19855), AOM_ICDF(24478), AOM_ICDF(28675), AOM_ICDF(29878),
-          AOM_ICDF(31238), AOM_ICDF(31741), AOM_ICDF(31874), AOM_ICDF(32048),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(5753), AOM_ICDF(9673), AOM_ICDF(10149),
-          AOM_ICDF(14343), AOM_ICDF(15190), AOM_ICDF(24967), AOM_ICDF(26378),
-          AOM_ICDF(26841), AOM_ICDF(29749), AOM_ICDF(30527), AOM_ICDF(32120),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(6606), AOM_ICDF(11498), AOM_ICDF(12538),
-          AOM_ICDF(14737), AOM_ICDF(15425), AOM_ICDF(19549), AOM_ICDF(24047),
-          AOM_ICDF(24765), AOM_ICDF(28711), AOM_ICDF(29822), AOM_ICDF(32138),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10368), AOM_ICDF(11757), AOM_ICDF(14126), AOM_ICDF(15474),
-          AOM_ICDF(18311), AOM_ICDF(19358), AOM_ICDF(21539), AOM_ICDF(23451),
-          AOM_ICDF(25034), AOM_ICDF(28791), AOM_ICDF(30035), AOM_ICDF(31280),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(7623), AOM_ICDF(11378), AOM_ICDF(12248),
-          AOM_ICDF(15171), AOM_ICDF(15459), AOM_ICDF(18958), AOM_ICDF(20875),
-          AOM_ICDF(21955), AOM_ICDF(27411), AOM_ICDF(29196), AOM_ICDF(31723),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(7469), AOM_ICDF(11399), AOM_ICDF(12323),
-          AOM_ICDF(15165), AOM_ICDF(15528), AOM_ICDF(18804), AOM_ICDF(20769),
-          AOM_ICDF(21767), AOM_ICDF(27129), AOM_ICDF(29435), AOM_ICDF(31502),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7040), AOM_ICDF(8295), AOM_ICDF(12298), AOM_ICDF(13035),
-          AOM_ICDF(15194), AOM_ICDF(15357), AOM_ICDF(18976), AOM_ICDF(21100),
-          AOM_ICDF(21805), AOM_ICDF(26978), AOM_ICDF(28342), AOM_ICDF(31763),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5632), AOM_ICDF(7465), AOM_ICDF(14220), AOM_ICDF(15035),
-          AOM_ICDF(17014), AOM_ICDF(18105), AOM_ICDF(21111), AOM_ICDF(23027),
-          AOM_ICDF(23934), AOM_ICDF(27207), AOM_ICDF(28293), AOM_ICDF(30330),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(11008), AOM_ICDF(13089), AOM_ICDF(17144), AOM_ICDF(18425),
-          AOM_ICDF(19954), AOM_ICDF(20624), AOM_ICDF(21658), AOM_ICDF(24229),
-          AOM_ICDF(25290), AOM_ICDF(28803), AOM_ICDF(29938), AOM_ICDF(31493),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(14218), AOM_ICDF(16378), AOM_ICDF(17699),
-          AOM_ICDF(18935), AOM_ICDF(19928), AOM_ICDF(20524), AOM_ICDF(22781),
-          AOM_ICDF(24155), AOM_ICDF(27523), AOM_ICDF(29068), AOM_ICDF(30270),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6144), AOM_ICDF(7194), AOM_ICDF(17912), AOM_ICDF(18991),
-          AOM_ICDF(19879), AOM_ICDF(20151), AOM_ICDF(21170), AOM_ICDF(23938),
-          AOM_ICDF(24712), AOM_ICDF(27763), AOM_ICDF(28556), AOM_ICDF(30584),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(11614), AOM_ICDF(13652), AOM_ICDF(16928),
-          AOM_ICDF(18425), AOM_ICDF(18967), AOM_ICDF(19724), AOM_ICDF(23817),
-          AOM_ICDF(25594), AOM_ICDF(28685), AOM_ICDF(29734), AOM_ICDF(30941),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(8915), AOM_ICDF(11163), AOM_ICDF(13821),
-          AOM_ICDF(16951), AOM_ICDF(18507), AOM_ICDF(20180), AOM_ICDF(22423),
-          AOM_ICDF(24017), AOM_ICDF(28294), AOM_ICDF(29614), AOM_ICDF(31673),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(13441), AOM_ICDF(15858), AOM_ICDF(18860),
-          AOM_ICDF(21713), AOM_ICDF(24478), AOM_ICDF(25995), AOM_ICDF(28233),
-          AOM_ICDF(30347), AOM_ICDF(30853), AOM_ICDF(31081), AOM_ICDF(31328),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(7480), AOM_ICDF(11482), AOM_ICDF(13206),
-          AOM_ICDF(16199), AOM_ICDF(16908), AOM_ICDF(20436), AOM_ICDF(23507),
-          AOM_ICDF(24650), AOM_ICDF(28360), AOM_ICDF(29438), AOM_ICDF(31532),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(10979), AOM_ICDF(13430), AOM_ICDF(15195),
-          AOM_ICDF(15957), AOM_ICDF(16350), AOM_ICDF(16871), AOM_ICDF(26198),
-          AOM_ICDF(26991), AOM_ICDF(29612), AOM_ICDF(30438), AOM_ICDF(31962),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10529), AOM_ICDF(12640), AOM_ICDF(15350),
-          AOM_ICDF(16987), AOM_ICDF(17859), AOM_ICDF(18590), AOM_ICDF(21400),
-          AOM_ICDF(23812), AOM_ICDF(28188), AOM_ICDF(29589), AOM_ICDF(31280),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(8877), AOM_ICDF(12393), AOM_ICDF(14015),
-          AOM_ICDF(15655), AOM_ICDF(15794), AOM_ICDF(16814), AOM_ICDF(19923),
-          AOM_ICDF(21086), AOM_ICDF(26723), AOM_ICDF(28669), AOM_ICDF(31468),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(8900), AOM_ICDF(12241), AOM_ICDF(13828),
-          AOM_ICDF(15513), AOM_ICDF(15671), AOM_ICDF(16500), AOM_ICDF(19257),
-          AOM_ICDF(20456), AOM_ICDF(25984), AOM_ICDF(28658), AOM_ICDF(31017),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(8820), AOM_ICDF(12885), AOM_ICDF(14441),
-          AOM_ICDF(15813), AOM_ICDF(15911), AOM_ICDF(16954), AOM_ICDF(20026),
-          AOM_ICDF(20950), AOM_ICDF(26563), AOM_ICDF(28140), AOM_ICDF(31673),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(8455), AOM_ICDF(13328), AOM_ICDF(15907),
-          AOM_ICDF(17026), AOM_ICDF(17464), AOM_ICDF(18267), AOM_ICDF(21436),
-          AOM_ICDF(22712), AOM_ICDF(26403), AOM_ICDF(27660), AOM_ICDF(29559),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(6784), AOM_ICDF(11216), AOM_ICDF(13269), AOM_ICDF(15677),
-          AOM_ICDF(16931), AOM_ICDF(18445), AOM_ICDF(19097), AOM_ICDF(20082),
-          AOM_ICDF(24298), AOM_ICDF(28236), AOM_ICDF(30118), AOM_ICDF(31448),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(13240), AOM_ICDF(14110), AOM_ICDF(16966),
-          AOM_ICDF(17743), AOM_ICDF(18916), AOM_ICDF(19281), AOM_ICDF(19848),
-          AOM_ICDF(25552), AOM_ICDF(28646), AOM_ICDF(30444), AOM_ICDF(31291),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(6870), AOM_ICDF(14660), AOM_ICDF(16597),
-          AOM_ICDF(17361), AOM_ICDF(18126), AOM_ICDF(18852), AOM_ICDF(20765),
-          AOM_ICDF(23526), AOM_ICDF(27670), AOM_ICDF(29096), AOM_ICDF(31214),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9472), AOM_ICDF(11736), AOM_ICDF(13172), AOM_ICDF(18192),
-          AOM_ICDF(19070), AOM_ICDF(19651), AOM_ICDF(19991), AOM_ICDF(21793),
-          AOM_ICDF(26005), AOM_ICDF(29291), AOM_ICDF(30500), AOM_ICDF(31767),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(7252), AOM_ICDF(8651), AOM_ICDF(12379),
-          AOM_ICDF(14936), AOM_ICDF(17493), AOM_ICDF(18326), AOM_ICDF(19527),
-          AOM_ICDF(23655), AOM_ICDF(28031), AOM_ICDF(29960), AOM_ICDF(31773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(11561), AOM_ICDF(12864), AOM_ICDF(15793),
-          AOM_ICDF(18765), AOM_ICDF(23040), AOM_ICDF(23640), AOM_ICDF(24415),
-          AOM_ICDF(31040), AOM_ICDF(31473), AOM_ICDF(31740), AOM_ICDF(31827),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(6825), AOM_ICDF(8810), AOM_ICDF(11269),
-          AOM_ICDF(14257), AOM_ICDF(15716), AOM_ICDF(18397), AOM_ICDF(20006),
-          AOM_ICDF(24020), AOM_ICDF(28230), AOM_ICDF(29780), AOM_ICDF(31773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(9466), AOM_ICDF(11717), AOM_ICDF(15159),
-          AOM_ICDF(16237), AOM_ICDF(17145), AOM_ICDF(17814), AOM_ICDF(21258),
-          AOM_ICDF(24754), AOM_ICDF(28864), AOM_ICDF(30313), AOM_ICDF(32061),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7680), AOM_ICDF(10517), AOM_ICDF(11381), AOM_ICDF(16202),
-          AOM_ICDF(16809), AOM_ICDF(17425), AOM_ICDF(17774), AOM_ICDF(18764),
-          AOM_ICDF(26842), AOM_ICDF(29600), AOM_ICDF(31073), AOM_ICDF(31886),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4992), AOM_ICDF(8626), AOM_ICDF(10531), AOM_ICDF(13103),
-          AOM_ICDF(14495), AOM_ICDF(14784), AOM_ICDF(15365), AOM_ICDF(16657),
-          AOM_ICDF(21051), AOM_ICDF(27011), AOM_ICDF(29685), AOM_ICDF(31574),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(9433), AOM_ICDF(10981), AOM_ICDF(13494),
-          AOM_ICDF(14644), AOM_ICDF(15043), AOM_ICDF(15396), AOM_ICDF(16378),
-          AOM_ICDF(21506), AOM_ICDF(26869), AOM_ICDF(29824), AOM_ICDF(31454),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(9526), AOM_ICDF(11905), AOM_ICDF(14476),
-          AOM_ICDF(15722), AOM_ICDF(16103), AOM_ICDF(16768), AOM_ICDF(18070),
-          AOM_ICDF(21630), AOM_ICDF(27401), AOM_ICDF(29592), AOM_ICDF(31818),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(9947), AOM_ICDF(12386), AOM_ICDF(15909),
-          AOM_ICDF(16496), AOM_ICDF(17397), AOM_ICDF(17866), AOM_ICDF(18927),
-          AOM_ICDF(24408), AOM_ICDF(27750), AOM_ICDF(29614), AOM_ICDF(30889),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7424), AOM_ICDF(10538), AOM_ICDF(14098), AOM_ICDF(14891),
-          AOM_ICDF(16486), AOM_ICDF(16756), AOM_ICDF(17607), AOM_ICDF(18952),
-          AOM_ICDF(20168), AOM_ICDF(26275), AOM_ICDF(28617), AOM_ICDF(31580),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(13070), AOM_ICDF(14969), AOM_ICDF(15848),
-          AOM_ICDF(17197), AOM_ICDF(17447), AOM_ICDF(17954), AOM_ICDF(18747),
-          AOM_ICDF(20137), AOM_ICDF(25628), AOM_ICDF(28753), AOM_ICDF(30628),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3584), AOM_ICDF(5287), AOM_ICDF(16141), AOM_ICDF(16840),
-          AOM_ICDF(17670), AOM_ICDF(17760), AOM_ICDF(18532), AOM_ICDF(20387),
-          AOM_ICDF(21102), AOM_ICDF(26118), AOM_ICDF(27535), AOM_ICDF(30830),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(9594), AOM_ICDF(11770), AOM_ICDF(14505),
-          AOM_ICDF(16234), AOM_ICDF(16365), AOM_ICDF(17201), AOM_ICDF(20286),
-          AOM_ICDF(22128), AOM_ICDF(27371), AOM_ICDF(29426), AOM_ICDF(31580),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5632), AOM_ICDF(8393), AOM_ICDF(10566), AOM_ICDF(11917),
-          AOM_ICDF(16025), AOM_ICDF(16697), AOM_ICDF(18123), AOM_ICDF(19541),
-          AOM_ICDF(21135), AOM_ICDF(27059), AOM_ICDF(29325), AOM_ICDF(31814),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(7916), AOM_ICDF(9526), AOM_ICDF(11010),
-          AOM_ICDF(14114), AOM_ICDF(18169), AOM_ICDF(19510), AOM_ICDF(21031),
-          AOM_ICDF(23083), AOM_ICDF(27769), AOM_ICDF(29782), AOM_ICDF(31299),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7338), AOM_ICDF(10657), AOM_ICDF(11699),
-          AOM_ICDF(14780), AOM_ICDF(15070), AOM_ICDF(18291), AOM_ICDF(20170),
-          AOM_ICDF(21347), AOM_ICDF(26985), AOM_ICDF(28811), AOM_ICDF(31805),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5632), AOM_ICDF(7669), AOM_ICDF(11558), AOM_ICDF(12653),
-          AOM_ICDF(13962), AOM_ICDF(14116), AOM_ICDF(15074), AOM_ICDF(19886),
-          AOM_ICDF(21123), AOM_ICDF(26953), AOM_ICDF(28755), AOM_ICDF(31708),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(9739), AOM_ICDF(11612), AOM_ICDF(13211),
-          AOM_ICDF(14992), AOM_ICDF(15237), AOM_ICDF(16016), AOM_ICDF(17677),
-          AOM_ICDF(20588), AOM_ICDF(26647), AOM_ICDF(29116), AOM_ICDF(31435),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(8346), AOM_ICDF(11022), AOM_ICDF(11976),
-          AOM_ICDF(13541), AOM_ICDF(13749), AOM_ICDF(14520), AOM_ICDF(16173),
-          AOM_ICDF(17567), AOM_ICDF(25182), AOM_ICDF(28111), AOM_ICDF(31591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(8894), AOM_ICDF(11294), AOM_ICDF(12220),
-          AOM_ICDF(13753), AOM_ICDF(14029), AOM_ICDF(14645), AOM_ICDF(16065),
-          AOM_ICDF(17621), AOM_ICDF(24911), AOM_ICDF(28655), AOM_ICDF(31344),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(8193), AOM_ICDF(11667), AOM_ICDF(12461),
-          AOM_ICDF(13880), AOM_ICDF(14040), AOM_ICDF(14946), AOM_ICDF(16537),
-          AOM_ICDF(17642), AOM_ICDF(25117), AOM_ICDF(27333), AOM_ICDF(31713),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4096), AOM_ICDF(8479), AOM_ICDF(13751), AOM_ICDF(14813),
-          AOM_ICDF(15994), AOM_ICDF(16157), AOM_ICDF(16905), AOM_ICDF(18314),
-          AOM_ICDF(19575), AOM_ICDF(25132), AOM_ICDF(27445), AOM_ICDF(30192),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7936), AOM_ICDF(12263), AOM_ICDF(15558), AOM_ICDF(16331),
-          AOM_ICDF(17779), AOM_ICDF(18148), AOM_ICDF(18810), AOM_ICDF(19794),
-          AOM_ICDF(21046), AOM_ICDF(26644), AOM_ICDF(29417), AOM_ICDF(31507),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(15025), AOM_ICDF(16457), AOM_ICDF(17074),
-          AOM_ICDF(18079), AOM_ICDF(18299), AOM_ICDF(18648), AOM_ICDF(19240),
-          AOM_ICDF(20612), AOM_ICDF(25687), AOM_ICDF(29392), AOM_ICDF(30842),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(6037), AOM_ICDF(17465), AOM_ICDF(18089),
-          AOM_ICDF(18869), AOM_ICDF(18953), AOM_ICDF(19688), AOM_ICDF(21223),
-          AOM_ICDF(21816), AOM_ICDF(26562), AOM_ICDF(28195), AOM_ICDF(30621),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(11255), AOM_ICDF(13307), AOM_ICDF(15676),
-          AOM_ICDF(17392), AOM_ICDF(17603), AOM_ICDF(18268), AOM_ICDF(20783),
-          AOM_ICDF(22646), AOM_ICDF(27628), AOM_ICDF(29737), AOM_ICDF(31628),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(9119), AOM_ICDF(11015), AOM_ICDF(12269),
-          AOM_ICDF(16280), AOM_ICDF(17023), AOM_ICDF(18282), AOM_ICDF(19418),
-          AOM_ICDF(21325), AOM_ICDF(27309), AOM_ICDF(30004), AOM_ICDF(31818),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3968), AOM_ICDF(9094), AOM_ICDF(10606), AOM_ICDF(12007),
-          AOM_ICDF(14218), AOM_ICDF(18911), AOM_ICDF(20089), AOM_ICDF(20924),
-          AOM_ICDF(23587), AOM_ICDF(27808), AOM_ICDF(30253), AOM_ICDF(31305),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(8627), AOM_ICDF(11201), AOM_ICDF(12200),
-          AOM_ICDF(15305), AOM_ICDF(15671), AOM_ICDF(18639), AOM_ICDF(20185),
-          AOM_ICDF(21627), AOM_ICDF(26990), AOM_ICDF(29449), AOM_ICDF(31723),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(8768), AOM_ICDF(12320), AOM_ICDF(13296),
-          AOM_ICDF(14643), AOM_ICDF(14970), AOM_ICDF(15760), AOM_ICDF(20545),
-          AOM_ICDF(21863), AOM_ICDF(27473), AOM_ICDF(29535), AOM_ICDF(31836),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(10905), AOM_ICDF(12656), AOM_ICDF(14084),
-          AOM_ICDF(15705), AOM_ICDF(16069), AOM_ICDF(16674), AOM_ICDF(17779),
-          AOM_ICDF(21041), AOM_ICDF(26586), AOM_ICDF(29539), AOM_ICDF(31253),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(9672), AOM_ICDF(12113), AOM_ICDF(12871),
-          AOM_ICDF(14423), AOM_ICDF(14710), AOM_ICDF(15376), AOM_ICDF(16708),
-          AOM_ICDF(18092), AOM_ICDF(25260), AOM_ICDF(28991), AOM_ICDF(31585),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(10789), AOM_ICDF(13029), AOM_ICDF(13750),
-          AOM_ICDF(15040), AOM_ICDF(15385), AOM_ICDF(15840), AOM_ICDF(16887),
-          AOM_ICDF(18393), AOM_ICDF(25230), AOM_ICDF(29558), AOM_ICDF(31454),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(9916), AOM_ICDF(12938), AOM_ICDF(13741),
-          AOM_ICDF(15030), AOM_ICDF(15297), AOM_ICDF(16116), AOM_ICDF(17333),
-          AOM_ICDF(18672), AOM_ICDF(25954), AOM_ICDF(28498), AOM_ICDF(31618),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4608), AOM_ICDF(10266), AOM_ICDF(15450), AOM_ICDF(16299),
-          AOM_ICDF(17114), AOM_ICDF(17288), AOM_ICDF(17775), AOM_ICDF(18835),
-          AOM_ICDF(20227), AOM_ICDF(25199), AOM_ICDF(28098), AOM_ICDF(30018),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7296), AOM_ICDF(9951), AOM_ICDF(14124), AOM_ICDF(14806),
-          AOM_ICDF(16181), AOM_ICDF(16377), AOM_ICDF(17485), AOM_ICDF(19069),
-          AOM_ICDF(20078), AOM_ICDF(26051), AOM_ICDF(27777), AOM_ICDF(31574),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(13823), AOM_ICDF(15889), AOM_ICDF(16620),
-          AOM_ICDF(17709), AOM_ICDF(17881), AOM_ICDF(18327), AOM_ICDF(19140),
-          AOM_ICDF(20374), AOM_ICDF(25685), AOM_ICDF(28160), AOM_ICDF(30521),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3200), AOM_ICDF(4602), AOM_ICDF(16404), AOM_ICDF(17042),
-          AOM_ICDF(17780), AOM_ICDF(17829), AOM_ICDF(18706), AOM_ICDF(20608),
-          AOM_ICDF(21115), AOM_ICDF(25884), AOM_ICDF(26960), AOM_ICDF(30804),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7040), AOM_ICDF(9444), AOM_ICDF(11770), AOM_ICDF(14321),
-          AOM_ICDF(15951), AOM_ICDF(16074), AOM_ICDF(17033), AOM_ICDF(20352),
-          AOM_ICDF(22301), AOM_ICDF(27567), AOM_ICDF(29151), AOM_ICDF(31662),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(8316), AOM_ICDF(10849), AOM_ICDF(12136),
-          AOM_ICDF(15860), AOM_ICDF(16430), AOM_ICDF(17935), AOM_ICDF(19659),
-          AOM_ICDF(21083), AOM_ICDF(26968), AOM_ICDF(28839), AOM_ICDF(31618),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(7472), AOM_ICDF(9436), AOM_ICDF(11038),
-          AOM_ICDF(13625), AOM_ICDF(17596), AOM_ICDF(18959), AOM_ICDF(20543),
-          AOM_ICDF(22879), AOM_ICDF(27487), AOM_ICDF(29351), AOM_ICDF(31186),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7117), AOM_ICDF(11424), AOM_ICDF(12381),
-          AOM_ICDF(14823), AOM_ICDF(15053), AOM_ICDF(18656), AOM_ICDF(20818),
-          AOM_ICDF(21722), AOM_ICDF(27042), AOM_ICDF(28233), AOM_ICDF(31591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(7281), AOM_ICDF(11910), AOM_ICDF(12912),
-          AOM_ICDF(14229), AOM_ICDF(14391), AOM_ICDF(15474), AOM_ICDF(20113),
-          AOM_ICDF(21128), AOM_ICDF(26627), AOM_ICDF(28077), AOM_ICDF(31713),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(9452), AOM_ICDF(11526), AOM_ICDF(13288),
-          AOM_ICDF(14861), AOM_ICDF(15062), AOM_ICDF(15909), AOM_ICDF(17695),
-          AOM_ICDF(20429), AOM_ICDF(26225), AOM_ICDF(28603), AOM_ICDF(31340),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7722), AOM_ICDF(10921), AOM_ICDF(11813),
-          AOM_ICDF(13222), AOM_ICDF(13348), AOM_ICDF(14211), AOM_ICDF(15976),
-          AOM_ICDF(17110), AOM_ICDF(24634), AOM_ICDF(27176), AOM_ICDF(31484),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(8226), AOM_ICDF(11137), AOM_ICDF(11988),
-          AOM_ICDF(13518), AOM_ICDF(13706), AOM_ICDF(14332), AOM_ICDF(16016),
-          AOM_ICDF(17301), AOM_ICDF(24641), AOM_ICDF(27704), AOM_ICDF(31016),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(7592), AOM_ICDF(11880), AOM_ICDF(12612),
-          AOM_ICDF(13738), AOM_ICDF(13813), AOM_ICDF(14681), AOM_ICDF(16392),
-          AOM_ICDF(17306), AOM_ICDF(24619), AOM_ICDF(26334), AOM_ICDF(31818),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4096), AOM_ICDF(8524), AOM_ICDF(14316), AOM_ICDF(15392),
-          AOM_ICDF(16295), AOM_ICDF(16433), AOM_ICDF(17197), AOM_ICDF(18718),
-          AOM_ICDF(19924), AOM_ICDF(25123), AOM_ICDF(26953), AOM_ICDF(29856),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(6528), AOM_ICDF(13383), AOM_ICDF(17642), AOM_ICDF(18342),
-          AOM_ICDF(19224), AOM_ICDF(20209), AOM_ICDF(20899), AOM_ICDF(21944),
-          AOM_ICDF(23137), AOM_ICDF(25966), AOM_ICDF(27429), AOM_ICDF(28463),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(16901), AOM_ICDF(18876), AOM_ICDF(19560),
-          AOM_ICDF(20257), AOM_ICDF(20912), AOM_ICDF(21169), AOM_ICDF(21959),
-          AOM_ICDF(23036), AOM_ICDF(25781), AOM_ICDF(27676), AOM_ICDF(28569),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2688), AOM_ICDF(5337), AOM_ICDF(18178), AOM_ICDF(18829),
-          AOM_ICDF(19344), AOM_ICDF(19628), AOM_ICDF(20267), AOM_ICDF(22135),
-          AOM_ICDF(22671), AOM_ICDF(25817), AOM_ICDF(26914), AOM_ICDF(28773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(11378), AOM_ICDF(14742), AOM_ICDF(17269),
-          AOM_ICDF(18230), AOM_ICDF(19001), AOM_ICDF(19655), AOM_ICDF(22949),
-          AOM_ICDF(24337), AOM_ICDF(28025), AOM_ICDF(29503), AOM_ICDF(30848),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(10133), AOM_ICDF(13144), AOM_ICDF(14374),
-          AOM_ICDF(17020), AOM_ICDF(18920), AOM_ICDF(20235), AOM_ICDF(21677),
-          AOM_ICDF(23142), AOM_ICDF(27131), AOM_ICDF(28671), AOM_ICDF(30284),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(15588), AOM_ICDF(18431), AOM_ICDF(19723),
-          AOM_ICDF(21455), AOM_ICDF(24705), AOM_ICDF(25461), AOM_ICDF(26753),
-          AOM_ICDF(28923), AOM_ICDF(29475), AOM_ICDF(29729), AOM_ICDF(29897),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4224), AOM_ICDF(8689), AOM_ICDF(13024), AOM_ICDF(13658),
-          AOM_ICDF(16637), AOM_ICDF(17307), AOM_ICDF(20836), AOM_ICDF(22665),
-          AOM_ICDF(23673), AOM_ICDF(27015), AOM_ICDF(28310), AOM_ICDF(30203),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(7896), AOM_ICDF(13618), AOM_ICDF(14900),
-          AOM_ICDF(15708), AOM_ICDF(16153), AOM_ICDF(16997), AOM_ICDF(23625),
-          AOM_ICDF(24466), AOM_ICDF(27719), AOM_ICDF(28892), AOM_ICDF(30500),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(11305), AOM_ICDF(13669), AOM_ICDF(15462),
-          AOM_ICDF(16564), AOM_ICDF(17683), AOM_ICDF(18252), AOM_ICDF(20073),
-          AOM_ICDF(22917), AOM_ICDF(27005), AOM_ICDF(28923), AOM_ICDF(30236),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4224), AOM_ICDF(9510), AOM_ICDF(13787), AOM_ICDF(14587),
-          AOM_ICDF(15753), AOM_ICDF(15925), AOM_ICDF(16513), AOM_ICDF(18193),
-          AOM_ICDF(19490), AOM_ICDF(24944), AOM_ICDF(27482), AOM_ICDF(29757),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(10052), AOM_ICDF(14106), AOM_ICDF(14887),
-          AOM_ICDF(15827), AOM_ICDF(15996), AOM_ICDF(16522), AOM_ICDF(17939),
-          AOM_ICDF(19204), AOM_ICDF(24508), AOM_ICDF(27661), AOM_ICDF(29491),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(9676), AOM_ICDF(14492), AOM_ICDF(15163),
-          AOM_ICDF(16179), AOM_ICDF(16390), AOM_ICDF(17133), AOM_ICDF(18905),
-          AOM_ICDF(19864), AOM_ICDF(25185), AOM_ICDF(27191), AOM_ICDF(30030),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3584), AOM_ICDF(9370), AOM_ICDF(14746), AOM_ICDF(15820),
-          AOM_ICDF(16708), AOM_ICDF(17224), AOM_ICDF(17718), AOM_ICDF(19329),
-          AOM_ICDF(20405), AOM_ICDF(23541), AOM_ICDF(25258), AOM_ICDF(26726),
-          AOM_ICDF(32768), 0,
-      },
-  },
-#else
-  {
-      {
-          AOM_ICDF(15488), AOM_ICDF(18706), AOM_ICDF(22561), AOM_ICDF(23619),
-          AOM_ICDF(24954), AOM_ICDF(25782), AOM_ICDF(26710), AOM_ICDF(27861),
-          AOM_ICDF(28656), AOM_ICDF(30743), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11648), AOM_ICDF(18744), AOM_ICDF(20846), AOM_ICDF(22100),
-          AOM_ICDF(23332), AOM_ICDF(24337), AOM_ICDF(25093), AOM_ICDF(26104),
-          AOM_ICDF(27097), AOM_ICDF(29633), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(10732), AOM_ICDF(22507), AOM_ICDF(23254),
-          AOM_ICDF(24382), AOM_ICDF(24876), AOM_ICDF(25827), AOM_ICDF(27488),
-          AOM_ICDF(28040), AOM_ICDF(30108), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(13568), AOM_ICDF(16981), AOM_ICDF(19885), AOM_ICDF(22014),
-          AOM_ICDF(23543), AOM_ICDF(24658), AOM_ICDF(25641), AOM_ICDF(27378),
-          AOM_ICDF(28625), AOM_ICDF(31043), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(12225), AOM_ICDF(14408), AOM_ICDF(16033),
-          AOM_ICDF(19544), AOM_ICDF(22318), AOM_ICDF(23960), AOM_ICDF(25617),
-          AOM_ICDF(26522), AOM_ICDF(30596), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12160), AOM_ICDF(15078), AOM_ICDF(16990), AOM_ICDF(18964),
-          AOM_ICDF(22725), AOM_ICDF(25793), AOM_ICDF(27133), AOM_ICDF(28447),
-          AOM_ICDF(30831), AOM_ICDF(30836), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(11274), AOM_ICDF(15818), AOM_ICDF(16940),
-          AOM_ICDF(21178), AOM_ICDF(22338), AOM_ICDF(26171), AOM_ICDF(27754),
-          AOM_ICDF(28503), AOM_ICDF(31473), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10880), AOM_ICDF(13846), AOM_ICDF(18649), AOM_ICDF(20252),
-          AOM_ICDF(22157), AOM_ICDF(22992), AOM_ICDF(24396), AOM_ICDF(27581),
-          AOM_ICDF(28501), AOM_ICDF(31400), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(13462), AOM_ICDF(15747), AOM_ICDF(18378),
-          AOM_ICDF(20085), AOM_ICDF(21663), AOM_ICDF(22766), AOM_ICDF(24635),
-          AOM_ICDF(27476), AOM_ICDF(30643), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10112), AOM_ICDF(13147), AOM_ICDF(16135), AOM_ICDF(17577),
-          AOM_ICDF(19681), AOM_ICDF(19689), AOM_ICDF(20856), AOM_ICDF(22374),
-          AOM_ICDF(24454), AOM_ICDF(30555), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(12176), AOM_ICDF(17582), AOM_ICDF(18905),
-          AOM_ICDF(19994), AOM_ICDF(20669), AOM_ICDF(21635), AOM_ICDF(23564),
-          AOM_ICDF(24741), AOM_ICDF(27222), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(8448), AOM_ICDF(18738), AOM_ICDF(21694), AOM_ICDF(22413),
-          AOM_ICDF(23358), AOM_ICDF(24675), AOM_ICDF(25193), AOM_ICDF(26119),
-          AOM_ICDF(27310), AOM_ICDF(30773), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(22027), AOM_ICDF(23242), AOM_ICDF(23986),
-          AOM_ICDF(24529), AOM_ICDF(25363), AOM_ICDF(25646), AOM_ICDF(26087),
-          AOM_ICDF(27130), AOM_ICDF(30218), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(13862), AOM_ICDF(21137), AOM_ICDF(22124),
-          AOM_ICDF(23036), AOM_ICDF(23803), AOM_ICDF(24458), AOM_ICDF(26390),
-          AOM_ICDF(27342), AOM_ICDF(30968), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(17409), AOM_ICDF(19830), AOM_ICDF(21521),
-          AOM_ICDF(22580), AOM_ICDF(23726), AOM_ICDF(24377), AOM_ICDF(25679),
-          AOM_ICDF(27269), AOM_ICDF(30867), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(15832), AOM_ICDF(17559), AOM_ICDF(18777),
-          AOM_ICDF(20425), AOM_ICDF(22719), AOM_ICDF(23447), AOM_ICDF(24952),
-          AOM_ICDF(26527), AOM_ICDF(30950), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(18730), AOM_ICDF(20143), AOM_ICDF(21445),
-          AOM_ICDF(23347), AOM_ICDF(26267), AOM_ICDF(27229), AOM_ICDF(28315),
-          AOM_ICDF(30911), AOM_ICDF(30915), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(14299), AOM_ICDF(17264), AOM_ICDF(18505),
-          AOM_ICDF(20765), AOM_ICDF(22440), AOM_ICDF(24331), AOM_ICDF(26038),
-          AOM_ICDF(27481), AOM_ICDF(31448), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8832), AOM_ICDF(15726), AOM_ICDF(19455), AOM_ICDF(20668),
-          AOM_ICDF(21607), AOM_ICDF(22655), AOM_ICDF(23384), AOM_ICDF(26356),
-          AOM_ICDF(27697), AOM_ICDF(31459), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(17385), AOM_ICDF(18866), AOM_ICDF(20120),
-          AOM_ICDF(21273), AOM_ICDF(22853), AOM_ICDF(23470), AOM_ICDF(24881),
-          AOM_ICDF(27216), AOM_ICDF(31040), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(16341), AOM_ICDF(18497), AOM_ICDF(19439),
-          AOM_ICDF(20706), AOM_ICDF(20711), AOM_ICDF(21234), AOM_ICDF(22307),
-          AOM_ICDF(23950), AOM_ICDF(30728), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(17625), AOM_ICDF(20326), AOM_ICDF(21821),
-          AOM_ICDF(22568), AOM_ICDF(23415), AOM_ICDF(23854), AOM_ICDF(24896),
-          AOM_ICDF(26171), AOM_ICDF(29575), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(12032), AOM_ICDF(14259), AOM_ICDF(22597), AOM_ICDF(23443),
-          AOM_ICDF(24581), AOM_ICDF(25079), AOM_ICDF(26399), AOM_ICDF(27862),
-          AOM_ICDF(28509), AOM_ICDF(30419), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9216), AOM_ICDF(14883), AOM_ICDF(20941), AOM_ICDF(21958),
-          AOM_ICDF(23597), AOM_ICDF(24328), AOM_ICDF(25208), AOM_ICDF(26590),
-          AOM_ICDF(27377), AOM_ICDF(29364), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(8088), AOM_ICDF(24407), AOM_ICDF(25006),
-          AOM_ICDF(25777), AOM_ICDF(25950), AOM_ICDF(26882), AOM_ICDF(28811),
-          AOM_ICDF(29159), AOM_ICDF(30636), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11904), AOM_ICDF(14425), AOM_ICDF(18729), AOM_ICDF(20730),
-          AOM_ICDF(21998), AOM_ICDF(22686), AOM_ICDF(23856), AOM_ICDF(26580),
-          AOM_ICDF(27613), AOM_ICDF(29834), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10752), AOM_ICDF(12784), AOM_ICDF(16305), AOM_ICDF(17624),
-          AOM_ICDF(20320), AOM_ICDF(22450), AOM_ICDF(24380), AOM_ICDF(26773),
-          AOM_ICDF(27837), AOM_ICDF(30016), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(14090), AOM_ICDF(18314), AOM_ICDF(20621),
-          AOM_ICDF(23539), AOM_ICDF(25261), AOM_ICDF(26953), AOM_ICDF(28692),
-          AOM_ICDF(30064), AOM_ICDF(30071), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(10229), AOM_ICDF(16542), AOM_ICDF(17725),
-          AOM_ICDF(21504), AOM_ICDF(22332), AOM_ICDF(26006), AOM_ICDF(27895),
-          AOM_ICDF(28487), AOM_ICDF(31248), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(11162), AOM_ICDF(19379), AOM_ICDF(20981),
-          AOM_ICDF(22356), AOM_ICDF(22926), AOM_ICDF(24318), AOM_ICDF(28364),
-          AOM_ICDF(29020), AOM_ICDF(31328), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9216), AOM_ICDF(10861), AOM_ICDF(14850), AOM_ICDF(16471),
-          AOM_ICDF(18611), AOM_ICDF(19674), AOM_ICDF(21009), AOM_ICDF(23454),
-          AOM_ICDF(26078), AOM_ICDF(29272), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(10132), AOM_ICDF(17327), AOM_ICDF(18472),
-          AOM_ICDF(20126), AOM_ICDF(20132), AOM_ICDF(21599), AOM_ICDF(23338),
-          AOM_ICDF(24514), AOM_ICDF(29843), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(9210), AOM_ICDF(19309), AOM_ICDF(20715),
-          AOM_ICDF(21833), AOM_ICDF(22262), AOM_ICDF(23353), AOM_ICDF(24942),
-          AOM_ICDF(25800), AOM_ICDF(28200), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(12288), AOM_ICDF(15040), AOM_ICDF(18401), AOM_ICDF(21071),
-          AOM_ICDF(22800), AOM_ICDF(23945), AOM_ICDF(25274), AOM_ICDF(26939),
-          AOM_ICDF(28554), AOM_ICDF(31328), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(17170), AOM_ICDF(19325), AOM_ICDF(22119),
-          AOM_ICDF(23284), AOM_ICDF(24378), AOM_ICDF(24911), AOM_ICDF(26095),
-          AOM_ICDF(27781), AOM_ICDF(31121), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(11650), AOM_ICDF(19788), AOM_ICDF(21928),
-          AOM_ICDF(22916), AOM_ICDF(23571), AOM_ICDF(24362), AOM_ICDF(26633),
-          AOM_ICDF(27946), AOM_ICDF(31212), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12928), AOM_ICDF(14428), AOM_ICDF(17080), AOM_ICDF(20882),
-          AOM_ICDF(22104), AOM_ICDF(23149), AOM_ICDF(23715), AOM_ICDF(27167),
-          AOM_ICDF(28932), AOM_ICDF(31218), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(11962), AOM_ICDF(13849), AOM_ICDF(16880),
-          AOM_ICDF(19818), AOM_ICDF(21895), AOM_ICDF(23000), AOM_ICDF(25923),
-          AOM_ICDF(27961), AOM_ICDF(31380), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(13336), AOM_ICDF(15505), AOM_ICDF(18844),
-          AOM_ICDF(21646), AOM_ICDF(24723), AOM_ICDF(25832), AOM_ICDF(27802),
-          AOM_ICDF(31088), AOM_ICDF(31096), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(10683), AOM_ICDF(14446), AOM_ICDF(17035),
-          AOM_ICDF(20211), AOM_ICDF(21577), AOM_ICDF(24370), AOM_ICDF(26477),
-          AOM_ICDF(28223), AOM_ICDF(31734), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12928), AOM_ICDF(17358), AOM_ICDF(19982), AOM_ICDF(22123),
-          AOM_ICDF(23335), AOM_ICDF(23948), AOM_ICDF(24890), AOM_ICDF(28884),
-          AOM_ICDF(30197), AOM_ICDF(32148), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(12429), AOM_ICDF(16401), AOM_ICDF(20493),
-          AOM_ICDF(21471), AOM_ICDF(22433), AOM_ICDF(23162), AOM_ICDF(24686),
-          AOM_ICDF(29027), AOM_ICDF(31115), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(12157), AOM_ICDF(14796), AOM_ICDF(17676),
-          AOM_ICDF(19754), AOM_ICDF(19762), AOM_ICDF(20641), AOM_ICDF(23274),
-          AOM_ICDF(25569), AOM_ICDF(31058), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(11083), AOM_ICDF(15313), AOM_ICDF(20550),
-          AOM_ICDF(21783), AOM_ICDF(22727), AOM_ICDF(23461), AOM_ICDF(25072),
-          AOM_ICDF(27195), AOM_ICDF(30380), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10880), AOM_ICDF(13214), AOM_ICDF(15829), AOM_ICDF(16866),
-          AOM_ICDF(20613), AOM_ICDF(22316), AOM_ICDF(24539), AOM_ICDF(27077),
-          AOM_ICDF(28116), AOM_ICDF(31485), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(13868), AOM_ICDF(16397), AOM_ICDF(17486),
-          AOM_ICDF(20011), AOM_ICDF(22071), AOM_ICDF(23357), AOM_ICDF(24990),
-          AOM_ICDF(26336), AOM_ICDF(30276), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(8637), AOM_ICDF(17963), AOM_ICDF(18813),
-          AOM_ICDF(21065), AOM_ICDF(22052), AOM_ICDF(23502), AOM_ICDF(25702),
-          AOM_ICDF(26745), AOM_ICDF(30668), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10682), AOM_ICDF(12496), AOM_ICDF(18240),
-          AOM_ICDF(20500), AOM_ICDF(21585), AOM_ICDF(23387), AOM_ICDF(25795),
-          AOM_ICDF(27119), AOM_ICDF(31001), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(12056), AOM_ICDF(13722), AOM_ICDF(15196),
-          AOM_ICDF(19276), AOM_ICDF(21891), AOM_ICDF(23643), AOM_ICDF(25538),
-          AOM_ICDF(26854), AOM_ICDF(31515), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(12963), AOM_ICDF(14960), AOM_ICDF(16734),
-          AOM_ICDF(21279), AOM_ICDF(25616), AOM_ICDF(27638), AOM_ICDF(28950),
-          AOM_ICDF(31161), AOM_ICDF(31166), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(8604), AOM_ICDF(12044), AOM_ICDF(13632),
-          AOM_ICDF(18931), AOM_ICDF(20553), AOM_ICDF(23452), AOM_ICDF(25800),
-          AOM_ICDF(27754), AOM_ICDF(31668), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11520), AOM_ICDF(13372), AOM_ICDF(16642), AOM_ICDF(18137),
-          AOM_ICDF(20232), AOM_ICDF(21510), AOM_ICDF(23052), AOM_ICDF(26792),
-          AOM_ICDF(27974), AOM_ICDF(31274), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(12483), AOM_ICDF(14364), AOM_ICDF(16168),
-          AOM_ICDF(18668), AOM_ICDF(20707), AOM_ICDF(22158), AOM_ICDF(24410),
-          AOM_ICDF(26370), AOM_ICDF(30744), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(10798), AOM_ICDF(13829), AOM_ICDF(15128),
-          AOM_ICDF(19136), AOM_ICDF(19152), AOM_ICDF(21057), AOM_ICDF(22583),
-          AOM_ICDF(24513), AOM_ICDF(30645), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(11025), AOM_ICDF(16073), AOM_ICDF(17603),
-          AOM_ICDF(20094), AOM_ICDF(21468), AOM_ICDF(22971), AOM_ICDF(24628),
-          AOM_ICDF(26015), AOM_ICDF(29728), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10368), AOM_ICDF(15372), AOM_ICDF(18442), AOM_ICDF(19576),
-          AOM_ICDF(22674), AOM_ICDF(27128), AOM_ICDF(28232), AOM_ICDF(29624),
-          AOM_ICDF(31363), AOM_ICDF(31368), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9472), AOM_ICDF(16687), AOM_ICDF(18957), AOM_ICDF(20272),
-          AOM_ICDF(22852), AOM_ICDF(27082), AOM_ICDF(27839), AOM_ICDF(28995),
-          AOM_ICDF(30943), AOM_ICDF(30948), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(12334), AOM_ICDF(19197), AOM_ICDF(20956),
-          AOM_ICDF(24804), AOM_ICDF(26553), AOM_ICDF(27556), AOM_ICDF(29877),
-          AOM_ICDF(31311), AOM_ICDF(31320), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(14083), AOM_ICDF(16058), AOM_ICDF(19129),
-          AOM_ICDF(21136), AOM_ICDF(23635), AOM_ICDF(24870), AOM_ICDF(27577),
-          AOM_ICDF(31176), AOM_ICDF(31187), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(14208), AOM_ICDF(15589), AOM_ICDF(17640),
-          AOM_ICDF(22080), AOM_ICDF(26660), AOM_ICDF(27947), AOM_ICDF(29400),
-          AOM_ICDF(31605), AOM_ICDF(31611), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9216), AOM_ICDF(15167), AOM_ICDF(16263), AOM_ICDF(17767),
-          AOM_ICDF(21531), AOM_ICDF(26689), AOM_ICDF(27607), AOM_ICDF(28880),
-          AOM_ICDF(31291), AOM_ICDF(31296), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(12756), AOM_ICDF(15781), AOM_ICDF(17279),
-          AOM_ICDF(21198), AOM_ICDF(24057), AOM_ICDF(26171), AOM_ICDF(29200),
-          AOM_ICDF(31901), AOM_ICDF(31913), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(15074), AOM_ICDF(18244), AOM_ICDF(19878),
-          AOM_ICDF(22246), AOM_ICDF(24436), AOM_ICDF(25560), AOM_ICDF(28991),
-          AOM_ICDF(31687), AOM_ICDF(31700), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(15128), AOM_ICDF(17012), AOM_ICDF(18989),
-          AOM_ICDF(21294), AOM_ICDF(25011), AOM_ICDF(25999), AOM_ICDF(27784),
-          AOM_ICDF(30934), AOM_ICDF(30941), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(5875), AOM_ICDF(8846), AOM_ICDF(11817),
-          AOM_ICDF(14806), AOM_ICDF(17795), AOM_ICDF(20769), AOM_ICDF(23761),
-          AOM_ICDF(26747), AOM_ICDF(29739), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(12915), AOM_ICDF(17544), AOM_ICDF(19392),
-          AOM_ICDF(23074), AOM_ICDF(25635), AOM_ICDF(26431), AOM_ICDF(28241),
-          AOM_ICDF(30088), AOM_ICDF(30095), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(11648), AOM_ICDF(13565), AOM_ICDF(18996), AOM_ICDF(19908),
-          AOM_ICDF(21897), AOM_ICDF(22852), AOM_ICDF(26656), AOM_ICDF(28172),
-          AOM_ICDF(28995), AOM_ICDF(31283), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(14255), AOM_ICDF(18109), AOM_ICDF(19716),
-          AOM_ICDF(21521), AOM_ICDF(22859), AOM_ICDF(24613), AOM_ICDF(26161),
-          AOM_ICDF(27279), AOM_ICDF(30392), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(7848), AOM_ICDF(18820), AOM_ICDF(19447),
-          AOM_ICDF(22335), AOM_ICDF(22733), AOM_ICDF(25112), AOM_ICDF(28427),
-          AOM_ICDF(29013), AOM_ICDF(31550), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11904), AOM_ICDF(13581), AOM_ICDF(17695), AOM_ICDF(19311),
-          AOM_ICDF(21698), AOM_ICDF(22562), AOM_ICDF(24391), AOM_ICDF(26559),
-          AOM_ICDF(27779), AOM_ICDF(30567), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10624), AOM_ICDF(12334), AOM_ICDF(14643), AOM_ICDF(16255),
-          AOM_ICDF(20783), AOM_ICDF(22767), AOM_ICDF(24929), AOM_ICDF(26876),
-          AOM_ICDF(27998), AOM_ICDF(31470), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12032), AOM_ICDF(14415), AOM_ICDF(16715), AOM_ICDF(18712),
-          AOM_ICDF(21557), AOM_ICDF(25332), AOM_ICDF(27840), AOM_ICDF(29663),
-          AOM_ICDF(31708), AOM_ICDF(31715), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(10683), AOM_ICDF(13955), AOM_ICDF(14786),
-          AOM_ICDF(18481), AOM_ICDF(19492), AOM_ICDF(26749), AOM_ICDF(28483),
-          AOM_ICDF(29116), AOM_ICDF(31958), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10032), AOM_ICDF(15755), AOM_ICDF(16949),
-          AOM_ICDF(19144), AOM_ICDF(19744), AOM_ICDF(22082), AOM_ICDF(27608),
-          AOM_ICDF(28411), AOM_ICDF(31838), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(14592), AOM_ICDF(15937), AOM_ICDF(18518), AOM_ICDF(19566),
-          AOM_ICDF(21817), AOM_ICDF(23102), AOM_ICDF(24436), AOM_ICDF(26651),
-          AOM_ICDF(28100), AOM_ICDF(30993), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10791), AOM_ICDF(14718), AOM_ICDF(16094),
-          AOM_ICDF(18560), AOM_ICDF(18570), AOM_ICDF(22120), AOM_ICDF(24188),
-          AOM_ICDF(25677), AOM_ICDF(31280), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11136), AOM_ICDF(13058), AOM_ICDF(19006), AOM_ICDF(20135),
-          AOM_ICDF(21463), AOM_ICDF(22159), AOM_ICDF(24042), AOM_ICDF(26348),
-          AOM_ICDF(27367), AOM_ICDF(30064), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(12544), AOM_ICDF(15384), AOM_ICDF(20327), AOM_ICDF(21555),
-          AOM_ICDF(23456), AOM_ICDF(24144), AOM_ICDF(25421), AOM_ICDF(27884),
-          AOM_ICDF(28875), AOM_ICDF(31188), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10368), AOM_ICDF(15009), AOM_ICDF(17631), AOM_ICDF(18970),
-          AOM_ICDF(20691), AOM_ICDF(21850), AOM_ICDF(22749), AOM_ICDF(25280),
-          AOM_ICDF(26570), AOM_ICDF(29530), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(10956), AOM_ICDF(21554), AOM_ICDF(22698),
-          AOM_ICDF(23666), AOM_ICDF(24052), AOM_ICDF(25122), AOM_ICDF(27792),
-          AOM_ICDF(28612), AOM_ICDF(30825), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11520), AOM_ICDF(12888), AOM_ICDF(16374), AOM_ICDF(19132),
-          AOM_ICDF(21186), AOM_ICDF(21843), AOM_ICDF(22902), AOM_ICDF(26440),
-          AOM_ICDF(27928), AOM_ICDF(29946), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(12199), AOM_ICDF(14625), AOM_ICDF(17321),
-          AOM_ICDF(20195), AOM_ICDF(21574), AOM_ICDF(23010), AOM_ICDF(25688),
-          AOM_ICDF(27600), AOM_ICDF(30988), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10112), AOM_ICDF(13705), AOM_ICDF(16847), AOM_ICDF(19242),
-          AOM_ICDF(22011), AOM_ICDF(24064), AOM_ICDF(26481), AOM_ICDF(29125),
-          AOM_ICDF(30545), AOM_ICDF(30555), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(10994), AOM_ICDF(15018), AOM_ICDF(16915),
-          AOM_ICDF(20471), AOM_ICDF(21334), AOM_ICDF(24577), AOM_ICDF(27472),
-          AOM_ICDF(28592), AOM_ICDF(31578), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12928), AOM_ICDF(14540), AOM_ICDF(18022), AOM_ICDF(19481),
-          AOM_ICDF(21028), AOM_ICDF(21825), AOM_ICDF(22728), AOM_ICDF(28191),
-          AOM_ICDF(29154), AOM_ICDF(31683), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10368), AOM_ICDF(12160), AOM_ICDF(14900), AOM_ICDF(17161),
-          AOM_ICDF(19379), AOM_ICDF(20521), AOM_ICDF(21747), AOM_ICDF(24534),
-          AOM_ICDF(26677), AOM_ICDF(30318), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(11488), AOM_ICDF(16197), AOM_ICDF(18030),
-          AOM_ICDF(20010), AOM_ICDF(20018), AOM_ICDF(21347), AOM_ICDF(23948),
-          AOM_ICDF(25016), AOM_ICDF(30536), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(10310), AOM_ICDF(15420), AOM_ICDF(18961),
-          AOM_ICDF(20114), AOM_ICDF(20772), AOM_ICDF(21721), AOM_ICDF(24599),
-          AOM_ICDF(26237), AOM_ICDF(29160), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(9856), AOM_ICDF(13764), AOM_ICDF(16995), AOM_ICDF(19540),
-          AOM_ICDF(20802), AOM_ICDF(22302), AOM_ICDF(23113), AOM_ICDF(24519),
-          AOM_ICDF(27717), AOM_ICDF(31604), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(15725), AOM_ICDF(17309), AOM_ICDF(20296),
-          AOM_ICDF(21257), AOM_ICDF(22573), AOM_ICDF(23165), AOM_ICDF(23893),
-          AOM_ICDF(27755), AOM_ICDF(31170), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7936), AOM_ICDF(11343), AOM_ICDF(19355), AOM_ICDF(21223),
-          AOM_ICDF(22121), AOM_ICDF(22978), AOM_ICDF(23703), AOM_ICDF(26079),
-          AOM_ICDF(27978), AOM_ICDF(31507), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11264), AOM_ICDF(14823), AOM_ICDF(17314), AOM_ICDF(20715),
-          AOM_ICDF(21999), AOM_ICDF(22982), AOM_ICDF(23728), AOM_ICDF(25229),
-          AOM_ICDF(28593), AOM_ICDF(31508), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(11788), AOM_ICDF(13666), AOM_ICDF(16523),
-          AOM_ICDF(18630), AOM_ICDF(20579), AOM_ICDF(21574), AOM_ICDF(23335),
-          AOM_ICDF(26298), AOM_ICDF(31264), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(14031), AOM_ICDF(15766), AOM_ICDF(18533),
-          AOM_ICDF(21457), AOM_ICDF(24078), AOM_ICDF(24973), AOM_ICDF(26102),
-          AOM_ICDF(31284), AOM_ICDF(31288), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7040), AOM_ICDF(9648), AOM_ICDF(12140), AOM_ICDF(14601),
-          AOM_ICDF(16742), AOM_ICDF(18070), AOM_ICDF(21154), AOM_ICDF(23582),
-          AOM_ICDF(27647), AOM_ICDF(31763), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(13466), AOM_ICDF(16837), AOM_ICDF(19351),
-          AOM_ICDF(20636), AOM_ICDF(21620), AOM_ICDF(22474), AOM_ICDF(25815),
-          AOM_ICDF(28364), AOM_ICDF(31976), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(13682), AOM_ICDF(15127), AOM_ICDF(18779),
-          AOM_ICDF(19841), AOM_ICDF(20792), AOM_ICDF(21954), AOM_ICDF(23365),
-          AOM_ICDF(29100), AOM_ICDF(31748), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(12260), AOM_ICDF(15037), AOM_ICDF(17152),
-          AOM_ICDF(18730), AOM_ICDF(18736), AOM_ICDF(19436), AOM_ICDF(20484),
-          AOM_ICDF(24465), AOM_ICDF(30868), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(12469), AOM_ICDF(15422), AOM_ICDF(19291),
-          AOM_ICDF(20301), AOM_ICDF(21344), AOM_ICDF(21894), AOM_ICDF(23415),
-          AOM_ICDF(27696), AOM_ICDF(31042), AOM_ICDF(32768), 0,
-      },
-  },
-  {
       {
-          AOM_ICDF(10112), AOM_ICDF(13929), AOM_ICDF(17880), AOM_ICDF(18857),
-          AOM_ICDF(20955), AOM_ICDF(20963), AOM_ICDF(21974), AOM_ICDF(23273),
-          AOM_ICDF(24734), AOM_ICDF(31352), AOM_ICDF(32768), 0,
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(1127, 12814, 22772, 27483) },
+              { AOM_CDF5(145, 6761, 11980, 26667) },
+              { AOM_CDF5(362, 5887, 11678, 16725) },
+              { AOM_CDF5(385, 15213, 18587, 30693) },
+              { AOM_CDF5(25, 2914, 23134, 27903) },
+              { AOM_CDF5(60, 4470, 11749, 23991) },
+              { AOM_CDF5(37, 3332, 14511, 21448) },
+              { AOM_CDF5(157, 6320, 13036, 17439) },
+              { AOM_CDF5(119, 6719, 12906, 29396) },
+              { AOM_CDF5(47, 5537, 12576, 21499) },
+              { AOM_CDF5(269, 6076, 11258, 23115) },
+              { AOM_CDF5(83, 5615, 12001, 17228) },
+              { AOM_CDF5(1968, 5556, 12023, 18547) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
       },
+    };
+
+static const aom_cdf_prob
+    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+        TX_TYPES)] = {
       {
-          AOM_ICDF(8064), AOM_ICDF(15826), AOM_ICDF(17929), AOM_ICDF(19017),
-          AOM_ICDF(21016), AOM_ICDF(21024), AOM_ICDF(21687), AOM_ICDF(22701),
-          AOM_ICDF(24242), AOM_ICDF(30645), AOM_ICDF(32768), 0,
+          { 0 },
+          { 0 },
+          { 0 },
+          { 0 },
       },
       {
-          AOM_ICDF(6528), AOM_ICDF(9196), AOM_ICDF(20118), AOM_ICDF(21101),
-          AOM_ICDF(22227), AOM_ICDF(22231), AOM_ICDF(22997), AOM_ICDF(25070),
-          AOM_ICDF(25919), AOM_ICDF(30923), AOM_ICDF(32768), 0,
+          { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504,
+                      22848, 23934, 25474, 27727, 28915, 30631) },
+          { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674,
+                      20408, 22517, 25010, 27116, 28856, 30749) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
       },
       {
-          AOM_ICDF(9600), AOM_ICDF(13218), AOM_ICDF(15898), AOM_ICDF(17780),
-          AOM_ICDF(19991), AOM_ICDF(20000), AOM_ICDF(21196), AOM_ICDF(23912),
-          AOM_ICDF(26044), AOM_ICDF(31139), AOM_ICDF(32768), 0,
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595,
+                      28526, 30529) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
       },
       {
-          AOM_ICDF(8960), AOM_ICDF(12037), AOM_ICDF(14178), AOM_ICDF(15681),
-          AOM_ICDF(20126), AOM_ICDF(20143), AOM_ICDF(21435), AOM_ICDF(23083),
-          AOM_ICDF(24675), AOM_ICDF(31466), AOM_ICDF(32768), 0,
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(4167) },
+          { AOM_CDF2(1998) },
+          { AOM_CDF2(748) },
       },
+    };
+
+static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
+  AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
+};
+
+static const aom_cdf_prob
+    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+      { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
+                  32704, 32708, 32712, 32716, 32720, 32724) },
+      { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620,
+                  32647, 32668, 32672, 32676, 32680, 32684) },
+      { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673,
+                  32677, 32681, 32685, 32689, 32693, 32697) },
+      { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708,
+                  32712, 32716, 32720, 32724, 32728, 32732) },
+      { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394,
+                  32464, 32516, 32560, 32576, 32593, 32622) },
+      { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
+                  32413, 32520, 32594, 32622, 32656, 32660) }
+    };
+
+static const aom_cdf_prob
+    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+        SWITCHABLE_FILTERS)] = {
+      { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) },
+      { AOM_CDF3(422, 2938) },    { AOM_CDF3(28244, 32608) },
+      { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) },
+      { AOM_CDF3(770, 1152) },    { AOM_CDF3(20889, 25637) },
+      { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) },
+      { AOM_CDF3(305, 2247) },    { AOM_CDF3(27403, 32636) },
+      { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) },
+      { AOM_CDF3(601, 943) },     { AOM_CDF3(14969, 21398) }
+    };
+
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+    { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+      { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+
+static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
+
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+    { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+      { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+
+static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
+};
+
+static const aom_cdf_prob
+    default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
+        INTER_COMPOUND_MODES)] = {
+      { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+      { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+      { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+      { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+      { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+      { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+      { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+      { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) }
+    };
+
+static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    2)] = { { AOM_CDF2(16384) },
+            { AOM_CDF2(26887) },
+            { AOM_CDF2(27597) },
+            { AOM_CDF2(30237) } };
+
+static const aom_cdf_prob
+    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
+        { { AOM_CDF4(8192, 16384, 24576) },
+          { AOM_CDF4(1875, 11082, 27332) },
+          { AOM_CDF4(2473, 9996, 26388) },
+          { AOM_CDF4(4238, 11537, 25926) } };
+
+static const aom_cdf_prob
+    default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) },
+      { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) },
+      { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
+    };
+
+static const aom_cdf_prob
+    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+      { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
+      { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
+    };
+
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
+    { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
+                  22362, 24127, 25702, 27752, 29450, 31171) },
+      { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
+                  18452, 19422, 22839, 26127, 29629) },
+      { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
+                  24520, 27470, 29456, 30529, 31656) },
+      { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
+                  20961, 22884, 24471, 26719, 28714, 30877) },
+      { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
+                  18114, 19313, 22521, 26012, 29550) },
+      { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
+                  20533, 23434, 25972, 27944, 29570, 31416) },
+      { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
+                  22038, 23963, 25311, 26988, 28766, 31012) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
+                  24985, 25684, 27259, 28883, 30911) },
+      { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
+                  27251, 29173, 30089, 30960, 31933) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) } };
+
+static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) },
+                       { AOM_CDF3(4738, 24765) },  { AOM_CDF3(5391, 25528) },
+                       { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) },
+                       { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) },
+                       { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) },
+                       { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) },
+                       { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) },
+                       { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } };
+
+static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(10437) }, { AOM_CDF2(9371) },  { AOM_CDF2(9301) },
+  { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) },
+  { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) },
+  { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) },
+  { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) },
+  { AOM_CDF2(26879) }
+};
+
+static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS]
+                                                 [CDF_SIZE(2)] = {
+                                                   { AOM_CDF2(806) },
+                                                   { AOM_CDF2(16662) },
+                                                   { AOM_CDF2(20186) },
+                                                   { AOM_CDF2(26538) }
+                                                 };
+
+static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(26828) },
+            { AOM_CDF2(24035) },
+            { AOM_CDF2(12031) },
+            { AOM_CDF2(10640) },
+            { AOM_CDF2(2901) } };
+
+static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS]
+                                                   [CDF_SIZE(2)] = {
+                                                     { AOM_CDF2(1198) },
+                                                     { AOM_CDF2(2070) },
+                                                     { AOM_CDF2(9166) },
+                                                     { AOM_CDF2(7499) },
+                                                     { AOM_CDF2(22475) }
+                                                   };
+
+static const aom_cdf_prob
+    default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS -
+                                                    1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } },
+      { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } },
+      { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } }
+    };
+
+static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1]
+                                                [CDF_SIZE(2)] = {
+                                                  { { AOM_CDF2(4897) },
+                                                    { AOM_CDF2(1555) },
+                                                    { AOM_CDF2(4236) },
+                                                    { AOM_CDF2(8650) },
+                                                    { AOM_CDF2(904) },
+                                                    { AOM_CDF2(1444) } },
+                                                  { { AOM_CDF2(16973) },
+                                                    { AOM_CDF2(16751) },
+                                                    { AOM_CDF2(19647) },
+                                                    { AOM_CDF2(24773) },
+                                                    { AOM_CDF2(11014) },
+                                                    { AOM_CDF2(15087) } },
+                                                  { { AOM_CDF2(29744) },
+                                                    { AOM_CDF2(30279) },
+                                                    { AOM_CDF2(31194) },
+                                                    { AOM_CDF2(31895) },
+                                                    { AOM_CDF2(26875) },
+                                                    { AOM_CDF2(30304) } }
+                                                };
+
+static const aom_cdf_prob
+    default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } },
+      { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } },
+      { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } }
+    };
+
+static const aom_cdf_prob
+    default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } },
+      { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } },
+      { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } }
+    };
+
+static const aom_cdf_prob
+    default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) },
+      { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) },
+      { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) },
+      { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) },
+      { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) },
+      { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) },
+      { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) }
+    };
+
+static const aom_cdf_prob
+    default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) },
+      { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) },
+      { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) },
+      { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) },
+      { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) },
+      { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) },
+      { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) }
+    };
+
+static const aom_cdf_prob default_palette_y_mode_cdf
+    [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } },
+      { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } },
+      { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } },
+      { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } },
+      { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } },
+      { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } },
+      { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } }
+    };
+
+static const aom_cdf_prob
+    default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(32461) }, { AOM_CDF2(21488) }
+    };
+
+static const aom_cdf_prob default_palette_y_color_index_cdf
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
       {
-          AOM_ICDF(2944), AOM_ICDF(5875), AOM_ICDF(8846), AOM_ICDF(11817),
-          AOM_ICDF(14806), AOM_ICDF(17795), AOM_ICDF(20769), AOM_ICDF(23761),
-          AOM_ICDF(26747), AOM_ICDF(29739), AOM_ICDF(32768), 0,
+          { AOM_CDF2(28710) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(10553) },
+          { AOM_CDF2(27036) },
+          { AOM_CDF2(31603) },
       },
       {
-          AOM_ICDF(9472), AOM_ICDF(12334), AOM_ICDF(15469), AOM_ICDF(16848),
-          AOM_ICDF(19972), AOM_ICDF(19984), AOM_ICDF(22292), AOM_ICDF(24384),
-          AOM_ICDF(25891), AOM_ICDF(31676), AOM_ICDF(32768), 0,
+          { AOM_CDF3(27877, 30490) },
+          { AOM_CDF3(11532, 25697) },
+          { AOM_CDF3(6544, 30234) },
+          { AOM_CDF3(23018, 28072) },
+          { AOM_CDF3(31915, 32385) },
       },
       {
-          AOM_ICDF(8448), AOM_ICDF(11176), AOM_ICDF(15497), AOM_ICDF(16676),
-          AOM_ICDF(18528), AOM_ICDF(18535), AOM_ICDF(19595), AOM_ICDF(24334),
-          AOM_ICDF(25725), AOM_ICDF(31723), AOM_ICDF(32768), 0,
+          { AOM_CDF4(25572, 28046, 30045) },
+          { AOM_CDF4(9478, 21590, 27256) },
+          { AOM_CDF4(7248, 26837, 29824) },
+          { AOM_CDF4(19167, 24486, 28349) },
+          { AOM_CDF4(31400, 31825, 32250) },
       },
       {
-          AOM_ICDF(8704), AOM_ICDF(12141), AOM_ICDF(14313), AOM_ICDF(15828),
-          AOM_ICDF(18358), AOM_ICDF(18368), AOM_ICDF(19469), AOM_ICDF(21089),
-          AOM_ICDF(24027), AOM_ICDF(30700), AOM_ICDF(32768), 0,
+          { AOM_CDF5(24779, 26955, 28576, 30282) },
+          { AOM_CDF5(8669, 20364, 24073, 28093) },
+          { AOM_CDF5(4255, 27565, 29377, 31067) },
+          { AOM_CDF5(19864, 23674, 26716, 29530) },
+          { AOM_CDF5(31646, 31893, 32147, 32426) },
       },
       {
-          AOM_ICDF(7680), AOM_ICDF(11689), AOM_ICDF(14556), AOM_ICDF(15548),
-          AOM_ICDF(17878), AOM_ICDF(17887), AOM_ICDF(18873), AOM_ICDF(20512),
-          AOM_ICDF(22152), AOM_ICDF(31004), AOM_ICDF(32768), 0,
+          { AOM_CDF6(23132, 25407, 26970, 28435, 30073) },
+          { AOM_CDF6(7443, 17242, 20717, 24762, 27982) },
+          { AOM_CDF6(6300, 24862, 26944, 28784, 30671) },
+          { AOM_CDF6(18916, 22895, 25267, 27435, 29652) },
+          { AOM_CDF6(31270, 31550, 31808, 32059, 32353) },
       },
       {
-          AOM_ICDF(6656), AOM_ICDF(11476), AOM_ICDF(16600), AOM_ICDF(18052),
-          AOM_ICDF(19683), AOM_ICDF(19689), AOM_ICDF(20509), AOM_ICDF(22077),
-          AOM_ICDF(23496), AOM_ICDF(29504), AOM_ICDF(32768), 0,
+          { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) },
+          { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) },
+          { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) },
+          { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) },
+          { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) },
       },
-  },
-  {
       {
-          AOM_ICDF(9728), AOM_ICDF(14651), AOM_ICDF(19394), AOM_ICDF(20550),
-          AOM_ICDF(21680), AOM_ICDF(22479), AOM_ICDF(23516), AOM_ICDF(24952),
-          AOM_ICDF(26183), AOM_ICDF(28538), AOM_ICDF(32768), 0,
+          { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+          { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+          { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+          { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+          { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
       },
+    };
+
+static const aom_cdf_prob default_palette_uv_color_index_cdf
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
       {
-          AOM_ICDF(8832), AOM_ICDF(18693), AOM_ICDF(20913), AOM_ICDF(21933),
-          AOM_ICDF(22956), AOM_ICDF(23831), AOM_ICDF(24341), AOM_ICDF(25317),
-          AOM_ICDF(26434), AOM_ICDF(29028), AOM_ICDF(32768), 0,
+          { AOM_CDF2(29089) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(8713) },
+          { AOM_CDF2(29257) },
+          { AOM_CDF2(31610) },
       },
       {
-          AOM_ICDF(5888), AOM_ICDF(8413), AOM_ICDF(20542), AOM_ICDF(21609),
-          AOM_ICDF(22437), AOM_ICDF(22864), AOM_ICDF(23663), AOM_ICDF(26329),
-          AOM_ICDF(26900), AOM_ICDF(29828), AOM_ICDF(32768), 0,
+          { AOM_CDF3(25257, 29145) },
+          { AOM_CDF3(12287, 27293) },
+          { AOM_CDF3(7033, 27960) },
+          { AOM_CDF3(20145, 25405) },
+          { AOM_CDF3(30608, 31639) },
       },
       {
-          AOM_ICDF(9984), AOM_ICDF(13134), AOM_ICDF(16328), AOM_ICDF(18267),
-          AOM_ICDF(19814), AOM_ICDF(21461), AOM_ICDF(22393), AOM_ICDF(24944),
-          AOM_ICDF(26320), AOM_ICDF(29653), AOM_ICDF(32768), 0,
+          { AOM_CDF4(24210, 27175, 29903) },
+          { AOM_CDF4(9888, 22386, 27214) },
+          { AOM_CDF4(5901, 26053, 29293) },
+          { AOM_CDF4(18318, 22152, 28333) },
+          { AOM_CDF4(30459, 31136, 31926) },
       },
       {
-          AOM_ICDF(8448), AOM_ICDF(12425), AOM_ICDF(15474), AOM_ICDF(17031),
-          AOM_ICDF(19216), AOM_ICDF(20889), AOM_ICDF(23077), AOM_ICDF(25108),
-          AOM_ICDF(26548), AOM_ICDF(30108), AOM_ICDF(32768), 0,
+          { AOM_CDF5(22980, 25479, 27781, 29986) },
+          { AOM_CDF5(8413, 21408, 24859, 28874) },
+          { AOM_CDF5(2257, 29449, 30594, 31598) },
+          { AOM_CDF5(19189, 21202, 25915, 28620) },
+          { AOM_CDF5(31844, 32044, 32281, 32518) },
       },
       {
-          AOM_ICDF(9856), AOM_ICDF(15675), AOM_ICDF(19169), AOM_ICDF(20837),
-          AOM_ICDF(22638), AOM_ICDF(24556), AOM_ICDF(25438), AOM_ICDF(27114),
-          AOM_ICDF(29449), AOM_ICDF(29456), AOM_ICDF(32768), 0,
+          { AOM_CDF6(22217, 24567, 26637, 28683, 30548) },
+          { AOM_CDF6(7307, 16406, 19636, 24632, 28424) },
+          { AOM_CDF6(4441, 25064, 26879, 28942, 30919) },
+          { AOM_CDF6(17210, 20528, 23319, 26750, 29582) },
+          { AOM_CDF6(30674, 30953, 31396, 31735, 32207) },
       },
       {
-          AOM_ICDF(6784), AOM_ICDF(10294), AOM_ICDF(14542), AOM_ICDF(15724),
-          AOM_ICDF(19109), AOM_ICDF(19972), AOM_ICDF(24084), AOM_ICDF(26329),
-          AOM_ICDF(27637), AOM_ICDF(30433), AOM_ICDF(32768), 0,
+          { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) },
+          { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) },
+          { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) },
+          { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) },
+          { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) },
       },
       {
-          AOM_ICDF(8320), AOM_ICDF(10873), AOM_ICDF(17095), AOM_ICDF(18466),
-          AOM_ICDF(19674), AOM_ICDF(20129), AOM_ICDF(21230), AOM_ICDF(27562),
-          AOM_ICDF(28568), AOM_ICDF(30858), AOM_ICDF(32768), 0,
+          { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+          { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+          { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+          { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+          { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
       },
+    };
+
+static const aom_cdf_prob
+    default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) },
+      { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) },
+      { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) },
+      { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) },
+      { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) },
+      { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) },
+      { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
+    };
+
+static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
+};
+
+static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } };
+
+static const aom_cdf_prob
+    default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) },
+      { AOM_CDF2(13259) }, { AOM_CDF2(9334) },  { AOM_CDF2(4644) }
+    };
+
+static const aom_cdf_prob
+    default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) },
+      { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) }
+    };
+
+static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    30531) };
+
+static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE(
+    FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) };
+
+static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(
+    2)] = { { AOM_CDF2(4621) },  { AOM_CDF2(6743) },  { AOM_CDF2(5893) },
+            { AOM_CDF2(7866) },  { AOM_CDF2(12551) }, { AOM_CDF2(9394) },
+            { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) },
+            { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) },
+            { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) } };
+
+static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE(
+    RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) };
+
+static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    11570) };
+
+static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    16855) };
+
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(
+    DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) } };
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+// FIXME(someone) need real defaults here
+static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
+  AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672)
+};
+
+static const aom_cdf_prob
+    default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
+      { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
+    };
+
+static const aom_cdf_prob
+    default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE(
+        MAX_SEGMENTS)] = {
       {
-          AOM_ICDF(9088), AOM_ICDF(13196), AOM_ICDF(15898), AOM_ICDF(17566),
-          AOM_ICDF(19210), AOM_ICDF(20354), AOM_ICDF(21186), AOM_ICDF(23647),
-          AOM_ICDF(26235), AOM_ICDF(30548), AOM_ICDF(32768), 0,
+          AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533),
       },
       {
-          AOM_ICDF(6912), AOM_ICDF(11512), AOM_ICDF(16390), AOM_ICDF(17479),
-          AOM_ICDF(19065), AOM_ICDF(19071), AOM_ICDF(19740), AOM_ICDF(21715),
-          AOM_ICDF(23208), AOM_ICDF(29132), AOM_ICDF(32768), 0,
+          AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344),
       },
       {
-          AOM_ICDF(6656), AOM_ICDF(11485), AOM_ICDF(16060), AOM_ICDF(17734),
-          AOM_ICDF(19099), AOM_ICDF(19814), AOM_ICDF(21018), AOM_ICDF(23053),
-          AOM_ICDF(24333), AOM_ICDF(27260), AOM_ICDF(32768), 0,
+          AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679),
       },
-  },
-#endif  // CONFIG_SMOOTH_HV
-};
-#endif  // CONFIG_KF_CTX
+    };
 
-#if CONFIG_LPF_SB
-static const aom_cdf_prob default_lpf_reuse_cdf[LPF_REUSE_CONTEXT][CDF_SIZE(
-    2)] = { { AOM_ICDF(8192), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(4096), AOM_ICDF(32768), 0 } };
+static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+                                             [CDF_SIZE(MAX_TX_DEPTH + 1)] = {
+                                               { { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(24320) } },
+                                               { { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(18677, 30848) } },
+                                               { { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(24302, 25602) } },
+                                               { { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(16803, 22759) } },
+                                             };
 
-static const aom_cdf_prob
-    default_lpf_delta_cdf[LPF_DELTA_CONTEXT][CDF_SIZE(DELTA_RANGE)] = {
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 }
-    };
+#define MAX_COLOR_CONTEXT_HASH 8
+// Negative values are invalid
+static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
+                                                    1] = { -1, -1, 0, -1, -1,
+                                                           4,  3,  2, 1 };
 
-static const aom_cdf_prob
-    default_lpf_sign_cdf[LPF_REUSE_CONTEXT][LPF_SIGN_CONTEXT][CDF_SIZE(2)] = {
-      { { AOM_ICDF(6554), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(26214), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(16384), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16384), AOM_ICDF(32768), 0 } }
-    };
-#endif  // CONFIG_LPF_SB
+#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx) {
+  assert(palette_size <= PALETTE_MAX_SIZE);
+  assert(r > 0 || c > 0);
+
+  // Get color indices of neighbors.
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+  color_neighbors[1] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+  int scores[PALETTE_MAX_SIZE + 10] = { 0 };
+  int i;
+  static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    if (color_neighbors[i] >= 0) {
+      scores[color_neighbors[i]] += weights[i];
+    }
+  }
+
+  int inverse_color_order[PALETTE_MAX_SIZE];
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
+
+  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    int max = scores[i];
+    int max_idx = i;
+    for (int j = i + 1; j < palette_size; ++j) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+    }
+    if (max_idx != i) {
+      // Move the score at index 'max_idx' to index 'i', and shift the scores
+      // from 'i' to 'max_idx - 1' by 1.
+      const int max_score = scores[max_idx];
+      const uint8_t max_color_order = color_order[max_idx];
+      for (int k = max_idx; k > i; --k) {
+        scores[k] = scores[k - 1];
+        color_order[k] = color_order[k - 1];
+        inverse_color_order[color_order[k]] = k;
+      }
+      scores[i] = max_score;
+      color_order[i] = max_color_order;
+      inverse_color_order[color_order[i]] = i;
+    }
+  }
+
+  if (color_idx != NULL)
+    *color_idx = inverse_color_order[color_map[r * stride + c]];
+
+  // Get hash value of context.
+  int color_index_ctx_hash = 0;
+  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    color_index_ctx_hash += scores[i] * hash_multipliers[i];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
+#undef NUM_PALETTE_NEIGHBORS
+#undef MAX_COLOR_CONTEXT_HASH
 
 static void init_mode_probs(FRAME_CONTEXT *fc) {
-  av1_copy(fc->partition_prob, default_partition_probs);
-  av1_copy(fc->intra_inter_prob, default_intra_inter_p);
-  av1_copy(fc->comp_inter_prob, default_comp_inter_p);
   av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
   av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
   av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
   av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
   av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
-#if CONFIG_MRC_TX
-  av1_copy(fc->mrc_mask_inter_cdf, default_mrc_mask_inter_cdf);
-  av1_copy(fc->mrc_mask_intra_cdf, default_mrc_mask_intra_cdf);
-#endif  // CONFIG_MRC_TX
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
   av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_COMP_REFS
-  av1_copy(fc->comp_ref_type_prob, default_comp_ref_type_p);
-  av1_copy(fc->uni_comp_ref_prob, default_uni_comp_ref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
   av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
-  av1_copy(fc->comp_ref_prob, default_comp_ref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
   av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
   av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
-#endif
-#if CONFIG_LV_MAP
-  av1_copy(fc->txb_skip, default_txb_skip);
-  av1_copy(fc->nz_map, default_nz_map);
-  av1_copy(fc->eob_flag, default_eob_flag);
-  av1_copy(fc->dc_sign, default_dc_sign);
-  av1_copy(fc->coeff_base, default_coeff_base);
-  av1_copy(fc->coeff_lps, default_coeff_lps);
-#if BR_NODE
-  av1_copy(fc->coeff_br, default_coeff_br);
-#endif
-#if CONFIG_CTX1D
-  av1_copy(fc->eob_mode, default_eob_mode);
-  av1_copy(fc->empty_line, default_empty_line);
-  av1_copy(fc->hv_eob, default_hv_eob);
-#endif  // CONFIG_CTX1D
-
-#if LV_MAP_PROB
-  av1_init_txb_probs(fc);
-#endif  // LV_MAP_PROB
-#endif
-#if CONFIG_EXT_REFS
-  av1_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
-#endif
-#endif  // CONFIG_EXT_REFS
-  av1_copy(fc->single_ref_prob, default_single_ref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
-#endif
-#if CONFIG_COMPOUND_SINGLEREF
-  av1_copy(fc->comp_inter_mode_prob, default_comp_inter_mode_p);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  fc->quarter_tx_size_prob = default_quarter_tx_size_prob;
-#if CONFIG_NEW_MULTISYMBOL
-  av1_copy(fc->quarter_tx_size_cdf, default_quarter_tx_size_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif
-#if CONFIG_VAR_TX
-  av1_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
-#endif
-#endif
-  av1_copy(fc->skip_probs, default_skip_probs);
-  av1_copy(fc->newmv_prob, default_newmv_prob);
-  av1_copy(fc->zeromv_prob, default_zeromv_prob);
-  av1_copy(fc->refmv_prob, default_refmv_prob);
-  av1_copy(fc->drl_prob, default_drl_prob);
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
+  av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
   av1_copy(fc->newmv_cdf, default_newmv_cdf);
   av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
   av1_copy(fc->refmv_cdf, default_refmv_cdf);
   av1_copy(fc->drl_cdf, default_drl_cdf);
-#endif
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  av1_copy(fc->motion_mode_prob, default_motion_mode_prob);
   av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  av1_copy(fc->ncobmc_mode_prob, default_ncobmc_mode_prob);
-  av1_copy(fc->ncobmc_mode_cdf, default_ncobmc_mode_cdf);
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  av1_copy(fc->obmc_prob, default_obmc_prob);
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
   av1_copy(fc->obmc_cdf, default_obmc_cdf);
-#endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  av1_copy(fc->ncobmc_prob, default_ncobmc_prob);
-  av1_copy(fc->ncobmc_cdf, default_ncobmc_cdf);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  av1_copy(fc->inter_compound_mode_probs, default_inter_compound_mode_probs);
   av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
-#if CONFIG_COMPOUND_SINGLEREF
-  av1_copy(fc->inter_singleref_comp_mode_probs,
-           default_inter_singleref_comp_mode_probs);
-  av1_copy(fc->inter_singleref_comp_mode_cdf,
-           default_inter_singleref_comp_mode_cdf);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  av1_copy(fc->compound_type_prob, default_compound_type_probs);
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_INTERINTRA
-  av1_copy(fc->interintra_prob, default_interintra_prob);
-  av1_copy(fc->wedge_interintra_prob, default_wedge_interintra_prob);
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
   av1_copy(fc->interintra_cdf, default_interintra_cdf);
   av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-  av1_copy(fc->interintra_mode_prob, default_interintra_mode_prob);
   av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_SUPERTX
-  av1_copy(fc->supertx_prob, default_supertx_prob);
-#endif  // CONFIG_SUPERTX
-  av1_copy(fc->seg.tree_probs, default_segment_tree_probs);
-  av1_copy(fc->seg.pred_probs, default_segment_pred_probs);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
-#endif
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  av1_copy(fc->intra_filter_probs, default_intra_filter_probs);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  av1_copy(fc->filter_intra_probs, default_filter_intra_probs);
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_LGT_FROM_PRED
-  av1_copy(fc->intra_lgt_prob, default_intra_lgt_prob);
-  av1_copy(fc->inter_lgt_prob, default_inter_lgt_prob);
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_LOOP_RESTORATION
-  av1_copy(fc->switchable_restore_prob, default_switchable_restore_prob);
-#endif  // CONFIG_LOOP_RESTORATION
+  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
+  av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
+  av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
+  av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
+  av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
   av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
   av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
   av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
   av1_copy(fc->partition_cdf, default_partition_cdf);
   av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
   av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
   av1_copy(fc->skip_cdfs, default_skip_cdfs);
   av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
-#endif
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  av1_copy(fc->intra_filter_cdf, default_intra_filter_cdf);
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
+    av1_copy(fc->seg.spatial_pred_seg_cdf[i],
+             default_spatial_pred_seg_tree_cdf[i]);
   av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
-  av1_copy(fc->delta_q_prob, default_delta_q_probs);
   av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
-#if CONFIG_EXT_DELTA_Q
-  av1_copy(fc->delta_lf_prob, default_delta_lf_probs);
   av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
-#if CONFIG_LOOPFILTER_LEVEL
   av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
-#if CONFIG_CFL
   av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
   av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
-#endif
-#if CONFIG_INTRABC
   av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
-#endif
-#if CONFIG_LPF_SB
-  av1_copy(fc->lpf_reuse_cdf, default_lpf_reuse_cdf);
-  av1_copy(fc->lpf_delta_cdf, default_lpf_delta_cdf);
-  av1_copy(fc->lpf_sign_cdf, default_lpf_sign_cdf);
-#endif  // CONFIG_LPF_SB
 }
 
-void av1_adapt_inter_frame_probs(AV1_COMMON *cm) {
-  int i, j;
-  FRAME_CONTEXT *fc = cm->fc;
-  const FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  const FRAME_COUNTS *counts = &cm->counts;
-
-  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = av1_mode_mv_merge_probs(
-        pre_fc->intra_inter_prob[i], counts->intra_inter[i]);
-
-  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = av1_mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
-                                                     counts->comp_inter[i]);
-
-#if CONFIG_EXT_COMP_REFS
-  for (i = 0; i < COMP_REF_TYPE_CONTEXTS; i++)
-    fc->comp_ref_type_prob[i] = av1_mode_mv_merge_probs(
-        pre_fc->comp_ref_type_prob[i], counts->comp_ref_type[i]);
-
-  for (i = 0; i < UNI_COMP_REF_CONTEXTS; i++)
-    for (j = 0; j < (UNIDIR_COMP_REFS - 1); j++)
-      fc->uni_comp_ref_prob[i][j] = av1_mode_mv_merge_probs(
-          pre_fc->uni_comp_ref_prob[i][j], counts->uni_comp_ref[i][j]);
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_EXT_REFS
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (FWD_REFS - 1); j++)
-      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
-                                                    counts->comp_ref[i][j]);
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (BWD_REFS - 1); j++)
-      fc->comp_bwdref_prob[i][j] = mode_mv_merge_probs(
-          pre_fc->comp_bwdref_prob[i][j], counts->comp_bwdref[i][j]);
-#else
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (COMP_REFS - 1); j++)
-      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
-                                                    counts->comp_ref[i][j]);
-#endif  // CONFIG_EXT_REFS
-
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (SINGLE_REFS - 1); j++)
-      fc->single_ref_prob[i][j] = av1_mode_mv_merge_probs(
-          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
-
-#if CONFIG_COMPOUND_SINGLEREF
-  for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
-    fc->comp_inter_mode_prob[i] = av1_mode_mv_merge_probs(
-        pre_fc->comp_inter_mode_prob[i], counts->comp_inter_mode[i]);
-
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    fc->newmv_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->newmv_prob[i], counts->newmv_mode[i]);
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    fc->zeromv_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->zeromv_prob[i], counts->zeromv_mode[i]);
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    fc->refmv_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->refmv_prob[i], counts->refmv_mode[i]);
-
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    fc->drl_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->drl_prob[i], counts->drl_mode[i]);
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; ++i)
-    aom_tree_merge_probs(av1_motion_mode_tree, pre_fc->motion_mode_prob[i],
-                         counts->motion_mode[i], fc->motion_mode_prob[i]);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  for (i = 0; i < ADAPT_OVERLAP_BLOCKS; ++i)
-    aom_tree_merge_probs(av1_ncobmc_mode_tree, pre_fc->ncobmc_mode_prob[i],
-                         counts->ncobmc_mode[i], fc->ncobmc_mode_prob[i]);
-#if CONFIG_WARPED_MOTION
-  for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; ++i)
-    aom_tree_merge_probs(av1_ncobmc_tree, pre_fc->ncobmc_prob[i],
-                         counts->ncobmc[i], fc->ncobmc_prob[i]);
-#endif
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; ++i)
-    fc->obmc_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->obmc_prob[i], counts->obmc[i]);
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-#if CONFIG_SUPERTX
-  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-    for (j = TX_8X8; j < TX_SIZES; ++j) {
-      fc->supertx_prob[i][j] = av1_mode_mv_merge_probs(
-          pre_fc->supertx_prob[i][j], counts->supertx[i][j]);
-    }
-  }
-#endif  // CONFIG_SUPERTX
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    aom_tree_merge_probs(
-        av1_inter_compound_mode_tree, pre_fc->inter_compound_mode_probs[i],
-        counts->inter_compound_mode[i], fc->inter_compound_mode_probs[i]);
-#if CONFIG_COMPOUND_SINGLEREF
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    aom_tree_merge_probs(av1_inter_singleref_comp_mode_tree,
-                         pre_fc->inter_singleref_comp_mode_probs[i],
-                         counts->inter_singleref_comp_mode[i],
-                         fc->inter_singleref_comp_mode_probs[i]);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-  if (cm->allow_interintra_compound) {
-    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-      if (is_interintra_allowed_bsize_group(i))
-        fc->interintra_prob[i] = av1_mode_mv_merge_probs(
-            pre_fc->interintra_prob[i], counts->interintra[i]);
-    }
-    for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-      aom_tree_merge_probs(
-          av1_interintra_mode_tree, pre_fc->interintra_mode_prob[i],
-          counts->interintra_mode[i], fc->interintra_mode_prob[i]);
-    }
-#if CONFIG_WEDGE
-    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
-        fc->wedge_interintra_prob[i] = av1_mode_mv_merge_probs(
-            pre_fc->wedge_interintra_prob[i], counts->wedge_interintra[i]);
-    }
-#endif  // CONFIG_WEDGE
-  }
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-  if (cm->allow_masked_compound) {
-    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      aom_tree_merge_probs(
-          av1_compound_type_tree, pre_fc->compound_type_prob[i],
-          counts->compound_interinter[i], fc->compound_type_prob[i]);
-    }
-  }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+void av1_set_default_ref_deltas(int8_t *ref_deltas) {
+  assert(ref_deltas != NULL);
+
+  ref_deltas[INTRA_FRAME] = 1;
+  ref_deltas[LAST_FRAME] = 0;
+  ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[GOLDEN_FRAME] = -1;
+  ref_deltas[ALTREF2_FRAME] = -1;
+  ref_deltas[ALTREF_FRAME] = -1;
 }
 
-void av1_adapt_intra_frame_probs(AV1_COMMON *cm) {
-  int i;
-  FRAME_CONTEXT *fc = cm->fc;
-  const FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  const FRAME_COUNTS *counts = &cm->counts;
-
-  if (cm->tx_mode == TX_MODE_SELECT) {
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    fc->quarter_tx_size_prob = av1_mode_mv_merge_probs(
-        pre_fc->quarter_tx_size_prob, counts->quarter_tx_size);
-#endif
-  }
-
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT) {
-    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
-      fc->txfm_partition_prob[i] = av1_mode_mv_merge_probs(
-          pre_fc->txfm_partition_prob[i], counts->txfm_partition[i]);
-  }
-#endif
-
-  for (i = 0; i < SKIP_CONTEXTS; ++i)
-    fc->skip_probs[i] =
-        av1_mode_mv_merge_probs(pre_fc->skip_probs[i], counts->skip[i]);
-
-#if CONFIG_LGT_FROM_PRED
-  int j;
-  if (LGT_FROM_PRED_INTRA) {
-    for (i = TX_4X4; i < LGT_SIZES; ++i) {
-      for (j = 0; j < INTRA_MODES; ++j)
-        fc->intra_lgt_prob[i][j] = av1_mode_mv_merge_probs(
-            pre_fc->intra_lgt_prob[i][j], counts->intra_lgt[i][j]);
-    }
-  }
-  if (LGT_FROM_PRED_INTER) {
-    for (i = TX_4X4; i < LGT_SIZES; ++i) {
-      fc->inter_lgt_prob[i] = av1_mode_mv_merge_probs(pre_fc->inter_lgt_prob[i],
-                                                      counts->inter_lgt[i]);
-    }
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  if (cm->seg.temporal_update) {
-    for (i = 0; i < PREDICTION_PROBS; i++)
-      fc->seg.pred_probs[i] = av1_mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
-                                                      counts->seg.pred[i]);
-
-    aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
-                         counts->seg.tree_mispred, fc->seg.tree_probs);
-  } else {
-    aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
-                         counts->seg.tree_total, fc->seg.tree_probs);
-  }
+void av1_set_default_mode_deltas(int8_t *mode_deltas) {
+  assert(mode_deltas != NULL);
 
-#if CONFIG_EXT_PARTITION_TYPES
-  for (i = 0; i < PARTITION_PLOFFSET; ++i)
-    aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-  for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-    aom_tree_merge_probs(av1_ext_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-#else
-  for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i) {
-    aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-  }
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_UNPOISON_PARTITION_CTX
-  for (i = PARTITION_CONTEXTS_PRIMARY;
-       i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
-    unsigned int ct[2] = { counts->partition[i][PARTITION_VERT],
-                           counts->partition[i][PARTITION_SPLIT] };
-    assert(counts->partition[i][PARTITION_NONE] == 0);
-    assert(counts->partition[i][PARTITION_HORZ] == 0);
-    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
-    assert(fc->partition_prob[i][PARTITION_HORZ] == 0);
-    fc->partition_prob[i][PARTITION_VERT] =
-        av1_mode_mv_merge_probs(pre_fc->partition_prob[i][PARTITION_VERT], ct);
-  }
-  for (i = PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES;
-       i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
-    unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ],
-                           counts->partition[i][PARTITION_SPLIT] };
-    assert(counts->partition[i][PARTITION_NONE] == 0);
-    assert(counts->partition[i][PARTITION_VERT] == 0);
-    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
-    assert(fc->partition_prob[i][PARTITION_VERT] == 0);
-    fc->partition_prob[i][PARTITION_HORZ] =
-        av1_mode_mv_merge_probs(pre_fc->partition_prob[i][PARTITION_HORZ], ct);
-  }
-#endif
-  for (i = 0; i < DELTA_Q_PROBS; ++i)
-    fc->delta_q_prob[i] =
-        mode_mv_merge_probs(pre_fc->delta_q_prob[i], counts->delta_q[i]);
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  for (i = 0; i < FRAME_LF_COUNT; ++i)
-    for (int j = 0; j < DELTA_LF_PROBS; ++j)
-      fc->delta_lf_multi_prob[i][j] = mode_mv_merge_probs(
-          pre_fc->delta_lf_multi_prob[i][j], counts->delta_lf_multi[i][j]);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  for (i = 0; i < DELTA_LF_PROBS; ++i)
-    fc->delta_lf_prob[i] =
-        mode_mv_merge_probs(pre_fc->delta_lf_prob[i], counts->delta_lf[i]);
-#endif  // CONFIG_EXT_DELTA_Q
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  for (i = 0; i < INTRA_FILTERS + 1; ++i) {
-    aom_tree_merge_probs(av1_intra_filter_tree, pre_fc->intra_filter_probs[i],
-                         counts->intra_filter[i], fc->intra_filter_probs[i]);
-  }
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  for (i = 0; i < PLANE_TYPES; ++i) {
-    fc->filter_intra_probs[i] = av1_mode_mv_merge_probs(
-        pre_fc->filter_intra_probs[i], counts->filter_intra[i]);
-  }
-#endif  // CONFIG_FILTER_INTRA
+  mode_deltas[0] = 0;
+  mode_deltas[1] = 0;
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
   lf->mode_ref_delta_enabled = 1;
   lf->mode_ref_delta_update = 1;
 
-  lf->ref_deltas[INTRA_FRAME] = 1;
-  lf->ref_deltas[LAST_FRAME] = 0;
-#if CONFIG_EXT_REFS
-  lf->ref_deltas[LAST2_FRAME] = lf->ref_deltas[LAST_FRAME];
-  lf->ref_deltas[LAST3_FRAME] = lf->ref_deltas[LAST_FRAME];
-  lf->ref_deltas[BWDREF_FRAME] = lf->ref_deltas[LAST_FRAME];
-#endif  // CONFIG_EXT_REFS
-  lf->ref_deltas[GOLDEN_FRAME] = -1;
-#if CONFIG_EXT_REFS
-  lf->ref_deltas[ALTREF2_FRAME] = -1;
-#endif  // CONFIG_EXT_REFS
-  lf->ref_deltas[ALTREF_FRAME] = -1;
-
-  lf->mode_deltas[0] = 0;
-  lf->mode_deltas[1] = 0;
+  av1_set_default_ref_deltas(lf->ref_deltas);
+  av1_set_default_mode_deltas(lf->mode_deltas);
+}
 
-  av1_copy(lf->last_ref_deltas, lf->ref_deltas);
-  av1_copy(lf->last_mode_deltas, lf->mode_deltas);
+void av1_setup_frame_contexts(AV1_COMMON *cm) {
+  // Store the frame context into a special slot (not associated with any
+  // reference buffer), so that we can set up cm->pre_fc correctly later
+  // This function must ONLY be called when cm->fc has been initialized with
+  // default probs, either by av1_setup_past_independence or after manually
+  // initializing them
+  cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc;
+  if (cm->large_scale_tile) {
+    for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  }
 }
 
 void av1_setup_past_independence(AV1_COMMON *cm) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &cm->lf;
-
-  int i;
   av1_clearall_segfeatures(&cm->seg);
-  cm->seg.abs_delta = SEGMENT_DELTADATA;
 
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
-    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
 
   if (cm->current_frame_seg_map)
     memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
-  // Reset the mode ref deltas for loop filter
-  av1_zero(lf->last_ref_deltas);
-  av1_zero(lf->last_mode_deltas);
-  set_default_lf_deltas(lf);
-
-  // To force update of the sharpness
-  lf->last_sharpness_level = -1;
+  // reset mode ref deltas
+  av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+  av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+  set_default_lf_deltas(&cm->lf);
 
   av1_default_coef_probs(cm);
   init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
-#if CONFIG_LV_MAP
   av1_init_lv_map(cm);
-#endif
-#if CONFIG_PVQ
-  av1_default_pvq_probs(cm);
-#endif  // CONFIG_PVQ
-#if CONFIG_ADAPT_SCAN
-  av1_init_scan_order(cm);
-#endif
-  av1_convolve_init(cm);
   cm->fc->initialized = 1;
-
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (cm->frame_type == KEY_FRAME) {
-    // Reset all frame contexts, as all reference frames will be lost.
-    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-  }
-#else
-  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
-    // Reset all frame contexts.
-    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    // Reset the frame context of the first specified ref frame.
-    if (cm->frame_refs[0].idx >= 0) {
-      cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
-    }
-#else
-    // Reset only the frame context specified in the frame header.
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  }
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  av1_setup_frame_contexts(cm);
 
   // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+  if (frame_is_intra_only(cm) && cm->prev_mip)
     memset(cm->prev_mip, 0,
-           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->frame_context_idx = 0;
-#endif  // !CONFIG_NO_FRAME_CONTEXT_SIGNALING
+           cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
 }
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
index 3452241b0..0bd2e20a1 100644
--- a/third_party/aom/av1/common/entropymode.h
+++ b/third_party/aom/av1/common/entropymode.h
@@ -18,25 +18,16 @@
 #include "av1/common/seg_common.h"
 #include "aom_dsp/aom_filter.h"
 
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#include "av1/common/pvq_state.h"
-#include "av1/common/generic_code.h"
-#endif  // CONFIG_PVQ
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define BLOCK_SIZE_GROUPS 4
 
-#define TX_SIZE_CONTEXTS 2
+#define TX_SIZE_CONTEXTS 3
 
 #define INTER_OFFSET(mode) ((mode)-NEARESTMV)
-#if CONFIG_COMPOUND_SINGLEREF
-#define INTER_SINGLEREF_COMP_OFFSET(mode) ((mode)-SR_NEAREST_NEARMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#define INTER_COMPOUND_OFFSET(mode) ((mode)-NEAREST_NEARESTMV)
+#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV)
 
 // Number of possible contexts for a color index.
 // As can be seen from av1_get_palette_color_index_context(), the possible
@@ -44,14 +35,6 @@ extern "C" {
 // a value from 0 to 4 using 'palette_color_index_context_lookup' table.
 #define PALETTE_COLOR_INDEX_CONTEXTS 5
 
-// Maximum number of colors in a palette.
-#define PALETTE_MAX_SIZE 8
-// Minimum number of colors in a palette.
-#define PALETTE_MIN_SIZE 2
-
-// Palette mode is available for block sizes >= 8x8.
-#define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
-
 // Palette Y mode context for a block is determined by number of neighboring
 // blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
 // context values are:
@@ -66,11 +49,14 @@ extern "C" {
 // 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
 #define PALETTE_UV_MODE_CONTEXTS 2
 
-#define PALETTE_MAX_BLOCK_SIZE (64 * 64)
+// Map the number of pixels in a block size to a context
+//   64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4)  -> 0
+//  128(BLOCK_8X16, BLOCK_16x8)             -> 1
+//   ...
+// 4096(BLOCK_64X64)                        -> 6
+#define PALATTE_BSIZE_CTXS 7
 
-#if CONFIG_KF_CTX
 #define KF_MODE_CONTEXTS 5
-#endif
 
 struct AV1Common;
 
@@ -80,643 +66,128 @@ typedef struct {
   const int16_t *neighbors;
 } SCAN_ORDER;
 
-struct seg_counts {
-  unsigned int tree_total[MAX_SEGMENTS];
-  unsigned int tree_mispred[MAX_SEGMENTS];
-  unsigned int pred[PREDICTION_PROBS][2];
-};
-
 typedef struct frame_contexts {
-  aom_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  aom_prob uv_mode_prob[INTRA_MODES][UV_INTRA_MODES - 1];
-#if CONFIG_EXT_PARTITION_TYPES
-  aom_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
-#else
-  aom_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-#endif
-  coeff_cdf_model coef_tail_cdfs[TX_SIZES][PLANE_TYPES];
-  coeff_cdf_model coef_head_cdfs[TX_SIZES][PLANE_TYPES];
-#if CONFIG_ADAPT_SCAN
-// TODO(angiebird): try aom_prob
-#if CONFIG_CHROMA_2X2
-  uint32_t non_zero_prob_2x2[TX_TYPES][4];
-#endif
-  uint32_t non_zero_prob_4X4[TX_TYPES][16];
-  uint32_t non_zero_prob_8X8[TX_TYPES][64];
-  uint32_t non_zero_prob_16X16[TX_TYPES][256];
-  uint32_t non_zero_prob_32X32[TX_TYPES][1024];
-
-  uint32_t non_zero_prob_4X8[TX_TYPES][32];
-  uint32_t non_zero_prob_8X4[TX_TYPES][32];
-  uint32_t non_zero_prob_16X8[TX_TYPES][128];
-  uint32_t non_zero_prob_8X16[TX_TYPES][128];
-  uint32_t non_zero_prob_32X16[TX_TYPES][512];
-  uint32_t non_zero_prob_16X32[TX_TYPES][512];
-
-#if CONFIG_CHROMA_2X2
-  DECLARE_ALIGNED(16, int16_t, scan_2x2[TX_TYPES][4]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, scan_4X4[TX_TYPES][16]);
-  DECLARE_ALIGNED(16, int16_t, scan_8X8[TX_TYPES][64]);
-  DECLARE_ALIGNED(16, int16_t, scan_16X16[TX_TYPES][256]);
-  DECLARE_ALIGNED(16, int16_t, scan_32X32[TX_TYPES][1024]);
-
-  DECLARE_ALIGNED(16, int16_t, scan_4X8[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, scan_8X4[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, scan_8X16[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, scan_16X8[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, scan_16X32[TX_TYPES][512]);
-  DECLARE_ALIGNED(16, int16_t, scan_32X16[TX_TYPES][512]);
-
-#if CONFIG_CHROMA_2X2
-  DECLARE_ALIGNED(16, int16_t, iscan_2x2[TX_TYPES][4]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, iscan_4X4[TX_TYPES][16]);
-  DECLARE_ALIGNED(16, int16_t, iscan_8X8[TX_TYPES][64]);
-  DECLARE_ALIGNED(16, int16_t, iscan_16X16[TX_TYPES][256]);
-  DECLARE_ALIGNED(16, int16_t, iscan_32X32[TX_TYPES][1024]);
-
-  DECLARE_ALIGNED(16, int16_t, iscan_4X8[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, iscan_8X4[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, iscan_8X16[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, iscan_16X8[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, iscan_16X32[TX_TYPES][512]);
-  DECLARE_ALIGNED(16, int16_t, iscan_32X16[TX_TYPES][512]);
-
-#if CONFIG_CHROMA_2X2
-  int16_t nb_2x2[TX_TYPES][(4 + 1) * 2];
-#endif
-  int16_t nb_4X4[TX_TYPES][(16 + 1) * 2];
-  int16_t nb_8X8[TX_TYPES][(64 + 1) * 2];
-  int16_t nb_16X16[TX_TYPES][(256 + 1) * 2];
-  int16_t nb_32X32[TX_TYPES][(1024 + 1) * 2];
-
-  int16_t nb_4X8[TX_TYPES][(32 + 1) * 2];
-  int16_t nb_8X4[TX_TYPES][(32 + 1) * 2];
-  int16_t nb_8X16[TX_TYPES][(128 + 1) * 2];
-  int16_t nb_16X8[TX_TYPES][(128 + 1) * 2];
-  int16_t nb_16X32[TX_TYPES][(512 + 1) * 2];
-  int16_t nb_32X16[TX_TYPES][(512 + 1) * 2];
-
-  SCAN_ORDER sc[TX_SIZES_ALL][TX_TYPES];
-
-  int16_t eob_threshold[TX_SIZES_ALL][TX_TYPES][EOB_THRESHOLD_NUM];
-#endif  // CONFIG_ADAPT_SCAN
-
-#if CONFIG_LV_MAP
-  aom_prob txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS];
-  aom_prob nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS];
-  aom_prob eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS];
-  aom_prob dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS];
-  aom_prob coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
-                     [COEFF_BASE_CONTEXTS];
-  aom_prob coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS];
-#if BR_NODE
-  aom_prob coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS][LEVEL_CONTEXTS];
-#endif
-#if CONFIG_CTX1D
-  aom_prob eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES];
-  aom_prob empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES][EMPTY_LINE_CONTEXTS];
-  aom_prob hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS];
-#endif  // CONFIG_CTX1D
-
-#if LV_MAP_PROB
   aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob nz_map_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
-                         [CDF_SIZE(2)];
-  aom_cdf_prob eob_flag_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
-                           [CDF_SIZE(2)];
-  aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
-                             [COEFF_BASE_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob coeff_lps_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+  aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
                             [CDF_SIZE(2)];
-#if BR_NODE
-  aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS]
-                           [LEVEL_CONTEXTS][CDF_SIZE(2)];
-#endif
-#if CONFIG_CTX1D
-  aom_cdf_prob eob_mode_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES][CDF_SIZE(2)];
-  aom_cdf_prob empty_line_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES]
-                             [EMPTY_LINE_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob hv_eob_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS]
-                         [CDF_SIZE(2)];
-#endif  // CONFIG_CTX1D
-#endif  // LV_MAP_PROB
-#endif
+  aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)];
+  aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)];
+  aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)];
+  aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)];
+  aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)];
+  aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)];
+  aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)];
+  aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]
+                                 [CDF_SIZE(3)];
+  aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+                             [CDF_SIZE(4)];
+  aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+                           [CDF_SIZE(BR_CDF_SIZE)];
 
-  aom_prob newmv_prob[NEWMV_MODE_CONTEXTS];
-  aom_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
-  aom_prob refmv_prob[REFMV_MODE_CONTEXTS];
-  aom_prob drl_prob[DRL_MODE_CONTEXTS];
-#if CONFIG_NEW_MULTISYMBOL
   aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob zeromv_cdf[ZEROMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)];
-#endif
 
-  aom_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
-                                    [INTER_COMPOUND_MODES - 1];
   aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
                                       [CDF_SIZE(INTER_COMPOUND_MODES)];
-#if CONFIG_COMPOUND_SINGLEREF
-  aom_prob inter_singleref_comp_mode_probs[INTER_MODE_CONTEXTS]
-                                          [INTER_SINGLEREF_COMP_MODES - 1];
-  aom_cdf_prob inter_singleref_comp_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
-      INTER_SINGLEREF_COMP_MODES)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  aom_prob compound_type_prob[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)];
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_INTERINTRA
-  aom_prob interintra_prob[BLOCK_SIZE_GROUPS];
-  aom_prob wedge_interintra_prob[BLOCK_SIZES_ALL];
-  aom_prob interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1];
-#if CONFIG_NEW_MULTISYMBOL
+  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+  aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
   aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
   aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
-#endif
   aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS]
                                   [CDF_SIZE(INTERINTRA_MODES)];
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  aom_prob motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1];
   aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  aom_prob ncobmc_mode_prob[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES - 1];
-  aom_cdf_prob ncobmc_mode_cdf[ADAPT_OVERLAP_BLOCKS]
-                              [CDF_SIZE(MAX_NCOBMC_MODES)];
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  aom_prob ncobmc_prob[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES - 1];
-  aom_cdf_prob ncobmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(OBMC_FAMILY_MODES)];
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-  aom_prob obmc_prob[BLOCK_SIZES_ALL];
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
   aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  aom_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  aom_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  aom_cdf_prob palette_y_size_cdf[PALETTE_BLOCK_SIZES][CDF_SIZE(PALETTE_SIZES)];
-  aom_cdf_prob palette_uv_size_cdf[PALETTE_BLOCK_SIZES]
-                                  [CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
   aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES]
                                         [PALETTE_COLOR_INDEX_CONTEXTS]
                                         [CDF_SIZE(PALETTE_COLORS)];
   aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES]
                                          [PALETTE_COLOR_INDEX_CONTEXTS]
                                          [CDF_SIZE(PALETTE_COLORS)];
-#if CONFIG_MRC_TX
-  aom_cdf_prob mrc_mask_inter_cdf[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                                 [CDF_SIZE(PALETTE_COLORS)];
-  aom_cdf_prob mrc_mask_intra_cdf[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                                 [CDF_SIZE(PALETTE_COLORS)];
-#endif  // CONFIG_MRC_TX
-#if CONFIG_NEW_MULTISYMBOL
-  aom_cdf_prob palette_y_mode_cdf[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
+  aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]
                                  [CDF_SIZE(2)];
   aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)];
-#endif
-#if CONFIG_EXT_COMP_REFS
-  aom_prob comp_ref_type_prob[COMP_REF_TYPE_CONTEXTS];
-  aom_prob uni_comp_ref_prob[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1];
-#if CONFIG_NEW_MULTISYMBOL
   aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
                                [CDF_SIZE(2)];
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
-  aom_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS - 1];
-#if CONFIG_EXT_REFS
-  aom_prob comp_ref_prob[REF_CONTEXTS][FWD_REFS - 1];
-  aom_prob comp_bwdref_prob[REF_CONTEXTS][BWD_REFS - 1];
-#else
-  aom_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS - 1];
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_REFS
   aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)];
   aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)];
-#else
-  aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][COMP_REFS - 1][CDF_SIZE(2)];
-#endif  // CONFIG_EXT_REFS
-#endif
-#if CONFIG_COMPOUND_SINGLEREF
-  aom_prob comp_inter_mode_prob[COMP_INTER_MODE_CONTEXTS];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  aom_prob quarter_tx_size_prob;
-#if CONFIG_NEW_MULTISYMBOL
-  aom_cdf_prob quarter_tx_size_cdf[CDF_SIZE(2)];
-#endif
-#endif
-#if CONFIG_VAR_TX
-  aom_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
-#if CONFIG_NEW_MULTISYMBOL
   aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
-#endif
-#endif  // CONFIG_VAR_TX
-  aom_prob skip_probs[SKIP_CONTEXTS];
-#if CONFIG_NEW_MULTISYMBOL
+  aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
-#endif
-  nmv_context nmvc[NMV_CONTEXTS];
-#if CONFIG_INTRABC
+  nmv_context nmvc;
   nmv_context ndvc;
   aom_cdf_prob intrabc_cdf[CDF_SIZE(2)];
-#endif
-  int initialized;
-#if CONFIG_SUPERTX
-  aom_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
-#endif  // CONFIG_SUPERTX
   struct segmentation_probs seg;
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  aom_prob intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  aom_prob filter_intra_probs[PLANE_TYPES];
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_LOOP_RESTORATION
-  aom_prob switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1];
-#endif  // CONFIG_LOOP_RESTORATION
+  aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)];
+  aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)];
+  aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)];
+  aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)];
   aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
-  aom_cdf_prob uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)];
-#if CONFIG_EXT_PARTITION_TYPES
+  aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]
+                          [CDF_SIZE(UV_INTRA_MODES)];
   aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
-#else
-  aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)];
-#endif
   aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
                                     [CDF_SIZE(SWITCHABLE_FILTERS)];
-/* kf_y_cdf is discarded after use, so does not require persistent storage.
-   However, we keep it with the other CDFs in this struct since it needs to
-   be copied to each tile to support parallelism just like the others.
-*/
-#if CONFIG_KF_CTX
+  /* kf_y_cdf is discarded after use, so does not require persistent storage.
+     However, we keep it with the other CDFs in this struct since it needs to
+     be copied to each tile to support parallelism just like the others.
+  */
   aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]
                        [CDF_SIZE(INTRA_MODES)];
-#else
-  aom_cdf_prob kf_y_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(INTRA_MODES)];
-#endif
-  aom_cdf_prob tx_size_cdf[MAX_TX_DEPTH][TX_SIZE_CONTEXTS]
+
+  aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES]
+                              [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)];
+
+  aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
                           [CDF_SIZE(MAX_TX_DEPTH + 1)];
   aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
   aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)];
-#endif  // CONFIG_LOOPFILTER_LEVEL
   aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
-#endif
-#if CONFIG_EXT_TX
   aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
                                [CDF_SIZE(TX_TYPES)];
   aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
                                [CDF_SIZE(TX_TYPES)];
-#else
-  aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)];
-  aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_LGT_FROM_PRED
-  aom_prob intra_lgt_prob[LGT_SIZES][INTRA_MODES];
-  aom_prob inter_lgt_prob[LGT_SIZES];
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  aom_cdf_prob intra_filter_cdf[INTRA_FILTERS + 1][CDF_SIZE(INTRA_FILTERS)];
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  aom_prob delta_q_prob[DELTA_Q_PROBS];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  aom_prob delta_lf_multi_prob[FRAME_LF_COUNT][DELTA_LF_PROBS];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  aom_prob delta_lf_prob[DELTA_LF_PROBS];
-#endif
-#if CONFIG_PVQ
-  // TODO(any): If PVQ is enabled, most of coefficient related cdf,
-  // such as coef_cdfs[], coef_tail_cdfs[], and coef_heaf_cdfs[] can be removed.
-  od_adapt_ctx pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_CFL
   aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)];
   aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)];
-#endif
-#if CONFIG_LPF_SB
-  aom_cdf_prob lpf_reuse_cdf[LPF_REUSE_CONTEXT][CDF_SIZE(2)];
-  aom_cdf_prob lpf_delta_cdf[LPF_DELTA_CONTEXT][CDF_SIZE(DELTA_RANGE)];
-  aom_cdf_prob lpf_sign_cdf[LPF_REUSE_CONTEXT][LPF_SIGN_CONTEXT][CDF_SIZE(2)];
-#endif  // CONFIG_LPF_SB
+  int initialized;
 } FRAME_CONTEXT;
 
-typedef struct FRAME_COUNTS {
-// Note: This structure should only contain 'unsigned int' fields, or
-// aggregates built solely from 'unsigned int' fields/elements
-#if CONFIG_ENTROPY_STATS
-  unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  unsigned int uv_mode[INTRA_MODES][UV_INTRA_MODES];
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_EXT_PARTITION_TYPES
-  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
-#else
-  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
-#endif
-  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
-                                [SWITCHABLE_FILTERS];
-#if CONFIG_ADAPT_SCAN
-#if CONFIG_CHROMA_2X2
-  unsigned int non_zero_count_2x2[TX_TYPES][4];
-#endif  // CONFIG_CHROMA_2X2
-  unsigned int non_zero_count_4X4[TX_TYPES][16];
-  unsigned int non_zero_count_8X8[TX_TYPES][64];
-  unsigned int non_zero_count_16X16[TX_TYPES][256];
-  unsigned int non_zero_count_32X32[TX_TYPES][1024];
-
-  unsigned int non_zero_count_4x8[TX_TYPES][32];
-  unsigned int non_zero_count_8x4[TX_TYPES][32];
-  unsigned int non_zero_count_8x16[TX_TYPES][128];
-  unsigned int non_zero_count_16x8[TX_TYPES][128];
-  unsigned int non_zero_count_16x32[TX_TYPES][512];
-  unsigned int non_zero_count_32x16[TX_TYPES][512];
-
-  unsigned int txb_count[TX_SIZES_ALL][TX_TYPES];
-#endif  // CONFIG_ADAPT_SCAN
-
-#if CONFIG_LV_MAP
-  unsigned int txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS][2];
-  unsigned int nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS][2];
-  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
-  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
-  unsigned int coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
-                         [COEFF_BASE_CONTEXTS][2];
-  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS][2];
-  unsigned int coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS][LEVEL_CONTEXTS]
-                       [2];
-#if CONFIG_CTX1D
-  unsigned int eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES][2];
-  unsigned int empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES]
-                         [EMPTY_LINE_CONTEXTS][2];
-  unsigned int hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS][2];
-#endif  // CONFIG_CTX1D
-#endif  // CONFIG_LV_MAP
-
-#if CONFIG_SYMBOLRATE
-  unsigned int coeff_num[2];   // 0: zero coeff 1: non-zero coeff
-  unsigned int symbol_num[2];  // 0: entropy symbol 1: non-entropy symbol
-#endif
-
-  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
-  unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
-  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
-  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
-
-  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-#if CONFIG_COMPOUND_SINGLEREF
-  unsigned int inter_singleref_comp_mode[INTER_MODE_CONTEXTS]
-                                        [INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
-  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
-  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
-#endif  // CONFIG_INTERINTRA
-  unsigned int compound_interinter[BLOCK_SIZES_ALL][COMPOUND_TYPES];
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  unsigned int ncobmc_mode[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  unsigned int ncobmc[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES];
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-  unsigned int obmc[BLOCK_SIZES_ALL][2];
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
-  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-#if CONFIG_EXT_COMP_REFS
-  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
-  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
-#endif  // CONFIG_EXT_COMP_REFS
-  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
-#if CONFIG_EXT_REFS
-  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
-  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
-#else
-  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS - 1][2];
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_COMPOUND_SINGLEREF
-  unsigned int comp_inter_mode[COMP_INTER_MODE_CONTEXTS][2];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  // TODO(urvang): Only needed for !CONFIG_VAR_TX case. So can be removed when
-  // CONFIG_VAR_TX flag is removed.
-  unsigned int tx_size[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  unsigned int quarter_tx_size[2];
-#endif
-#if CONFIG_VAR_TX
-  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
-#endif
-  unsigned int skip[SKIP_CONTEXTS][2];
-  nmv_context_counts mv[NMV_CONTEXTS];
-#if CONFIG_INTRABC
-  unsigned int intrabc[2];
-  nmv_context_counts dv;
-#endif
-#if CONFIG_LGT_FROM_PRED
-  unsigned int intra_lgt[LGT_SIZES][INTRA_MODES][2];
-  unsigned int inter_lgt[LGT_SIZES][2];
-#endif  // CONFIG_LGT_FROM_PRED
-  unsigned int delta_q[DELTA_Q_PROBS][2];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  unsigned int delta_lf[DELTA_LF_PROBS][2];
-#endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  unsigned int tx_size_implied[TX_SIZES][TX_SIZES];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-#if CONFIG_ENTROPY_STATS
-#if CONFIG_EXT_TX
-  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
-  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
-                           [TX_TYPES];
-#else
-  unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_SUPERTX
-  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
-  unsigned int supertx_size[TX_SIZES];
-#endif  // CONFIG_SUPERTX
-  struct seg_counts seg;
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  unsigned int intra_filter[INTRA_FILTERS + 1][INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  unsigned int filter_intra[PLANE_TYPES][2];
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_LPF_SB
-  unsigned int lpf_reuse[LPF_REUSE_CONTEXT][2];
-  unsigned int lpf_delta[LPF_DELTA_CONTEXT][DELTA_RANGE];
-  unsigned int lpf_sign[LPF_SIGN_CONTEXT][2];
-#endif  // CONFIG_LPF_SB
-} FRAME_COUNTS;
-
-#if CONFIG_KF_CTX
-extern const aom_cdf_prob default_kf_y_mode_cdf[KF_MODE_CONTEXTS]
-                                               [KF_MODE_CONTEXTS]
-                                               [CDF_SIZE(INTRA_MODES)];
-#else
-extern const aom_cdf_prob default_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES]
-                                               [CDF_SIZE(INTRA_MODES)];
-#endif
-
-extern const aom_prob av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
-                                                     [PALETTE_Y_MODE_CONTEXTS];
-extern const aom_prob
-    av1_default_palette_uv_mode_prob[PALETTE_UV_MODE_CONTEXTS];
-
-#if CONFIG_EXT_TX
 static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-#if CONFIG_MRC_TX
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
-  },
-#endif  // CONFIG_MRC_TX
-  {
-      1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0,
-  },
-  {
-      3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0,
-  },
-  {
-      7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6,
-  },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 },
+  { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 },
+  { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 },
 };
 
 static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-#if CONFIG_MRC_TX
-  {
-      0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-#endif  // CONFIG_MRC_TX
-  {
-      9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0,
-  },
-  {
-      9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8,
-  },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 },
+  { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 },
 };
-#else
-#if CONFIG_MRC_TX
-static const int av1_ext_tx_ind[TX_TYPES] = {
-  0, 3, 4, 2, 1,
-};
-static const int av1_ext_tx_inv[TX_TYPES] = {
-  0, 4, 3, 1, 2,
-};
-#else
-static const int av1_ext_tx_ind[TX_TYPES] = {
-  0, 2, 3, 1,
-};
-static const int av1_ext_tx_inv[TX_TYPES] = {
-  0, 3, 1, 2,
-};
-#endif  // CONFIG_MRC_TX
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_INTERINTRA
-extern const aom_tree_index
-    av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)];
-#endif
-extern const aom_tree_index
-    av1_inter_compound_mode_tree[TREE_SIZE(INTER_COMPOUND_MODES)];
-#if CONFIG_COMPOUND_SINGLEREF
-extern const aom_tree_index
-    av1_inter_singleref_comp_mode_tree[TREE_SIZE(INTER_SINGLEREF_COMP_MODES)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-extern const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)];
-extern const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-#if CONFIG_EXT_PARTITION_TYPES
-extern const aom_tree_index
-    av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)];
-#endif
-extern const aom_tree_index
-    av1_palette_color_index_tree[PALETTE_SIZES][TREE_SIZE(PALETTE_COLORS)];
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-extern const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-#if CONFIG_EXT_TX
-extern const aom_tree_index av1_ext_tx_tree[EXT_TX_SET_TYPES]
-                                           [TREE_SIZE(TX_TYPES)];
-#else
-extern const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-extern const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)];
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-extern const aom_tree_index av1_ncobmc_mode_tree[TREE_SIZE(MAX_NCOBMC_MODES)];
-#if CONFIG_WARPED_MOTION
-extern const aom_tree_index av1_ncobmc_tree[TREE_SIZE(OBMC_FAMILY_MODES)];
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#if CONFIG_LOOP_RESTORATION
-#define RESTORE_NONE_SGRPROJ_PROB 64
-#define RESTORE_NONE_BILATERAL_PROB 16
-#define RESTORE_NONE_WIENER_PROB 64
-#define RESTORE_NONE_DOMAINTXFMRF_PROB 64
-extern const aom_tree_index
-    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)];
-#endif  // CONFIG_LOOP_RESTORATION
 
+void av1_set_default_ref_deltas(int8_t *ref_deltas);
+void av1_set_default_mode_deltas(int8_t *mode_deltas);
+void av1_setup_frame_contexts(struct AV1Common *cm);
 void av1_setup_past_independence(struct AV1Common *cm);
 
-void av1_adapt_intra_frame_probs(struct AV1Common *cm);
-void av1_adapt_inter_frame_probs(struct AV1Common *cm);
-
 static INLINE int av1_ceil_log2(int n) {
+  if (n < 2) return 0;
   int i = 1, p = 2;
   while (p < n) {
     i++;
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
index 2d0191366..446aa433c 100644
--- a/third_party/aom/av1/common/entropymv.c
+++ b/third_party/aom/av1/common/entropymv.c
@@ -12,100 +12,51 @@
 #include "av1/common/onyxc_int.h"
 #include "av1/common/entropymv.h"
 
-// Integer pel reference mv threshold for use of high-precision 1/8 mv
-#define COMPANDED_MVREF_THRESH 8
-
-const aom_tree_index av1_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
-  -MV_JOINT_ZERO, 2, -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
-};
-
-/* clang-format off */
-const aom_tree_index av1_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
-  -MV_CLASS_0, 2,
-  -MV_CLASS_1, 4,
-  6, 8,
-  -MV_CLASS_2, -MV_CLASS_3,
-  10, 12,
-  -MV_CLASS_4, -MV_CLASS_5,
-  -MV_CLASS_6, 14,
-  16, 18,
-  -MV_CLASS_7, -MV_CLASS_8,
-  -MV_CLASS_9, -MV_CLASS_10,
-};
-/* clang-format on */
-
-const aom_tree_index av1_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-  -0, -1,
-};
-
-const aom_tree_index av1_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
-                                                               4,  -2, -3 };
-
 static const nmv_context default_nmv_context = {
-  { 32, 64, 96 },  // joints
-  { AOM_ICDF(4096), AOM_ICDF(11264), AOM_ICDF(19328), AOM_ICDF(32768),
-    0 },  // joint_cdf
+  { AOM_CDF4(4096, 11264, 19328) },  // joints_cdf
   { {
         // Vertical component
-        128,                                                   // sign
-        { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 },  // class
-        { AOM_ICDF(28672), AOM_ICDF(30976), AOM_ICDF(31858), AOM_ICDF(32320),
-          AOM_ICDF(32551), AOM_ICDF(32656), AOM_ICDF(32740), AOM_ICDF(32757),
-          AOM_ICDF(32762), AOM_ICDF(32767), AOM_ICDF(32768), 0 },  // class_cdf
-        { 216 },                                                   // class0
-        { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },      // bits
-        { { 128, 128, 64 }, { 96, 112, 64 } },                     // class0_fp
-        { 64, 96, 64 },                                            // fp
-        { { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(26624), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(12288), AOM_ICDF(21248), AOM_ICDF(24128), AOM_ICDF(32768),
-            0 } },  // class0_fp_cdf
-        { AOM_ICDF(8192), AOM_ICDF(17408), AOM_ICDF(21248), AOM_ICDF(32768),
-          0 },  // fp_cdf
-        160,    // class0_hp bit
-        128,    // hp
-#if CONFIG_NEW_MULTISYMBOL
-        { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(216 * 128), AOM_ICDF(32768), 0 },
-        { { AOM_ICDF(128 * 196), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 198), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 208), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 224), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 245), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 } },  // bits_cdf
-#endif
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
     },
     {
         // Horizontal component
-        128,                                                   // sign
-        { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 },  // class
-        { AOM_ICDF(28672), AOM_ICDF(30976), AOM_ICDF(31858), AOM_ICDF(32320),
-          AOM_ICDF(32551), AOM_ICDF(32656), AOM_ICDF(32740), AOM_ICDF(32757),
-          AOM_ICDF(32762), AOM_ICDF(32767), AOM_ICDF(32768), 0 },  // class_cdf
-        { 208 },                                                   // class0
-        { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },      // bits
-        { { 128, 128, 64 }, { 96, 112, 64 } },                     // class0_fp
-        { 64, 96, 64 },                                            // fp
-        { { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(26624), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(12288), AOM_ICDF(21248), AOM_ICDF(24128), AOM_ICDF(32768),
-            0 } },  // class0_fp_cdf
-        { AOM_ICDF(8192), AOM_ICDF(17408), AOM_ICDF(21248), AOM_ICDF(32768),
-          0 },  // fp_cdf
-        160,    // class0_hp bit
-        128,    // hp
-#if CONFIG_NEW_MULTISYMBOL
-        { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(216 * 128), AOM_ICDF(32768), 0 },
-        { { AOM_ICDF(128 * 196), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 198), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 208), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 224), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 245), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 } },  // bits_cdf
-#endif
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
     } },
 };
 
@@ -164,104 +115,8 @@ MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
   return c;
 }
 
-static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr,
-                             MvSubpelPrecision precision) {
-  int s, z, c, o, d, e, f;
-  assert(v != 0); /* should not be zero */
-  s = v < 0;
-  comp_counts->sign[s] += incr;
-  z = (s ? -v : v) - 1; /* magnitude - 1 */
-
-  c = av1_get_mv_class(z, &o);
-  comp_counts->classes[c] += incr;
-
-  d = (o >> 3);     /* int mv data */
-  f = (o >> 1) & 3; /* fractional pel mv data */
-  e = (o & 1);      /* high precision mv data */
-
-  if (c == MV_CLASS_0) {
-    comp_counts->class0[d] += incr;
-#if CONFIG_INTRABC || CONFIG_AMVR
-    if (precision > MV_SUBPEL_NONE)
-#endif
-      comp_counts->class0_fp[d][f] += incr;
-    if (precision > MV_SUBPEL_LOW_PRECISION) comp_counts->class0_hp[e] += incr;
-  } else {
-    int i;
-    int b = c + CLASS0_BITS - 1;  // number of bits
-    for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr;
-#if CONFIG_INTRABC || CONFIG_AMVR
-    if (precision > MV_SUBPEL_NONE)
-#endif
-      comp_counts->fp[f] += incr;
-    if (precision > MV_SUBPEL_LOW_PRECISION) comp_counts->hp[e] += incr;
-  }
-}
-
-void av1_inc_mv(const MV *mv, nmv_context_counts *counts,
-                MvSubpelPrecision precision) {
-  if (counts != NULL) {
-    const MV_JOINT_TYPE j = av1_get_mv_joint(mv);
-    ++counts->joints[j];
-
-    if (mv_joint_vertical(j))
-      inc_mv_component(mv->row, &counts->comps[0], 1, precision);
-
-    if (mv_joint_horizontal(j))
-      inc_mv_component(mv->col, &counts->comps[1], 1, precision);
-  }
-}
-
-void av1_adapt_mv_probs(AV1_COMMON *cm, int allow_hp) {
-  int i, j;
-  int idx;
-  for (idx = 0; idx < NMV_CONTEXTS; ++idx) {
-    nmv_context *nmvc = &cm->fc->nmvc[idx];
-    const nmv_context *pre_nmvc = &cm->pre_fc->nmvc[idx];
-    const nmv_context_counts *counts = &cm->counts.mv[idx];
-    aom_tree_merge_probs(av1_mv_joint_tree, pre_nmvc->joints, counts->joints,
-                         nmvc->joints);
-    for (i = 0; i < 2; ++i) {
-      nmv_component *comp = &nmvc->comps[i];
-      const nmv_component *pre_comp = &pre_nmvc->comps[i];
-      const nmv_component_counts *c = &counts->comps[i];
-
-      comp->sign = av1_mode_mv_merge_probs(pre_comp->sign, c->sign);
-      aom_tree_merge_probs(av1_mv_class_tree, pre_comp->classes, c->classes,
-                           comp->classes);
-      aom_tree_merge_probs(av1_mv_class0_tree, pre_comp->class0, c->class0,
-                           comp->class0);
-
-      for (j = 0; j < MV_OFFSET_BITS; ++j)
-        comp->bits[j] = av1_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
-#if CONFIG_AMVR
-      if (cm->cur_frame_mv_precision_level == 0) {
-#endif
-        for (j = 0; j < CLASS0_SIZE; ++j)
-          aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->class0_fp[j],
-                               c->class0_fp[j], comp->class0_fp[j]);
-
-        aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
-
-        if (allow_hp) {
-          comp->class0_hp =
-              av1_mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
-          comp->hp = av1_mode_mv_merge_probs(pre_comp->hp, c->hp);
-        }
-#if CONFIG_AMVR
-      }
-#endif
-    }
-  }
-}
-
 void av1_init_mv_probs(AV1_COMMON *cm) {
-  int i;
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    // NB: this sets CDFs too
-    cm->fc->nmvc[i] = default_nmv_context;
-  }
-#if CONFIG_INTRABC
+  // NB: this sets CDFs too
+  cm->fc->nmvc = default_nmv_context;
   cm->fc->ndvc = default_nmv_context;
-#endif  // CONFIG_INTRABC
 }
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
index 9ce089f7d..02ca7b66b 100644
--- a/third_party/aom/av1/common/entropymv.h
+++ b/third_party/aom/av1/common/entropymv.h
@@ -12,7 +12,7 @@
 #ifndef AV1_COMMON_ENTROPYMV_H_
 #define AV1_COMMON_ENTROPYMV_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom_dsp/prob.h"
 
@@ -26,8 +26,6 @@ struct AV1Common;
 
 void av1_init_mv_probs(struct AV1Common *cm);
 
-void av1_adapt_mv_probs(struct AV1Common *cm, int usehp);
-
 #define MV_UPDATE_PROB 252
 
 /* Symbols for coding which components are zero jointly */
@@ -66,9 +64,7 @@ typedef enum {
 #define CLASS0_BITS 1 /* bits at integer precision for class 0 */
 #define CLASS0_SIZE (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
-#if CONFIG_NEW_MULTISYMBOL
 #define MV_BITS_CONTEXTS 6
-#endif
 #define MV_FP_SIZE 4
 
 #define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
@@ -76,37 +72,22 @@ typedef enum {
 #define MV_VALS ((MV_MAX << 1) + 1)
 
 #define MV_IN_USE_BITS 14
-#define MV_UPP ((1 << MV_IN_USE_BITS) - 1)
+#define MV_UPP (1 << MV_IN_USE_BITS)
 #define MV_LOW (-(1 << MV_IN_USE_BITS))
 
-extern const aom_tree_index av1_mv_joint_tree[];
-extern const aom_tree_index av1_mv_class_tree[];
-extern const aom_tree_index av1_mv_class0_tree[];
-extern const aom_tree_index av1_mv_fp_tree[];
-
 typedef struct {
-  aom_prob sign;
-  aom_prob classes[MV_CLASSES - 1];
-  aom_cdf_prob class_cdf[CDF_SIZE(MV_CLASSES)];
-  aom_prob class0[CLASS0_SIZE - 1];
-  aom_prob bits[MV_OFFSET_BITS];
-  aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
-  aom_prob fp[MV_FP_SIZE - 1];
+  aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)];
   aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
   aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
-  aom_prob class0_hp;
-  aom_prob hp;
-#if CONFIG_NEW_MULTISYMBOL
+  aom_cdf_prob sign_cdf[CDF_SIZE(2)];
   aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)];
   aom_cdf_prob hp_cdf[CDF_SIZE(2)];
   aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)];
-  aom_cdf_prob bits_cdf[MV_BITS_CONTEXTS][CDF_SIZE(2)];
-#endif
+  aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)];
 } nmv_component;
 
 typedef struct {
-  aom_prob joints[MV_JOINTS - 1];
-  aom_cdf_prob joint_cdf[CDF_SIZE(MV_JOINTS)];
+  aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)];
   nmv_component comps[2];
 } nmv_context;
 
@@ -120,33 +101,12 @@ static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
 
 MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
 
-typedef struct {
-  unsigned int sign[2];
-  unsigned int classes[MV_CLASSES];
-  unsigned int class0[CLASS0_SIZE];
-  unsigned int bits[MV_OFFSET_BITS][2];
-  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
-  unsigned int fp[MV_FP_SIZE];
-  unsigned int class0_hp[2];
-  unsigned int hp[2];
-} nmv_component_counts;
-
-typedef struct {
-  unsigned int joints[MV_JOINTS];
-  nmv_component_counts comps[2];
-} nmv_context_counts;
-
 typedef enum {
-#if CONFIG_INTRABC || CONFIG_AMVR
   MV_SUBPEL_NONE = -1,
-#endif
   MV_SUBPEL_LOW_PRECISION = 0,
   MV_SUBPEL_HIGH_PRECISION,
 } MvSubpelPrecision;
 
-void av1_inc_mv(const MV *mv, nmv_context_counts *mvctx,
-                MvSubpelPrecision precision);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index e8c4003cc..a37ee9f24 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -12,7 +12,8 @@
 #ifndef AV1_COMMON_ENUMS_H_
 #define AV1_COMMON_ENUMS_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
 
@@ -22,22 +23,8 @@ extern "C" {
 
 #undef MAX_SB_SIZE
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#define TWO_MODE
-#endif
-
-#if CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT
-#define NC_MODE_INFO 1
-#else
-#define NC_MODE_INFO 0
-#endif
-
 // Max superblock size
-#if CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE_LOG2 7
-#else
-#define MAX_SB_SIZE_LOG2 6
-#endif  // CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
@@ -45,11 +32,7 @@ extern "C" {
 #define MIN_SB_SIZE_LOG2 6
 
 // Pixels per Mode Info (MI) unit
-#if CONFIG_CB4X4
 #define MI_SIZE_LOG2 2
-#else
-#define MI_SIZE_LOG2 3
-#endif
 #define MI_SIZE (1 << MI_SIZE_LOG2)
 
 // MI-units per max superblock (MI Block - MIB)
@@ -63,73 +46,78 @@ extern "C" {
 #define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
 
 // Maximum number of tile rows and tile columns
-#if CONFIG_EXT_TILE
-#define MAX_TILE_ROWS 1024
-#define MAX_TILE_COLS 1024
-#else
-#if CONFIG_MAX_TILE
 #define MAX_TILE_ROWS 64
 #define MAX_TILE_COLS 64
-#else
-#define MAX_TILE_ROWS 4
-#define MAX_TILE_COLS 64
-#endif
-#endif  // CONFIG_EXT_TILE
 
-#if CONFIG_VAR_TX
 #define MAX_VARTX_DEPTH 2
-#define SQR_VARTX_DEPTH_INIT 0
-#define RECT_VARTX_DEPTH_INIT 0
-#endif
 
 #define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
+
+#define MAX_PALETTE_SQUARE (64 * 64)
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+#define FRAME_OFFSET_BITS 5
+#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
 
-#if CONFIG_LOOPFILTER_LEVEL
 // 4 frame filter levels: y plane vertical, y plane horizontal,
 // u plane, and v plane
 #define FRAME_LF_COUNT 4
 #define DEFAULT_DELTA_LF_MULTI 0
-#endif  // CONFIG_LOOPFILTER_LEVEL
-
-#if CONFIG_LPF_SB
-#define LPF_DELTA_BITS 3
-#define LPF_STEP 2
-#define DELTA_RANGE (1 << LPF_DELTA_BITS)
-#define MAX_LPF_OFFSET (LPF_STEP * ((1 << LPF_DELTA_BITS) - 1))
-
-#define LPF_REUSE_CONTEXT 2
-#define LPF_DELTA_CONTEXT DELTA_RANGE
-#define LPF_SIGN_CONTEXT 2
-
-// Half of maximum loop filter length (15-tap)
-#define FILT_BOUNDARY_OFFSET 8
-#define FILT_BOUNDARY_MI_OFFSET (FILT_BOUNDARY_OFFSET >> MI_SIZE_LOG2)
-#endif  // CONFIG_LPF_SB
-
-// Bitstream profiles indicated by 2-3 bits in the uncompressed header.
-// 00: Profile 0.  8-bit 4:2:0 only.
-// 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
-// 01: Profile 2.  10-bit and 12-bit color only, with 4:2:0 sampling.
-// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0
-//                 sampling.
-// 111: Undefined profile.
+#define MAX_MODE_LF_DELTAS 2
+
+#define DIST_PRECISION_BITS 4
+#define DIST_PRECISION (1 << DIST_PRECISION_BITS)  // 16
+
+// TODO(chengchen): Temporal flag serve as experimental flag for WIP
+// bitmask construction.
+// Shall be removed when bitmask code is completely checkedin
+#define LOOP_FILTER_BITMASK 0
+
+#define PROFILE_BITS 3
+// The following three profiles are currently defined.
+// Profile 0.  8-bit and 10-bit 4:2:0 and 4:0:0 only.
+// Profile 1.  8-bit and 10-bit 4:4:4
+// Profile 2.  8-bit and 10-bit 4:2:2
+//            12-bit  4:0:0, 4:2:2 and 4:4:4
+// Since we have three bits for the profiles, it can be extended later.
 typedef enum BITSTREAM_PROFILE {
   PROFILE_0,
   PROFILE_1,
   PROFILE_2,
-  PROFILE_3,
-  MAX_PROFILES
+  MAX_PROFILES,
 } BITSTREAM_PROFILE;
 
+#define LEVEL_MAJOR_BITS 3
+#define LEVEL_MINOR_BITS 2
+#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
+
+#define LEVEL_MAJOR_MIN 2
+#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
+#define LEVEL_MINOR_MIN 0
+#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+
+#define OP_POINTS_CNT_MINUS_1_BITS 5
+#define OP_POINTS_IDC_BITS 12
+
 // Note: Some enums use the attribute 'packed' to use smallest possible integer
 // type, so that we can save memory when they are used in structs/arrays.
 
 typedef enum ATTRIBUTE_PACKED {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,
-  BLOCK_2X4,
-  BLOCK_4X2,
-#endif
   BLOCK_4X4,
   BLOCK_4X8,
   BLOCK_8X4,
@@ -143,33 +131,29 @@ typedef enum ATTRIBUTE_PACKED {
   BLOCK_32X64,
   BLOCK_64X32,
   BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
   BLOCK_64X128,
   BLOCK_128X64,
   BLOCK_128X128,
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_4X16,
   BLOCK_16X4,
   BLOCK_8X32,
   BLOCK_32X8,
   BLOCK_16X64,
   BLOCK_64X16,
-#if CONFIG_EXT_PARTITION
-  BLOCK_32X128,
-  BLOCK_128X32,
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_SIZES_ALL,
   BLOCK_SIZES = BLOCK_4X16,
   BLOCK_INVALID = 255,
   BLOCK_LARGEST = (BLOCK_SIZES - 1)
 } BLOCK_SIZE;
 
-typedef enum {
+// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+#define SQR_BLOCK_SIZES 6
+
+typedef enum ATTRIBUTE_PACKED {
   PARTITION_NONE,
   PARTITION_HORZ,
   PARTITION_VERT,
   PARTITION_SPLIT,
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_HORZ_A,  // HORZ split and the top partition is split again
   PARTITION_HORZ_B,  // HORZ split and the bottom partition is split again
   PARTITION_VERT_A,  // VERT split and the left partition is split again
@@ -177,134 +161,104 @@ typedef enum {
   PARTITION_HORZ_4,  // 4:1 horizontal partition
   PARTITION_VERT_4,  // 4:1 vertical partition
   EXT_PARTITION_TYPES,
-#endif  // CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPES = PARTITION_SPLIT + 1,
   PARTITION_INVALID = 255
 } PARTITION_TYPE;
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET 4  // number of probability models per block size
-#define PARTITION_BLOCK_SIZES (4 + CONFIG_EXT_PARTITION)
-#define PARTITION_CONTEXTS_PRIMARY (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
-#if CONFIG_UNPOISON_PARTITION_CTX
-#define INVALID_PARTITION_CTX (-1)
-#define PARTITION_CONTEXTS \
-  (PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES)
-#else
-#define PARTITION_CONTEXTS PARTITION_CONTEXTS_PRIMARY
-#endif
+#define PARTITION_BLOCK_SIZES 5
+#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
 // block transform size
+#if defined(_MSC_VER)
+typedef uint8_t TX_SIZE;
+enum ATTRIBUTE_PACKED {
+#else
 typedef enum ATTRIBUTE_PACKED {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // 2x2 transform
 #endif
-  TX_4X4,    // 4x4 transform
-  TX_8X8,    // 8x8 transform
-  TX_16X16,  // 16x16 transform
-  TX_32X32,  // 32x32 transform
-#if CONFIG_TX64X64
-  TX_64X64,  // 64x64 transform
-#endif       // CONFIG_TX64X64
-  TX_4X8,    // 4x8 transform
-  TX_8X4,    // 8x4 transform
-  TX_8X16,   // 8x16 transform
-  TX_16X8,   // 16x8 transform
-  TX_16X32,  // 16x32 transform
-  TX_32X16,  // 32x16 transform
-#if CONFIG_TX64X64
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
   TX_32X64,           // 32x64 transform
   TX_64X32,           // 64x32 transform
-#endif                // CONFIG_TX64X64
   TX_4X16,            // 4x16 transform
   TX_16X4,            // 16x4 transform
   TX_8X32,            // 8x32 transform
   TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
   TX_SIZES_ALL,       // Includes rectangular transforms
   TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
-  TX_INVALID = 255    // Invalid transform size
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+#if defined(_MSC_VER)
+};
+#else
 } TX_SIZE;
+#endif
 
 #define TX_SIZE_LUMA_MIN (TX_4X4)
 /* We don't need to code a transform size unless the allowed size is at least
    one more than the minimum. */
 #define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1)
 
-#define MAX_TX_DEPTH (TX_SIZES - TX_SIZE_CTX_MIN)
+// Maximum tx_size categories
+#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN)
+#define MAX_TX_DEPTH 2
 
-#if CONFIG_CTX1D
-#define MAX_HVTX_SIZE (1 << 5)
-#endif  // CONFIG_CTX1D
-
-#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64)
+#define MAX_TX_SIZE_LOG2 (6)
 #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
 #define MIN_TX_SIZE_LOG2 2
 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
 #define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
 
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR_LOG2 2
+#define TX_PAD_HOR 4
+// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
+// check.
+#define TX_PAD_TOP 2
+#define TX_PAD_BOTTOM 4
+#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
+
 // Number of maxium size transform blocks in the maximum size superblock
 #define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
 #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-typedef enum ATTRIBUTE_PACKED {
-  NCOBMC_MODE_0,
-  NCOBMC_MODE_1,
-  NCOBMC_MODE_2,
-  NCOBMC_MODE_3,
-  NCOBMC_MODE_4,
-  NCOBMC_MODE_5,
-  NCOBMC_MODE_6,
-  NCOBMC_MODE_7,
-  ALL_NCOBMC_MODES,
-#ifdef TWO_MODE
-  MAX_NCOBMC_MODES = NCOBMC_MODE_1 + 1,
-#else
-  MAX_NCOBMC_MODES = ALL_NCOBMC_MODES,
-#endif
-  NO_OVERLAP = MAX_NCOBMC_MODES + 1
-} NCOBMC_MODE;
-
-typedef enum {
-  ADAPT_OVERLAP_BLOCK_8X8,
-  ADAPT_OVERLAP_BLOCK_16X16,
-  ADAPT_OVERLAP_BLOCK_32X32,
-  ADAPT_OVERLAP_BLOCK_64X64,
-  ADAPT_OVERLAP_BLOCKS,
-  ADAPT_OVERLAP_BLOCK_INVALID = 255
-} ADAPT_OVERLAP_BLOCK;
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
 // frame transform mode
-typedef enum {
-  ONLY_4X4,     // only 4x4 transform used
-  ALLOW_8X8,    // allow block transform size up to 8x8
-  ALLOW_16X16,  // allow block transform size up to 16x16
-  ALLOW_32X32,  // allow block transform size up to 32x32
-#if CONFIG_TX64X64
-  ALLOW_64X64,  // allow block transform size up to 64x64
-#endif
-  TX_MODE_SELECT,  // transform specified for each block
+typedef enum ATTRIBUTE_PACKED {
+  ONLY_4X4,         // use only 4x4 transform
+  TX_MODE_LARGEST,  // transform size is the largest possible for pu size
+  TX_MODE_SELECT,   // transform specified for each block
   TX_MODES,
 } TX_MODE;
 
 // 1D tx types
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   DCT_1D,
   ADST_1D,
   FLIPADST_1D,
   IDTX_1D,
-  // TODO(sarahparker) need to eventually put something here for the
-  // mrc experiment to make this work with the ext-tx pruning functions
   TX_TYPES_1D,
 } TX_TYPE_1D;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   DCT_DCT,    // DCT  in both horizontal and vertical
   ADST_DCT,   // ADST in vertical, DCT in horizontal
   DCT_ADST,   // DCT  in vertical, ADST in horizontal
   ADST_ADST,  // ADST in both directions
-#if CONFIG_EXT_TX
   FLIPADST_DCT,
   DCT_FLIPADST,
   FLIPADST_FLIPADST,
@@ -317,25 +271,26 @@ typedef enum {
   H_ADST,
   V_FLIPADST,
   H_FLIPADST,
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-  MRC_DCT,  // DCT in both directions with mrc based bitmask
-#endif      // CONFIG_MRC_TX
   TX_TYPES,
 } TX_TYPE;
 
-#if CONFIG_EXT_TX
 typedef enum {
+  REG_REG,
+  REG_SMOOTH,
+  REG_SHARP,
+  SMOOTH_REG,
+  SMOOTH_SMOOTH,
+  SMOOTH_SHARP,
+  SHARP_REG,
+  SHARP_SMOOTH,
+  SHARP_SHARP,
+} DUAL_FILTER_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
   // DCT only
   EXT_TX_SET_DCTONLY,
   // DCT + Identity only
   EXT_TX_SET_DCT_IDTX,
-#if CONFIG_MRC_TX
-  // DCT + MRC_DCT
-  EXT_TX_SET_MRC_DCT,
-  // DCT + MRC_DCT + IDTX
-  EXT_TX_SET_MRC_DCT_IDTX,
-#endif  // CONFIG_MRC_TX
   // Discrete Trig transforms w/o flip (4) + Identity (1)
   EXT_TX_SET_DTT4_IDTX,
   // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
@@ -348,45 +303,13 @@ typedef enum {
 } TxSetType;
 
 #define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
-#else
-#define IS_2D_TRANSFORM(tx_type) 1
-#endif
 
-typedef enum {
-  TILE_LEFT_BOUNDARY = 1,
-  TILE_RIGHT_BOUNDARY = 2,
-  TILE_ABOVE_BOUNDARY = 4,
-  TILE_BOTTOM_BOUNDARY = 8,
-  FRAME_LEFT_BOUNDARY = 16,
-  FRAME_RIGHT_BOUNDARY = 32,
-  FRAME_ABOVE_BOUNDARY = 64,
-  FRAME_BOTTOM_BOUNDARY = 128,
-} BOUNDARY_TYPE;
-
-#if CONFIG_EXT_TX
-#if CONFIG_CHROMA_2X2
-#define EXT_TX_SIZES 5  // number of sizes that use extended transforms
-#else
-#define EXT_TX_SIZES 4  // number of sizes that use extended transforms
-#endif                  // CONFIG_CHROMA_2X2
-#if CONFIG_MRC_TX
-#define EXT_TX_SETS_INTER 5  // Sets of transform selections for INTER
-#define EXT_TX_SETS_INTRA 4  // Sets of transform selections for INTRA
-#else                        // CONFIG_MRC_TX
+#define EXT_TX_SIZES 4       // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
-#endif                       // CONFIG_MRC_TX
-#else
-#if CONFIG_CHROMA_2X2
-#define EXT_TX_SIZES 4  // number of sizes that use extended transforms
-#else
-#define EXT_TX_SIZES 3  // number of sizes that use extended transforms
-#endif
-#endif  // CONFIG_EXT_TX
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   AOM_LAST_FLAG = 1 << 0,
-#if CONFIG_EXT_REFS
   AOM_LAST2_FLAG = 1 << 1,
   AOM_LAST3_FLAG = 1 << 2,
   AOM_GOLD_FLAG = 1 << 3,
@@ -394,43 +317,45 @@ typedef enum {
   AOM_ALT2_FLAG = 1 << 5,
   AOM_ALT_FLAG = 1 << 6,
   AOM_REFFRAME_ALL = (1 << 7) - 1
-#else   // !CONFIG_EXT_REFS
-  AOM_GOLD_FLAG = 1 << 1,
-  AOM_ALT_FLAG = 1 << 2,
-  AOM_REFFRAME_ALL = (1 << 3) - 1
-#endif  // CONFIG_EXT_REFS
 } AOM_REFFRAME;
 
-#if CONFIG_EXT_COMP_REFS
-#define USE_UNI_COMP_REFS 1
-
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   UNIDIR_COMP_REFERENCE,
   BIDIR_COMP_REFERENCE,
   COMP_REFERENCE_TYPES,
 } COMP_REFERENCE_TYPE;
-#else  // !CONFIG_EXT_COMP_REFS
-#define USE_UNI_COMP_REFS 0
-#endif  // CONFIG_EXT_COMP_REFS
 
-typedef enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } PLANE_TYPE;
+typedef enum ATTRIBUTE_PACKED {
+  PLANE_TYPE_Y,
+  PLANE_TYPE_UV,
+  PLANE_TYPES
+} PLANE_TYPE;
 
-#if CONFIG_CFL
 #define CFL_ALPHABET_SIZE_LOG2 4
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
 #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
 #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
 
-typedef enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } CFL_PRED_TYPE;
+typedef enum ATTRIBUTE_PACKED {
+  CFL_PRED_U,
+  CFL_PRED_V,
+  CFL_PRED_PLANES
+} CFL_PRED_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   CFL_SIGN_ZERO,
   CFL_SIGN_NEG,
   CFL_SIGN_POS,
   CFL_SIGNS
 } CFL_SIGN_TYPE;
 
+typedef enum ATTRIBUTE_PACKED {
+  CFL_DISALLOWED,
+  CFL_ALLOWED,
+  CFL_ALLOWED_TYPES
+} CFL_ALLOWED_TYPE;
+
 // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
 #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
 // CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8
@@ -445,17 +370,13 @@ typedef enum {
 // Also, the contexts are symmetric under swapping the planes.
 #define CFL_CONTEXT_V(js) \
   (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
-#endif
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   PALETTE_MAP,
-#if CONFIG_MRC_TX
-  MRC_MAP,
-#endif  // CONFIG_MRC_TX
   COLOR_MAP_TYPES,
 } COLOR_MAP_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   TWO_COLORS,
   THREE_COLORS,
   FOUR_COLORS,
@@ -466,7 +387,7 @@ typedef enum {
   PALETTE_SIZES
 } PALETTE_SIZE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   PALETTE_COLOR_ONE,
   PALETTE_COLOR_TWO,
   PALETTE_COLOR_THREE,
@@ -478,36 +399,26 @@ typedef enum {
   PALETTE_COLORS
 } PALETTE_COLOR;
 
-// Note: All directional predictors must be between V_PRED and D63_PRED (both
+// Note: All directional predictors must be between V_PRED and D67_PRED (both
 // inclusive).
 typedef enum ATTRIBUTE_PACKED {
-  DC_PRED,      // Average of above and left pixels
-  V_PRED,       // Vertical
-  H_PRED,       // Horizontal
-  D45_PRED,     // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  D135_PRED,    // Directional 135 deg = 180 - 45
-  D117_PRED,    // Directional 117 deg = 180 - 63
-  D153_PRED,    // Directional 153 deg = 180 - 27
-  D207_PRED,    // Directional 207 deg = 180 + 27
-  D63_PRED,     // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  SMOOTH_PRED,  // Combination of horizontal and vertical interpolation
-#if CONFIG_SMOOTH_HV
+  DC_PRED,        // Average of above and left pixels
+  V_PRED,         // Vertical
+  H_PRED,         // Horizontal
+  D45_PRED,       // Directional 45  degree
+  D135_PRED,      // Directional 135 degree
+  D113_PRED,      // Directional 113 degree
+  D157_PRED,      // Directional 157 degree
+  D203_PRED,      // Directional 203 degree
+  D67_PRED,       // Directional 67  degree
+  SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
   SMOOTH_V_PRED,  // Vertical interpolation
   SMOOTH_H_PRED,  // Horizontal interpolation
-#endif            // CONFIG_SMOOTH_HV
-  TM_PRED,        // True-motion
+  PAETH_PRED,     // Predict from the direction of smallest gradient
   NEARESTMV,
   NEARMV,
-  ZEROMV,
+  GLOBALMV,
   NEWMV,
-#if CONFIG_COMPOUND_SINGLEREF
-  // Single ref compound modes
-  SR_NEAREST_NEARMV,
-  // SR_NEAREST_NEWMV,
-  SR_NEAR_NEWMV,
-  SR_ZERO_NEWMV,
-  SR_NEW_NEWMV,
-#endif  // CONFIG_COMPOUND_SINGLEREF
   // Compound ref compound modes
   NEAREST_NEARESTMV,
   NEAR_NEARMV,
@@ -515,175 +426,131 @@ typedef enum ATTRIBUTE_PACKED {
   NEW_NEARESTMV,
   NEAR_NEWMV,
   NEW_NEARMV,
-  ZERO_ZEROMV,
+  GLOBAL_GLOBALMV,
   NEW_NEWMV,
   MB_MODE_COUNT,
-  INTRA_MODES = TM_PRED + 1,     // TM_PRED has to be the last intra mode.
+  INTRA_MODE_START = DC_PRED,
+  INTRA_MODE_END = NEARESTMV,
+  INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
+  SINGLE_INTER_MODE_START = NEARESTMV,
+  SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
+  SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START,
+  COMP_INTER_MODE_START = NEAREST_NEARESTMV,
+  COMP_INTER_MODE_END = MB_MODE_COUNT,
+  COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+  INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
   INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
 } PREDICTION_MODE;
 
-#if CONFIG_CFL
 // TODO(ltrudeau) Do we really want to pack this?
 // TODO(ltrudeau) Do we match with PREDICTION_MODE?
 typedef enum ATTRIBUTE_PACKED {
-  UV_DC_PRED,      // Average of above and left pixels
-  UV_V_PRED,       // Vertical
-  UV_H_PRED,       // Horizontal
-  UV_D45_PRED,     // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  UV_D135_PRED,    // Directional 135 deg = 180 - 45
-  UV_D117_PRED,    // Directional 117 deg = 180 - 63
-  UV_D153_PRED,    // Directional 153 deg = 180 - 27
-  UV_D207_PRED,    // Directional 207 deg = 180 + 27
-  UV_D63_PRED,     // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  UV_SMOOTH_PRED,  // Combination of horizontal and vertical interpolation
-#if CONFIG_SMOOTH_HV
+  UV_DC_PRED,        // Average of above and left pixels
+  UV_V_PRED,         // Vertical
+  UV_H_PRED,         // Horizontal
+  UV_D45_PRED,       // Directional 45  degree
+  UV_D135_PRED,      // Directional 135 degree
+  UV_D113_PRED,      // Directional 113 degree
+  UV_D157_PRED,      // Directional 157 degree
+  UV_D203_PRED,      // Directional 203 degree
+  UV_D67_PRED,       // Directional 67  degree
+  UV_SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
   UV_SMOOTH_V_PRED,  // Vertical interpolation
   UV_SMOOTH_H_PRED,  // Horizontal interpolation
-#endif               // CONFIG_SMOOTH_HV
-  UV_TM_PRED,        // True-motion
+  UV_PAETH_PRED,     // Predict from the direction of smallest gradient
   UV_CFL_PRED,       // Chroma-from-Luma
   UV_INTRA_MODES,
   UV_MODE_INVALID,  // For uv_mode in inter blocks
 } UV_PREDICTION_MODE;
-#else
-#define UV_INTRA_MODES (INTRA_MODES)
-#define UV_PREDICTION_MODE PREDICTION_MODE
-#define UV_DC_PRED (DC_PRED)
-#define UV_MODE_INVALID (INTRA_INVALID)
-#endif  // CONFIG_CFL
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   SIMPLE_TRANSLATION,
-#if CONFIG_MOTION_VAR
-  OBMC_CAUSAL,  // 2-sided OBMC
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  NCOBMC_ADAPT_WEIGHT,
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_WARPED_MOTION
+  OBMC_CAUSAL,    // 2-sided OBMC
   WARPED_CAUSAL,  // 2-sided WARPED
-#endif            // CONFIG_WARPED_MOTION
   MOTION_MODES
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  ,
-  OBMC_FAMILY_MODES = NCOBMC_ADAPT_WEIGHT + 1
-#endif
 } MOTION_MODE;
 
-#if CONFIG_INTERINTRA
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   II_DC_PRED,
   II_V_PRED,
   II_H_PRED,
   II_SMOOTH_PRED,
   INTERINTRA_MODES
 } INTERINTRA_MODE;
-#endif
 
 typedef enum {
   COMPOUND_AVERAGE,
-#if CONFIG_WEDGE
   COMPOUND_WEDGE,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-  COMPOUND_SEG,
-#endif  // CONFIG_COMPOUND_SEGMENT
+  COMPOUND_DIFFWTD,
   COMPOUND_TYPES,
 } COMPOUND_TYPE;
 
-// TODO(huisu): Consider adding FILTER_SMOOTH_PRED to "FILTER_INTRA_MODE".
-#if CONFIG_FILTER_INTRA
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   FILTER_DC_PRED,
   FILTER_V_PRED,
   FILTER_H_PRED,
-  FILTER_D45_PRED,
-  FILTER_D135_PRED,
-  FILTER_D117_PRED,
-  FILTER_D153_PRED,
-  FILTER_D207_PRED,
-  FILTER_D63_PRED,
-  FILTER_TM_PRED,
+  FILTER_D157_PRED,
+  FILTER_PAETH_PRED,
   FILTER_INTRA_MODES,
 } FILTER_INTRA_MODE;
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
 #define DIRECTIONAL_MODES 8
-#endif  // CONFIG_EXT_INTRA
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
 
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
 
-#if CONFIG_COMPOUND_SINGLEREF
-#define INTER_SINGLEREF_COMP_MODES (1 + SR_NEW_NEWMV - SR_NEAREST_NEARMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
 #define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
 
 #define SKIP_CONTEXTS 3
+#define SKIP_MODE_CONTEXTS 3
+
+#define COMP_INDEX_CONTEXTS 6
+#define COMP_GROUP_IDX_CONTEXTS 6
 
 #define NMV_CONTEXTS 3
 
-#define NEWMV_MODE_CONTEXTS 7
-#define ZEROMV_MODE_CONTEXTS 2
-#define REFMV_MODE_CONTEXTS 9
-#define DRL_MODE_CONTEXTS 5
+#define NEWMV_MODE_CONTEXTS 6
+#define GLOBALMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 6
+#define DRL_MODE_CONTEXTS 3
 
-#define ZEROMV_OFFSET 3
+#define GLOBALMV_OFFSET 3
 #define REFMV_OFFSET 4
 
-#define NEWMV_CTX_MASK ((1 << ZEROMV_OFFSET) - 1)
-#define ZEROMV_CTX_MASK ((1 << (REFMV_OFFSET - ZEROMV_OFFSET)) - 1)
+#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
+#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
 #define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
 
-#define ALL_ZERO_FLAG_OFFSET 8
-#define SKIP_NEARESTMV_OFFSET 9
-#define SKIP_NEARMV_OFFSET 10
-#define SKIP_NEARESTMV_SUB8X8_OFFSET 11
+#define COMP_NEWMV_CTXS 5
+#define INTER_MODE_CONTEXTS 8
 
-#define INTER_MODE_CONTEXTS 7
 #define DELTA_Q_SMALL 3
 #define DELTA_Q_PROBS (DELTA_Q_SMALL)
 #define DEFAULT_DELTA_Q_RES 4
-#if CONFIG_EXT_DELTA_Q
 #define DELTA_LF_SMALL 3
 #define DELTA_LF_PROBS (DELTA_LF_SMALL)
 #define DEFAULT_DELTA_LF_RES 2
-#endif
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
 
-#define MAX_REF_MV_STACK_SIZE 16
-#if CONFIG_EXT_PARTITION
+#define MAX_REF_MV_STACK_SIZE 8
 #define REF_CAT_LEVEL 640
-#else
-#define REF_CAT_LEVEL 255
-#endif  // CONFIG_EXT_PARTITION
 
 #define INTRA_INTER_CONTEXTS 4
 #define COMP_INTER_CONTEXTS 5
-#define REF_CONTEXTS 5
+#define REF_CONTEXTS 3
 
-#if CONFIG_EXT_COMP_REFS
 #define COMP_REF_TYPE_CONTEXTS 5
 #define UNI_COMP_REF_CONTEXTS 3
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_COMPOUND_SINGLEREF
-#define COMP_INTER_MODE_CONTEXTS 4
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_VAR_TX
-#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 2)
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
 typedef uint8_t TXFM_CONTEXT;
-#endif
 
 #define NONE_FRAME -1
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
-
-#if CONFIG_EXT_REFS
 #define LAST2_FRAME 2
 #define LAST3_FRAME 3
 #define GOLDEN_FRAME 4
@@ -691,94 +558,55 @@ typedef uint8_t TXFM_CONTEXT;
 #define ALTREF2_FRAME 6
 #define ALTREF_FRAME 7
 #define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
-#else  // !CONFIG_EXT_REFS
-#define GOLDEN_FRAME 2
-#define ALTREF_FRAME 3
-#endif  // CONFIG_EXT_REFS
 
 #define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
-#define TOTAL_REFS_PER_FRAME (ALTREF_FRAME - INTRA_FRAME + 1)
 
 #define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-#if CONFIG_EXT_REFS
 #define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
-#else
-#define BWD_REFS 1
-#define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
-#endif  // CONFIG_EXT_REFS
 
 #define SINGLE_REFS (FWD_REFS + BWD_REFS)
-#if CONFIG_EXT_COMP_REFS
-typedef enum {
-  LAST_LAST2_FRAMES,     // { LAST_FRAME, LAST2_FRAME }
-  LAST_LAST3_FRAMES,     // { LAST_FRAME, LAST3_FRAME }
-  LAST_GOLDEN_FRAMES,    // { LAST_FRAME, GOLDEN_FRAME }
-  BWDREF_ALTREF_FRAMES,  // { BWDREF_FRAME, ALTREF_FRAME }
-  UNIDIR_COMP_REFS
+
+typedef enum ATTRIBUTE_PACKED {
+  LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
+  LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
+  LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF_FRAMES,   // { BWDREF_FRAME, ALTREF_FRAME }
+  LAST2_LAST3_FRAMES,     // { LAST2_FRAME, LAST3_FRAME }
+  LAST2_GOLDEN_FRAMES,    // { LAST2_FRAME, GOLDEN_FRAME }
+  LAST3_GOLDEN_FRAMES,    // { LAST3_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF2_FRAMES,  // { BWDREF_FRAME, ALTREF2_FRAME }
+  ALTREF2_ALTREF_FRAMES,  // { ALTREF2_FRAME, ALTREF_FRAME }
+  TOTAL_UNIDIR_COMP_REFS,
+  // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
+  //       that are explicitly signaled.
+  UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
 } UNIDIR_COMP_REF;
-#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
-#else  // !CONFIG_EXT_COMP_REFS
-#define COMP_REFS (FWD_REFS * BWD_REFS)
-#endif  // CONFIG_EXT_COMP_REFS
 
-#define MODE_CTX_REF_FRAMES (TOTAL_REFS_PER_FRAME + COMP_REFS)
+#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
 
-#if CONFIG_SUPERTX
-#define PARTITION_SUPERTX_CONTEXTS 2
-#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
-#endif  // CONFIG_SUPERTX
+#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
 
-#if CONFIG_LOOP_RESTORATION
-typedef enum {
+// NOTE: A limited number of unidirectional reference pairs can be signalled for
+//       compound prediction. The use of skip mode, on the other hand, makes it
+//       possible to have a reference pair not listed for explicit signaling.
+#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
+
+typedef enum ATTRIBUTE_PACKED {
   RESTORE_NONE,
   RESTORE_WIENER,
   RESTORE_SGRPROJ,
   RESTORE_SWITCHABLE,
   RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
-  RESTORE_TYPES,
+  RESTORE_TYPES = 4,
 } RestorationType;
-#endif  // CONFIG_LOOP_RESTORATION
 
-#if CONFIG_FRAME_SUPERRES
 #define SUPERRES_SCALE_BITS 3
-#define SUPERRES_SCALE_DENOMINATOR_MIN 8
-#endif  // CONFIG_FRAME_SUPERRES
-
-#if CONFIG_LPF_DIRECT
-typedef enum {
-  VERT_HORZ,
-  DEGREE_30,
-  DEGREE_45,
-  DEGREE_60,
-  DEGREE_120,
-  DEGREE_135,
-  DEGREE_150,
-  FILTER_DEGREES,
-} FILTER_DEGREE;
-#endif  // CONFIG_LPF_DIRECT
-
-#if CONFIG_OBU
-// R19
-typedef enum {
-  OBU_SEQUENCE_HEADER = 1,
-  OBU_TD = 2,
-  OBU_FRAME_HEADER = 3,
-  OBU_TILE_GROUP = 4,
-  OBU_METADATA = 5,
-  OBU_PADDING = 15,
-} OBU_TYPE;
-#endif
+#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
 
-#if CONFIG_LGT_FROM_PRED
-#define LGT_SIZES 2
-// Note: at least one of LGT_FROM_PRED_INTRA and LGT_FROM_PRED_INTER must be 1
-#define LGT_FROM_PRED_INTRA 1
-#define LGT_FROM_PRED_INTER 1
-// LGT_SL_INTRA: LGTs with a mode-dependent first self-loop and a break point
-#define LGT_SL_INTRA 0
-#endif  // CONFIG_LGT_FROM_PRED
+// In large_scale_tile coding, external references are used.
+#define MAX_EXTERNAL_REFERENCES 128
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c
index 135132316..a7e67ea4a 100644
--- a/third_party/aom/av1/common/filter.c
+++ b/third_party/aom/av1/common/filter.c
@@ -25,153 +25,6 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
 };
 
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static const int16_t,
-                sub_pel_filters_temporalfilter_12[SUBPEL_SHIFTS][12]) = {
-  // intfilt 0.8
-  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
-  { 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0 },
-  { 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0 },
-  { -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1 },
-  { -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1 },
-  { -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1 },
-  { -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1 },
-  { -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1 },
-  { -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1 },
-  { -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1 },
-  { -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1 },
-  { -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1 },
-  { -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1 },
-  { -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1 },
-  { 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0 },
-  { 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0 },
-};
-#endif  // USE_TEMPORALFILTER_12TAP
-
-#if USE_EXTRA_FILTER
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_regular_uv[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-#if USE_12TAP_FILTER
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
-  // intfilt 0.8
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 2, -6, 127, 9, -4, 2, -1 },
-  { -2, 5, -12, 124, 18, -7, 4, -2 },   { -2, 7, -16, 119, 28, -11, 5, -2 },
-  { -3, 8, -19, 114, 38, -14, 7, -3 },  { -3, 9, -22, 107, 49, -17, 8, -3 },
-  { -4, 10, -23, 99, 60, -20, 10, -4 }, { -4, 11, -23, 90, 70, -22, 10, -4 },
-  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -22, 70, 90, -23, 11, -4 },
-  { -4, 10, -20, 60, 99, -23, 10, -4 }, { -3, 8, -17, 49, 107, -22, 9, -3 },
-  { -3, 7, -14, 38, 114, -19, 8, -3 },  { -2, 5, -11, 28, 119, -16, 7, -2 },
-  { -2, 4, -7, 18, 124, -12, 5, -2 },   { -1, 2, -4, 9, 127, -6, 2, -1 },
-};
-
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_10sharp[SUBPEL_SHIFTS][12]) = {
-  // intfilt 0.85
-  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
-  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
-  { 0, 1, -3, 6, -13, 124, 18, -8, 4, -2, 1, 0 },
-  { 0, 2, -4, 8, -18, 120, 28, -12, 6, -4, 2, 0 },
-  { 0, 2, -5, 10, -21, 114, 38, -15, 8, -5, 2, 0 },
-  { 0, 3, -6, 11, -24, 107, 49, -19, 10, -6, 3, 0 },
-  { 0, 3, -7, 12, -25, 99, 59, -21, 11, -6, 3, 0 },
-  { 0, 3, -7, 12, -25, 90, 70, -23, 12, -7, 3, 0 },
-  { 0, 3, -7, 12, -25, 81, 81, -25, 12, -7, 3, 0 },
-  { 0, 3, -7, 12, -23, 70, 90, -25, 12, -7, 3, 0 },
-  { 0, 3, -6, 11, -21, 59, 99, -25, 12, -7, 3, 0 },
-  { 0, 3, -6, 10, -19, 49, 107, -24, 11, -6, 3, 0 },
-  { 0, 2, -5, 8, -15, 38, 114, -21, 10, -5, 2, 0 },
-  { 0, 2, -4, 6, -12, 28, 120, -18, 8, -4, 2, 0 },
-  { 0, 1, -2, 4, -8, 18, 124, -13, 6, -3, 1, 0 },
-  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 },
-};
-#else
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
-  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
-  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
-  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
-  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
-  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
-  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
-  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
-};
-#endif
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8smooth2[SUBPEL_SHIFTS]) = {
-  // freqmultiplier = 0.2
-  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 9, 30, 44, 32, 11, 2, 0 },
-  { 0, 8, 28, 44, 34, 12, 2, 0 }, { 0, 7, 27, 44, 35, 13, 2, 0 },
-  { 0, 6, 26, 43, 37, 14, 2, 0 }, { 0, 5, 24, 43, 38, 16, 2, 0 },
-  { 0, 5, 23, 42, 38, 17, 3, 0 }, { 0, 4, 21, 41, 40, 19, 3, 0 },
-  { 0, 4, 20, 40, 40, 20, 4, 0 }, { 0, 3, 19, 40, 41, 21, 4, 0 },
-  { 0, 3, 17, 38, 42, 23, 5, 0 }, { 0, 2, 16, 38, 43, 24, 5, 0 },
-  { 0, 2, 14, 37, 43, 26, 6, 0 }, { 0, 2, 13, 35, 44, 27, 7, 0 },
-  { 0, 2, 12, 34, 44, 28, 8, 0 }, { 0, 2, 11, 32, 44, 30, 9, 0 },
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_smooth2_uv[SUBPEL_SHIFTS]) = {
-  // freqmultiplier = 0.2
-  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 9, 30, 44, 32, 11, 2, 0 },
-  { 0, 8, 28, 44, 34, 12, 2, 0 }, { 0, 7, 27, 44, 35, 13, 2, 0 },
-  { 0, 6, 26, 43, 37, 14, 2, 0 }, { 0, 5, 24, 43, 38, 16, 2, 0 },
-  { 0, 5, 23, 42, 38, 17, 3, 0 }, { 0, 4, 21, 41, 40, 19, 3, 0 },
-  { 0, 4, 20, 40, 40, 20, 4, 0 }, { 0, 3, 19, 40, 41, 21, 4, 0 },
-  { 0, 3, 17, 38, 42, 23, 5, 0 }, { 0, 2, 16, 38, 43, 24, 5, 0 },
-  { 0, 2, 14, 37, 43, 26, 6, 0 }, { 0, 2, 13, 35, 44, 27, 7, 0 },
-  { 0, 2, 12, 34, 44, 28, 8, 0 }, { 0, 2, 11, 32, 44, 30, 9, 0 },
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
-  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_smooth_uv[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
-  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-#else   // USE_EXTRA_FILTER
-
 DECLARE_ALIGNED(256, static const InterpKernel,
                 sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
@@ -207,49 +60,7 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
   { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
 };
-#endif  // USE_EXTRA_FILTER
 
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS] = {
-  bilinear_filters,         // INTRA_FILTER_LINEAR
-  sub_pel_filters_8,        // INTRA_FILTER_8TAP
-  sub_pel_filters_8sharp,   // INTRA_FILTER_8TAP_SHARP
-  sub_pel_filters_8smooth,  // INTRA_FILTER_8TAP_SMOOTH
-};
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-
-#if USE_EXTRA_FILTER
-static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + EXTRA_FILTERS] = {
-      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_REGULAR },
-      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SMOOTH },
-#if USE_12TAP_FILTER
-      { (const int16_t *)sub_pel_filters_10sharp, 12, SUBPEL_SHIFTS,
-        MULTITAP_SHARP },
-#else
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SHARP },
-#endif
-      { (const int16_t *)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SMOOTH2 },
-      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        BILINEAR },
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SHARP },
-      { (const int16_t *)sub_pel_filters_regular_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_REGULAR_UV },
-      { (const int16_t *)sub_pel_filters_smooth_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_SMOOTH_UV },
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_SHARP_UV },
-      { (const int16_t *)sub_pel_filters_smooth2_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_SMOOTH2_UV },
-    };
-#else
 static const InterpFilterParams
     av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
       { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
@@ -261,62 +72,49 @@ static const InterpFilterParams
       { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
         BILINEAR }
     };
-#endif  // USE_EXTRA_FILTER
 
-#if USE_TEMPORALFILTER_12TAP
-static const InterpFilterParams av1_interp_temporalfilter_12tap = {
-  (const int16_t *)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS,
-  TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
+  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
 };
-#endif  // USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+static const InterpFilterParams av1_interp_4tap[2] = {
+  { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_SMOOTH },
+};
+
+InterpFilterParams av1_get_interp_filter_params_with_block_size(
+    const InterpFilter interp_filter, const int w) {
+  if (w <= 4 &&
+      (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
+    return av1_interp_4tap[0];
+  else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
+    return av1_interp_4tap[1];
 
-InterpFilterParams av1_get_interp_filter_params(
-    const InterpFilter interp_filter) {
-#if USE_TEMPORALFILTER_12TAP
-  if (interp_filter == TEMPORALFILTER_12TAP)
-    return av1_interp_temporalfilter_12tap;
-#endif  // USE_TEMPORALFILTER_12TAP
   return av1_interp_filter_params_list[interp_filter];
 }
 
 const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
-#if USE_TEMPORALFILTER_12TAP
-  if (interp_filter == TEMPORALFILTER_12TAP)
-    return av1_interp_temporalfilter_12tap.filter_ptr;
-#endif  // USE_TEMPORALFILTER_12TAP
   return (const int16_t *)av1_interp_filter_params_list[interp_filter]
       .filter_ptr;
 }
-
-#if CONFIG_DUAL_FILTER
-InterpFilter av1_get_plane_interp_filter(InterpFilter interp_filter,
-                                         int plane) {
-#if USE_TEMPORALFILTER_12TAP
-#if USE_EXTRA_FILTER
-  assert(interp_filter <= EIGHTTAP_SHARP ||
-         interp_filter == TEMPORALFILTER_12TAP);
-#else   // USE_EXTRA_FILTER
-  assert(interp_filter <= SWITCHABLE_FILTERS ||
-         interp_filter == TEMPORALFILTER_12TAP);
-#endif  // USE_EXTRA_FILTER
-#else
-  assert(interp_filter <= EIGHTTAP_SHARP);
-#endif
-#if USE_EXTRA_FILTER
-  if (plane == 0) {
-    return interp_filter;
-  } else {
-    switch (interp_filter) {
-      case EIGHTTAP_REGULAR: return FILTER_REGULAR_UV;
-      case EIGHTTAP_SMOOTH: return FILTER_SMOOTH_UV;
-      case MULTITAP_SHARP: return FILTER_SHARP_UV;
-      case EIGHTTAP_SMOOTH2: return FILTER_SMOOTH2_UV;
-      default: return interp_filter;
-    }
-  }
-#else   // USE_EXTRA_FILTER
-  (void)plane;
-  return interp_filter;
-#endif  // USE_EXTRA_FILTER
-}
-#endif
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 343e87560..0c24ad9d0 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -14,7 +14,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
@@ -23,34 +24,17 @@
 extern "C" {
 #endif
 
-#define USE_TEMPORALFILTER_12TAP 1
-#define MAX_FILTER_TAP 12
-
-#define USE_12TAP_FILTER 0
-#define USE_EXTRA_FILTER 0
+#define MAX_FILTER_TAP 8
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   EIGHTTAP_REGULAR,
   EIGHTTAP_SMOOTH,
-#if USE_EXTRA_FILTER
-  EIGHTTAP_SMOOTH2,
-#endif  // USE_EXTRA_FILTER
   MULTITAP_SHARP,
   BILINEAR,
-#if USE_EXTRA_FILTER
-  EIGHTTAP_SHARP,
-  FILTER_REGULAR_UV,
-  FILTER_SMOOTH_UV,
-  FILTER_SHARP_UV,
-  FILTER_SMOOTH2_UV,
-#endif  // USE_EXTRA_FILTER
   INTERP_FILTERS_ALL,
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
   EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
-#if USE_TEMPORALFILTER_12TAP
-  TEMPORALFILTER_12TAP = SWITCHABLE_FILTERS + EXTRA_FILTERS,
-#endif
 } InterpFilter;
 
 // With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
@@ -59,73 +43,34 @@ typedef enum {
 // setting a (pair of) filters.
 //
 // Without CONFIG_DUAL_FILTER,
-#if CONFIG_DUAL_FILTER
 typedef uint32_t InterpFilters;
 static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
                                                      int x_filter) {
-  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xffff);
+  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
 }
 
 static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
                                                     InterpFilter x_filter) {
-  uint16_t y16 = y_filter & 0xffff;
-  uint16_t x16 = x_filter & 0xffff;
+  uint16_t y16 = y_filter & 0xf;
+  uint16_t x16 = x_filter & 0xf;
   return y16 | ((uint32_t)x16 << 16);
 }
 
 static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
   return av1_make_interp_filters(filter, filter);
 }
-#else
-typedef InterpFilter InterpFilters;
-static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
-                                                     int x_filter) {
-#ifdef NDEBUG
-  (void)x_filter;
-#endif
-  assert(!x_filter);
-  return filters;
-}
-
-static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
-  return filter;
-}
-#endif
 
 static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
   return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
 }
 
-#if USE_EXTRA_FILTER
-#define LOG_SWITCHABLE_FILTERS \
-  3 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
-#else
 #define LOG_SWITCHABLE_FILTERS \
   2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
-#endif
 
-#if CONFIG_DUAL_FILTER
 #define MAX_SUBPEL_TAPS 12
 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
 #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
 #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
-#else  // CONFIG_DUAL_FILTER
-#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-#endif  // CONFIG_DUAL_FILTER
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-typedef enum {
-  INTRA_FILTER_LINEAR,
-  INTRA_FILTER_8TAP,
-  INTRA_FILTER_8TAP_SHARP,
-  INTRA_FILTER_8TAP_SMOOTH,
-  INTRA_FILTERS,
-} INTRA_FILTER;
-
-extern const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
 
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
@@ -134,26 +79,16 @@ typedef struct InterpFilterParams {
   InterpFilter interp_filter;
 } InterpFilterParams;
 
-InterpFilterParams av1_get_interp_filter_params(
-    const InterpFilter interp_filter);
-
 const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
 
+InterpFilterParams av1_get_interp_filter_params_with_block_size(
+    const InterpFilter interp_filter, const int w);
+
 static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
     const InterpFilterParams filter_params, const int subpel) {
   return filter_params.filter_ptr + filter_params.taps * subpel;
 }
 
-static INLINE int av1_is_interpolating_filter(
-    const InterpFilter interp_filter) {
-  const InterpFilterParams ip = av1_get_interp_filter_params(interp_filter);
-  return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
-}
-
-#if CONFIG_DUAL_FILTER
-InterpFilter av1_get_plane_interp_filter(InterpFilter interp_filter, int plane);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
index 0b6b78e3d..502ccd27d 100644
--- a/third_party/aom/av1/common/frame_buffers.c
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -75,5 +75,6 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
   InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   (void)cb_priv;
   if (int_fb) int_fb->in_use = 0;
+  fb->priv = NULL;
   return 0;
 }
diff --git a/third_party/aom/av1/common/generic_code.c b/third_party/aom/av1/common/generic_code.c
deleted file mode 100644
index 7285decc9..000000000
--- a/third_party/aom/av1/common/generic_code.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "generic_code.h"
-
-void aom_cdf_init_q15_1D(uint16_t *cdf, int nsyms, int cdf_size) {
-  int i;
-  for (i = 0; i < nsyms; i++)
-    cdf[i] = AOM_ICDF((i + 1)*CDF_PROB_TOP/nsyms);
-
-  cdf[cdf_size - 1] = 0;
-}
-
-/** Adapts a Q15 cdf after encoding/decoding a symbol. */
-void aom_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate) {
-  int i;
-  *count = OD_MINI(*count + 1, 1 << rate);
-  OD_ASSERT(AOM_ICDF(cdf[n - 1]) == 32768);
-  if (*count >= 1 << rate) {
-    /* Steady-state adaptation based on a simple IIR with dyadic rate. */
-    for (i = 0; i < n; i++) {
-      int tmp;
-      /* When (i < val), we want the adjustment ((cdf[i] - tmp) >> rate) to be
-         positive so long as (cdf[i] > i + 1), and 0 when (cdf[i] == i + 1),
-         to ensure we don't drive any probabilities to 0. Replacing cdf[i] with
-         (i + 2) and solving ((i + 2 - tmp) >> rate == 1) for tmp produces
-         tmp == i + 2 - (1 << rate). Using this value of tmp with
-         cdf[i] == i + 1 instead gives an adjustment of 0 as desired.
-
-         When (i >= val), we want ((cdf[i] - tmp) >> rate) to be negative so
-         long as cdf[i] < 32768 - (n - 1 - i), and 0 when
-         cdf[i] == 32768 - (n - 1 - i), again to ensure we don't drive any
-         probabilities to 0. Since right-shifting any negative value is still
-         negative, we can solve (32768 - (n - 1 - i) - tmp == 0) for tmp,
-         producing tmp = 32769 - n + i. Using this value of tmp with smaller
-         values of cdf[i] instead gives negative adjustments, as desired.
-
-         Combining the two cases gives the expression below. These could be
-         stored in a lookup table indexed by n and rate to avoid the
-         arithmetic. */
-      tmp = 2 - (1<<rate) + i + (32767 + (1<<rate) - n)*(i >= val);
-      cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i]) - ((AOM_ICDF(cdf[i]) - tmp) >> rate));
-    }
-  }
-  else {
-    int alpha;
-    /* Initial adaptation for the first symbols. The adaptation rate is
-       computed to be equivalent to what od_{en,de}code_cdf_adapt() does
-       when the initial cdf is set to increment/4. */
-    alpha = 4*32768/(n + 4**count);
-    for (i = 0; i < n; i++) {
-      int tmp;
-      tmp = (32768 - n)*(i >= val) + i + 1;
-      cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i])
-          - (((AOM_ICDF(cdf[i]) - tmp)*alpha) >> 15));
-    }
-  }
-  OD_ASSERT(AOM_ICDF(cdf[n - 1]) == 32768);
-}
-
-/** Takes the base-2 log of E(x) in Q1.
- *
- * @param [in] ExQ16 expectation of x in Q16
- *
- * @retval 2*log2(ExQ16/2^16)
- */
-int log_ex(int ex_q16) {
-  int lg;
-  int lg_q1;
-  int odd;
-  lg = OD_ILOG(ex_q16);
-  if (lg < 15) {
-    odd = ex_q16*ex_q16 > 2 << 2*lg;
-  }
-  else {
-    int tmp;
-    tmp = ex_q16 >> (lg - 8);
-    odd = tmp*tmp > (1 << 15);
-  }
-  lg_q1 = OD_MAXI(0, 2*lg - 33 + odd);
-  return lg_q1;
-}
-
-/** Updates the probability model based on the encoded/decoded value
- *
- * @param [in,out] model generic prob model
- * @param [in,out] ExQ16 expectation of x
- * @param [in]     x     variable encoded/decoded (used for ExQ16)
- * @param [in]     xs    variable x after shift (used for the model)
- * @param [in]     id    id of the icdf to adapt
- * @param [in]     integration integration period of ExQ16 (leaky average over
- * 1<<integration samples)
- */
-void generic_model_update(int *ex_q16, int x, int integration) {
-  /* We could have saturated ExQ16 directly, but this is safe and simpler */
-  x = OD_MINI(x, 32767);
-  OD_IIR_DIADIC(*ex_q16, x << 16, integration);
-}
diff --git a/third_party/aom/av1/common/generic_code.h b/third_party/aom/av1/common/generic_code.h
deleted file mode 100644
index e1620ee8e..000000000
--- a/third_party/aom/av1/common/generic_code.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_generic_code_H)
-# define _generic_code_H
-
-# include "aom_dsp/bitreader.h"
-# include "aom_dsp/bitwriter.h"
-
-# define GENERIC_TABLES 12
-
-#define generic_decode(r, model, ex_q16, integration, ACCT_STR_NAME) \
-  generic_decode_(r, model, ex_q16, integration ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_decode_cdf_adapt_q15(r, cdf, n, count, rate, ACCT_STR_NAME) \
-  aom_decode_cdf_adapt_q15_(r, cdf, n, count, rate ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_decode_cdf_adapt(r, cdf, n, increment, ACCT_STR_NAME) \
-  aom_decode_cdf_adapt_(r, cdf, n, increment ACCT_STR_ARG(ACCT_STR_NAME))
-
-typedef struct {
-  /** cdf for multiple expectations of x */
-  uint16_t cdf[GENERIC_TABLES][CDF_SIZE(16)];
-} generic_encoder;
-
-#define OD_IIR_DIADIC(y, x, shift) ((y) += ((x) - (y)) >> (shift))
-
-void generic_model_init(generic_encoder *model);
-
-/* Initialize a CDF for use by aom_write_symbol_pvq()/aom_read_symbol_pvq().
-   This is used for CDFs whose size might not match the declared array size.
-   The only real requirement is that the first value of every CDF be zero.
-   Then aom_cdf_init_q15_1D() will be called with the real size the first time
-   the CDF is used. */
-#define OD_CDFS_INIT_DYNAMIC(cdf) (memset(cdf, 0, sizeof(cdf)))
-
-// WARNING: DO NOT USE this init function,
-// if the size of cdf is different from what is declared by code.
-#define OD_CDFS_INIT_Q15(cdfs) \
-  { int n_cdfs = sizeof(cdfs)/sizeof(cdfs[0]); \
-    int cdf_size = sizeof(cdfs[0])/sizeof(cdfs[0][0]); \
-    int nsyms = cdf_size - 1; \
-    int i_; \
-    for (i_ = 0; i_ < n_cdfs; i_++) \
-      aom_cdf_init_q15_1D(cdfs[i_], nsyms, cdf_size); \
-  }
-
-void aom_cdf_init(uint16_t *cdf, int ncdfs, int nsyms, int val, int first);
-
-void aom_cdf_init_q15_1D(uint16_t *cdf, int nsyms, int cdf_size);
-
-void aom_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate);
-
-void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
- int *count, int rate);
-
-void generic_encode(aom_writer *w, generic_encoder *model, int x,
- int *ex_q16, int integration);
-double generic_encode_cost(generic_encoder *model, int x, int *ex_q16);
-
-double od_encode_cdf_cost(int val, uint16_t *cdf, int n);
-
-int aom_decode_cdf_adapt_q15_(aom_reader *r, uint16_t *cdf, int n,
- int *count, int rate ACCT_STR_PARAM);
-
-int generic_decode_(aom_reader *r, generic_encoder *model,
- int *ex_q16, int integration ACCT_STR_PARAM);
-
-int log_ex(int ex_q16);
-
-void generic_model_update(int *ex_q16, int x, int integration);
-
-#endif
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
index 53c2ba1f0..bc758eb57 100644
--- a/third_party/aom/av1/common/idct.c
+++ b/third_party/aom/av1/common/idct.c
@@ -11,2623 +11,33 @@
 
 #include <math.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-#include "av1/common/av1_inv_txfm1d_cfg.h"
-#include "av1/common/blockd.h"
-#include "av1/common/enums.h"
-#include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
-#include "av1/common/daala_tx.h"
-#endif
-
-int av1_get_tx_scale(const TX_SIZE tx_size) {
-  const int pels = tx_size_2d[tx_size];
-  return (pels > 256) + (pels > 1024) + (pels > 4096);
-}
-
-// NOTE: The implementation of all inverses need to be aware of the fact
-// that input and output could be the same buffer.
-
-#if CONFIG_EXT_TX
-static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-}
-
-static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    output[i] = input[i] * 2;
-  }
-}
-
-static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
-  }
-}
-
-static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[i] * 4;
-  }
-}
-
-#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-  }
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-
-// For use in lieu of ADST
-static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i] * 4;
-  }
-  aom_idct16_c(inputhalf, output + 16);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
-#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_idct64_new(in, out, inv_cos_bit_col_dct_64, inv_stage_range_col_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-
-static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_idct64_new(in, out, inv_cos_bit_row_dct_64, inv_stage_range_row_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-
-// For use in lieu of ADST
-static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-  for (i = 0; i < 32; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
-  }
-  aom_idct32_c(inputhalf, output + 32);
-  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
-}
-#endif  // CONFIG_TX64X64
-
-// Inverse identity transform and add.
-#if CONFIG_EXT_TX
-static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           int bsx, int bsy, TX_TYPE tx_type) {
-  int r, c;
-  const int pels = bsx * bsy;
-  const int shift = 3 - ((pels > 256) + (pels > 1024));
-  if (tx_type == IDTX) {
-    for (r = 0; r < bsy; ++r) {
-      for (c = 0; c < bsx; ++c)
-        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
-      dest += stride;
-      input += bsx;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-#define FLIPUD_PTR(dest, stride, size)       \
-  do {                                       \
-    (dest) = (dest) + ((size)-1) * (stride); \
-    (stride) = -(stride);                    \
-  } while (0)
-
-#if CONFIG_EXT_TX
-static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
-                               int *sstride, TX_TYPE tx_type, int sizey,
-                               int sizex) {
-  // Note that the transpose of src will be added to dst. In order to LR
-  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
-  // the addends, we UD flip the dst.
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST: break;
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST:
-      // flip UD
-      FLIPUD_PTR(*dst, *dstride, sizey);
-      break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      // flip LR
-      FLIPUD_PTR(*src, *sstride, sizex);
-      break;
-    case FLIPADST_FLIPADST:
-      // flip UD
-      FLIPUD_PTR(*dst, *dstride, sizey);
-      // flip LR
-      FLIPUD_PTR(*src, *sstride, sizex);
-      break;
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_EXT_TX && CONFIG_TX64X64
-static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bsx, int bsy, TX_TYPE tx_type,
-                                  int bd) {
-  int r, c;
-  const int pels = bsx * bsy;
-  const int shift = 3 - ((pels > 256) + (pels > 1024));
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  if (tx_type == IDTX) {
-    for (r = 0; r < bsy; ++r) {
-      for (c = 0; c < bsx; ++c)
-        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
-      dest += stride;
-      input += bsx;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX && CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
-void ilgt4(const tran_low_t *input, tran_low_t *output,
-           const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT4) {
-    aom_idct4_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST4) {
-    aom_iadst4_c(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,4
-  tran_high_t s[4] = { 0 };
-  for (int i = 0; i < 4; ++i)
-    for (int j = 0; j < 4; ++j) s[j] += lgtmtx[i * 4 + j] * input[i];
-
-  for (int i = 0; i < 4; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
-}
-
-void ilgt8(const tran_low_t *input, tran_low_t *output,
-           const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT8) {
-    aom_idct8_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST8) {
-    aom_iadst8_c(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,8
-  tran_high_t s[8] = { 0 };
-  for (int i = 0; i < 8; ++i)
-    for (int j = 0; j < 8; ++j) s[j] += lgtmtx[i * 8 + j] * input[i];
-
-  for (int i = 0; i < 8; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
-}
-#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
-
-#if CONFIG_LGT
-// get_lgt4 and get_lgt8 return 1 and pick a lgt matrix if LGT is chosen to
-// apply. Otherwise they return 0
-int get_lgt4(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx) {
-  if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
-                 vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
-    return 1;
-  } else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
-                         htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
-    return 1;
-  }
-  lgtmtx[0] = NULL;
-  return 0;
-}
-
-int get_lgt8(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx) {
-  if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
-                 vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
-    return 1;
-  } else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
-                         htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
-    return 1;
-  }
-  lgtmtx[0] = NULL;
-  return 0;
-}
-#endif  // CONFIG_LGT
-
-#if CONFIG_LGT_FROM_PRED
-void ilgt16up(const tran_low_t *input, tran_low_t *output,
-              const tran_high_t *lgtmtx) {
-  if (lgtmtx[0] == DCT16) {
-    aom_idct16_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST16) {
-    aom_iadst16_c(input, output);
-    return;
-  } else if (lgtmtx[0] == DCT32) {
-    aom_idct32_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST32) {
-    ihalfright32_c(input, output);
-    return;
-  } else {
-    assert(0);
-  }
-}
-
-void get_discontinuity_1d(uint8_t *arr, int n, int *idx_max_diff) {
-  *idx_max_diff = -1;
-
-  int temp = 0, max_diff = 0, min_diff = INT_MAX;
-  for (int i = 1; i < n; ++i) {
-    temp = abs(arr[i] - arr[i - 1]);
-    if (temp > max_diff) {
-      max_diff = temp;
-      *idx_max_diff = i;
-    }
-    if (temp < min_diff) min_diff = temp;
-  }
-}
-
-void get_discontinuity_2d(uint8_t *dst, int stride, int n, int is_col,
-                          int *idx_max_diff, int ntx) {
-  *idx_max_diff = -1;
-
-  int diff = 0, temp = 0, max_diff = 0, min_diff = INT_MAX;
-  for (int i = 1; i < n; ++i) {
-    temp = 0;
-    for (int j = 0; j < ntx; ++j) {
-      if (is_col)  // vertical diff
-        diff = dst[i * stride + j] - dst[(i - 1) * stride + j];
-      else  // horizontal diff
-        diff = dst[j * stride + i] - dst[j * stride + i - 1];
-      temp += diff * diff;
-    }
-    // temp/w is the i-th avg square diff
-    if (temp > max_diff) {
-      max_diff = temp;
-      *idx_max_diff = i;
-    }
-    if (temp < min_diff) min_diff = temp;
-  }
-}
-
-int idx_selfloop_wrt_mode(PREDICTION_MODE mode, int is_col) {
-  // 0: no self-loop
-  // 1: small self-loop
-  // 2: medium self-loop
-  // 3: large self-loop
-  switch (mode) {
-    case DC_PRED:
-    case SMOOTH_PRED:
-      // predition is good for both directions: large SLs for row and col
-      return 3;
-    case TM_PRED: return 0;
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_H_PRED:
-#endif
-    case H_PRED:
-      // prediction is good for H direction: large SL for row only
-      return is_col ? 0 : 3;
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_V_PRED:
-#endif
-    case V_PRED:
-      // prediction is good for V direction: large SL for col only
-      return is_col ? 3 : 0;
-#if LGT_SL_INTRA
-    // directional mode: choose SL based on the direction
-    case D45_PRED: return is_col ? 2 : 0;
-    case D63_PRED: return is_col ? 3 : 0;
-    case D117_PRED: return is_col ? 3 : 1;
-    case D135_PRED: return 2;
-    case D153_PRED: return is_col ? 1 : 3;
-    case D207_PRED: return is_col ? 0 : 3;
-#else
-    case D45_PRED:
-    case D63_PRED:
-    case D117_PRED: return is_col ? 3 : 0;
-    case D135_PRED:
-    case D153_PRED:
-    case D207_PRED: return is_col ? 0 : 3;
-#endif
-    // inter: no SL
-    default: return 0;
-  }
-}
-
-void get_lgt4_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx) {
-  PREDICTION_MODE mode = txfm_param->mode;
-  int stride = txfm_param->stride;
-  uint8_t *dst = txfm_param->dst;
-  int bp = -1;
-  uint8_t arr[4];
-
-  // Each lgt4mtx_arr[k][i] corresponds to a line graph with a self-loop on
-  // the first node, and possibly a weak edge within the line graph. i is
-  // the index of the weak edge (between the i-th and (i+1)-th pixels, i=0
-  // means no weak edge). k corresponds to the first self-loop's weight
-  const tran_high_t *lgt4mtx_arr[4][4] = {
-    { &lgt4_000[0][0], &lgt4_000w1[0][0], &lgt4_000w2[0][0],
-      &lgt4_000w3[0][0] },
-    { &lgt4_060[0][0], &lgt4_060_000w1[0][0], &lgt4_060_000w2[0][0],
-      &lgt4_060_000w3[0][0] },
-    { &lgt4_100[0][0], &lgt4_100_000w1[0][0], &lgt4_100_000w2[0][0],
-      &lgt4_100_000w3[0][0] },
-    { &lgt4_150[0][0], &lgt4_150_000w1[0][0], &lgt4_150_000w2[0][0],
-      &lgt4_150_000w3[0][0] },
-  };
-
-  // initialize to DCT or some LGTs, and then change later if necessary
-  int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
-  lgtmtx[0] = lgt4mtx_arr[idx_sl][0];
-
-  // find the break point and replace the line graph by the one with a
-  // break point
-  if (mode == DC_PRED || mode == SMOOTH_PRED) {
-    // Do not use break point, since 1) is_left_available and is_top_available
-    // in DC_PRED are not known by txfm_param for now, so accessing
-    // both boundaries anyway may cause a mismatch 2) DC prediciton
-    // typically yields very smooth residues so having the break point
-    // does not usually improve the RD result.
-    return;
-  } else if (mode == TM_PRED) {
-    // TM_PRED: use both 1D top boundary and 1D left boundary
-    if (is_col)
-      for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
-    else
-      for (int i = 0; i < 4; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-  } else if (mode == V_PRED) {
-    // V_PRED: use 1D top boundary only
-    if (is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-  } else if (mode == H_PRED) {
-    // H_PRED: use 1D left boundary only
-    if (!is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-#if CONFIG_SMOOTH_HV
-  } else if (mode == SMOOTH_V_PRED) {
-    if (is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[-stride + i];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-  } else if (mode == SMOOTH_H_PRED) {
-    if (!is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride - 1];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-#endif
-  } else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
-    // directional modes closer to vertical (maybe include D135 later)
-    if (!is_col) get_discontinuity_2d(dst, stride, 4, 0, &bp, ntx);
-  } else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
-    // directional modes closer to horizontal
-    if (is_col) get_discontinuity_2d(dst, stride, 4, 1, &bp, ntx);
-  } else if (mode > TM_PRED) {
-    // inter
-    get_discontinuity_2d(dst, stride, 4, is_col, &bp, ntx);
-  }
-
-#if LGT_SL_INTRA
-  if (bp != -1) lgtmtx[0] = lgt4mtx_arr[idx_sl][bp];
-#else
-  if (bp != -1) lgtmtx[0] = lgt4mtx_arr[0][bp];
-#endif
-}
-
-void get_lgt8_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx) {
-  PREDICTION_MODE mode = txfm_param->mode;
-  int stride = txfm_param->stride;
-  uint8_t *dst = txfm_param->dst;
-  int bp = -1;
-  uint8_t arr[8];
-
-  const tran_high_t *lgt8mtx_arr[4][8] = {
-    { &lgt8_000[0][0], &lgt8_000w1[0][0], &lgt8_000w2[0][0], &lgt8_000w3[0][0],
-      &lgt8_000w4[0][0], &lgt8_000w5[0][0], &lgt8_000w6[0][0],
-      &lgt8_000w7[0][0] },
-    { &lgt8_060[0][0], &lgt8_060_000w1[0][0], &lgt8_060_000w2[0][0],
-      &lgt8_060_000w3[0][0], &lgt8_060_000w4[0][0], &lgt8_060_000w5[0][0],
-      &lgt8_060_000w6[0][0], &lgt8_060_000w7[0][0] },
-    { &lgt8_100[0][0], &lgt8_100_000w1[0][0], &lgt8_100_000w2[0][0],
-      &lgt8_100_000w3[0][0], &lgt8_100_000w4[0][0], &lgt8_100_000w5[0][0],
-      &lgt8_100_000w6[0][0], &lgt8_100_000w7[0][0] },
-    { &lgt8_150[0][0], &lgt8_150_000w1[0][0], &lgt8_150_000w2[0][0],
-      &lgt8_150_000w3[0][0], &lgt8_150_000w4[0][0], &lgt8_150_000w5[0][0],
-      &lgt8_150_000w6[0][0], &lgt8_150_000w7[0][0] },
-  };
-
-  int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
-  lgtmtx[0] = lgt8mtx_arr[idx_sl][0];
-
-  if (mode == DC_PRED || mode == SMOOTH_PRED) {
-    return;
-  } else if (mode == TM_PRED) {
-    if (is_col)
-      for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
-    else
-      for (int i = 0; i < 8; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-  } else if (mode == V_PRED) {
-    if (is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-  } else if (mode == H_PRED) {
-    if (!is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-#if CONFIG_SMOOTH_HV
-  } else if (mode == SMOOTH_V_PRED) {
-    if (is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[-stride + i];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-  } else if (mode == SMOOTH_H_PRED) {
-    if (!is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride - 1];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-#endif
-  } else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
-    if (!is_col) get_discontinuity_2d(dst, stride, 8, 0, &bp, ntx);
-  } else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
-    if (is_col) get_discontinuity_2d(dst, stride, 8, 1, &bp, ntx);
-  } else if (mode > TM_PRED) {
-    get_discontinuity_2d(dst, stride, 8, is_col, &bp, ntx);
-  }
-
-#if LGT_SL_INTRA
-  if (bp != -1) lgtmtx[0] = lgt8mtx_arr[idx_sl][bp];
-#else
-  if (bp != -1) lgtmtx[0] = lgt8mtx_arr[0][bp];
-#endif
-}
-
-// Since LGTs with length >8 are not implemented now, the following function
-// will just call DCT or ADST
-void get_lgt16up_from_pred(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t **lgtmtx, int ntx) {
-  int tx_length = is_col ? tx_size_high[txfm_param->tx_size]
-                         : tx_size_wide[txfm_param->tx_size];
-  assert(tx_length == 16 || tx_length == 32);
-  PREDICTION_MODE mode = txfm_param->mode;
-
-  (void)ntx;
-  const tran_high_t *dctmtx =
-      tx_length == 16 ? &lgt16_000[0][0] : &lgt32_000[0][0];
-  const tran_high_t *adstmtx =
-      tx_length == 16 ? &lgt16_200[0][0] : &lgt32_200[0][0];
-
-  switch (mode) {
-    case DC_PRED:
-    case TM_PRED:
-    case SMOOTH_PRED:
-      // prediction from both top and left -> ADST
-      lgtmtx[0] = adstmtx;
-      break;
-    case V_PRED:
-    case D45_PRED:
-    case D63_PRED:
-    case D117_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_V_PRED:
-#endif
-      // prediction from the top more than from the left -> ADST
-      lgtmtx[0] = is_col ? adstmtx : dctmtx;
-      break;
-    case H_PRED:
-    case D135_PRED:
-    case D153_PRED:
-    case D207_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_H_PRED:
-#endif
-      // prediction from the left more than from the top -> DCT
-      lgtmtx[0] = is_col ? dctmtx : adstmtx;
-      break;
-    default: lgtmtx[0] = dctmtx; break;
-  }
-}
-
-typedef void (*IlgtFunc)(const tran_low_t *input, tran_low_t *output,
-                         const tran_high_t *lgtmtx);
-
-static IlgtFunc ilgt_func[4] = { ilgt4, ilgt8, ilgt16up, ilgt16up };
-
-typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t **lgtmtx, int ntx);
-
-static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
-                                      get_lgt16up_from_pred,
-                                      get_lgt16up_from_pred };
-
-// this inline function corresponds to the up scaling before the transpose
-// operation in the av1_iht* functions
-static INLINE tran_low_t inv_upscale_wrt_txsize(const tran_high_t val,
-                                                const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4:
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4:
-    case TX_8X32:
-    case TX_32X8: return (tran_low_t)val;
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X16:
-    case TX_16X8: return (tran_low_t)dct_const_round_shift(val * Sqrt2);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-// This inline function corresponds to the bit shift before summing with the
-// destination in the av1_iht* functions
-static INLINE tran_low_t inv_downscale_wrt_txsize(const tran_low_t val,
-                                                  const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return ROUND_POWER_OF_TWO(val, 4);
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4: return ROUND_POWER_OF_TWO(val, 5);
-    case TX_8X16:
-    case TX_16X8:
-    case TX_8X32:
-    case TX_32X8: return ROUND_POWER_OF_TWO(val, 6);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-void ilgt2d_from_pred_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const int w = tx_size_wide[tx_size];
-  const int h = tx_size_high[tx_size];
-  const int wlog2 = tx_size_wide_log2[tx_size];
-  const int hlog2 = tx_size_high_log2[tx_size];
-  assert(w <= 8 || h <= 8);
-
-  int i, j;
-  // largest 1D size allowed for LGT: 32
-  // largest 2D size allowed for LGT: 8x32=256
-  tran_low_t tmp[256], out[256], temp1d[32];
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
-  get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
-
-// for inverse transform, to be consistent with av1_iht functions, we always
-// apply row transforms first and column transforms second, but both
-// row-first and column-first versions are implemented here for future
-// tests (use different lgtmtx_col[i], and choose row or column tx first
-// depending on transforms).
-#if 1
-  // inverse column transforms
-  for (i = 0; i < w; ++i) {
-    // transpose
-    for (j = 0; j < h; ++j) tmp[i * h + j] = input[j * w + i];
-    ilgt_func[hlog2 - 2](&tmp[i * h], temp1d, lgtmtx_col[0]);
-    // upscale, and store in place
-    for (j = 0; j < h; ++j)
-      tmp[i * h + j] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
-  }
-  // inverse row transforms
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) temp1d[j] = tmp[j * h + i];
-    ilgt_func[wlog2 - 2](temp1d, &out[i * w], lgtmtx_row[0]);
-  }
-  // downscale + sum with the destination
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      int d = i * stride + j;
-      int s = i * w + j;
-      dest[d] =
-          clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
-    }
-  }
-#else
-  // inverse row transforms
-  for (i = 0; i < h; ++i) {
-    ilgt_func[wlog2 - 2](input, temp1d, lgtmtx_row[0]);
-    // upscale and transpose (tmp[j*h+i] <--> tmp[j][i])
-    for (j = 0; j < w; ++j)
-      tmp[j * h + i] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
-    input += w;
-  }
-  // inverse column transforms
-  for (i = 0; i < w; ++i)
-    ilgt_func[hlog2 - 2](&tmp[i * h], &out[i * h], lgtmtx_col[0]);
-  // here, out[] is the transpose of 2D block of transform coefficients
-
-  // downscale + transform + sum with dest
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      int d = i * stride + j;
-      int s = j * h + i;
-      dest[d] =
-          clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
-    }
-  }
-#endif
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if !CONFIG_DAALA_DCT4
-  if (tx_type == DCT_DCT) {
-    aom_idct4x4_16_add(input, dest, stride);
-    return;
-  }
-#endif
-  static const transform_2d IHT_4[] = {
-#if CONFIG_DAALA_DCT4
-    { daala_idct4, daala_idct4 },  // DCT_DCT  = 0
-    { daala_idst4, daala_idct4 },  // ADST_DCT = 1
-    { daala_idct4, daala_idst4 },  // DCT_ADST = 2
-    { daala_idst4, daala_idst4 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { daala_idst4, daala_idct4 },  // FLIPADST_DCT
-    { daala_idct4, daala_idst4 },  // DCT_FLIPADST
-    { daala_idst4, daala_idst4 },  // FLIPADST_FLIPADST
-    { daala_idst4, daala_idst4 },  // ADST_FLIPADST
-    { daala_idst4, daala_idst4 },  // FLIPADST_ADST
-    { daala_idtx4, daala_idtx4 },  // IDTX
-    { daala_idct4, daala_idtx4 },  // V_DCT
-    { daala_idtx4, daala_idct4 },  // H_DCT
-    { daala_idst4, daala_idtx4 },  // V_ADST
-    { daala_idtx4, daala_idst4 },  // H_ADST
-    { daala_idst4, daala_idtx4 },  // V_FLIPADST
-    { daala_idtx4, daala_idst4 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
-    { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
-    { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
-    { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
-    { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
-    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
-    { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
-    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx4_c },          // IDTX
-    { aom_idct4_c, iidtx4_c },       // V_DCT
-    { iidtx4_c, aom_idct4_c },       // H_DCT
-    { aom_iadst4_c, iidtx4_c },      // V_ADST
-    { iidtx4_c, aom_iadst4_c },      // H_ADST
-    { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
-    { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[4][4];
-  tran_low_t out[4][4];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 4;
-
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors
-  for (i = 0; i < 4; ++i) {
-#if CONFIG_DAALA_DCT4
-    tran_low_t temp_in[4];
-    for (j = 0; j < 4; j++) temp_in[j] = input[j] * 2;
-    IHT_4[tx_type].rows(temp_in, out[i]);
-#else
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt4(input, out[i], lgtmtx_row[0]);
-    else
-#endif
-      IHT_4[tx_type].rows(input, out[i]);
-#endif
-    input += 4;
-  }
-
-  // transpose
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt4(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_4[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT4
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#endif
-    }
-  }
-}
-
-void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_4x8[] = {
-    { aom_idct8_c, aom_idct4_c },    // DCT_DCT
-    { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
-    { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
-    { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
-    { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
-    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
-    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx4_c },          // IDTX
-    { aom_idct8_c, iidtx4_c },       // V_DCT
-    { iidtx8_c, aom_idct4_c },       // H_DCT
-    { aom_iadst8_c, iidtx4_c },      // V_ADST
-    { iidtx8_c, aom_iadst4_c },      // H_ADST
-    { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
-    { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n2 = 8;
-  int i, j;
-  tran_low_t out[4][8], tmp[4][8], outtmp[4];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt4(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_4x8[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_4x8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8x4[] = {
-    { aom_idct4_c, aom_idct8_c },    // DCT_DCT
-    { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
-    { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
-    { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
-    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
-    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx8_c },          // IDTX
-    { aom_idct4_c, iidtx8_c },       // V_DCT
-    { iidtx4_c, aom_idct8_c },       // H_DCT
-    { aom_iadst4_c, iidtx8_c },      // V_ADST
-    { iidtx4_c, aom_iadst8_c },      // H_ADST
-    { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
-    { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n2 = 8;
-
-  int i, j;
-  tran_low_t out[8][4], tmp[8][4], outtmp[8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_8x4[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt4(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_8x4[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht4x16_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_4x16[] = {
-    { aom_idct16_c, aom_idct4_c },    // DCT_DCT
-    { aom_iadst16_c, aom_idct4_c },   // ADST_DCT
-    { aom_idct16_c, aom_iadst4_c },   // DCT_ADST
-    { aom_iadst16_c, aom_iadst4_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct4_c },   // FLIPADST_DCT
-    { aom_idct16_c, aom_iadst4_c },   // DCT_FLIPADST
-    { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, aom_iadst4_c },  // ADST_FLIPADST
-    { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx4_c },          // IDTX
-    { aom_idct16_c, iidtx4_c },       // V_DCT
-    { iidtx16_c, aom_idct4_c },       // H_DCT
-    { aom_iadst16_c, iidtx4_c },      // V_ADST
-    { iidtx16_c, aom_iadst4_c },      // H_ADST
-    { aom_iadst16_c, iidtx4_c },      // V_FLIPADST
-    { iidtx16_c, aom_iadst4_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n4 = 16;
-  int i, j;
-  tran_low_t out[4][16], tmp[4][16], outtmp[4];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n4;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt4(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_4x16[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-    IHT_4x16[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht16x4_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16x4[] = {
-    { aom_idct4_c, aom_idct16_c },    // DCT_DCT
-    { aom_iadst4_c, aom_idct16_c },   // ADST_DCT
-    { aom_idct4_c, aom_iadst16_c },   // DCT_ADST
-    { aom_iadst4_c, aom_iadst16_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst4_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct4_c, aom_iadst16_c },   // DCT_FLIPADST
-    { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { aom_iadst4_c, aom_iadst16_c },  // ADST_FLIPADST
-    { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx16_c },          // IDTX
-    { aom_idct4_c, iidtx16_c },       // V_DCT
-    { iidtx4_c, aom_idct16_c },       // H_DCT
-    { aom_iadst4_c, iidtx16_c },      // V_ADST
-    { iidtx4_c, aom_iadst16_c },      // H_ADST
-    { aom_iadst4_c, iidtx16_c },      // V_FLIPADST
-    { iidtx4_c, aom_iadst16_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n4 = 16;
-
-  int i, j;
-  tran_low_t out[16][4], tmp[16][4], outtmp[16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_16x4[tx_type].rows(input, outtmp);
-    for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
-    input += n4;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt4(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_16x4[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8x16[] = {
-    { aom_idct16_c, aom_idct8_c },    // DCT_DCT
-    { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
-    { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
-    { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
-    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
-    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx8_c },          // IDTX
-    { aom_idct16_c, iidtx8_c },       // V_DCT
-    { iidtx16_c, aom_idct8_c },       // H_DCT
-    { aom_iadst16_c, iidtx8_c },      // V_ADST
-    { iidtx16_c, aom_iadst8_c },      // H_ADST
-    { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
-    { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n2 = 16;
-  int i, j;
-  tran_low_t out[8][16], tmp[8][16], outtmp[8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_8x16[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-    IHT_8x16[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16x8[] = {
-    { aom_idct8_c, aom_idct16_c },    // DCT_DCT
-    { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
-    { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
-    { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
-    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
-    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx16_c },          // IDTX
-    { aom_idct8_c, iidtx16_c },       // V_DCT
-    { iidtx8_c, aom_idct16_c },       // H_DCT
-    { aom_iadst8_c, iidtx16_c },      // V_ADST
-    { iidtx8_c, aom_iadst16_c },      // H_ADST
-    { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
-    { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n2 = 16;
-
-  int i, j;
-  tran_low_t out[16][8], tmp[16][8], outtmp[16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_16x8[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_16x8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht8x32_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8x32[] = {
-    { aom_idct32_c, aom_idct8_c },     // DCT_DCT
-    { ihalfright32_c, aom_idct8_c },   // ADST_DCT
-    { aom_idct32_c, aom_iadst8_c },    // DCT_ADST
-    { ihalfright32_c, aom_iadst8_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright32_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct32_c, aom_iadst8_c },    // DCT_FLIPADST
-    { ihalfright32_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, aom_iadst8_c },  // ADST_FLIPADST
-    { ihalfright32_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx8_c },           // IDTX
-    { aom_idct32_c, iidtx8_c },        // V_DCT
-    { iidtx32_c, aom_idct8_c },        // H_DCT
-    { ihalfright32_c, iidtx8_c },      // V_ADST
-    { iidtx32_c, aom_iadst8_c },       // H_ADST
-    { ihalfright32_c, iidtx8_c },      // V_FLIPADST
-    { iidtx32_c, aom_iadst8_c },       // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n4 = 32;
-  int i, j;
-  tran_low_t out[8][32], tmp[8][32], outtmp[8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n4;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_8x32[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-    IHT_8x32[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht32x8_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32x8[] = {
-    { aom_idct8_c, aom_idct32_c },     // DCT_DCT
-    { aom_iadst8_c, aom_idct32_c },    // ADST_DCT
-    { aom_idct8_c, ihalfright32_c },   // DCT_ADST
-    { aom_iadst8_c, ihalfright32_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct32_c },    // FLIPADST_DCT
-    { aom_idct8_c, ihalfright32_c },   // DCT_FLIPADST
-    { aom_iadst8_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, ihalfright32_c },  // ADST_FLIPADST
-    { aom_iadst8_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx32_c },           // IDTX
-    { aom_idct8_c, iidtx32_c },        // V_DCT
-    { iidtx8_c, aom_idct32_c },        // H_DCT
-    { aom_iadst8_c, iidtx32_c },       // V_ADST
-    { iidtx8_c, ihalfright32_c },      // H_ADST
-    { aom_iadst8_c, iidtx32_c },       // V_FLIPADST
-    { iidtx8_c, ihalfright32_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n4 = 32;
-
-  int i, j;
-  tran_low_t out[32][8], tmp[32][8], outtmp[32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_32x8[tx_type].rows(input, outtmp);
-    for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
-    input += n4;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_32x8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16x32[] = {
-    { aom_idct32_c, aom_idct16_c },     // DCT_DCT
-    { ihalfright32_c, aom_idct16_c },   // ADST_DCT
-    { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
-    { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
-    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
-    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx16_c },           // IDTX
-    { aom_idct32_c, iidtx16_c },        // V_DCT
-    { iidtx32_c, aom_idct16_c },        // H_DCT
-    { ihalfright32_c, iidtx16_c },      // V_ADST
-    { iidtx32_c, aom_iadst16_c },       // H_ADST
-    { ihalfright32_c, iidtx16_c },      // V_FLIPADST
-    { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
-#endif
-  };
-
-  const int n = 16;
-  const int n2 = 32;
-  int i, j;
-  tran_low_t out[16][32], tmp[16][32], outtmp[16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-    IHT_16x32[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) IHT_16x32[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32x16[] = {
-    { aom_idct16_c, aom_idct32_c },     // DCT_DCT
-    { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
-    { aom_idct16_c, ihalfright32_c },   // DCT_ADST
-    { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
-    { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
-    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
-    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx32_c },           // IDTX
-    { aom_idct16_c, iidtx32_c },        // V_DCT
-    { iidtx16_c, aom_idct32_c },        // H_DCT
-    { aom_iadst16_c, iidtx32_c },       // V_ADST
-    { iidtx16_c, ihalfright32_c },      // H_ADST
-    { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
-    { iidtx16_c, ihalfright32_c },      // H_FLIPADST
-#endif
-  };
-  const int n = 16;
-  const int n2 = 32;
-
-  int i, j;
-  tran_low_t out[32][16], tmp[32][16], outtmp[32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_32x16[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) IHT_32x16[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8[] = {
-#if CONFIG_DAALA_DCT8
-    { daala_idct8, daala_idct8 },  // DCT_DCT  = 0
-    { daala_idst8, daala_idct8 },  // ADST_DCT = 1
-    { daala_idct8, daala_idst8 },  // DCT_ADST = 2
-    { daala_idst8, daala_idst8 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { daala_idst8, daala_idct8 },  // FLIPADST_DCT
-    { daala_idct8, daala_idst8 },  // DCT_FLIPADST
-    { daala_idst8, daala_idst8 },  // FLIPADST_FLIPADST
-    { daala_idst8, daala_idst8 },  // ADST_FLIPADST
-    { daala_idst8, daala_idst8 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx8 },  // IDTX
-    { daala_idct8, daala_idtx8 },  // V_DCT
-    { daala_idtx8, daala_idct8 },  // H_DCT
-    { daala_idst8, daala_idtx8 },  // V_ADST
-    { daala_idtx8, daala_idst8 },  // H_ADST
-    { daala_idst8, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx8, daala_idst8 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
-    { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
-    { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
-    { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
-    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
-    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx8_c },          // IDTX
-    { aom_idct8_c, iidtx8_c },       // V_DCT
-    { iidtx8_c, aom_idct8_c },       // H_DCT
-    { aom_iadst8_c, iidtx8_c },      // V_ADST
-    { iidtx8_c, aom_iadst8_c },      // H_ADST
-    { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
-    { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[8][8];
-  tran_low_t out[8][8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 8;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors
-  for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_DCT8
-    tran_low_t temp_in[8];
-    for (j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
-    IHT_8[tx_type].rows(temp_in, out[i]);
-#else
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, out[i], lgtmtx_row[0]);
-    else
-#endif
-      IHT_8[tx_type].rows(input, out[i]);
-#endif
-    input += 8;
-  }
-
-  // transpose
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 8; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT8
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
-    }
-  }
-}
-
-void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16[] = {
-#if CONFIG_DAALA_DCT16
-    { daala_idct16, daala_idct16 },  // DCT_DCT  = 0
-    { daala_idst16, daala_idct16 },  // ADST_DCT = 1
-    { daala_idct16, daala_idst16 },  // DCT_ADST = 2
-    { daala_idst16, daala_idst16 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { daala_idst16, daala_idct16 },  // FLIPADST_DCT
-    { daala_idct16, daala_idst16 },  // DCT_FLIPADST
-    { daala_idst16, daala_idst16 },  // FLIPADST_FLIPADST
-    { daala_idst16, daala_idst16 },  // ADST_FLIPADST
-    { daala_idst16, daala_idst16 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx16 },  // IDTX
-    { daala_idct16, daala_idtx16 },  // V_DCT
-    { daala_idtx16, daala_idct16 },  // H_DCT
-    { daala_idst16, daala_idtx16 },  // V_ADST
-    { daala_idtx16, daala_idst16 },  // H_ADST
-    { daala_idst16, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx16, daala_idst16 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
-    { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
-    { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
-    { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
-    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
-    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx16_c },          // IDTX
-    { aom_idct16_c, iidtx16_c },       // V_DCT
-    { iidtx16_c, aom_idct16_c },       // H_DCT
-    { aom_iadst16_c, iidtx16_c },      // V_ADST
-    { iidtx16_c, aom_iadst16_c },      // H_ADST
-    { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
-    { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[16][16];
-  tran_low_t out[16][16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 16;
-
-  // inverse transform row vectors
-  for (i = 0; i < 16; ++i) {
-#if CONFIG_DAALA_DCT16
-    tran_low_t temp_in[16];
-    for (j = 0; j < 16; j++) temp_in[j] = input[j] * 2;
-    IHT_16[tx_type].rows(temp_in, out[i]);
-#else
-    IHT_16[tx_type].rows(input, out[i]);
-#endif
-    input += 16;
-  }
-
-  // transpose
-  for (i = 0; i < 16; i++) {
-    for (j = 0; j < 16; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 16; ++i) IHT_16[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT16
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
-    }
-  }
-}
-
-#if CONFIG_EXT_TX || CONFIG_DAALA_DCT32
-void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32[] = {
-#if CONFIG_DAALA_DCT32
-    { daala_idct32, daala_idct32 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { daala_idst32, daala_idct32 },  // ADST_DCT
-    { daala_idct32, daala_idst32 },  // DCT_ADST
-    { daala_idst32, daala_idst32 },  // ADST_ADST
-    { daala_idst32, daala_idct32 },  // FLIPADST_DCT
-    { daala_idct32, daala_idst32 },  // DCT_FLIPADST
-    { daala_idst32, daala_idst32 },  // FLIPADST_FLIPADST
-    { daala_idst32, daala_idst32 },  // ADST_FLIPADST
-    { daala_idst32, daala_idst32 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx32 },  // IDTX
-    { daala_idct32, daala_idtx32 },  // V_DCT
-    { daala_idtx32, daala_idct32 },  // H_DCT
-    { daala_idst32, daala_idtx32 },  // V_ADST
-    { daala_idtx32, daala_idst32 },  // H_ADST
-    { daala_idst32, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx32, daala_idst32 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
-#if CONFIG_EXT_TX
-    { ihalfright32_c, aom_idct32_c },    // ADST_DCT
-    { aom_idct32_c, ihalfright32_c },    // DCT_ADST
-    { ihalfright32_c, ihalfright32_c },  // ADST_ADST
-    { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
-    { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
-    { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
-    { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx32_c },            // IDTX
-    { aom_idct32_c, iidtx32_c },         // V_DCT
-    { iidtx32_c, aom_idct32_c },         // H_DCT
-    { ihalfright32_c, iidtx32_c },       // V_ADST
-    { iidtx32_c, ihalfright32_c },       // H_ADST
-    { ihalfright32_c, iidtx32_c },       // V_FLIPADST
-    { iidtx32_c, ihalfright32_c },       // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[32][32];
-  tran_low_t out[32][32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 32;
-
-  // inverse transform row vectors
-  for (i = 0; i < 32; ++i) {
-#if CONFIG_DAALA_DCT32
-    tran_low_t temp_in[32];
-    for (j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
-    IHT_32[tx_type].rows(temp_in, out[i]);
-#else
-    IHT_32[tx_type].rows(input, out[i]);
-#endif
-    input += 32;
-  }
-
-  // transpose
-  for (i = 0; i < 32; i++) {
-    for (j = 0; j < 32; j++) {
-#if CONFIG_DAALA_DCT32
-      tmp[j][i] = out[i][j] * 4;
-#else
-      tmp[j][i] = out[i][j];
-#endif
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
-
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
-
-  // Sum with the destination
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX || CONFIG_DAALA_DCT32
-
-#if CONFIG_TX64X64
-void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_64[] = {
-#if CONFIG_DAALA_DCT64
-    { daala_idct64, daala_idct64 },  // DCT_DCT
-    { daala_idst64, daala_idct64 },  // ADST_DCT
-    { daala_idct64, daala_idst64 },  // DCT_ADST
-    { daala_idst64, daala_idst64 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { daala_idst64, daala_idct64 },  // FLIPADST_DCT
-    { daala_idct64, daala_idst64 },  // DCT_FLIPADST
-    { daala_idst64, daala_idst64 },  // FLIPADST_FLIPADST
-    { daala_idst64, daala_idst64 },  // ADST_FLIPADST
-    { daala_idst64, daala_idst64 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx64 },  // IDTX
-    { daala_idct64, daala_idtx64 },  // V_DCT
-    { daala_idtx64, daala_idct64 },  // H_DCT
-    { daala_idst64, daala_idtx64 },  // V_ADST
-    { daala_idtx64, daala_idst64 },  // H_ADST
-    { daala_idst64, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx64, daala_idst64 },  // H_FLIPADST
-#endif
-#else
-    { idct64_col_c, idct64_row_c },      // DCT_DCT
-    { ihalfright64_c, idct64_row_c },    // ADST_DCT
-    { idct64_col_c, ihalfright64_c },    // DCT_ADST
-    { ihalfright64_c, ihalfright64_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
-    { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
-    { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
-    { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
-    { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
-    { iidtx64_c, iidtx64_c },            // IDTX
-    { idct64_col_c, iidtx64_c },         // V_DCT
-    { iidtx64_c, idct64_row_c },         // H_DCT
-    { ihalfright64_c, iidtx64_c },       // V_ADST
-    { iidtx64_c, ihalfright64_c },       // H_ADST
-    { ihalfright64_c, iidtx64_c },       // V_FLIPADST
-    { iidtx64_c, ihalfright64_c },       // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[64][64];
-  tran_low_t out[64][64];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 64;
-
-  // inverse transform row vectors
-  for (i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_DCT64
-    tran_low_t temp_in[64];
-    for (j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
-    IHT_64[tx_type].rows(temp_in, out[i]);
-// Do not rescale intermediate for Daala
-#else
-    IHT_64[tx_type].rows(input, out[i]);
-    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
-#endif
-    input += 64;
-  }
-
-  // transpose
-  for (i = 0; i < 64; i++) {
-    for (j = 0; j < 64; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 64; ++i) IHT_64[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
-#endif  // CONFIG_EXT_TX
-
-  // Sum with the destination
-  for (i = 0; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT64
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
-    }
-  }
-}
-
-void av1_iht64x32_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_64x32[] = {
-    { aom_idct32_c, idct64_row_c },      // DCT_DCT
-    { ihalfright32_c, idct64_row_c },    // ADST_DCT
-    { aom_idct32_c, ihalfright64_c },    // DCT_ADST
-    { ihalfright32_c, ihalfright64_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright32_c, idct64_row_c },    // FLIPADST_DCT
-    { aom_idct32_c, ihalfright64_c },    // DCT_FLIPADST
-    { ihalfright32_c, ihalfright64_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, ihalfright64_c },  // ADST_FLIPADST
-    { ihalfright32_c, ihalfright64_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx64_c },            // IDTX
-    { aom_idct32_c, iidtx64_c },         // V_DCT
-    { iidtx32_c, idct64_row_c },         // H_DCT
-    { ihalfright32_c, iidtx64_c },       // V_ADST
-    { iidtx32_c, ihalfright64_c },       // H_ADST
-    { ihalfright32_c, iidtx64_c },       // V_FLIPADST
-    { iidtx32_c, ihalfright64_c },       // H_FLIPADST
-#endif
-  };
-  const int n = 32;
-  const int n2 = 64;
-
-  int i, j;
-  tran_low_t out[64][32], tmp[64][32], outtmp[64];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_64x32[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) IHT_64x32[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht32x64_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32x64[] = {
-    { idct64_col_c, aom_idct32_c },      // DCT_DCT
-    { ihalfright64_c, aom_idct32_c },    // ADST_DCT
-    { idct64_col_c, ihalfright32_c },    // DCT_ADST
-    { ihalfright64_c, ihalfright32_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright64_c, aom_idct32_c },    // FLIPADST_DCT
-    { idct64_col_c, ihalfright32_c },    // DCT_FLIPADST
-    { ihalfright64_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { ihalfright64_c, ihalfright32_c },  // ADST_FLIPADST
-    { ihalfright64_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx64_c, iidtx32_c },            // IDTX
-    { idct64_col_c, iidtx32_c },         // V_DCT
-    { iidtx64_c, aom_idct32_c },         // H_DCT
-    { ihalfright64_c, iidtx32_c },       // V_ADST
-    { iidtx64_c, ihalfright32_c },       // H_ADST
-    { ihalfright64_c, iidtx32_c },       // V_FLIPADST
-    { iidtx64_c, ihalfright32_c },       // H_FLIPADST
-#endif
-  };
-
-  const int n = 32;
-  const int n2 = 64;
-  int i, j;
-  tran_low_t out[32][64], tmp[32][64], outtmp[32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-    IHT_32x64[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) IHT_32x64[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-#endif  // CONFIG_TX64X64
-
-// idct
-void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param) {
-  const int eob = txfm_param->eob;
-  if (eob > 1)
-    av1_iht4x4_16_add(input, dest, stride, txfm_param);
-  else
-    aom_idct4x4_1_add(input, dest, stride);
-}
-
-void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param) {
-  const int eob = txfm_param->eob;
-  if (eob > 1)
-    aom_iwht4x4_16_add(input, dest, stride);
-  else
-    aom_iwht4x4_1_add(input, dest, stride);
-}
-
-#if !CONFIG_DAALA_DCT8
-static void idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
-                        const TxfmParam *txfm_param) {
-// If dc is 1, then input[0] is the reconstructed value, do not need
-// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-
-// The calculation can be simplified if there are not many non-zero dct
-// coefficients. Use eobs to decide what to do.
-// TODO(yunqingwang): "eobs = 1" case is also handled in av1_short_idct8x8_c.
-// Combine that with code here.
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-#else
-  const int16_t half = 12;
-#endif
-
-  const int eob = txfm_param->eob;
-  if (eob == 1)
-    // DC only DCT coefficient
-    aom_idct8x8_1_add(input, dest, stride);
-  else if (eob <= half)
-    aom_idct8x8_12_add(input, dest, stride);
-  else
-    aom_idct8x8_64_add(input, dest, stride);
-}
-#endif
-
-#if !CONFIG_DAALA_DCT16
-static void idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-// The calculation can be simplified if there are not many non-zero dct
-// coefficients. Use eobs to separate different cases.
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-  const int16_t quarter = txfm_param->eob_threshold[1];
-#else
-  const int16_t half = 38;
-  const int16_t quarter = 10;
-#endif
-
-  const int eob = txfm_param->eob;
-  if (eob == 1) /* DC only DCT coefficient. */
-    aom_idct16x16_1_add(input, dest, stride);
-  else if (eob <= quarter)
-    aom_idct16x16_10_add(input, dest, stride);
-  else if (eob <= half)
-    aom_idct16x16_38_add(input, dest, stride);
-  else
-    aom_idct16x16_256_add(input, dest, stride);
-}
-#endif
-
-#if CONFIG_MRC_TX
-static void imrc32x32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-  const int16_t quarter = txfm_param->eob_threshold[1];
-#else
-  const int16_t half = 135;
-  const int16_t quarter = 34;
-#endif
-
-  const int eob = txfm_param->eob;
-  int n_masked_vals = 0;
-  uint8_t *mask;
-  uint8_t mask_tmp[32 * 32];
-  if (eob == 1) {
-    aom_idct32x32_1_add_c(input, dest, stride);
-  } else {
-    if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
-        (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
-      mask = txfm_param->mask;
-    } else {
-      n_masked_vals =
-          get_mrc_pred_mask(txfm_param->dst, txfm_param->stride, mask_tmp, 32,
-                            32, 32, txfm_param->is_inter);
-      if (!is_valid_mrc_mask(n_masked_vals, 32, 32))
-        assert(0 && "Invalid MRC mask");
-      mask = mask_tmp;
-    }
-    if (eob <= quarter)
-      // non-zero coeff only in upper-left 8x8
-      aom_imrc32x32_34_add_c(input, dest, stride, mask);
-    else if (eob <= half)
-      // non-zero coeff only in upper-left 16x16
-      aom_imrc32x32_135_add_c(input, dest, stride, mask);
-    else
-      aom_imrc32x32_1024_add_c(input, dest, stride, mask);
-  }
-}
-#endif  // CONFIG_MRC_TX
-
-#if !CONFIG_DAALA_DCT32
-static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-  const int16_t quarter = txfm_param->eob_threshold[1];
-#else
-  const int16_t half = 135;
-  const int16_t quarter = 34;
-#endif
-
-  const int eob = txfm_param->eob;
-  if (eob == 1)
-    aom_idct32x32_1_add(input, dest, stride);
-  else if (eob <= quarter)
-    // non-zero coeff only in upper-left 8x8
-    aom_idct32x32_34_add(input, dest, stride);
-  else if (eob <= half)
-    // non-zero coeff only in upper-left 16x16
-    aom_idct32x32_135_add(input, dest, stride);
-  else
-    aom_idct32x32_1024_add(input, dest, stride);
-}
-#endif
-
-#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  (void)txfm_param;
-  av1_iht64x64_4096_add(input, dest, stride, txfm_param);
-}
-#endif  // CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-
-#if CONFIG_CHROMA_2X2
-static void inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
-  tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
-  tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
-  tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  (void)txfm_param;
-
-  a1 = (a2 + b2) >> 2;
-  b1 = (a2 - b2) >> 2;
-  c1 = (c2 + d2) >> 2;
-  d1 = (c2 - d2) >> 2;
-
-  dest[0] = clip_pixel_add(dest[0], WRAPLOW(a1));
-  dest[1] = clip_pixel_add(dest[1], WRAPLOW(b1));
-  dest[stride] = clip_pixel_add(dest[stride], WRAPLOW(c1));
-  dest[stride + 1] = clip_pixel_add(dest[stride + 1], WRAPLOW(d1));
-}
-#endif
-
-static void inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  if (txfm_param->lossless) {
-    assert(tx_type == DCT_DCT);
-    av1_iwht4x4_add(input, dest, stride, txfm_param);
-    return;
-  }
-
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT4
-    case DCT_DCT: av1_idct4x4_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT4
-      // LGT only exists in C verson
-      av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht4x4_16_add(input, dest, stride, txfm_param);
-      break;
-#endif
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT4
-      av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht4x4_16_add(input, dest, stride, txfm_param);
-      break;
-#endif
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // Use C version since DST only exists in C code
-      av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 4, 4, tx_type); break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-
-static void inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht4x8_32_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht4x8_32_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht8x4_32_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht8x4_32_add(input, dest, stride, txfm_param);
-#endif
-}
-
-// These will be used by the masked-tx experiment in the future.
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static void inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht4x16_64_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht4x16_64_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht16x4_64_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht16x4_64_add(input, dest, stride, txfm_param);
-#endif
-}
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-static void inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht8x32_256_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht8x32_256_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht32x8_256_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht32x8_256_add(input, dest, stride, txfm_param);
-#endif
-}
-#endif
-
-static void inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht8x16_128_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht8x16_128_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht16x8_128_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht16x8_128_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht16x32_512_add(input, dest, stride, txfm_param);
-}
-
-static void inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht32x16_512_add(input, dest, stride, txfm_param);
-}
-
-#if CONFIG_TX64X64
-static void inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht32x64_2048_add(input, dest, stride, txfm_param);
-}
-
-static void inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht64x32_2048_add(input, dest, stride, txfm_param);
-}
-#endif  // CONFIG_TX64X64
-
-static void inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT8
-    case DCT_DCT: idct8x8_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT8
-      av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht8x8_64_add(input, dest, stride, txfm_param);
-      break;
-#endif
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT8
-      av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht8x8_64_add(input, dest, stride, txfm_param);
-      break;
-#endif
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // Use C version since DST only exists in C code
-      av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 8, 8, tx_type); break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-
-static void inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT16
-    case DCT_DCT: idct16x16_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_DAALA_DCT16
-      av1_iht16x16_256_add_c(input, dest, stride, txfm_param);
-#else
-      av1_iht16x16_256_add(input, dest, stride, txfm_param);
-#endif  // CONFIG_DAALA_DCT16
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-#if CONFIG_DAALA_DCT16
-      av1_iht16x16_256_add_c(input, dest, stride, txfm_param);
-#else
-      av1_iht16x16_256_add(input, dest, stride, txfm_param);
-#endif  // CONFIG_DAALA_DCT16
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 16, 16, tx_type); break;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: assert(0 && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-    default: assert(0); break;
-  }
-}
+#include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
 
-static void inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT32
-    case DCT_DCT: idct32x32_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-      av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
-      break;
-#endif
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 32, 32, tx_type); break;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: imrc32x32_add_c(input, dest, stride, txfm_param); break;
-#endif  // CONFIG_MRC_TX
-    default: assert(0); break;
-  }
+int av1_get_tx_scale(const TX_SIZE tx_size) {
+  const int pels = tx_size_2d[tx_size];
+  // Largest possible pels is 4096 (64x64).
+  return (pels > 256) + (pels > 1024);
 }
 
-#if CONFIG_TX64X64
-static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  assert(tx_type == DCT_DCT);
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT64
-    case DCT_DCT: idct64x64_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      av1_iht64x64_4096_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 64, 64, tx_type); break;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: assert(0 && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_TX64X64
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
 
-void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                            int eob, int bd) {
+// idct
+static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, int bd) {
   if (eob > 1)
-    aom_highbd_iwht4x4_16_add(input, dest, stride, bd);
+    av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
-    aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
-}
-
-#if CONFIG_CHROMA_2X2
-static void highbd_inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
-  int eob = txfm_param->eob;
-  int bd = txfm_param->bd;
-  int lossless = txfm_param->lossless;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
-  tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
-  tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
-  tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
-
-  (void)tx_type;
-  (void)lossless;
-  (void)eob;
-
-  a1 = (a2 + b2) >> 2;
-  b1 = (a2 - b2) >> 2;
-  c1 = (c2 + d2) >> 2;
-  d1 = (c2 - d2) >> 2;
-
-  dst[0] = highbd_clip_pixel_add(dst[0], a1, bd);
-  dst[1] = highbd_clip_pixel_add(dst[1], b1, bd);
-  dst[stride] = highbd_clip_pixel_add(dst[stride], c1, bd);
-  dst[stride + 1] = highbd_clip_pixel_add(dst[stride + 1], d1, bd);
+    av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
-#endif
 
 static const int32_t *cast_to_int32(const tran_low_t *input) {
   assert(sizeof(int32_t) == sizeof(tran_low_t));
@@ -2636,6 +46,7 @@ static const int32_t *cast_to_int32(const tran_low_t *input) {
 
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
   int lossless = txfm_param->lossless;
@@ -2643,27 +54,12 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (lossless) {
     assert(tx_type == DCT_DCT);
-    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-    // use the c version for anything including identity for now
+    // Assembly version doesn't support some transform types, so use C version
+    // for those.
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -2674,68 +70,112 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
+    default:
+      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
   }
 }
 
-void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride,
+                         txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride,
+                         txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                            txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                            txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
 }
 
-#if CONFIG_TX64X64
 static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
-#endif  // CONFIG_TX64X64
 
 static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                     int stride, const TxfmParam *txfm_param) {
@@ -2743,23 +183,8 @@ static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-    // use the c version for anything including identity for now
+    // Assembly version doesn't support some transform types, so use C version
+    // for those.
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -2770,8 +195,10 @@ static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
   }
 }
 
@@ -2781,23 +208,8 @@ static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    // use the c version for anything including identity for now
+    // Assembly version doesn't support some transform types, so use C version
+    // for those.
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -2808,14 +220,16 @@ static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
   }
 }
 
 static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
+  const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
@@ -2823,26 +237,8 @@ static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
-
-    // The optimised version only supports DCT_DCT, so force use of
-    // the C version for all other transform types.
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
+    // Assembly version doesn't support IDTX, so use C version for it.
     case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-#endif  // CONFIG_EXT_TX
       av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;
@@ -2851,225 +247,34 @@ static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-#if CONFIG_TX64X64
 static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
+  const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, DCT_DCT,
-                               bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 DCT_DCT, bd);
-      break;
-    case IDTX:
-      highbd_inv_idtx_add_c(input, dest, stride, 64, 64, tx_type, bd);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_TX64X64
-
-void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                      TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-#if CONFIG_LGT_FROM_PRED
-  if (txfm_param->use_lgt) {
-    assert(is_lgt_allowed(txfm_param->mode, tx_size));
-    ilgt2d_from_pred_add(input, dest, stride, txfm_param);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-  switch (tx_size) {
-#if CONFIG_TX64X64
-    case TX_64X64: inv_txfm_add_64x64(input, dest, stride, txfm_param); break;
-#endif  // CONFIG_TX64X64
-    case TX_32X32: inv_txfm_add_32x32(input, dest, stride, txfm_param); break;
-    case TX_16X16: inv_txfm_add_16x16(input, dest, stride, txfm_param); break;
-    case TX_8X8: inv_txfm_add_8x8(input, dest, stride, txfm_param); break;
-    case TX_4X8: inv_txfm_add_4x8(input, dest, stride, txfm_param); break;
-    case TX_8X4: inv_txfm_add_8x4(input, dest, stride, txfm_param); break;
-    case TX_8X16: inv_txfm_add_8x16(input, dest, stride, txfm_param); break;
-    case TX_16X8: inv_txfm_add_16x8(input, dest, stride, txfm_param); break;
-    case TX_16X32: inv_txfm_add_16x32(input, dest, stride, txfm_param); break;
-    case TX_32X16: inv_txfm_add_32x16(input, dest, stride, txfm_param); break;
-#if CONFIG_TX64X64
-    case TX_64X32: inv_txfm_add_64x32(input, dest, stride, txfm_param); break;
-    case TX_32X64: inv_txfm_add_32x64(input, dest, stride, txfm_param); break;
-#endif  // CONFIG_TX64X64
-    case TX_4X4:
-      // this is like av1_short_idct4x4 but has a special case around eob<=1
-      // which is significant (not just an optimization) for the lossless
-      // case.
-      inv_txfm_add_4x4(input, dest, stride, txfm_param);
-      break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: inv_txfm_add_2x2(input, dest, stride, txfm_param); break;
-#endif
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_32X8: inv_txfm_add_32x8(input, dest, stride, txfm_param); break;
-    case TX_8X32: inv_txfm_add_8x32(input, dest, stride, txfm_param); break;
-    case TX_16X4: inv_txfm_add_16x4(input, dest, stride, txfm_param); break;
-    case TX_4X16: inv_txfm_add_4x16(input, dest, stride, txfm_param); break;
-#endif
-    default: assert(0 && "Invalid transform size"); break;
-  }
+  assert(tx_type == DCT_DCT);
+  av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-static void init_txfm_param(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                            TX_TYPE tx_type, int eob, TxfmParam *txfm_param) {
+static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int eob, int reduced_tx_set,
+                            TxfmParam *txfm_param) {
+  (void)plane;
   txfm_param->tx_type = tx_type;
   txfm_param->tx_size = tx_size;
   txfm_param->eob = eob;
-  txfm_param->lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
   txfm_param->bd = xd->bd;
-#if CONFIG_LGT
-  txfm_param->is_inter = is_inter_block(&xd->mi[0]->mbmi);
-#endif
-#if CONFIG_LGT_FROM_PRED
-  txfm_param->use_lgt = xd->mi[0]->mbmi.use_lgt;
-#endif
-#if CONFIG_ADAPT_SCAN
-  txfm_param->eob_threshold =
-      (const int16_t *)&xd->eob_threshold_md[tx_size][tx_type][0];
-#endif
-}
-
-#if !CONFIG_TXMG
-typedef void (*InvTxfmFunc)(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
-                            TxfmParam *txfm_param);
-
-static InvTxfmFunc inv_txfm_func[2] = { av1_inv_txfm_add,
-                                        av1_highbd_inv_txfm_add };
-#endif
-
-void av1_inverse_transform_block(const MACROBLOCKD *xd,
-                                 const tran_low_t *dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                 PREDICTION_MODE mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                 uint8_t *mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
-                                 int stride, int eob) {
-  if (!eob) return;
-#if CONFIG_PVQ
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  const int txb_width = block_size_wide[tx_bsize];
-  const int txb_height = block_size_high[tx_bsize];
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int r = 0; r < txb_height; r++)
-      for (int c = 0; c < txb_width; c++)
-        CONVERT_TO_SHORTPTR(dst)[r * stride + c] = 0;
-  } else {
-    for (int r = 0; r < txb_height; r++)
-      for (int c = 0; c < txb_width; c++) dst[r * stride + c] = 0;
-  }
-#endif  // CONFIG_PVQ
-  TxfmParam txfm_param;
-  init_txfm_param(xd, tx_size, tx_type, eob, &txfm_param);
-#if CONFIG_LGT || CONFIG_MRC_TX
-  txfm_param.is_inter = is_inter_block(&xd->mi[0]->mbmi);
-#endif  // CONFIG_LGT || CONFIG_MRC_TX
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  txfm_param.mask = mrc_mask;
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-#if CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-  txfm_param.dst = dst;
-  txfm_param.stride = stride;
-#if CONFIG_LGT_FROM_PRED
-  txfm_param.mode = mode;
-#endif  // CONFIG_LGT_FROM_PRED
-#endif  // CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-
-  const int is_hbd = get_bitdepth_data_path_index(xd);
-#if CONFIG_TXMG
-  if (is_hbd) {
-    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t, tmp[MAX_TX_SQUARE]);
-    int tmp_stride = MAX_TX_SIZE;
-    int w = tx_size_wide[tx_size];
-    int h = tx_size_high[tx_size];
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        tmp[r * tmp_stride + c] = dst[r * stride + c];
-      }
-    }
-
-    av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
-                            &txfm_param);
-
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
-      }
-    }
-  }
-#else  // CONFIG_TXMG
-  inv_txfm_func[is_hbd](dqcoeff, dst, stride, &txfm_param);
-#endif  // CONFIG_TXMG
-}
-
-void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
-                                        int blk_row, int blk_col, int eob) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const int dst_stride = pd->dst.stride;
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              xd->mi[0]->mbmi.mode,
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, dst_stride, eob);
+  txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
 
-void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                             TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
+                                int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-#if CONFIG_TX64X64
-    case TX_64X64:
-      highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
-      break;
-#endif  // CONFIG_TX64X64
     case TX_32X32:
       highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
       break;
@@ -3080,10 +285,10 @@ void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
       highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
       highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
@@ -3097,25 +302,81 @@ void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
     case TX_32X16:
       highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
       break;
-#if CONFIG_TX64X64
-    case TX_64X32:
-      highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+    case TX_64X64:
+      highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
       highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
       break;
-#endif  // CONFIG_TX64X64
+    case TX_64X32:
+      highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_16X64:
+      highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+      break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
       av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
       break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2:
-      highbd_inv_txfm_add_2x2(input, dest, stride, txfm_param);
+    case TX_16X4:
+      highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
       break;
-#endif
     default: assert(0 && "Invalid transform size"); break;
   }
 }
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                        const TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
+  int tmp_stride = MAX_TX_SIZE;
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      tmp[r * tmp_stride + c] = dst[r * stride + c];
+    }
+  }
+
+  highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);
+
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
+    }
+  }
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, int plane,
+                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+                                 int stride, int eob, int reduced_tx_set) {
+  if (!eob) return;
+
+  assert(eob <= av1_get_max_eob(tx_size));
+
+  TxfmParam txfm_param;
+  init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
+                  &txfm_param);
+  assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
+
+  if (txfm_param.is_hbd) {
+    highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  } else {
+    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
index e4e4ad671..50032a167 100644
--- a/third_party/aom/av1/common/idct.h
+++ b/third_party/aom/av1/common/idct.h
@@ -12,15 +12,12 @@
 #ifndef AV1_COMMON_IDCT_H_
 #define AV1_COMMON_IDCT_H_
 
-#include <assert.h>
+#include "config/aom_config.h"
 
-#include "./aom_config.h"
 #include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
-#include "aom_dsp/inv_txfm.h"
 #include "aom_dsp/txfm_common.h"
-#include "aom_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,64 +29,16 @@ typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-#if CONFIG_LGT
-int get_lgt4(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx);
-int get_lgt8(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx);
-#endif  // CONFIG_LGT
-
-#if CONFIG_LGT_FROM_PRED
-void get_lgt4_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx);
-void get_lgt8_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx);
-void get_lgt16up_from_pred(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t **lgtmtx, int ntx);
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*highbd_transform_1d)(const tran_low_t *, tran_low_t *, int bd);
-
-typedef struct {
-  highbd_transform_1d cols, rows;  // vertical and horizontal
-} highbd_transform_2d;
-#endif  // CONFIG_HIGHBITDEPTH
-
 #define MAX_TX_SCALE 1
 int av1_get_tx_scale(const TX_SIZE tx_size);
 
-void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param);
-void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param);
-
-void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                      TxfmParam *txfm_param);
 void av1_inverse_transform_block(const MACROBLOCKD *xd,
-                                 const tran_low_t *dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                 PREDICTION_MODE mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                 uint8_t *mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                 const tran_low_t *dqcoeff, int plane,
                                  TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
-                                 int stride, int eob);
-void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
-                                        int blk_row, int blk_col, int eob);
+                                 int stride, int eob, int reduced_tx_set);
 
-void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                            int eob, int bd);
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, const TxfmParam *param);
-void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *param);
-void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *param);
-void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                             TxfmParam *txfm_param);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/laplace_tables.c b/third_party/aom/av1/common/laplace_tables.c
deleted file mode 100644
index ab8784895..000000000
--- a/third_party/aom/av1/common/laplace_tables.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/* This file is auto-generated using "gen_laplace_tables 128 7" */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "aom_dsp/prob.h"
-#include "pvq.h"
-
-const uint16_t EXP_CDF_TABLE[128][16] = {
-  {AOM_ICDF(32753), AOM_ICDF(32754), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(32499), AOM_ICDF(32753), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(32243), AOM_ICDF(32747), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31987), AOM_ICDF(32737), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31732), AOM_ICDF(32724), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31476), AOM_ICDF(32706), AOM_ICDF(32754), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31220), AOM_ICDF(32684), AOM_ICDF(32753), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30964), AOM_ICDF(32658), AOM_ICDF(32751), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30708), AOM_ICDF(32628), AOM_ICDF(32748), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30452), AOM_ICDF(32594), AOM_ICDF(32745), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30198), AOM_ICDF(32558), AOM_ICDF(32742), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29941), AOM_ICDF(32515), AOM_ICDF(32736), AOM_ICDF(32755),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29686), AOM_ICDF(32470), AOM_ICDF(32731), AOM_ICDF(32755),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29429), AOM_ICDF(32419), AOM_ICDF(32723), AOM_ICDF(32754),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29174), AOM_ICDF(32366), AOM_ICDF(32715), AOM_ICDF(32753),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28918), AOM_ICDF(32308), AOM_ICDF(32705), AOM_ICDF(32752),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28662), AOM_ICDF(32246), AOM_ICDF(32694), AOM_ICDF(32750),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28406), AOM_ICDF(32180), AOM_ICDF(32681), AOM_ICDF(32748),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28150), AOM_ICDF(32110), AOM_ICDF(32667), AOM_ICDF(32745),
-    AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27894), AOM_ICDF(32036), AOM_ICDF(32651), AOM_ICDF(32742),
-    AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27639), AOM_ICDF(31959), AOM_ICDF(32634), AOM_ICDF(32739),
-    AOM_ICDF(32755), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27383), AOM_ICDF(31877), AOM_ICDF(32614), AOM_ICDF(32735),
-    AOM_ICDF(32755), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27126), AOM_ICDF(31790), AOM_ICDF(32592), AOM_ICDF(32730),
-    AOM_ICDF(32754), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26871), AOM_ICDF(31701), AOM_ICDF(32569), AOM_ICDF(32725),
-    AOM_ICDF(32753), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26615), AOM_ICDF(31607), AOM_ICDF(32543), AOM_ICDF(32719),
-    AOM_ICDF(32752), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26361), AOM_ICDF(31511), AOM_ICDF(32517), AOM_ICDF(32713),
-    AOM_ICDF(32751), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26104), AOM_ICDF(31408), AOM_ICDF(32485), AOM_ICDF(32704),
-    AOM_ICDF(32748), AOM_ICDF(32757), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25848), AOM_ICDF(31302), AOM_ICDF(32452), AOM_ICDF(32695),
-    AOM_ICDF(32746), AOM_ICDF(32757), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25591), AOM_ICDF(31191), AOM_ICDF(32416), AOM_ICDF(32684),
-    AOM_ICDF(32743), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25336), AOM_ICDF(31078), AOM_ICDF(32379), AOM_ICDF(32674),
-    AOM_ICDF(32741), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25080), AOM_ICDF(30960), AOM_ICDF(32338), AOM_ICDF(32661),
-    AOM_ICDF(32737), AOM_ICDF(32755), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24824), AOM_ICDF(30838), AOM_ICDF(32295), AOM_ICDF(32648),
-    AOM_ICDF(32733), AOM_ICDF(32754), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24568), AOM_ICDF(30712), AOM_ICDF(32248), AOM_ICDF(32632),
-    AOM_ICDF(32728), AOM_ICDF(32752), AOM_ICDF(32758), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24313), AOM_ICDF(30583), AOM_ICDF(32199), AOM_ICDF(32616),
-    AOM_ICDF(32723), AOM_ICDF(32751), AOM_ICDF(32758), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24057), AOM_ICDF(30449), AOM_ICDF(32147), AOM_ICDF(32598),
-    AOM_ICDF(32718), AOM_ICDF(32750), AOM_ICDF(32758), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23801), AOM_ICDF(30311), AOM_ICDF(32091), AOM_ICDF(32578),
-    AOM_ICDF(32711), AOM_ICDF(32747), AOM_ICDF(32757), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23546), AOM_ICDF(30170), AOM_ICDF(32033), AOM_ICDF(32557),
-    AOM_ICDF(32704), AOM_ICDF(32745), AOM_ICDF(32757), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23288), AOM_ICDF(30022), AOM_ICDF(31969), AOM_ICDF(32532),
-    AOM_ICDF(32695), AOM_ICDF(32742), AOM_ICDF(32756), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23033), AOM_ICDF(29873), AOM_ICDF(31904), AOM_ICDF(32507),
-    AOM_ICDF(32686), AOM_ICDF(32739), AOM_ICDF(32755), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22778), AOM_ICDF(29720), AOM_ICDF(31835), AOM_ICDF(32479),
-    AOM_ICDF(32675), AOM_ICDF(32735), AOM_ICDF(32753), AOM_ICDF(32759),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22521), AOM_ICDF(29561), AOM_ICDF(31761), AOM_ICDF(32449),
-    AOM_ICDF(32664), AOM_ICDF(32731), AOM_ICDF(32752), AOM_ICDF(32759),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22267), AOM_ICDF(29401), AOM_ICDF(31686), AOM_ICDF(32418),
-    AOM_ICDF(32652), AOM_ICDF(32727), AOM_ICDF(32751), AOM_ICDF(32759),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22011), AOM_ICDF(29235), AOM_ICDF(31605), AOM_ICDF(32383),
-    AOM_ICDF(32638), AOM_ICDF(32722), AOM_ICDF(32749), AOM_ICDF(32758),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(21754), AOM_ICDF(29064), AOM_ICDF(31520), AOM_ICDF(32345),
-    AOM_ICDF(32622), AOM_ICDF(32715), AOM_ICDF(32746), AOM_ICDF(32757),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(21501), AOM_ICDF(28893), AOM_ICDF(31434), AOM_ICDF(32307),
-    AOM_ICDF(32607), AOM_ICDF(32710), AOM_ICDF(32745), AOM_ICDF(32757),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(21243), AOM_ICDF(28713), AOM_ICDF(31339), AOM_ICDF(32262),
-    AOM_ICDF(32587), AOM_ICDF(32701), AOM_ICDF(32741), AOM_ICDF(32755),
-    AOM_ICDF(32760), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20988), AOM_ICDF(28532), AOM_ICDF(31243), AOM_ICDF(32217),
-    AOM_ICDF(32567), AOM_ICDF(32693), AOM_ICDF(32738), AOM_ICDF(32754),
-    AOM_ICDF(32760), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20730), AOM_ICDF(28344), AOM_ICDF(31140), AOM_ICDF(32167),
-    AOM_ICDF(32544), AOM_ICDF(32682), AOM_ICDF(32733), AOM_ICDF(32752),
-    AOM_ICDF(32759), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20476), AOM_ICDF(28156), AOM_ICDF(31036), AOM_ICDF(32116),
-    AOM_ICDF(32521), AOM_ICDF(32673), AOM_ICDF(32730), AOM_ICDF(32751),
-    AOM_ICDF(32759), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20220), AOM_ICDF(27962), AOM_ICDF(30926), AOM_ICDF(32061),
-    AOM_ICDF(32495), AOM_ICDF(32661), AOM_ICDF(32725), AOM_ICDF(32749),
-    AOM_ICDF(32758), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19963), AOM_ICDF(27763), AOM_ICDF(30810), AOM_ICDF(32000),
-    AOM_ICDF(32465), AOM_ICDF(32647), AOM_ICDF(32718), AOM_ICDF(32746),
-    AOM_ICDF(32757), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19708), AOM_ICDF(27562), AOM_ICDF(30691), AOM_ICDF(31938),
-    AOM_ICDF(32435), AOM_ICDF(32633), AOM_ICDF(32712), AOM_ICDF(32743),
-    AOM_ICDF(32756), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19454), AOM_ICDF(27358), AOM_ICDF(30569), AOM_ICDF(31873),
-    AOM_ICDF(32403), AOM_ICDF(32618), AOM_ICDF(32705), AOM_ICDF(32741),
-    AOM_ICDF(32755), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19196), AOM_ICDF(27146), AOM_ICDF(30438), AOM_ICDF(31801),
-    AOM_ICDF(32365), AOM_ICDF(32599), AOM_ICDF(32696), AOM_ICDF(32736),
-    AOM_ICDF(32753), AOM_ICDF(32760), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18942), AOM_ICDF(26934), AOM_ICDF(30306), AOM_ICDF(31728),
-    AOM_ICDF(32328), AOM_ICDF(32581), AOM_ICDF(32688), AOM_ICDF(32733),
-    AOM_ICDF(32752), AOM_ICDF(32760), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18684), AOM_ICDF(26714), AOM_ICDF(30164), AOM_ICDF(31647),
-    AOM_ICDF(32284), AOM_ICDF(32558), AOM_ICDF(32676), AOM_ICDF(32727),
-    AOM_ICDF(32749), AOM_ICDF(32758), AOM_ICDF(32762), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18429), AOM_ICDF(26493), AOM_ICDF(30021), AOM_ICDF(31565),
-    AOM_ICDF(32240), AOM_ICDF(32535), AOM_ICDF(32664), AOM_ICDF(32721),
-    AOM_ICDF(32746), AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18174), AOM_ICDF(26268), AOM_ICDF(29872), AOM_ICDF(31477),
-    AOM_ICDF(32192), AOM_ICDF(32510), AOM_ICDF(32652), AOM_ICDF(32715),
-    AOM_ICDF(32743), AOM_ICDF(32756), AOM_ICDF(32762), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17920), AOM_ICDF(26040), AOM_ICDF(29719), AOM_ICDF(31386),
-    AOM_ICDF(32141), AOM_ICDF(32483), AOM_ICDF(32638), AOM_ICDF(32708),
-    AOM_ICDF(32740), AOM_ICDF(32754), AOM_ICDF(32761), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17661), AOM_ICDF(25803), AOM_ICDF(29556), AOM_ICDF(31286),
-    AOM_ICDF(32083), AOM_ICDF(32451), AOM_ICDF(32620), AOM_ICDF(32698),
-    AOM_ICDF(32734), AOM_ICDF(32751), AOM_ICDF(32759), AOM_ICDF(32763),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17406), AOM_ICDF(25566), AOM_ICDF(29391), AOM_ICDF(31184),
-    AOM_ICDF(32024), AOM_ICDF(32418), AOM_ICDF(32603), AOM_ICDF(32690),
-    AOM_ICDF(32731), AOM_ICDF(32750), AOM_ICDF(32759), AOM_ICDF(32763),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17151), AOM_ICDF(25325), AOM_ICDF(29220), AOM_ICDF(31076),
-    AOM_ICDF(31961), AOM_ICDF(32383), AOM_ICDF(32584), AOM_ICDF(32680),
-    AOM_ICDF(32726), AOM_ICDF(32748), AOM_ICDF(32758), AOM_ICDF(32763),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16896), AOM_ICDF(25080), AOM_ICDF(29044), AOM_ICDF(30964),
-    AOM_ICDF(31894), AOM_ICDF(32344), AOM_ICDF(32562), AOM_ICDF(32668),
-    AOM_ICDF(32719), AOM_ICDF(32744), AOM_ICDF(32756), AOM_ICDF(32762),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16639), AOM_ICDF(24829), AOM_ICDF(28860), AOM_ICDF(30844),
-    AOM_ICDF(31821), AOM_ICDF(32302), AOM_ICDF(32539), AOM_ICDF(32655),
-    AOM_ICDF(32712), AOM_ICDF(32740), AOM_ICDF(32754), AOM_ICDF(32761),
-    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(30720),
-    AOM_ICDF(31744), AOM_ICDF(32256), AOM_ICDF(32512), AOM_ICDF(32640),
-    AOM_ICDF(32704), AOM_ICDF(32736), AOM_ICDF(32752), AOM_ICDF(32760),
-    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16130), AOM_ICDF(24320), AOM_ICDF(28479), AOM_ICDF(30591),
-    AOM_ICDF(31663), AOM_ICDF(32208), AOM_ICDF(32485), AOM_ICDF(32625),
-    AOM_ICDF(32696), AOM_ICDF(32732), AOM_ICDF(32750), AOM_ICDF(32759),
-    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(15872), AOM_ICDF(24056), AOM_ICDF(28276), AOM_ICDF(30452),
-    AOM_ICDF(31574), AOM_ICDF(32152), AOM_ICDF(32450), AOM_ICDF(32604),
-    AOM_ICDF(32683), AOM_ICDF(32724), AOM_ICDF(32745), AOM_ICDF(32756),
-    AOM_ICDF(32762), AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32768)},
-  {AOM_ICDF(15615), AOM_ICDF(23789), AOM_ICDF(28068), AOM_ICDF(30308),
-    AOM_ICDF(31480), AOM_ICDF(32094), AOM_ICDF(32415), AOM_ICDF(32583),
-    AOM_ICDF(32671), AOM_ICDF(32717), AOM_ICDF(32741), AOM_ICDF(32754),
-    AOM_ICDF(32761), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768)},
-  {AOM_ICDF(15361), AOM_ICDF(23521), AOM_ICDF(27856), AOM_ICDF(30159),
-    AOM_ICDF(31382), AOM_ICDF(32032), AOM_ICDF(32377), AOM_ICDF(32560),
-    AOM_ICDF(32657), AOM_ICDF(32709), AOM_ICDF(32737), AOM_ICDF(32752),
-    AOM_ICDF(32760), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768)},
-  {AOM_ICDF(15103), AOM_ICDF(23245), AOM_ICDF(27634), AOM_ICDF(30000),
-    AOM_ICDF(31275), AOM_ICDF(31963), AOM_ICDF(32334), AOM_ICDF(32534),
-    AOM_ICDF(32642), AOM_ICDF(32700), AOM_ICDF(32731), AOM_ICDF(32748),
-    AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32765), AOM_ICDF(32768)},
-  {AOM_ICDF(14848), AOM_ICDF(22968), AOM_ICDF(27409), AOM_ICDF(29837),
-    AOM_ICDF(31165), AOM_ICDF(31891), AOM_ICDF(32288), AOM_ICDF(32505),
-    AOM_ICDF(32624), AOM_ICDF(32689), AOM_ICDF(32725), AOM_ICDF(32744),
-    AOM_ICDF(32755), AOM_ICDF(32761), AOM_ICDF(32764), AOM_ICDF(32768)},
-  {AOM_ICDF(14592), AOM_ICDF(22686), AOM_ICDF(27176), AOM_ICDF(29666),
-    AOM_ICDF(31047), AOM_ICDF(31813), AOM_ICDF(32238), AOM_ICDF(32474),
-    AOM_ICDF(32605), AOM_ICDF(32678), AOM_ICDF(32718), AOM_ICDF(32740),
-    AOM_ICDF(32752), AOM_ICDF(32759), AOM_ICDF(32763), AOM_ICDF(32768)},
-  {AOM_ICDF(14336), AOM_ICDF(22400), AOM_ICDF(26936), AOM_ICDF(29488),
-    AOM_ICDF(30923), AOM_ICDF(31730), AOM_ICDF(32184), AOM_ICDF(32439),
-    AOM_ICDF(32583), AOM_ICDF(32664), AOM_ICDF(32709), AOM_ICDF(32735),
-    AOM_ICDF(32749), AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32768)},
-  {AOM_ICDF(14079), AOM_ICDF(22109), AOM_ICDF(26689), AOM_ICDF(29301),
-    AOM_ICDF(30791), AOM_ICDF(31641), AOM_ICDF(32125), AOM_ICDF(32401),
-    AOM_ICDF(32559), AOM_ICDF(32649), AOM_ICDF(32700), AOM_ICDF(32729),
-    AOM_ICDF(32746), AOM_ICDF(32756), AOM_ICDF(32761), AOM_ICDF(32768)},
-  {AOM_ICDF(13825), AOM_ICDF(21817), AOM_ICDF(26437), AOM_ICDF(29108),
-    AOM_ICDF(30652), AOM_ICDF(31545), AOM_ICDF(32061), AOM_ICDF(32359),
-    AOM_ICDF(32532), AOM_ICDF(32632), AOM_ICDF(32690), AOM_ICDF(32723),
-    AOM_ICDF(32742), AOM_ICDF(32753), AOM_ICDF(32759), AOM_ICDF(32768)},
-  {AOM_ICDF(13568), AOM_ICDF(21518), AOM_ICDF(26176), AOM_ICDF(28905),
-    AOM_ICDF(30504), AOM_ICDF(31441), AOM_ICDF(31990), AOM_ICDF(32312),
-    AOM_ICDF(32501), AOM_ICDF(32611), AOM_ICDF(32676), AOM_ICDF(32714),
-    AOM_ICDF(32736), AOM_ICDF(32749), AOM_ICDF(32757), AOM_ICDF(32768)},
-  {AOM_ICDF(13314), AOM_ICDF(21218), AOM_ICDF(25911), AOM_ICDF(28697),
-    AOM_ICDF(30351), AOM_ICDF(31333), AOM_ICDF(31916), AOM_ICDF(32262),
-    AOM_ICDF(32468), AOM_ICDF(32590), AOM_ICDF(32662), AOM_ICDF(32705),
-    AOM_ICDF(32731), AOM_ICDF(32746), AOM_ICDF(32755), AOM_ICDF(32768)},
-  {AOM_ICDF(13054), AOM_ICDF(20908), AOM_ICDF(25633), AOM_ICDF(28475),
-    AOM_ICDF(30185), AOM_ICDF(31214), AOM_ICDF(31833), AOM_ICDF(32205),
-    AOM_ICDF(32429), AOM_ICDF(32564), AOM_ICDF(32645), AOM_ICDF(32694),
-    AOM_ICDF(32723), AOM_ICDF(32741), AOM_ICDF(32752), AOM_ICDF(32768)},
-  {AOM_ICDF(12803), AOM_ICDF(20603), AOM_ICDF(25356), AOM_ICDF(28252),
-    AOM_ICDF(30017), AOM_ICDF(31093), AOM_ICDF(31748), AOM_ICDF(32147),
-    AOM_ICDF(32390), AOM_ICDF(32538), AOM_ICDF(32628), AOM_ICDF(32683),
-    AOM_ICDF(32717), AOM_ICDF(32737), AOM_ICDF(32749), AOM_ICDF(32768)},
-  {AOM_ICDF(12544), AOM_ICDF(20286), AOM_ICDF(25064), AOM_ICDF(28013),
-    AOM_ICDF(29833), AOM_ICDF(30956), AOM_ICDF(31649), AOM_ICDF(32077),
-    AOM_ICDF(32341), AOM_ICDF(32504), AOM_ICDF(32605), AOM_ICDF(32667),
-    AOM_ICDF(32705), AOM_ICDF(32729), AOM_ICDF(32744), AOM_ICDF(32768)},
-  {AOM_ICDF(12288), AOM_ICDF(19968), AOM_ICDF(24768), AOM_ICDF(27768),
-    AOM_ICDF(29643), AOM_ICDF(30815), AOM_ICDF(31547), AOM_ICDF(32005),
-    AOM_ICDF(32291), AOM_ICDF(32470), AOM_ICDF(32582), AOM_ICDF(32652),
-    AOM_ICDF(32696), AOM_ICDF(32723), AOM_ICDF(32740), AOM_ICDF(32768)},
-  {AOM_ICDF(12033), AOM_ICDF(19647), AOM_ICDF(24465), AOM_ICDF(27514),
-    AOM_ICDF(29443), AOM_ICDF(30664), AOM_ICDF(31437), AOM_ICDF(31926),
-    AOM_ICDF(32235), AOM_ICDF(32431), AOM_ICDF(32555), AOM_ICDF(32633),
-    AOM_ICDF(32683), AOM_ICDF(32714), AOM_ICDF(32734), AOM_ICDF(32768)},
-  {AOM_ICDF(11777), AOM_ICDF(19321), AOM_ICDF(24154), AOM_ICDF(27250),
-    AOM_ICDF(29233), AOM_ICDF(30504), AOM_ICDF(31318), AOM_ICDF(31839),
-    AOM_ICDF(32173), AOM_ICDF(32387), AOM_ICDF(32524), AOM_ICDF(32612),
-    AOM_ICDF(32668), AOM_ICDF(32704), AOM_ICDF(32727), AOM_ICDF(32768)},
-  {AOM_ICDF(11521), AOM_ICDF(18991), AOM_ICDF(23835), AOM_ICDF(26976),
-    AOM_ICDF(29013), AOM_ICDF(30334), AOM_ICDF(31190), AOM_ICDF(31745),
-    AOM_ICDF(32105), AOM_ICDF(32338), AOM_ICDF(32489), AOM_ICDF(32587),
-    AOM_ICDF(32651), AOM_ICDF(32692), AOM_ICDF(32719), AOM_ICDF(32768)},
-  {AOM_ICDF(11265), AOM_ICDF(18657), AOM_ICDF(23508), AOM_ICDF(26691),
-    AOM_ICDF(28780), AOM_ICDF(30151), AOM_ICDF(31051), AOM_ICDF(31641),
-    AOM_ICDF(32028), AOM_ICDF(32282), AOM_ICDF(32449), AOM_ICDF(32559),
-    AOM_ICDF(32631), AOM_ICDF(32678), AOM_ICDF(32709), AOM_ICDF(32768)},
-  {AOM_ICDF(11006), AOM_ICDF(18316), AOM_ICDF(23170), AOM_ICDF(26394),
-    AOM_ICDF(28535), AOM_ICDF(29957), AOM_ICDF(30901), AOM_ICDF(31528),
-    AOM_ICDF(31944), AOM_ICDF(32220), AOM_ICDF(32404), AOM_ICDF(32526),
-    AOM_ICDF(32607), AOM_ICDF(32661), AOM_ICDF(32697), AOM_ICDF(32768)},
-  {AOM_ICDF(10752), AOM_ICDF(17976), AOM_ICDF(22830), AOM_ICDF(26091),
-    AOM_ICDF(28282), AOM_ICDF(29754), AOM_ICDF(30743), AOM_ICDF(31408),
-    AOM_ICDF(31854), AOM_ICDF(32154), AOM_ICDF(32356), AOM_ICDF(32491),
-    AOM_ICDF(32582), AOM_ICDF(32643), AOM_ICDF(32684), AOM_ICDF(32768)},
-  {AOM_ICDF(10496), AOM_ICDF(17630), AOM_ICDF(22479), AOM_ICDF(25775),
-    AOM_ICDF(28015), AOM_ICDF(29538), AOM_ICDF(30573), AOM_ICDF(31276),
-    AOM_ICDF(31754), AOM_ICDF(32079), AOM_ICDF(32300), AOM_ICDF(32450),
-    AOM_ICDF(32552), AOM_ICDF(32621), AOM_ICDF(32668), AOM_ICDF(32768)},
-  {AOM_ICDF(10240), AOM_ICDF(17280), AOM_ICDF(22120), AOM_ICDF(25448),
-    AOM_ICDF(27736), AOM_ICDF(29309), AOM_ICDF(30390), AOM_ICDF(31133),
-    AOM_ICDF(31644), AOM_ICDF(31995), AOM_ICDF(32237), AOM_ICDF(32403),
-    AOM_ICDF(32517), AOM_ICDF(32595), AOM_ICDF(32649), AOM_ICDF(32768)},
-  { AOM_ICDF(9984), AOM_ICDF(16926), AOM_ICDF(21753), AOM_ICDF(25109),
-    AOM_ICDF(27443), AOM_ICDF(29066), AOM_ICDF(30194), AOM_ICDF(30978),
-    AOM_ICDF(31523), AOM_ICDF(31902), AOM_ICDF(32166), AOM_ICDF(32349),
-    AOM_ICDF(32476), AOM_ICDF(32565), AOM_ICDF(32627), AOM_ICDF(32768)},
-  { AOM_ICDF(9728), AOM_ICDF(16568), AOM_ICDF(21377), AOM_ICDF(24759),
-    AOM_ICDF(27137), AOM_ICDF(28809), AOM_ICDF(29984), AOM_ICDF(30811),
-    AOM_ICDF(31392), AOM_ICDF(31801), AOM_ICDF(32088), AOM_ICDF(32290),
-    AOM_ICDF(32432), AOM_ICDF(32532), AOM_ICDF(32602), AOM_ICDF(32768)},
-  { AOM_ICDF(9474), AOM_ICDF(16208), AOM_ICDF(20995), AOM_ICDF(24399),
-    AOM_ICDF(26819), AOM_ICDF(28539), AOM_ICDF(29762), AOM_ICDF(30631),
-    AOM_ICDF(31249), AOM_ICDF(31688), AOM_ICDF(32000), AOM_ICDF(32222),
-    AOM_ICDF(32380), AOM_ICDF(32492), AOM_ICDF(32572), AOM_ICDF(32768)},
-  { AOM_ICDF(9216), AOM_ICDF(15840), AOM_ICDF(20601), AOM_ICDF(24023),
-    AOM_ICDF(26483), AOM_ICDF(28251), AOM_ICDF(29522), AOM_ICDF(30435),
-    AOM_ICDF(31091), AOM_ICDF(31563), AOM_ICDF(31902), AOM_ICDF(32146),
-    AOM_ICDF(32321), AOM_ICDF(32447), AOM_ICDF(32537), AOM_ICDF(32768)},
-  { AOM_ICDF(8959), AOM_ICDF(15469), AOM_ICDF(20199), AOM_ICDF(23636),
-    AOM_ICDF(26133), AOM_ICDF(27947), AOM_ICDF(29265), AOM_ICDF(30223),
-    AOM_ICDF(30919), AOM_ICDF(31425), AOM_ICDF(31792), AOM_ICDF(32059),
-    AOM_ICDF(32253), AOM_ICDF(32394), AOM_ICDF(32496), AOM_ICDF(32768)},
-  { AOM_ICDF(8705), AOM_ICDF(15097), AOM_ICDF(19791), AOM_ICDF(23238),
-    AOM_ICDF(25770), AOM_ICDF(27629), AOM_ICDF(28994), AOM_ICDF(29997),
-    AOM_ICDF(30733), AOM_ICDF(31274), AOM_ICDF(31671), AOM_ICDF(31963),
-    AOM_ICDF(32177), AOM_ICDF(32334), AOM_ICDF(32449), AOM_ICDF(32768)},
-  { AOM_ICDF(8449), AOM_ICDF(14719), AOM_ICDF(19373), AOM_ICDF(22827),
-    AOM_ICDF(25390), AOM_ICDF(27292), AOM_ICDF(28704), AOM_ICDF(29752),
-    AOM_ICDF(30530), AOM_ICDF(31107), AOM_ICDF(31535), AOM_ICDF(31853),
-    AOM_ICDF(32089), AOM_ICDF(32264), AOM_ICDF(32394), AOM_ICDF(32768)},
-  { AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(18944), AOM_ICDF(22400),
-    AOM_ICDF(24992), AOM_ICDF(26936), AOM_ICDF(28394), AOM_ICDF(29488),
-    AOM_ICDF(30308), AOM_ICDF(30923), AOM_ICDF(31384), AOM_ICDF(31730),
-    AOM_ICDF(31989), AOM_ICDF(32184), AOM_ICDF(32330), AOM_ICDF(32768)},
-  { AOM_ICDF(7936), AOM_ICDF(13950), AOM_ICDF(18507), AOM_ICDF(21961),
-    AOM_ICDF(24578), AOM_ICDF(26561), AOM_ICDF(28064), AOM_ICDF(29203),
-    AOM_ICDF(30066), AOM_ICDF(30720), AOM_ICDF(31216), AOM_ICDF(31592),
-    AOM_ICDF(31877), AOM_ICDF(32093), AOM_ICDF(32256), AOM_ICDF(32768)},
-  { AOM_ICDF(7678), AOM_ICDF(13558), AOM_ICDF(18060), AOM_ICDF(21507),
-    AOM_ICDF(24146), AOM_ICDF(26166), AOM_ICDF(27713), AOM_ICDF(28897),
-    AOM_ICDF(29804), AOM_ICDF(30498), AOM_ICDF(31030), AOM_ICDF(31437),
-    AOM_ICDF(31749), AOM_ICDF(31988), AOM_ICDF(32171), AOM_ICDF(32768)},
-  { AOM_ICDF(7423), AOM_ICDF(13165), AOM_ICDF(17606), AOM_ICDF(21041),
-    AOM_ICDF(23698), AOM_ICDF(25753), AOM_ICDF(27342), AOM_ICDF(28571),
-    AOM_ICDF(29522), AOM_ICDF(30257), AOM_ICDF(30826), AOM_ICDF(31266),
-    AOM_ICDF(31606), AOM_ICDF(31869), AOM_ICDF(32073), AOM_ICDF(32768)},
-  { AOM_ICDF(7168), AOM_ICDF(12768), AOM_ICDF(17143), AOM_ICDF(20561),
-    AOM_ICDF(23231), AOM_ICDF(25317), AOM_ICDF(26947), AOM_ICDF(28220),
-    AOM_ICDF(29215), AOM_ICDF(29992), AOM_ICDF(30599), AOM_ICDF(31073),
-    AOM_ICDF(31444), AOM_ICDF(31734), AOM_ICDF(31960), AOM_ICDF(32768)},
-  { AOM_ICDF(6911), AOM_ICDF(12365), AOM_ICDF(16669), AOM_ICDF(20065),
-    AOM_ICDF(22744), AOM_ICDF(24858), AOM_ICDF(26526), AOM_ICDF(27842),
-    AOM_ICDF(28881), AOM_ICDF(29701), AOM_ICDF(30348), AOM_ICDF(30858),
-    AOM_ICDF(31261), AOM_ICDF(31579), AOM_ICDF(31830), AOM_ICDF(32768)},
-  { AOM_ICDF(6657), AOM_ICDF(11961), AOM_ICDF(16188), AOM_ICDF(19556),
-    AOM_ICDF(22240), AOM_ICDF(24379), AOM_ICDF(26083), AOM_ICDF(27441),
-    AOM_ICDF(28523), AOM_ICDF(29385), AOM_ICDF(30072), AOM_ICDF(30620),
-    AOM_ICDF(31056), AOM_ICDF(31404), AOM_ICDF(31681), AOM_ICDF(32768)},
-  { AOM_ICDF(6400), AOM_ICDF(11550), AOM_ICDF(15694), AOM_ICDF(19029),
-    AOM_ICDF(21712), AOM_ICDF(23871), AOM_ICDF(25609), AOM_ICDF(27007),
-    AOM_ICDF(28132), AOM_ICDF(29037), AOM_ICDF(29766), AOM_ICDF(30352),
-    AOM_ICDF(30824), AOM_ICDF(31204), AOM_ICDF(31509), AOM_ICDF(32768)},
-  { AOM_ICDF(6142), AOM_ICDF(11134), AOM_ICDF(15190), AOM_ICDF(18486),
-    AOM_ICDF(21164), AOM_ICDF(23340), AOM_ICDF(25108), AOM_ICDF(26544),
-    AOM_ICDF(27711), AOM_ICDF(28659), AOM_ICDF(29429), AOM_ICDF(30055),
-    AOM_ICDF(30564), AOM_ICDF(30977), AOM_ICDF(31313), AOM_ICDF(32768)},
-  { AOM_ICDF(5890), AOM_ICDF(10720), AOM_ICDF(14682), AOM_ICDF(17932),
-    AOM_ICDF(20598), AOM_ICDF(22785), AOM_ICDF(24579), AOM_ICDF(26051),
-    AOM_ICDF(27258), AOM_ICDF(28248), AOM_ICDF(29060), AOM_ICDF(29726),
-    AOM_ICDF(30273), AOM_ICDF(30721), AOM_ICDF(31089), AOM_ICDF(32768)},
-  { AOM_ICDF(5631), AOM_ICDF(10295), AOM_ICDF(14157), AOM_ICDF(17356),
-    AOM_ICDF(20005), AOM_ICDF(22199), AOM_ICDF(24016), AOM_ICDF(25520),
-    AOM_ICDF(26766), AOM_ICDF(27798), AOM_ICDF(28652), AOM_ICDF(29359),
-    AOM_ICDF(29945), AOM_ICDF(30430), AOM_ICDF(30832), AOM_ICDF(32768)},
-  { AOM_ICDF(5377), AOM_ICDF(9871), AOM_ICDF(13628), AOM_ICDF(16768),
-    AOM_ICDF(19393), AOM_ICDF(21587), AOM_ICDF(23421), AOM_ICDF(24954),
-    AOM_ICDF(26236), AOM_ICDF(27308), AOM_ICDF(28204), AOM_ICDF(28953),
-    AOM_ICDF(29579), AOM_ICDF(30102), AOM_ICDF(30539), AOM_ICDF(32768)},
-  { AOM_ICDF(5121), AOM_ICDF(9441), AOM_ICDF(13086), AOM_ICDF(16161),
-    AOM_ICDF(18756), AOM_ICDF(20945), AOM_ICDF(22792), AOM_ICDF(24351),
-    AOM_ICDF(25666), AOM_ICDF(26776), AOM_ICDF(27712), AOM_ICDF(28502),
-    AOM_ICDF(29169), AOM_ICDF(29731), AOM_ICDF(30206), AOM_ICDF(32768)},
-  { AOM_ICDF(4865), AOM_ICDF(9007), AOM_ICDF(12534), AOM_ICDF(15538),
-    AOM_ICDF(18096), AOM_ICDF(20274), AOM_ICDF(22129), AOM_ICDF(23708),
-    AOM_ICDF(25053), AOM_ICDF(26198), AOM_ICDF(27173), AOM_ICDF(28004),
-    AOM_ICDF(28711), AOM_ICDF(29313), AOM_ICDF(29826), AOM_ICDF(32768)},
-  { AOM_ICDF(4608), AOM_ICDF(8568), AOM_ICDF(11971), AOM_ICDF(14896),
-    AOM_ICDF(17409), AOM_ICDF(19569), AOM_ICDF(21425), AOM_ICDF(23020),
-    AOM_ICDF(24391), AOM_ICDF(25569), AOM_ICDF(26581), AOM_ICDF(27451),
-    AOM_ICDF(28199), AOM_ICDF(28842), AOM_ICDF(29394), AOM_ICDF(32768)},
-  { AOM_ICDF(4351), AOM_ICDF(8125), AOM_ICDF(11398), AOM_ICDF(14236),
-    AOM_ICDF(16697), AOM_ICDF(18831), AOM_ICDF(20682), AOM_ICDF(22287),
-    AOM_ICDF(23679), AOM_ICDF(24886), AOM_ICDF(25933), AOM_ICDF(26841),
-    AOM_ICDF(27628), AOM_ICDF(28311), AOM_ICDF(28903), AOM_ICDF(32768)},
-  { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-    AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(21508),
-    AOM_ICDF(22915), AOM_ICDF(24146), AOM_ICDF(25224), AOM_ICDF(26167),
-    AOM_ICDF(26992), AOM_ICDF(27714), AOM_ICDF(28346), AOM_ICDF(32768)},
-  { AOM_ICDF(3840), AOM_ICDF(7230), AOM_ICDF(10223), AOM_ICDF(12865),
-    AOM_ICDF(15197), AOM_ICDF(17256), AOM_ICDF(19074), AOM_ICDF(20679),
-    AOM_ICDF(22096), AOM_ICDF(23347), AOM_ICDF(24451), AOM_ICDF(25426),
-    AOM_ICDF(26287), AOM_ICDF(27047), AOM_ICDF(27718), AOM_ICDF(32768)},
-  { AOM_ICDF(3584), AOM_ICDF(6776), AOM_ICDF(9619), AOM_ICDF(12151),
-    AOM_ICDF(14406), AOM_ICDF(16414), AOM_ICDF(18203), AOM_ICDF(19796),
-    AOM_ICDF(21215), AOM_ICDF(22479), AOM_ICDF(23604), AOM_ICDF(24606),
-    AOM_ICDF(25499), AOM_ICDF(26294), AOM_ICDF(27002), AOM_ICDF(32768)},
-  { AOM_ICDF(3328), AOM_ICDF(6318), AOM_ICDF(9004), AOM_ICDF(11417),
-    AOM_ICDF(13585), AOM_ICDF(15533), AOM_ICDF(17283), AOM_ICDF(18856),
-    AOM_ICDF(20269), AOM_ICDF(21538), AOM_ICDF(22678), AOM_ICDF(23703),
-    AOM_ICDF(24624), AOM_ICDF(25451), AOM_ICDF(26194), AOM_ICDF(32768)},
-  { AOM_ICDF(3072), AOM_ICDF(5856), AOM_ICDF(8379), AOM_ICDF(10665),
-    AOM_ICDF(12737), AOM_ICDF(14615), AOM_ICDF(16317), AOM_ICDF(17859),
-    AOM_ICDF(19257), AOM_ICDF(20524), AOM_ICDF(21672), AOM_ICDF(22712),
-    AOM_ICDF(23655), AOM_ICDF(24509), AOM_ICDF(25283), AOM_ICDF(32768)},
-  { AOM_ICDF(2816), AOM_ICDF(5390), AOM_ICDF(7743), AOM_ICDF(9894),
-    AOM_ICDF(11860), AOM_ICDF(13657), AOM_ICDF(15299), AOM_ICDF(16800),
-    AOM_ICDF(18172), AOM_ICDF(19426), AOM_ICDF(20573), AOM_ICDF(21621),
-    AOM_ICDF(22579), AOM_ICDF(23455), AOM_ICDF(24255), AOM_ICDF(32768)},
-  { AOM_ICDF(2560), AOM_ICDF(4920), AOM_ICDF(7096), AOM_ICDF(9102),
-    AOM_ICDF(10951), AOM_ICDF(12656), AOM_ICDF(14227), AOM_ICDF(15676),
-    AOM_ICDF(17011), AOM_ICDF(18242), AOM_ICDF(19377), AOM_ICDF(20423),
-    AOM_ICDF(21388), AOM_ICDF(22277), AOM_ICDF(23097), AOM_ICDF(32768)},
-  { AOM_ICDF(2304), AOM_ICDF(4446), AOM_ICDF(6437), AOM_ICDF(8288),
-    AOM_ICDF(10009), AOM_ICDF(11609), AOM_ICDF(13097), AOM_ICDF(14480),
-    AOM_ICDF(15766), AOM_ICDF(16961), AOM_ICDF(18072), AOM_ICDF(19105),
-    AOM_ICDF(20066), AOM_ICDF(20959), AOM_ICDF(21789), AOM_ICDF(32768)},
-  { AOM_ICDF(2048), AOM_ICDF(3968), AOM_ICDF(5768), AOM_ICDF(7456),
-    AOM_ICDF(9038), AOM_ICDF(10521), AOM_ICDF(11911), AOM_ICDF(13215),
-    AOM_ICDF(14437), AOM_ICDF(15583), AOM_ICDF(16657), AOM_ICDF(17664),
-    AOM_ICDF(18608), AOM_ICDF(19493), AOM_ICDF(20323), AOM_ICDF(32768)},
-  { AOM_ICDF(1792), AOM_ICDF(3486), AOM_ICDF(5087), AOM_ICDF(6601),
-    AOM_ICDF(8032), AOM_ICDF(9385), AOM_ICDF(10664), AOM_ICDF(11873),
-    AOM_ICDF(13016), AOM_ICDF(14096), AOM_ICDF(15117), AOM_ICDF(16082),
-    AOM_ICDF(16995), AOM_ICDF(17858), AOM_ICDF(18673), AOM_ICDF(32768)},
-  { AOM_ICDF(1536), AOM_ICDF(3000), AOM_ICDF(4395), AOM_ICDF(5725),
-    AOM_ICDF(6993), AOM_ICDF(8201), AOM_ICDF(9353), AOM_ICDF(10451),
-    AOM_ICDF(11497), AOM_ICDF(12494), AOM_ICDF(13444), AOM_ICDF(14350),
-    AOM_ICDF(15213), AOM_ICDF(16036), AOM_ICDF(16820), AOM_ICDF(32768)},
-  { AOM_ICDF(1280), AOM_ICDF(2510), AOM_ICDF(3692), AOM_ICDF(4828),
-    AOM_ICDF(5919), AOM_ICDF(6968), AOM_ICDF(7976), AOM_ICDF(8944),
-    AOM_ICDF(9875), AOM_ICDF(10769), AOM_ICDF(11628), AOM_ICDF(12454),
-    AOM_ICDF(13248), AOM_ICDF(14011), AOM_ICDF(14744), AOM_ICDF(32768)},
-  { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(2977), AOM_ICDF(3908),
-    AOM_ICDF(4810), AOM_ICDF(5684), AOM_ICDF(6530), AOM_ICDF(7350),
-    AOM_ICDF(8144), AOM_ICDF(8913), AOM_ICDF(9658), AOM_ICDF(10380),
-    AOM_ICDF(11080), AOM_ICDF(11758), AOM_ICDF(12415), AOM_ICDF(32768)},
-  {  AOM_ICDF(768), AOM_ICDF(1518), AOM_ICDF(2250), AOM_ICDF(2965),
-    AOM_ICDF(3663), AOM_ICDF(4345), AOM_ICDF(5011), AOM_ICDF(5662),
-    AOM_ICDF(6297), AOM_ICDF(6917), AOM_ICDF(7523), AOM_ICDF(8115),
-    AOM_ICDF(8693), AOM_ICDF(9257), AOM_ICDF(9808), AOM_ICDF(32768)},
-  {  AOM_ICDF(512), AOM_ICDF(1016), AOM_ICDF(1512), AOM_ICDF(2000),
-    AOM_ICDF(2481), AOM_ICDF(2954), AOM_ICDF(3420), AOM_ICDF(3879),
-    AOM_ICDF(4330), AOM_ICDF(4774), AOM_ICDF(5211), AOM_ICDF(5642),
-    AOM_ICDF(6066), AOM_ICDF(6483), AOM_ICDF(6894), AOM_ICDF(32768)},
-  {  AOM_ICDF(256),  AOM_ICDF(510),  AOM_ICDF(762), AOM_ICDF(1012),
-    AOM_ICDF(1260), AOM_ICDF(1506), AOM_ICDF(1750), AOM_ICDF(1992),
-    AOM_ICDF(2232), AOM_ICDF(2471), AOM_ICDF(2708), AOM_ICDF(2943),
-    AOM_ICDF(3176), AOM_ICDF(3407), AOM_ICDF(3636), AOM_ICDF(32768)},
-};
-
-
-const uint16_t LAPLACE_OFFSET[128] = {
-  0,
-  29871,
-  28672,
-  27751,
-  26975,
-  26291,
-  25673,
-  25105,
-  24576,
-  24079,
-  23609,
-  23162,
-  22734,
-  22325,
-  21931,
-  21550,
-  21182,
-  20826,
-  20480,
-  20143,
-  19815,
-  19495,
-  19183,
-  18877,
-  18579,
-  18286,
-  17999,
-  17718,
-  17442,
-  17170,
-  16904,
-  16642,
-  16384,
-  16129,
-  15879,
-  15633,
-  15390,
-  15150,
-  14913,
-  14680,
-  14450,
-  14222,
-  13997,
-  13775,
-  13556,
-  13338,
-  13124,
-  12911,
-  12701,
-  12493,
-  12288,
-  12084,
-  11882,
-  11682,
-  11484,
-  11288,
-  11094,
-  10901,
-  10710,
-  10521,
-  10333,
-  10147,
-  9962,
-  9779,
-  9597,
-  9417,
-  9238,
-  9060,
-  8884,
-  8709,
-  8535,
-  8363,
-  8192,
-  8021,
-  7853,
-  7685,
-  7518,
-  7352,
-  7188,
-  7025,
-  6862,
-  6701,
-  6540,
-  6381,
-  6222,
-  6065,
-  5908,
-  5753,
-  5598,
-  5444,
-  5291,
-  5138,
-  4987,
-  4837,
-  4687,
-  4538,
-  4390,
-  4242,
-  4096,
-  3950,
-  3804,
-  3660,
-  3516,
-  3373,
-  3231,
-  3089,
-  2948,
-  2808,
-  2668,
-  2529,
-  2391,
-  2253,
-  2116,
-  1979,
-  1843,
-  1708,
-  1573,
-  1439,
-  1306,
-  1172,
-  1040,
-  908,
-  777,
-  646,
-  516,
-  386,
-  257,
-  128,
-};
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
deleted file mode 100644
index ff461b914..000000000
--- a/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void av1_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
-                              int32_t dst_stride, TxfmParam *txfm_param) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
-  int16_t *out_ptr = &out[0];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        /* process 8 * 16 block */
-        aom_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                         dst_stride);
-      }
-      break;
-    case ADST_DCT:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        aom_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
-                                          (dst + (i << 3)), dst_stride);
-      }
-      break;
-    case DCT_ADST:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        /* process 8 * 16 block */
-        aom_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                         dst_stride);
-      }
-      break;
-    case ADST_ADST:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        aom_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
-                                          (dst + (i << 3)), dst_stride);
-      }
-      break;
-    default: assert(0); break;
-  }
-}
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
deleted file mode 100644
index 37f7fd77b..000000000
--- a/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void av1_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride, TxfmParam *txfm_param) {
-  v8i16 in0, in1, in2, in3;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  /* load vector elements of 4x4 block */
-  LD4x4_SH(input, in0, in1, in2, in3);
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* DCT in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* ADST in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      /* ADST in horizontal */
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* DCT in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_ADST:
-      /* ADST in horizontal */
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* ADST in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    default: assert(0); break;
-  }
-
-  /* final rounding (add 2^3, divide by 2^4) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 4);
-  /* add block and store 4x4 */
-  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
-}
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
deleted file mode 100644
index 7410f7b98..000000000
--- a/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void av1_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride, TxfmParam *txfm_param) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  /* load vector elements of 8x8 block */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      /* DCT in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      break;
-    case ADST_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      /* ADST in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case DCT_ADST:
-      /* ADST in horizontal */
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      /* DCT in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      break;
-    case ADST_ADST:
-      /* ADST in horizontal */
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      /* ADST in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    default: assert(0); break;
-  }
-
-  /* final rounding (add 2^4, divide by 2^5) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 5);
-  SRARI_H4_SH(in4, in5, in6, in7, 5);
-
-  /* add block and store 8x8 */
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index 65f0f7eda..a6227f18f 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -27,6 +27,8 @@ typedef struct mv {
   int16_t col;
 } MV;
 
+static const MV kZeroMv = { 0, 0 };
+
 typedef union int_mv {
   uint32_t as_int;
   MV as_mv;
@@ -37,11 +39,6 @@ typedef struct mv32 {
   int32_t col;
 } MV32;
 
-#if CONFIG_WARPED_MOTION
-#define WARPED_MOTION_SORT_SAMPLES 1
-#endif  // CONFIG_WARPED_MOTION
-
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 // Bits of precision used for the model
 #define WARPEDMODEL_PREC_BITS 16
 #define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
@@ -54,19 +51,8 @@ typedef struct mv32 {
 #define WARPEDPIXEL_PREC_BITS 6
 #define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
 
-// Taps for ntap filter
-#define WARPEDPIXEL_FILTER_TAPS 6
-
-// Precision of filter taps
-#define WARPEDPIXEL_FILTER_BITS 7
-
 #define WARP_PARAM_REDUCE_BITS 6
 
-// Precision bits reduction after horizontal shear
-#define HORSHEAR_REDUCE_PREC_BITS 5
-#define VERSHEAR_REDUCE_PREC_BITS \
-  (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
-
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
@@ -75,10 +61,7 @@ typedef enum {
   TRANSLATION = 1,   // translational motion 2-parameter
   ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
   AFFINE = 3,        // affine, 6-parameter
-  HORTRAPEZOID = 4,  // constrained homography, hor trapezoid, 6-parameter
-  VERTRAPEZOID = 5,  // constrained homography, ver trapezoid, 6-parameter
-  HOMOGRAPHY = 6,    // homography, 8-parameter
-  TRANS_TYPES = 7,
+  TRANS_TYPES,
 } TransformationType;
 /* clang-format on */
 
@@ -90,24 +73,13 @@ typedef enum {
 // GLOBAL_TRANS_TYPES 7 - up to full homography
 #define GLOBAL_TRANS_TYPES 4
 
-#if GLOBAL_TRANS_TYPES > 4
-// First bit indicates whether using identity or not
-// GLOBAL_TYPE_BITS=ceiling(log2(GLOBAL_TRANS_TYPES-1)) is the
-// number of bits needed to cover the remaining possibilities
-#define GLOBAL_TYPE_BITS (get_msb(2 * GLOBAL_TRANS_TYPES - 3))
-#endif  // GLOBAL_TRANS_TYPES > 4
-
 typedef struct {
-#if CONFIG_GLOBAL_MOTION
   int global_warp_allowed;
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
   int local_warp_allowed;
-#endif  // CONFIG_WARPED_MOTION
 } WarpTypesAllowed;
 
 // number of parameters used by each transformation in TransformationTypes
-static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6, 6, 6, 8 };
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
 
 // The order of values in the wmmat matrix below is best described
 // by the homography:
@@ -118,6 +90,7 @@ typedef struct {
   TransformationType wmtype;
   int32_t wmmat[8];
   int16_t alpha, beta, gamma, delta;
+  int8_t invalid;
 } WarpedMotionParams;
 
 /* clang-format off */
@@ -125,12 +98,11 @@ static const WarpedMotionParams default_warp_params = {
   IDENTITY,
   { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
     0 },
-  0, 0, 0, 0
+  0, 0, 0, 0,
+  0,
 };
 /* clang-format on */
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-#if CONFIG_GLOBAL_MOTION
 // The following constants describe the various precisions
 // of different parameters in the global motion experiment.
 //
@@ -187,9 +159,6 @@ static const WarpedMotionParams default_warp_params = {
 #define GM_ALPHA_MIN -GM_ALPHA_MAX
 #define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX
 
-// Use global motion parameters for sub8x8 blocks
-#define GLOBAL_SUB8X8_USED 0
-
 static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
   const int bw = block_size_wide[bs];
   return mi_col * MI_SIZE + bw / 2 - 1;
@@ -206,7 +175,6 @@ static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
   else
     return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
 }
-#if CONFIG_AMVR
 static INLINE void integer_mv_precision(MV *mv) {
   int mod = (mv->row % 8);
   if (mod != 0) {
@@ -232,7 +200,6 @@ static INLINE void integer_mv_precision(MV *mv) {
     }
   }
 }
-#endif
 // Convert a global motion vector into a motion vector at the centre of the
 // given block.
 //
@@ -242,14 +209,15 @@ static INLINE void integer_mv_precision(MV *mv) {
 // represents an integer)
 static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
                                           int allow_hp, BLOCK_SIZE bsize,
-                                          int mi_col, int mi_row, int block_idx
-#if CONFIG_AMVR
-                                          ,
-                                          int is_integer
-#endif
-                                          ) {
-  const int unify_bsize = CONFIG_CB4X4;
+                                          int mi_col, int mi_row,
+                                          int is_integer) {
   int_mv res;
+
+  if (gm->wmtype == IDENTITY) {
+    res.as_int = 0;
+    return res;
+  }
+
   const int32_t *mat = gm->wmmat;
   int x, y, tx, ty;
 
@@ -265,65 +233,37 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
     res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
     res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
     assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
-#if CONFIG_AMVR
     if (is_integer) {
       integer_mv_precision(&res.as_mv);
     }
-#endif
     return res;
   }
 
-  if (bsize >= BLOCK_8X8 || unify_bsize) {
-    x = block_center_x(mi_col, bsize);
-    y = block_center_y(mi_row, bsize);
-  } else {
-    x = block_center_x(mi_col, bsize);
-    y = block_center_y(mi_row, bsize);
-    x += (block_idx & 1) * MI_SIZE / 2;
-    y += (block_idx & 2) * MI_SIZE / 4;
-  }
+  x = block_center_x(mi_col, bsize);
+  y = block_center_y(mi_row, bsize);
 
   if (gm->wmtype == ROTZOOM) {
     assert(gm->wmmat[5] == gm->wmmat[2]);
     assert(gm->wmmat[4] == -gm->wmmat[3]);
   }
-  if (gm->wmtype > AFFINE) {
-    int xc = (int)((int64_t)mat[2] * x + (int64_t)mat[3] * y + mat[0]);
-    int yc = (int)((int64_t)mat[4] * x + (int64_t)mat[5] * y + mat[1]);
-    const int Z = (int)((int64_t)mat[6] * x + (int64_t)mat[7] * y +
-                        (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
-    xc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
-    yc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
-    xc = (int)(xc > 0 ? ((int64_t)xc + Z / 2) / Z : ((int64_t)xc - Z / 2) / Z);
-    yc = (int)(yc > 0 ? ((int64_t)yc + Z / 2) / Z : ((int64_t)yc - Z / 2) / Z);
-    tx = convert_to_trans_prec(allow_hp, xc) - (x << 3);
-    ty = convert_to_trans_prec(allow_hp, yc) - (y << 3);
-  } else {
-    const int xc =
-        (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
-    const int yc =
-        mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
-    tx = convert_to_trans_prec(allow_hp, xc);
-    ty = convert_to_trans_prec(allow_hp, yc);
-  }
+
+  const int xc =
+      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+  const int yc =
+      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+  tx = convert_to_trans_prec(allow_hp, xc);
+  ty = convert_to_trans_prec(allow_hp, yc);
 
   res.as_mv.row = ty;
   res.as_mv.col = tx;
 
-#if CONFIG_AMVR
   if (is_integer) {
     integer_mv_precision(&res.as_mv);
   }
-#endif
   return res;
 }
 
 static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
-  if (gm->wmmat[6] != 0 || gm->wmmat[7] != 0) {
-    if (!gm->wmmat[6] && !gm->wmmat[4]) return HORTRAPEZOID;
-    if (!gm->wmmat[7] && !gm->wmmat[3]) return VERTRAPEZOID;
-    return HOMOGRAPHY;
-  }
   if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
       gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
     return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
@@ -333,12 +273,10 @@ static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
   else
     return AFFINE;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
 typedef struct candidate_mv {
   int_mv this_mv;
   int_mv comp_mv;
-  uint8_t pred_diff[2];
   int weight;
 } CANDIDATE_MV;
 
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
index 891396e9b..6939df335 100644
--- a/third_party/aom/av1/common/mvref_common.c
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -9,68 +9,72 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <stdlib.h>
+
 #include "av1/common/mvref_common.h"
-#if CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_GLOBAL_MOTION
-#define USE_CUR_GM_REFMV 1
-#endif  // CONFIG_GLOBAL_MOTION
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
+                            2048, 1820,  1638, 1489, 1365, 1260, 1170, 1092,
+                            1024, 963,   910,  862,  819,  780,  744,  712,
+                            682,  655,   630,  606,  585,  564,  546,  528 };
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static void get_mv_projection(MV *output, MV ref, int num, int den) {
+  den = AOMMIN(den, MAX_FRAME_DISTANCE);
+  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
+                : AOMMAX(num, -MAX_FRAME_DISTANCE);
+  int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+  int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+  const int clamp_max = MV_UPP - 1;
+  const int clamp_min = MV_LOW + 1;
+  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
+  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
+}
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MODE_INFO *mi, int mi_row,
-                        int mi_col, int x_mis, int y_mis) {
-#if CONFIG_TMV
+void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
+                        int mi_row, int mi_col, int x_mis, int y_mis) {
   const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      ((mi_row & 0xfffe) >> 1) * frame_mvs_stride +
-                      ((mi_col & 0xfffe) >> 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
   y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
-#else
-  const int frame_mvs_stride = cm->mi_cols;
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      (mi_row & 0xfffe) * frame_mvs_stride + (mi_col & 0xfffe);
-  x_mis = AOMMAX(x_mis, 2);
-  y_mis = AOMMAX(y_mis, 2);
-#endif  // CONFIG_TMV
   int w, h;
 
   for (h = 0; h < y_mis; h++) {
-    MV_REF *const frame_mv = frame_mvs + h * frame_mvs_stride;
+    MV_REF *mv = frame_mvs;
     for (w = 0; w < x_mis; w++) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
-      // (TODO:yunqing) The following 2 lines won't be used and can be removed.
-      mv->pred_mv[0].as_int = mi->mbmi.pred_mv[0].as_int;
-      mv->pred_mv[1].as_int = mi->mbmi.pred_mv[1].as_int;
+      mv->ref_frame = NONE_FRAME;
+      mv->mv.as_int = 0;
+
+      for (int idx = 0; idx < 2; ++idx) {
+        MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
+        if (ref_frame > INTRA_FRAME) {
+          int8_t ref_idx = cm->ref_frame_side[ref_frame];
+          if (ref_idx) continue;
+          if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
+              (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
+            continue;
+          mv->ref_frame = ref_frame;
+          mv->mv.as_int = mi->mv[idx].as_int;
+        }
+      }
+      mv++;
     }
+    frame_mvs += frame_mvs_stride;
   }
 }
 
-static uint8_t add_ref_mv_candidate(
-    const MODE_INFO *const candidate_mi, const MB_MODE_INFO *const candidate,
-    const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count,
-    CANDIDATE_MV *ref_mv_stack, const int use_hp, int len, int block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-    int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-    int col, int weight
-#if CONFIG_AMVR
-    ,
-    int is_integer
-#endif
-    ) {
+static void add_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
+    uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
+    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
+    const WarpedMotionParams *gm_params, int col, int weight) {
+  if (!is_inter_block(candidate)) return;  // for intrabc
   int index = 0, ref;
-  int newmv_count = 0;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
   assert(weight % 2 == 0);
 
   if (rf[1] == NONE_FRAME) {
@@ -78,60 +82,24 @@ static uint8_t add_ref_mv_candidate(
     for (ref = 0; ref < 2; ++ref) {
       if (candidate->ref_frame[ref] == rf[0]) {
         int_mv this_refmv;
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        if (is_global_mv_block(candidate_mi, block, gm_params[rf[0]].wmtype))
+        if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
           this_refmv = gm_mv_candidates[0];
         else
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-          this_refmv = get_sub_block_mv(candidate_mi, ref, col, block);
-#if CONFIG_AMVR
-        lower_mv_precision(&this_refmv.as_mv, use_hp, is_integer);
-#else
-        lower_mv_precision(&this_refmv.as_mv, use_hp);
-#endif  // CONFIG_AMVR
+          this_refmv = get_sub_block_mv(candidate, ref, col);
 
         for (index = 0; index < *refmv_count; ++index)
           if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
 
-        if (index < *refmv_count) ref_mv_stack[index].weight += weight * len;
+        if (index < *refmv_count) ref_mv_stack[index].weight += weight;
 
         // Add a new item to the list.
-        if (index == *refmv_count) {
+        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
           ref_mv_stack[index].this_mv = this_refmv;
-          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-              get_sub_block_pred_mv(candidate_mi, ref, col, block), this_refmv);
-          ref_mv_stack[index].weight = weight * len;
+          ref_mv_stack[index].weight = weight;
           ++(*refmv_count);
-
-          if (candidate->mode == NEWMV) ++newmv_count;
-        }
-
-        if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0 &&
-            !unify_bsize) {
-          int alt_block = 3 - block;
-          this_refmv = get_sub_block_mv(candidate_mi, ref, col, alt_block);
-#if CONFIG_AMVR
-          lower_mv_precision(&this_refmv.as_mv, use_hp, is_integer);
-#else
-          lower_mv_precision(&this_refmv.as_mv, use_hp);
-#endif
-          for (index = 0; index < *refmv_count; ++index)
-            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
-
-          if (index < *refmv_count) ref_mv_stack[index].weight += len;
-
-          // Add a new item to the list.
-          if (index == *refmv_count) {
-            ref_mv_stack[index].this_mv = this_refmv;
-            ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-                get_sub_block_pred_mv(candidate_mi, ref, col, alt_block),
-                this_refmv);
-            ref_mv_stack[index].weight = len;
-            ++(*refmv_count);
-
-            if (candidate->mode == NEWMV) ++newmv_count;
-          }
         }
+        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+        ++*ref_match_count;
       }
     }
   } else {
@@ -140,17 +108,10 @@ static uint8_t add_ref_mv_candidate(
       int_mv this_refmv[2];
 
       for (ref = 0; ref < 2; ++ref) {
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        if (is_global_mv_block(candidate_mi, block, gm_params[rf[ref]].wmtype))
+        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
           this_refmv[ref] = gm_mv_candidates[ref];
         else
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-          this_refmv[ref] = get_sub_block_mv(candidate_mi, ref, col, block);
-#if CONFIG_AMVR
-        lower_mv_precision(&this_refmv[ref].as_mv, use_hp, is_integer);
-#else
-        lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
-#endif
+          this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
       }
 
       for (index = 0; index < *refmv_count; ++index)
@@ -158,94 +119,46 @@ static uint8_t add_ref_mv_candidate(
             (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
           break;
 
-      if (index < *refmv_count) ref_mv_stack[index].weight += weight * len;
+      if (index < *refmv_count) ref_mv_stack[index].weight += weight;
 
       // Add a new item to the list.
-      if (index == *refmv_count) {
+      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[index].this_mv = this_refmv[0];
         ref_mv_stack[index].comp_mv = this_refmv[1];
-        ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-            get_sub_block_pred_mv(candidate_mi, 0, col, block), this_refmv[0]);
-        ref_mv_stack[index].pred_diff[1] = av1_get_pred_diff_ctx(
-            get_sub_block_pred_mv(candidate_mi, 1, col, block), this_refmv[1]);
-        ref_mv_stack[index].weight = weight * len;
+        ref_mv_stack[index].weight = weight;
         ++(*refmv_count);
-
-        if (candidate->mode == NEW_NEWMV) ++newmv_count;
-      }
-
-      if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0 &&
-          !unify_bsize) {
-        int alt_block = 3 - block;
-        this_refmv[0] = get_sub_block_mv(candidate_mi, 0, col, alt_block);
-        this_refmv[1] = get_sub_block_mv(candidate_mi, 1, col, alt_block);
-
-        for (ref = 0; ref < 2; ++ref) {
-#if CONFIG_AMVR
-          lower_mv_precision(&this_refmv[ref].as_mv, use_hp, is_integer);
-#else
-          lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
-#endif
-        }
-        for (index = 0; index < *refmv_count; ++index)
-          if (ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int &&
-              ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)
-            break;
-
-        if (index < *refmv_count) ref_mv_stack[index].weight += len;
-
-        // Add a new item to the list.
-        if (index == *refmv_count) {
-          ref_mv_stack[index].this_mv = this_refmv[0];
-          ref_mv_stack[index].comp_mv = this_refmv[1];
-          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-              get_sub_block_pred_mv(candidate_mi, 0, col, block),
-              this_refmv[0]);
-          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-              get_sub_block_pred_mv(candidate_mi, 1, col, block),
-              this_refmv[1]);
-          ref_mv_stack[index].weight = len;
-          ++(*refmv_count);
-
-          if (candidate->mode == NEW_NEWMV) ++newmv_count;
-        }
       }
+      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+      ++*ref_match_count;
     }
   }
-  return newmv_count;
 }
 
-static uint8_t scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             const int mi_col, int block,
-                             const MV_REFERENCE_FRAME rf[2], int row_offset,
-                             CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int max_row_offset, int *processed_rows) {
-  const int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int row_offset,
+                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates, int max_row_offset,
+                          int *processed_rows) {
+  int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
   const int n8_w_8 = mi_size_wide[BLOCK_8X8];
   const int n8_w_16 = mi_size_wide[BLOCK_16X16];
   int i;
-  uint8_t newmv_count = 0;
   int col_offset = 0;
-#if CONFIG_CB4X4
   const int shift = 0;
   // TODO(jingning): Revisit this part after cb4x4 is stable.
   if (abs(row_offset) > 1) {
     col_offset = 1;
-    if (mi_col & 0x01 && xd->n8_w < n8_w_8) --col_offset;
+    if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
   }
   const int use_step_16 = (xd->n8_w >= 16);
-#else
-  const int shift = 1;
-  const int use_step_16 = (xd->n8_w >= 8);
-#endif
-  MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
-
-  for (i = 0; i < end_mi && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
-    const MODE_INFO *const candidate_mi = candidate_mi0[col_offset + i];
-    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+  MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
+  (void)mi_row;
+
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
     const int candidate_bsize = candidate->sb_type;
     const int n8_w = mi_size_wide[candidate_bsize];
     int len = AOMMIN(xd->n8_w, n8_w);
@@ -264,60 +177,38 @@ static uint8_t scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       *processed_rows = inc - row_offset - 1;
     }
 
-#if CONFIG_AMVR
-    newmv_count += add_ref_mv_candidate(
-        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
-        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        col_offset + i, weight, cm->cur_frame_mv_precision_level);
-#else
-    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack,
-                                        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        col_offset + i, weight);
-#endif
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, col_offset + i, len * weight);
 
     i += len;
   }
-
-  return newmv_count;
 }
 
-static uint8_t scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             const int mi_row, int block,
-                             const MV_REFERENCE_FRAME rf[2], int col_offset,
-                             CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int max_col_offset, int *processed_cols) {
-  const int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int col_offset,
+                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates, int max_col_offset,
+                          int *processed_cols) {
+  int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
   const int n8_h_16 = mi_size_high[BLOCK_16X16];
   int i;
-  uint8_t newmv_count = 0;
   int row_offset = 0;
-#if CONFIG_CB4X4
   const int shift = 0;
   if (abs(col_offset) > 1) {
     row_offset = 1;
-    if (mi_row & 0x01 && xd->n8_h < n8_h_8) --row_offset;
+    if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
   }
   const int use_step_16 = (xd->n8_h >= 16);
-#else
-  const int shift = 1;
-  const int use_step_16 = (xd->n8_h >= 8);
-#endif
+  (void)mi_col;
 
-  for (i = 0; i < end_mi && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
-    const MODE_INFO *const candidate_mi =
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
-    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
     const int candidate_bsize = candidate->sb_type;
     const int n8_h = mi_size_high[candidate_bsize];
     int len = AOMMIN(xd->n8_h, n8_h);
@@ -336,79 +227,46 @@ static uint8_t scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       *processed_cols = inc - col_offset - 1;
     }
 
-#if CONFIG_AMVR
-    newmv_count += add_ref_mv_candidate(
-        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
-        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        col_offset, weight, cm->cur_frame_mv_precision_level);
-#else
-    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack,
-                                        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        col_offset, weight);
-#endif
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, col_offset, len * weight);
+
     i += len;
   }
-
-  return newmv_count;
 }
 
-static uint8_t scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             const int mi_row, const int mi_col, int block,
-                             const MV_REFERENCE_FRAME rf[2], int row_offset,
-                             int col_offset, CANDIDATE_MV *ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             uint8_t *refmv_count) {
+static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          const int mi_row, const int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int row_offset,
+                          int col_offset, CANDIDATE_MV *ref_mv_stack,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates,
+                          uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
   const TileInfo *const tile = &xd->tile;
   POSITION mi_pos;
-  uint8_t newmv_count = 0;
 
   mi_pos.row = row_offset;
   mi_pos.col = col_offset;
 
-  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos) &&
-      *refmv_count < MAX_REF_MV_STACK_SIZE) {
-    const MODE_INFO *const candidate_mi =
+  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+    const MB_MODE_INFO *const candidate =
         xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
-    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
     const int len = mi_size_wide[BLOCK_8X8];
 
-#if CONFIG_AMVR
-    newmv_count += add_ref_mv_candidate(
-        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
-        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        mi_pos.col, 2, cm->cur_frame_mv_precision_level);
-#else
-    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack,
-                                        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        mi_pos.col, 2);
-#endif
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, mi_pos.col, 2 * len);
   }  // Analyze a single 8x8 block motion information.
-
-  return newmv_count;
 }
 
 static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
-  const int sb_mi_size = mi_size_wide[cm->sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
   const int mask_row = mi_row & (sb_mi_size - 1);
   const int mask_col = mi_col & (sb_mi_size - 1);
 
+  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
+
   // In a split partition all apart from the bottom right has a top right
   int has_tr = !((mask_row & bs) && (mask_col & bs));
 
@@ -440,22 +298,20 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (xd->n8_w > xd->n8_h)
     if (xd->is_sec_rect) has_tr = 0;
 
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
   // rectangle of the partition
-  if (xd->mi[0]->mbmi.partition == PARTITION_VERT_A)
-    if ((mask_row & bs) && !(mask_col & bs)) has_tr = 0;
-#endif  // CONFIG_EXT_PARTITION_TYPES
+  if (xd->mi[0]->partition == PARTITION_VERT_A) {
+    if (xd->n8_w == xd->n8_h)
+      if (mask_row & bs) has_tr = 0;
+  }
 
   return has_tr;
 }
 
-#if CONFIG_MFMV
-static int check_sb_border(const AV1_COMMON *cm, const int mi_row,
-                           const int mi_col, const int row_offset,
-                           const int col_offset) {
-  const int sb_mi_size = mi_size_wide[cm->sb_size];
+static int check_sb_border(const int mi_row, const int mi_col,
+                           const int row_offset, const int col_offset) {
+  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
   const int row = mi_row & (sb_mi_size - 1);
   const int col = mi_col & (sb_mi_size - 1);
 
@@ -466,513 +322,307 @@ static int check_sb_border(const AV1_COMMON *cm, const int mi_row,
   return 1;
 }
 
-static int add_tpl_ref_mv(const AV1_COMMON *cm,
-                          const MV_REF *prev_frame_mvs_base,
-                          const MACROBLOCKD *xd, int mi_row, int mi_col,
-                          MV_REFERENCE_FRAME ref_frame, int blk_row,
-                          int blk_col, uint8_t *refmv_count,
-                          CANDIDATE_MV *ref_mv_stack, int16_t *mode_context) {
-  (void)prev_frame_mvs_base;
+static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
+                          int blk_row, int blk_col, int_mv *gm_mv_candidates,
+                          uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+                          CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+                          int16_t *mode_context) {
   POSITION mi_pos;
   int idx;
-  int coll_blk_count = 0;
   const int weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
 
-#if CONFIG_MV_COMPRESS
   mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
   mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
-#else
-  mi_pos.row = blk_row;
-  mi_pos.col = blk_col;
-#endif
 
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos))
-    return coll_blk_count;
+  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
 
-  const TPL_MV_REF *prev_frame_mvs = cm->cur_frame->tpl_mvs +
-                                     (mi_row + mi_pos.row) * cm->mi_stride +
-                                     (mi_col + mi_pos.col);
+  const TPL_MV_REF *prev_frame_mvs =
+      cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+      ((mi_col + mi_pos.col) >> 1);
 
   MV_REFERENCE_FRAME rf[2];
   av1_set_ref_frame(rf, ref_frame);
 
   if (rf[1] == NONE_FRAME) {
-    for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
-      if (prev_frame_mvs->mfmv[ref_frame - LAST_FRAME][i].as_int !=
-          INVALID_MV) {
-        int_mv this_refmv = prev_frame_mvs->mfmv[ref_frame - LAST_FRAME][i];
-        lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-
-        if (blk_row == 0 && blk_col == 0)
-          if (abs(this_refmv.as_mv.row) >= 16 ||
-              abs(this_refmv.as_mv.col) >= 16)
-            mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-
-        for (idx = 0; idx < *refmv_count; ++idx)
-          if (abs(this_refmv.as_mv.row - ref_mv_stack[idx].this_mv.as_mv.row) <
-                  4 &&
-              abs(this_refmv.as_mv.col - ref_mv_stack[idx].this_mv.as_mv.col) <
-                  4)
-            break;
-
-        if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-        if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
-          ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-          // TODO(jingning): Hard coded context number. Need to make it better
-          // sense.
-          ref_mv_stack[idx].pred_diff[0] = 1;
-          ref_mv_stack[idx].weight = 2 * weight_unit;
-          ++(*refmv_count);
-        }
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
+
+    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+      int_mv this_refmv;
+
+      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
+      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
+
+      if (blk_row == 0 && blk_col == 0)
+        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+      for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
 
-        ++coll_blk_count;
+      if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
+
+      if (idx == refmv_count[rf[0]] &&
+          refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+        ref_mv_stack[idx].weight = 2 * weight_unit;
+        ++(refmv_count[rf[0]]);
       }
+      return 1;
     }
   } else {
     // Process compound inter mode
-    for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
-      if (prev_frame_mvs->mfmv[rf[0] - LAST_FRAME][i].as_int != INVALID_MV &&
-          prev_frame_mvs->mfmv[rf[1] - LAST_FRAME][i].as_int != INVALID_MV) {
-        int_mv this_refmv = prev_frame_mvs->mfmv[rf[0] - LAST_FRAME][i];
-        int_mv comp_refmv = prev_frame_mvs->mfmv[rf[1] - LAST_FRAME][i];
-        lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-        lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv);
-
-        if (blk_row == 0 && blk_col == 0)
-          if (abs(this_refmv.as_mv.row) >= 16 ||
-              abs(this_refmv.as_mv.col) >= 16 ||
-              abs(comp_refmv.as_mv.row) >= 16 ||
-              abs(comp_refmv.as_mv.col) >= 16)
-            mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-
-        for (idx = 0; idx < *refmv_count; ++idx)
-          if (abs(this_refmv.as_mv.row - ref_mv_stack[idx].this_mv.as_mv.row) <
-                  4 &&
-              abs(this_refmv.as_mv.col - ref_mv_stack[idx].this_mv.as_mv.col) <
-                  4 &&
-              abs(comp_refmv.as_mv.row - ref_mv_stack[idx].comp_mv.as_mv.row) <
-                  4 &&
-              abs(comp_refmv.as_mv.col - ref_mv_stack[idx].comp_mv.as_mv.col) <
-                  4)
-            break;
-
-        if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-        if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
-          ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-          ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
-          // TODO(jingning): Hard coded context number. Need to make it better
-          // sense.
-          ref_mv_stack[idx].pred_diff[0] = 1;
-          ref_mv_stack[idx].pred_diff[1] = 1;
-          ref_mv_stack[idx].weight = 2 * weight_unit;
-          ++(*refmv_count);
-        }
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+
+    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+    int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
+    int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+    int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
+    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
+
+    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+      int_mv this_refmv;
+      int_mv comp_refmv;
+      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
+      get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_1, prev_frame_mvs->ref_frame_offset);
 
-        ++coll_blk_count;
-      }
-    }
-  }
-
-  return coll_blk_count;
-}
-#else
-static int add_col_ref_mv(const AV1_COMMON *cm,
-                          const MV_REF *prev_frame_mvs_base,
-                          int prev_frame_mvs_stride, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
-                          int blk_row, int blk_col, uint8_t *refmv_count,
-                          CANDIDATE_MV *ref_mv_stack, int16_t *mode_context) {
-#if CONFIG_TMV
-  const MV_REF *prev_frame_mvs = prev_frame_mvs_base +
-                                 ((blk_row + 1) >> 1) * prev_frame_mvs_stride +
-                                 ((blk_col + 1) >> 1);
-#else
-  const MV_REF *prev_frame_mvs =
-      prev_frame_mvs_base + blk_row * prev_frame_mvs_stride + blk_col;
-#endif
-  POSITION mi_pos;
-  int ref, idx;
-  int coll_blk_count = 0;
-  const int weight_unit = mi_size_wide[BLOCK_8X8];
-
-#if CONFIG_TMV
-  mi_pos.row = blk_row;
-  mi_pos.col = blk_col;
-#else
-#if CONFIG_MV_COMPRESS
-  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
-  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
-#else
-  mi_pos.row = blk_row;
-  mi_pos.col = blk_col;
-#endif
-#endif  // CONFIG_TMV
-
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos))
-    return coll_blk_count;
-  for (ref = 0; ref < 2; ++ref) {
-    if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
-      int_mv this_refmv = prev_frame_mvs->mv[ref];
-#if CONFIG_AMVR
       lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-#endif
+                         cm->cur_frame_force_integer_mv);
+      lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
 
-#if CONFIG_OPT_REF_MV
       if (blk_row == 0 && blk_col == 0)
-#endif
-      {
-        if (abs(this_refmv.as_mv.row) >= 16 || abs(this_refmv.as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-      }
-
-      for (idx = 0; idx < *refmv_count; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+            abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+            abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+      for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+            comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+          break;
 
-      if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
+      if (idx < refmv_count[ref_frame])
+        ref_mv_stack[idx].weight += 2 * weight_unit;
 
-      if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      if (idx == refmv_count[ref_frame] &&
+          refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].pred_diff[0] =
-            av1_get_pred_diff_ctx(prev_frame_mvs->pred_mv[ref], this_refmv);
+        ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
         ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(*refmv_count);
+        ++(refmv_count[ref_frame]);
       }
-
-      ++coll_blk_count;
+      return 1;
     }
   }
-
-  return coll_blk_count;
+  return 0;
 }
-#endif
-
-static void setup_ref_mv_list(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                              MV_REFERENCE_FRAME ref_frame,
-                              uint8_t *refmv_count, CANDIDATE_MV *ref_mv_stack,
-                              int_mv *mv_ref_list,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                              int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                              int block, int mi_row, int mi_col,
-                              int16_t *mode_context) {
-  int idx, nearest_refmv_count = 0;
-  uint8_t newmv_count = 0;
-  CANDIDATE_MV tmp_mv;
-  int len, nr_len;
-
-#if CONFIG_TMV
-  const int prev_frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
-  const int tmi_row = mi_row & 0xfffe;
-  const int tmi_col = mi_col & 0xfffe;
-  const MV_REF *const prev_frame_mvs_base =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs + (tmi_row >> 1) * prev_frame_mvs_stride +
-                (tmi_col >> 1)
-          : NULL;
-#else
-  const int prev_frame_mvs_stride = cm->mi_cols;
-#if CONFIG_MV_COMPRESS
-  const MV_REF *const prev_frame_mvs_base =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs +
-                (((mi_row >> 1) << 1) + 1) * prev_frame_mvs_stride +
-                ((mi_col >> 1) << 1) + 1
-          : NULL;
-#else
-  const MV_REF *const prev_frame_mvs_base =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs + mi_row * prev_frame_mvs_stride + mi_col
-          : NULL;
-#endif
-#endif  // CONFIG_TMV
 
+static void setup_ref_mv_list(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
+    uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+    int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+    int mi_row, int mi_col, int16_t *mode_context) {
   const int bs = AOMMAX(xd->n8_w, xd->n8_h);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
   int max_row_offset = 0, max_col_offset = 0;
-#if CONFIG_CB4X4
   const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
   const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
-#endif
   int processed_rows = 0;
   int processed_cols = 0;
-  int row_offset, col_offset;
 
   av1_set_ref_frame(rf, ref_frame);
   mode_context[ref_frame] = 0;
-  *refmv_count = 0;
+  refmv_count[ref_frame] = 0;
 
   // Find valid maximum row/col offset.
   if (xd->up_available) {
-#if CONFIG_CB4X4
-    max_row_offset = -(MVREF_ROWS << 1) + row_adj;
-#else
-    max_row_offset = -MVREF_ROWS;
-#endif
+    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
+
+    if (xd->n8_h < mi_size_high[BLOCK_8X8])
+      max_row_offset = -(2 << 1) + row_adj;
+
     max_row_offset =
-        find_valid_row_offset(tile, mi_row, cm->mi_rows, cm, max_row_offset);
+        find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
   }
 
   if (xd->left_available) {
-#if CONFIG_CB4X4
-    max_col_offset = -(MVREF_COLS << 1) + col_adj;
-#else
-    max_col_offset = -MVREF_COLS;
-#endif
+    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
+
+    if (xd->n8_w < mi_size_wide[BLOCK_8X8])
+      max_col_offset = -(2 << 1) + col_adj;
+
     max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
   }
 
+  uint8_t col_match_count = 0;
+  uint8_t row_match_count = 0;
+  uint8_t newmv_count = 0;
+
   // Scan the first above row mode info. row_offset = -1;
   if (abs(max_row_offset) >= 1)
-    newmv_count +=
-        scan_row_mbmi(cm, xd, mi_col, block, rf, -1, ref_mv_stack, refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      max_row_offset, &processed_rows);
+    scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+                  &refmv_count[ref_frame], &row_match_count, &newmv_count,
+                  gm_mv_candidates, max_row_offset, &processed_rows);
   // Scan the first left column mode info. col_offset = -1;
   if (abs(max_col_offset) >= 1)
-    newmv_count +=
-        scan_col_mbmi(cm, xd, mi_row, block, rf, -1, ref_mv_stack, refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      max_col_offset, &processed_cols);
+    scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+                  &refmv_count[ref_frame], &col_match_count, &newmv_count,
+                  gm_mv_candidates, max_col_offset, &processed_cols);
   // Check top-right boundary
   if (has_tr)
-    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
-                                 xd->n8_w, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                 gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                 refmv_count);
-
-  nearest_refmv_count = *refmv_count;
-
-  for (idx = 0; idx < nearest_refmv_count; ++idx)
-    ref_mv_stack[idx].weight += REF_CAT_LEVEL;
-
-#if CONFIG_MFMV
-  int blk_row, blk_col;
-  int coll_blk_count = 0;
-  int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
-  int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
-
-  int tpl_sample_pos[9][2] = {
-    { -2, hoffset }, { 0, hoffset },  { voffset, hoffset },
-    { voffset, 0 },  { voffset, -2 }, { voffset, -4 },
-    { -4, hoffset }, { voffset, 4 },  { 2, hoffset + 4 },
-  };
-  int i;
-
-  for (blk_row = 0; blk_row < xd->n8_h; blk_row += mi_size_high[BLOCK_8X8]) {
-    for (blk_col = 0; blk_col < xd->n8_w; blk_col += mi_size_wide[BLOCK_8X8]) {
-      // (TODO: yunqing) prev_frame_mvs_base is not used here, tpl_mvs is used.
-      // Can be modified the same way.
-      int is_available = add_tpl_ref_mv(
-          cm, prev_frame_mvs_base, xd, mi_row, mi_col, ref_frame, blk_row,
-          blk_col, refmv_count, ref_mv_stack, mode_context);
-      if (blk_row == 0 && blk_col == 0) coll_blk_count = is_available;
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
+                  ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
+                  gm_mv_candidates, &refmv_count[ref_frame]);
+
+  uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+  uint8_t nearest_refmv_count = refmv_count[ref_frame];
+
+  // TODO(yunqing): for comp_search, do it for all 3 cases.
+  for (int idx = 0; idx < nearest_refmv_count; ++idx)
+    ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+
+  if (cm->allow_ref_frame_mvs) {
+    int is_available = 0;
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
+    const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
+
+    const int tpl_sample_pos[3][2] = {
+      { voffset, -2 },
+      { voffset, hoffset },
+      { voffset - 2, hoffset },
+    };
+    const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
+                                (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->n8_w < mi_size_wide[BLOCK_64X64]);
+
+    int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
+                     ? mi_size_high[BLOCK_16X16]
+                     : mi_size_high[BLOCK_8X8];
+    int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
+                     ? mi_size_wide[BLOCK_16X16]
+                     : mi_size_wide[BLOCK_8X8];
+
+    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
+      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
+        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
+                                 blk_col, gm_mv_candidates, refmv_count,
+                                 ref_mv_stack, mode_context);
+        if (blk_row == 0 && blk_col == 0) is_available = ret;
+      }
     }
-  }
 
-  if (coll_blk_count == 0) mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
 
-  for (i = 0; i < 9; ++i) {
-    blk_row = tpl_sample_pos[i][0];
-    blk_col = tpl_sample_pos[i][1];
+    for (int i = 0; i < 3 && allow_extension; ++i) {
+      const int blk_row = tpl_sample_pos[i][0];
+      const int blk_col = tpl_sample_pos[i][1];
 
-    if (!check_sb_border(cm, mi_row, mi_col, blk_row, blk_col)) continue;
-    // (TODO: yunqing) prev_frame_mvs_base is not used here, tpl_mvs is used.
-    // Can be modified the same way.
-    coll_blk_count += add_tpl_ref_mv(cm, prev_frame_mvs_base, xd, mi_row,
-                                     mi_col, ref_frame, blk_row, blk_col,
-                                     refmv_count, ref_mv_stack, mode_context);
-  }
-#else
-#if CONFIG_TEMPMV_SIGNALING
-  if (cm->use_prev_frame_mvs && rf[1] == NONE_FRAME)
-#else
-  if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame &&
-      rf[1] == NONE_FRAME)
-#endif
-  {
-    int blk_row, blk_col;
-    int coll_blk_count = 0;
-#if CONFIG_CB4X4
-    const int mi_step = (xd->n8_w == 1 || xd->n8_h == 1)
-                            ? mi_size_wide[BLOCK_8X8]
-                            : mi_size_wide[BLOCK_16X16];
-#else
-    const int mi_step = mi_size_wide[BLOCK_16X16];
-#endif
-
-#if CONFIG_TPL_MV
-    // Modified sample positions to be consistent with frame_mvs
-    // spatial resolution.
-    int tpl_sample_pos[5][2] = { { -1, xd->n8_w },
-                                 { 0, xd->n8_w },
-                                 { xd->n8_h, xd->n8_w },
-                                 { xd->n8_h, 0 },
-                                 { xd->n8_h, -1 } };
-    int i;
-#endif
-
-    for (blk_row = 0; blk_row < xd->n8_h; blk_row += mi_step) {
-      for (blk_col = 0; blk_col < xd->n8_w; blk_col += mi_step) {
-#if CONFIG_TMV
-        int is_available =
-            add_col_ref_mv(cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd,
-                           tmi_row, tmi_col, ref_frame, blk_row, blk_col,
-                           refmv_count, ref_mv_stack, mode_context);
-#else
-        int is_available =
-            add_col_ref_mv(cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd,
-                           mi_row, mi_col, ref_frame, blk_row, blk_col,
-                           refmv_count, ref_mv_stack, mode_context);
-#endif  // CONFIG_TMV
-#if CONFIG_OPT_REF_MV
-        if (blk_row == 0 && blk_col == 0) coll_blk_count = is_available;
-#else
-        coll_blk_count += is_available;
-#endif
-      }
-    }
-
-#if CONFIG_TPL_MV
-    for (i = 0; i < 5; ++i) {
-      blk_row = tpl_sample_pos[i][0];
-      blk_col = tpl_sample_pos[i][1];
-#if CONFIG_TMV
-      coll_blk_count += add_col_ref_mv(
-          cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd, tmi_row, tmi_col,
-          ref_frame, blk_row, blk_col, refmv_count, ref_mv_stack, mode_context);
-#else
-      coll_blk_count += add_col_ref_mv(
-          cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd, mi_row, mi_col,
-          ref_frame, blk_row, blk_col, refmv_count, ref_mv_stack, mode_context);
-#endif  // CONFIG_TMV
+      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
+      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
+                     gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
     }
-#endif
-
-    if (coll_blk_count == 0) mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-  } else {
-    mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
   }
-#endif
+
+  uint8_t dummy_newmv_count = 0;
 
   // Scan the second outer area.
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, -1, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                refmv_count);
-  for (idx = 2; idx <= MVREF_ROWS; ++idx) {
-#if CONFIG_CB4X4
-    row_offset = -(idx << 1) + 1 + row_adj;
-    col_offset = -(idx << 1) + 1 + col_adj;
-#else
-    row_offset = -idx;
-    col_offset = -idx;
-#endif
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+                &refmv_count[ref_frame]);
+
+  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
+    const int row_offset = -(idx << 1) + 1 + row_adj;
+    const int col_offset = -(idx << 1) + 1 + col_adj;
 
     if (abs(row_offset) <= abs(max_row_offset) &&
         abs(row_offset) > processed_rows)
-      scan_row_mbmi(cm, xd, mi_col, block, rf, row_offset, ref_mv_stack,
-                    refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
+      scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
+                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+                    &row_match_count, &dummy_newmv_count, gm_mv_candidates,
                     max_row_offset, &processed_rows);
 
     if (abs(col_offset) <= abs(max_col_offset) &&
         abs(col_offset) > processed_cols)
-      scan_col_mbmi(cm, xd, mi_row, block, rf, col_offset, ref_mv_stack,
-                    refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
+      scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
+                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
                     max_col_offset, &processed_cols);
   }
 
-#if CONFIG_CB4X4
-  col_offset = -(MVREF_COLS << 1) + 1 + col_adj;
-#else
-  col_offset = -MVREF_COLS;
-#endif
-  if (abs(col_offset) <= abs(max_col_offset) &&
-      abs(col_offset) > processed_cols)
-    scan_col_mbmi(cm, xd, mi_row, block, rf, col_offset, ref_mv_stack,
-                  refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                  gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                  max_col_offset, &processed_cols);
-
-  switch (nearest_refmv_count) {
-    case 0: mode_context[ref_frame] |= 0;
-#if !CONFIG_OPT_REF_MV
-      if (*refmv_count >= 1) mode_context[ref_frame] |= 1;
-      if (*refmv_count == 1)
+  uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+
+  switch (nearest_match) {
+    case 0:
+      mode_context[ref_frame] |= 0;
+      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
+      if (ref_match_count == 1)
         mode_context[ref_frame] |= (1 << REFMV_OFFSET);
-      else if (*refmv_count >= 2)
+      else if (ref_match_count >= 2)
         mode_context[ref_frame] |= (2 << REFMV_OFFSET);
-#endif
       break;
-    case 1: mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
-#if CONFIG_OPT_REF_MV
-      mode_context[ref_frame] |= (3 << REFMV_OFFSET);
-#else
-      if (*refmv_count == 1)
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+      if (ref_match_count == 1)
         mode_context[ref_frame] |= (3 << REFMV_OFFSET);
-      else if (*refmv_count >= 2)
+      else if (ref_match_count >= 2)
         mode_context[ref_frame] |= (4 << REFMV_OFFSET);
-#endif
       break;
-
     case 2:
     default:
-      if (newmv_count >= 2)
+      if (newmv_count >= 1)
         mode_context[ref_frame] |= 4;
-      else if (newmv_count == 1)
-        mode_context[ref_frame] |= 5;
       else
-        mode_context[ref_frame] |= 6;
+        mode_context[ref_frame] |= 5;
 
       mode_context[ref_frame] |= (5 << REFMV_OFFSET);
       break;
   }
 
   // Rank the likelihood and assign nearest and near mvs.
-  len = nearest_refmv_count;
+  int len = nearest_refmv_count;
   while (len > 0) {
-    nr_len = 0;
-    for (idx = 1; idx < len; ++idx) {
-      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
-        tmp_mv = ref_mv_stack[idx - 1];
-        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
-        ref_mv_stack[idx] = tmp_mv;
+    int nr_len = 0;
+    for (int idx = 1; idx < len; ++idx) {
+      if (ref_mv_stack[ref_frame][idx - 1].weight <
+          ref_mv_stack[ref_frame][idx].weight) {
+        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+        ref_mv_stack[ref_frame][idx] = tmp_mv;
         nr_len = idx;
       }
     }
     len = nr_len;
   }
 
-  len = *refmv_count;
+  len = refmv_count[ref_frame];
   while (len > nearest_refmv_count) {
-    nr_len = nearest_refmv_count;
-    for (idx = nearest_refmv_count + 1; idx < len; ++idx) {
-      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
-        tmp_mv = ref_mv_stack[idx - 1];
-        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
-        ref_mv_stack[idx] = tmp_mv;
+    int nr_len = nearest_refmv_count;
+    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_stack[ref_frame][idx - 1].weight <
+          ref_mv_stack[ref_frame][idx].weight) {
+        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+        ref_mv_stack[ref_frame][idx] = tmp_mv;
         nr_len = idx;
       }
     }
@@ -980,595 +630,324 @@ static void setup_ref_mv_list(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   if (rf[1] > NONE_FRAME) {
-    for (idx = 0; idx < *refmv_count; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                   xd->n8_h << MI_SIZE_LOG2, xd);
-      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                   xd->n8_h << MI_SIZE_LOG2, xd);
-    }
-  } else {
-    for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) {
-      mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
-      clamp_mv_ref(&mv_ref_list[idx].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                   xd->n8_h << MI_SIZE_LOG2, xd);
-    }
-  }
-}
+    // TODO(jingning, yunqing): Refactor and consolidate the compound and
+    // single reference frame modes. Reduce unnecessary redundancy.
+    if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+      int_mv ref_id[2][2], ref_diff[2][2];
+      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
+
+      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+      mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+      mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+      int mi_size = AOMMIN(mi_width, mi_height);
+
+      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+        const int candidate_bsize = candidate->sb_type;
+
+        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+              ++ref_id_count[cmp_idx];
+            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+              int_mv this_mv = candidate->mv[rf_idx];
+              if (cm->ref_frame_sign_bias[can_rf] !=
+                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+                this_mv.as_mv.row = -this_mv.as_mv.row;
+                this_mv.as_mv.col = -this_mv.as_mv.col;
+              }
+              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+              ++ref_diff_count[cmp_idx];
+            }
+          }
+        }
+        idx += mi_size_wide[candidate_bsize];
+      }
 
-// This function searches the neighbourhood of a given MB/SB
-// to try and find candidate reference vectors.
-static void find_mv_refs_idx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                             int_mv *mv_ref_list, int block, int mi_row,
-                             int mi_col, find_mv_refs_sync sync,
-                             void *const data, int16_t *mode_context,
-                             int_mv zeromv) {
-  const int *ref_sign_bias = cm->ref_frame_sign_bias;
-  const int sb_mi_size = mi_size_wide[cm->sb_size];
-  int i, refmv_count = 0;
-  int different_ref_found = 0;
-  int context_counter = 0;
-
-#if CONFIG_TMV
-  int tmi_row = mi_row & 0xfffe;
-  int tmi_col = mi_col & 0xfffe;
-  POSITION mi_pos = { 0, 0 };
-  int inside = is_inside(&xd->tile, tmi_col, tmi_row, cm->mi_rows, cm, &mi_pos);
-  const MV_REF *const prev_frame_mvs =
-      cm->use_prev_frame_mvs && inside
-          ? cm->prev_frame->mvs + (tmi_row >> 1) * ((cm->mi_cols + 1) >> 1) +
-                (tmi_col >> 1)
-          : NULL;
-#else
-#if CONFIG_MV_COMPRESS
-  const TileInfo *const tile_ = &xd->tile;
-  int mi_row_end = tile_->mi_row_end;
-  int mi_col_end = tile_->mi_col_end;
-  const MV_REF *const prev_frame_mvs =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs +
-                AOMMIN(((mi_row >> 1) << 1) + 1 + (((xd->n8_h - 1) >> 1) << 1),
-                       mi_row_end - 1) *
-                    cm->mi_cols +
-                AOMMIN(((mi_col >> 1) << 1) + 1 + (((xd->n8_w - 1) >> 1) << 1),
-                       mi_col_end - 1)
-          : NULL;
-#else
-  const MV_REF *const prev_frame_mvs =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col
-          : NULL;
-#endif
-#endif  // CONFIG_TMV
-
-#if CONFIG_INTRABC
-  assert(IMPLIES(ref_frame == INTRA_FRAME, cm->use_prev_frame_mvs == 0));
-#endif
-  const TileInfo *const tile = &xd->tile;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int bw = block_size_wide[AOMMAX(bsize, BLOCK_8X8)];
-  const int bh = block_size_high[AOMMAX(bsize, BLOCK_8X8)];
-  POSITION mv_ref_search[MVREF_NEIGHBOURS];
-  const int num_8x8_blocks_wide = num_8x8_blocks_wide_lookup[bsize];
-  const int num_8x8_blocks_high = num_8x8_blocks_high_lookup[bsize];
-  mv_ref_search[0].row = num_8x8_blocks_high - 1;
-  mv_ref_search[0].col = -1;
-  mv_ref_search[1].row = -1;
-  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
-  mv_ref_search[2].row = -1;
-  mv_ref_search[2].col = (num_8x8_blocks_wide - 1) >> 1;
-  mv_ref_search[3].row = (num_8x8_blocks_high - 1) >> 1;
-  mv_ref_search[3].col = -1;
-  mv_ref_search[4].row = -1;
-  mv_ref_search[4].col = -1;
-#if CONFIG_EXT_PARTITION_TYPES
-  if (num_8x8_blocks_wide == num_8x8_blocks_high) {
-    mv_ref_search[5].row = -1;
-    mv_ref_search[5].col = 0;
-    mv_ref_search[6].row = 0;
-    mv_ref_search[6].col = -1;
-  } else {
-    mv_ref_search[5].row = -1;
-    mv_ref_search[5].col = num_8x8_blocks_wide;
-    mv_ref_search[6].row = num_8x8_blocks_high;
-    mv_ref_search[6].col = -1;
-  }
-#else
-  mv_ref_search[5].row = -1;
-  mv_ref_search[5].col = num_8x8_blocks_wide;
-  mv_ref_search[6].row = num_8x8_blocks_high;
-  mv_ref_search[6].col = -1;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-  mv_ref_search[7].row = -1;
-  mv_ref_search[7].col = -3;
-  mv_ref_search[8].row = num_8x8_blocks_high - 1;
-  mv_ref_search[8].col = -3;
-
-#if CONFIG_CB4X4
-  for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-    mv_ref_search[i].row *= 2;
-    mv_ref_search[i].col *= 2;
-  }
-#endif  // CONFIG_CB4X4
-
-  // The nearest 2 blocks are treated differently
-  // if the size < 8x8 we get the mv from the bmi substructure,
-  // and we also need to keep a mode count.
-  for (i = 0; i < 2; ++i) {
-    const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-      const MODE_INFO *const candidate_mi =
-          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
-      // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
-      different_ref_found = 1;
-
-      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
-      else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
-    }
-  }
+      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+        const int candidate_bsize = candidate->sb_type;
+
+        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+              ++ref_id_count[cmp_idx];
+            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+              int_mv this_mv = candidate->mv[rf_idx];
+              if (cm->ref_frame_sign_bias[can_rf] !=
+                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+                this_mv.as_mv.row = -this_mv.as_mv.row;
+                this_mv.as_mv.col = -this_mv.as_mv.col;
+              }
+              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+              ++ref_diff_count[cmp_idx];
+            }
+          }
+        }
+        idx += mi_size_high[candidate_bsize];
+      }
 
-  // Check the rest of the neighbors in much the same way
-  // as before except we don't need to keep track of sub blocks or
-  // mode counts.
-  for (; i < MVREF_NEIGHBOURS; ++i) {
-    const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-      const MB_MODE_INFO *const candidate =
-          !xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]
-              ? NULL
-              : &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
-      if (candidate == NULL) continue;
-      if ((mi_row & (sb_mi_size - 1)) + mv_ref->row >= sb_mi_size ||
-          (mi_col & (sb_mi_size - 1)) + mv_ref->col >= sb_mi_size)
-        continue;
-      different_ref_found = 1;
-
-      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, bw, bh, xd,
-                        Done);
-      else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, bw, bh, xd,
-                        Done);
+      // Build up the compound mv predictor
+      int_mv comp_list[3][2];
+
+      for (int idx = 0; idx < 2; ++idx) {
+        int comp_idx = 0;
+        for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
+        for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
+        for (; comp_idx < 3; ++comp_idx)
+          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
+      }
+
+      if (refmv_count[ref_frame]) {
+        assert(refmv_count[ref_frame] == 1);
+        if (comp_list[0][0].as_int ==
+                ref_mv_stack[ref_frame][0].this_mv.as_int &&
+            comp_list[0][1].as_int ==
+                ref_mv_stack[ref_frame][0].comp_mv.as_int) {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[1][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[1][1];
+        } else {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[0][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[0][1];
+        }
+        ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+        ++refmv_count[ref_frame];
+      } else {
+        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[idx][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[idx][1];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+          ++refmv_count[ref_frame];
+        }
+      }
     }
-  }
 
-// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-// on windows platform. The sync here is unncessary if use_perv_frame_mvs
-// is 0. But after removing it, there will be hang in the unit test on windows
-// due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  if (cm->frame_parallel_decode && sync != NULL) {
-    sync(data, mi_row);
-  }
-#endif
+    assert(refmv_count[ref_frame] >= 2);
 
-  // Check the last frame's mode and mv info.
-  if (cm->use_prev_frame_mvs) {
-    // Synchronize here for frame parallel decode if sync function is provided.
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
+    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
+                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
     }
+  } else {
+    // Handle single reference frame extension
+    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+    mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+    mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+    int mi_size = AOMMIN(mi_width, mi_height);
+
+    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
+                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+      const int candidate_bsize = candidate->sb_type;
+
+      // TODO(jingning): Refactor the following code.
+      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+          int_mv this_mv = candidate->mv[rf_idx];
+          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+              cm->ref_frame_sign_bias[ref_frame]) {
+            this_mv.as_mv.row = -this_mv.as_mv.row;
+            this_mv.as_mv.col = -this_mv.as_mv.col;
+          }
+          int stack_idx;
+          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+            if (this_mv.as_int == stack_mv.as_int) break;
+          }
 
-    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, bw, bh,
-                      xd, Done);
-    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
-      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, bw, bh,
-                      xd, Done);
-    }
-  }
+          if (stack_idx == refmv_count[ref_frame]) {
+            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
 
-  // Since we couldn't find 2 mvs from the same reference frame
-  // go back through the neighbors and find motion vectors from
-  // different reference frames.
-  if (different_ref_found) {
-    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-      const POSITION *mv_ref = &mv_ref_search[i];
-      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-        const MB_MODE_INFO *const candidate =
-            !xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]
-                ? NULL
-                : &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
-        if (candidate == NULL) continue;
-        if ((mi_row & (sb_mi_size - 1)) + mv_ref->row >= sb_mi_size ||
-            (mi_col & (sb_mi_size - 1)) + mv_ref->col >= sb_mi_size)
-          continue;
-
-        // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
-                                 refmv_count, mv_ref_list, bw, bh, xd, Done);
+            // TODO(jingning): Set an arbitrary small number here. The weight
+            // doesn't matter as long as it is properly initialized.
+            ref_mv_stack[ref_frame][stack_idx].weight = 2;
+            ++refmv_count[ref_frame];
+          }
+        }
       }
+      idx += mi_size_wide[candidate_bsize];
     }
-  }
 
-  // Since we still don't have a candidate we'll try the last frame.
-  if (cm->use_prev_frame_mvs) {
-    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
-        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
-      int_mv mv = prev_frame_mvs->mv[0];
-      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
-          ref_sign_bias[ref_frame]) {
-        mv.as_mv.row *= -1;
-        mv.as_mv.col *= -1;
+    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
+                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+      const int candidate_bsize = candidate->sb_type;
+
+      // TODO(jingning): Refactor the following code.
+      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+          int_mv this_mv = candidate->mv[rf_idx];
+          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+              cm->ref_frame_sign_bias[ref_frame]) {
+            this_mv.as_mv.row = -this_mv.as_mv.row;
+            this_mv.as_mv.col = -this_mv.as_mv.col;
+          }
+          int stack_idx;
+          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+            if (this_mv.as_int == stack_mv.as_int) break;
+          }
+
+          if (stack_idx == refmv_count[ref_frame]) {
+            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+            // TODO(jingning): Set an arbitrary small number here. The weight
+            // doesn't matter as long as it is properly initialized.
+            ref_mv_stack[ref_frame][stack_idx].weight = 2;
+            ++refmv_count[ref_frame];
+          }
+        }
       }
-      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+      idx += mi_size_high[candidate_bsize];
     }
 
-    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
-        prev_frame_mvs->ref_frame[1] != ref_frame) {
-      int_mv mv = prev_frame_mvs->mv[1];
-      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
-          ref_sign_bias[ref_frame]) {
-        mv.as_mv.row *= -1;
-        mv.as_mv.col *= -1;
-      }
-      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
     }
-  }
 
-Done:
-  if (mode_context)
-    mode_context[ref_frame] = counter_to_context[context_counter];
-  for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
-    mv_ref_list[i].as_int = zeromv.as_int;
-}
+    if (mv_ref_list != NULL) {
+      for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
+        mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
 
-// This function keeps a mode count for a given MB/SB
-void av1_update_mv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                           int_mv *mv_ref_list, int block, int mi_row,
-                           int mi_col, int16_t *mode_context) {
-  int i, refmv_count = 0;
-  int context_counter = 0;
-  const int bw = block_size_wide[mi->mbmi.sb_type];
-  const int bh = block_size_high[mi->mbmi.sb_type];
-  const TileInfo *const tile = &xd->tile;
-  POSITION mv_ref_search[2];
-  const int num_8x8_blocks_wide = mi_size_wide[mi->mbmi.sb_type];
-  const int num_8x8_blocks_high = mi_size_high[mi->mbmi.sb_type];
-
-  mv_ref_search[0].row = num_8x8_blocks_high - 1;
-  mv_ref_search[0].col = -1;
-  mv_ref_search[1].row = -1;
-  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
-
-  // Blank the reference vector list
-  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
-
-  // The nearest 2 blocks are examined only.
-  // If the size < 8x8, we get the mv from the bmi substructure;
-  for (i = 0; i < 2; ++i) {
-    const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-      const MODE_INFO *const candidate_mi =
-          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
-
-      // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
-
-      if (candidate->ref_frame[0] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
-      } else if (candidate->ref_frame[1] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      for (int idx = 0;
+           idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
+        mv_ref_list[rf[0]][idx].as_int =
+            ref_mv_stack[ref_frame][idx].this_mv.as_int;
       }
     }
   }
-
-Done:
-
-  if (mode_context)
-    mode_context[ref_frame] = counter_to_context[context_counter];
 }
 
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
-                      int16_t *compound_mode_context, int_mv *mv_ref_list,
-                      int mi_row, int mi_col, find_mv_refs_sync sync,
-                      void *const data, int16_t *mode_context) {
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int mi_row, int mi_col,
+                      int16_t *mode_context) {
   int_mv zeromv[2];
-#if CONFIG_GLOBAL_MOTION
-  BLOCK_SIZE bsize = mi->mbmi.sb_type;
-#endif  // CONFIG_GLOBAL_MOTION
-  int idx, all_zero = 1;
-#if CONFIG_GLOBAL_MOTION
+  BLOCK_SIZE bsize = mi->sb_type;
   MV_REFERENCE_FRAME rf[2];
-#endif  // CONFIG_GLOBAL_MOTION
-
-  av1_update_mv_context(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
-                        compound_mode_context);
-
-#if CONFIG_GLOBAL_MOTION
-  if (!CONFIG_INTRABC || ref_frame != INTRA_FRAME) {
-    av1_set_ref_frame(rf, ref_frame);
-    zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
-                                            cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                            ,
-                                            cm->cur_frame_mv_precision_level
-#endif
-                                            )
-                           .as_int;
+  av1_set_ref_frame(rf, ref_frame);
+
+  if (ref_frame < REF_FRAMES) {
+    if (ref_frame != INTRA_FRAME) {
+      global_mvs[ref_frame] = gm_get_motion_vector(
+          &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
+          mi_col, mi_row, cm->cur_frame_force_integer_mv);
+    } else {
+      global_mvs[ref_frame].as_int = INVALID_MV;
+    }
+  }
+
+  if (ref_frame != INTRA_FRAME) {
+    zeromv[0].as_int =
+        gm_get_motion_vector(&cm->global_motion[rf[0]],
+                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                             cm->cur_frame_force_integer_mv)
+            .as_int;
     zeromv[1].as_int =
         (rf[1] != NONE_FRAME)
             ? gm_get_motion_vector(&cm->global_motion[rf[1]],
                                    cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, 0
-#if CONFIG_AMVR
-                                   ,
-                                   cm->cur_frame_mv_precision_level
-#endif
-                                   )
+                                   mi_row, cm->cur_frame_force_integer_mv)
                   .as_int
             : 0;
   } else {
     zeromv[0].as_int = zeromv[1].as_int = 0;
   }
-#else
-  zeromv[0].as_int = zeromv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-
-  if (ref_frame <= ALTREF_FRAME)
-    find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
-                     sync, data, mode_context, zeromv[0]);
 
   setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    zeromv,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    -1, mi_row, mi_col, mode_context);
-  /* Note: If global motion is enabled, then we want to set the ALL_ZERO flag
-     iff all of the MVs we could generate with NEARMV/NEARESTMV are equivalent
-     to the global motion vector.
-     Note: For the following to work properly, the encoder can't throw away
-     any global motion models after calling this function, even if they are
-     unused. Instead we rely on the recode loop: If any non-IDENTITY model
-     is unused, the whole frame will be re-encoded without it.
-     The problem is that, otherwise, we can end up in the following situation:
-     * Encoder has a global motion model with nonzero translational part,
-       and all candidate MVs are zero. So the ALL_ZERO flag is unset.
-     * Encoder throws away global motion because it is never used.
-     * Decoder sees that there is no global motion and all candidate MVs are
-       zero, so sets the ALL_ZERO flag.
-     * This leads to an encode/decode mismatch.
-  */
-  for (idx = 0; idx < AOMMIN(3, *ref_mv_count); ++idx) {
-    if (ref_mv_stack[idx].this_mv.as_int != zeromv[0].as_int) all_zero = 0;
-    if (ref_frame > ALTREF_FRAME)
-      if (ref_mv_stack[idx].comp_mv.as_int != zeromv[1].as_int) all_zero = 0;
-  }
-  if (*ref_mv_count < 2 && ref_frame <= ALTREF_FRAME) {
-    for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
-      if (mv_ref_list[idx].as_int != zeromv[0].as_int) all_zero = 0;
-  }
-
-#if !CONFIG_OPT_REF_MV
-  if (all_zero) mode_context[ref_frame] |= (1 << ALL_ZERO_FLAG_OFFSET);
-#else
-  (void)all_zero;
-#endif
+                    zeromv, mi_row, mi_col, mode_context);
 }
 
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
-                           int_mv *near_mv
-#if CONFIG_AMVR
-                           ,
-                           int is_integer
-#endif
-                           ) {
+                           int_mv *near_mv, int is_integer) {
   int i;
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-#if CONFIG_AMVR
     lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
-#else
-    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-#endif
   }
   *nearest_mv = mvlist[0];
   *near_mv = mvlist[1];
 }
 
-void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   CANDIDATE_MV *ref_mv_stack,
-                                   uint8_t *ref_mv_count, int_mv *mv_list,
-                                   int_mv *nearest_mv, int_mv *near_mv) {
-  MODE_INFO *const mi = xd->mi[0];
-  b_mode_info *bmi = mi->bmi;
-  int n;
-  int_mv zeromv;
-  CANDIDATE_MV tmp_mv;
-  uint8_t idx;
-  uint8_t above_count = 0, left_count = 0;
-  MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE_FRAME };
-  *ref_mv_count = 0;
-
-  assert(MAX_MV_REF_CANDIDATES == 2);
-
-#if CONFIG_GLOBAL_MOTION
-  zeromv.as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
-                                       cm->allow_high_precision_mv,
-                                       mi->mbmi.sb_type, mi_col, mi_row, block
-#if CONFIG_AMVR
-                                       ,
-                                       cm->cur_frame_mv_precision_level
-#endif
-                                       )
-                      .as_int;
-#else
-  zeromv.as_int = 0;
-#endif
-  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block, mi_row,
-                   mi_col, NULL, NULL, NULL, zeromv);
-
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, 0, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                &zeromv,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                ref_mv_count);
-  above_count = *ref_mv_count;
-
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, 0, -1, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                &zeromv,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                ref_mv_count);
-  left_count = *ref_mv_count - above_count;
-
-  if (above_count > 1 && left_count > 0) {
-    tmp_mv = ref_mv_stack[1];
-    ref_mv_stack[1] = ref_mv_stack[above_count];
-    ref_mv_stack[above_count] = tmp_mv;
-  }
-
-  for (idx = 0; idx < *ref_mv_count; ++idx)
-    clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                 xd->n8_h << MI_SIZE_LOG2, xd);
-
-  for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *ref_mv_count); ++idx)
-    mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
-
-  near_mv->as_int = 0;
-  switch (block) {
-    case 0:
-      nearest_mv->as_int = mv_list[0].as_int;
-      near_mv->as_int = mv_list[1].as_int;
-      break;
-    case 1:
-    case 2:
-      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
-      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest_mv->as_int != mv_list[n].as_int) {
-          near_mv->as_int = mv_list[n].as_int;
-          break;
-        }
-      break;
-    case 3: {
-      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
-      candidates[0] = bmi[1].as_mv[ref];
-      candidates[1] = bmi[0].as_mv[ref];
-      candidates[2] = mv_list[0];
-      candidates[3] = mv_list[1];
-
-      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
-      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest_mv->as_int != candidates[n].as_int) {
-          near_mv->as_int = candidates[n].as_int;
-          break;
-        }
-      break;
-    }
-    default: assert(0 && "Invalid block index.");
-  }
-}
-
-#if CONFIG_FRAME_MARKER
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
   cm->cur_frame->cur_frame_offset = cm->frame_offset;
-  int alt_buf_idx = cm->frame_refs[ALTREF_FRAME - LAST_FRAME].idx;
-  int lst_buf_idx = cm->frame_refs[LAST_FRAME - LAST_FRAME].idx;
-  int gld_buf_idx = cm->frame_refs[GOLDEN_FRAME - LAST_FRAME].idx;
-
-#if CONFIG_EXT_REFS
-  int lst2_buf_idx = cm->frame_refs[LAST2_FRAME - LAST_FRAME].idx;
-  int lst3_buf_idx = cm->frame_refs[LAST3_FRAME - LAST_FRAME].idx;
-  int bwd_buf_idx = cm->frame_refs[BWDREF_FRAME - LAST_FRAME].idx;
-  int alt2_buf_idx = cm->frame_refs[ALTREF2_FRAME - LAST_FRAME].idx;
-#endif
-
-  if (alt_buf_idx >= 0)
-    cm->cur_frame->alt_frame_offset =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].cur_frame_offset;
-
-  if (lst_buf_idx >= 0)
-    cm->cur_frame->lst_frame_offset =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].cur_frame_offset;
-
-  if (gld_buf_idx >= 0)
-    cm->cur_frame->gld_frame_offset =
-        cm->buffer_pool->frame_bufs[gld_buf_idx].cur_frame_offset;
-
-#if CONFIG_EXT_REFS
-  if (lst2_buf_idx >= 0)
-    cm->cur_frame->lst2_frame_offset =
-        cm->buffer_pool->frame_bufs[lst2_buf_idx].cur_frame_offset;
-
-  if (lst3_buf_idx >= 0)
-    cm->cur_frame->lst3_frame_offset =
-        cm->buffer_pool->frame_bufs[lst3_buf_idx].cur_frame_offset;
-
-  if (bwd_buf_idx >= 0)
-    cm->cur_frame->bwd_frame_offset =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].cur_frame_offset;
-
-  if (alt2_buf_idx >= 0)
-    cm->cur_frame->alt2_frame_offset =
-        cm->buffer_pool->frame_bufs[alt2_buf_idx].cur_frame_offset;
-#endif
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    if (buf_idx >= 0)
+      cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] =
+          cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+  }
 }
 
-#if CONFIG_FRAME_SIGN_BIAS
 void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
-    if (buf_idx != INVALID_IDX) {
+    if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) {
       const int ref_frame_offset =
           cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
       cm->ref_frame_sign_bias[ref_frame] =
-          (ref_frame_offset <= (int)cm->frame_offset) ? 0 : 1;
+          (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0)
+              ? 0
+              : 1;
     } else {
       cm->ref_frame_sign_bias[ref_frame] = 0;
     }
   }
 }
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#endif  // CONFIG_FRAME_MARKER
-
-#if CONFIG_MFMV
-// Although we assign 32 bit integers, all the values are strictly under 14
-// bits.
-static int div_mult[32] = {
-  0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
-  1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
-  744,  712,   682,  655,  630,  606,  585,  564,  546,  528,
-};
-
-// TODO(jingning): Consider the use of lookup table for (num / den)
-// altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
-  output->row =
-      (int16_t)(ROUND_POWER_OF_TWO(ref.row * num * div_mult[den], 14));
-  output->col =
-      (int16_t)(ROUND_POWER_OF_TWO(ref.col * num * div_mult[den], 14));
-}
 
 #define MAX_OFFSET_WIDTH 64
-#define MAX_OFFSET_HEIGHT 32
+#define MAX_OFFSET_HEIGHT 0
 
 static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
                               int blk_col, MV mv, int sign_bias) {
-  if ((abs(mv.row) >> 3) > MAX_OFFSET_HEIGHT ||
-      (abs(mv.col) >> 3) > MAX_OFFSET_WIDTH)
-    return 0;
+  const int base_blk_row = (blk_row >> 3) << 3;
+  const int base_blk_col = (blk_col >> 3) << 3;
+
+  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
+
+  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
 
-  int row = (sign_bias == 1) ? blk_row - (mv.row >> (3 + MI_SIZE_LOG2))
-                             : blk_row + (mv.row >> (3 + MI_SIZE_LOG2));
-  int col = (sign_bias == 1) ? blk_col - (mv.col >> (3 + MI_SIZE_LOG2))
-                             : blk_col + (mv.col >> (3 + MI_SIZE_LOG2));
+  int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+  int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
 
-  if (row < 0 || row >= cm->mi_rows || col < 0 || col >= cm->mi_cols) return 0;
+  if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
+      col >= (cm->mi_cols >> 1))
+    return 0;
+
+  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
+      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
+      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
+      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
+    return 0;
 
   *mi_r = row;
   *mi_c = col;
@@ -1576,504 +955,209 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   return 1;
 }
 
-static uint32_t mv_sign_reverse(int_mv ref) {
-  int_mv this_mv;
-  this_mv.as_mv.row = -ref.as_mv.row;
-  this_mv.as_mv.col = -ref.as_mv.col;
+static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
+                                   int dir) {
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int ref_offset[REF_FRAMES] = { 0 };
 
-  return this_mv.as_int;
-}
+  (void)dir;
 
-void av1_setup_motion_field(AV1_COMMON *cm) {
+  int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
+  if (ref_frame_idx < 0) return 0;
+
+  if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0;
+
+  if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
+      cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
+    return 0;
+
+  int ref_frame_index =
+      cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset;
+  unsigned int *ref_rf_idx =
+      &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0];
   int cur_frame_index = cm->cur_frame->cur_frame_offset;
-  int lst_frame_index = 0, alt_frame_index = 0, gld_frame_index = 0;
-#if CONFIG_EXT_REFS
-  int lst2_frame_index = 0, lst3_frame_index = 0;
-  int bwd_frame_index = 0, alt2_frame_index = 0;
-#endif
-  TPL_MV_REF *tpl_mvs_base = cm->cur_frame->tpl_mvs;
-
-  for (int ref_frame = 0; ref_frame < INTER_REFS_PER_FRAME; ++ref_frame) {
-    int size = (cm->mi_rows + 16) * cm->mi_stride;
-    for (int idx = 0; idx < size; ++idx) {
-      for (int i = 0; i < MFMV_STACK_SIZE; ++i)
-        tpl_mvs_base[idx].mfmv[ref_frame][i].as_int = INVALID_MV;
-    }
+  int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
+
+  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
+    ref_offset[rf] =
+        get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
   }
 
-  int alt_buf_idx = cm->frame_refs[ALTREF_FRAME - LAST_FRAME].idx;
-  int lst_buf_idx = cm->frame_refs[LAST_FRAME - LAST_FRAME].idx;
-  int gld_buf_idx = cm->frame_refs[GOLDEN_FRAME - LAST_FRAME].idx;
-#if CONFIG_EXT_REFS
-  int lst2_buf_idx = cm->frame_refs[LAST2_FRAME - LAST_FRAME].idx;
-  int lst3_buf_idx = cm->frame_refs[LAST3_FRAME - LAST_FRAME].idx;
-  int bwd_buf_idx = cm->frame_refs[BWDREF_FRAME - LAST_FRAME].idx;
-  int alt2_buf_idx = cm->frame_refs[ALTREF2_FRAME - LAST_FRAME].idx;
-#endif
-
-  if (alt_buf_idx >= 0)
-    alt_frame_index = cm->buffer_pool->frame_bufs[alt_buf_idx].cur_frame_offset;
-
-  if (lst_buf_idx >= 0)
-    lst_frame_index = cm->buffer_pool->frame_bufs[lst_buf_idx].cur_frame_offset;
-
-  if (gld_buf_idx >= 0)
-    gld_frame_index = cm->buffer_pool->frame_bufs[gld_buf_idx].cur_frame_offset;
-
-#if CONFIG_EXT_REFS
-  if (lst2_buf_idx >= 0)
-    lst2_frame_index =
-        cm->buffer_pool->frame_bufs[lst2_buf_idx].cur_frame_offset;
-
-  if (lst3_buf_idx >= 0)
-    lst3_frame_index =
-        cm->buffer_pool->frame_bufs[lst3_buf_idx].cur_frame_offset;
-
-  if (bwd_buf_idx >= 0)
-    bwd_frame_index = cm->buffer_pool->frame_bufs[bwd_buf_idx].cur_frame_offset;
-
-  if (alt2_buf_idx >= 0)
-    alt2_frame_index =
-        cm->buffer_pool->frame_bufs[alt2_buf_idx].cur_frame_offset;
-#endif
-
-  if (alt_frame_index < cur_frame_index) return;
-
-  // ======================
-  // Process last frame
-  // ======================
-  if (lst_buf_idx >= 0) {
-    MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[lst_buf_idx].mvs;
-    const int lst_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].lst_frame_offset;
-    const int alt_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].alt_frame_offset;
-    const int gld_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].gld_frame_offset;
-#if CONFIG_EXT_REFS
-    const int lst2_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].lst2_frame_offset;
-    const int lst3_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].lst3_frame_offset;
-    const int bwd_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].bwd_frame_offset;
-    const int alt2_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].alt2_frame_offset;
-#endif
-
-    int alt_offset = AOMMAX(1, alt_frame_idx - lst_frame_index);
-    int lst_offset = AOMMAX(1, lst_frame_index - lst_frame_idx);
-    int gld_offset = AOMMAX(1, lst_frame_index - gld_frame_idx);
-    int cur_to_lst = cur_frame_index - lst_frame_index;
-    int cur_to_alt = alt_frame_index - cur_frame_index;
-    int cur_to_gld = cur_frame_index - gld_frame_index;
-
-#if CONFIG_EXT_REFS
-    int bwd_offset = AOMMAX(1, bwd_frame_idx - lst_frame_index);
-    int alt2_offset = AOMMAX(1, alt2_frame_idx - lst_frame_index);
-    int lst2_offset = AOMMAX(1, lst_frame_index - lst2_frame_idx);
-    int lst3_offset = AOMMAX(1, lst_frame_index - lst3_frame_idx);
-    int cur_to_lst2 = cur_frame_index - lst2_frame_index;
-    int cur_to_lst3 = cur_frame_index - lst3_frame_index;
-    int cur_to_bwd = bwd_frame_index - cur_frame_index;
-    int cur_to_alt2 = alt2_frame_index - cur_frame_index;
-#endif
-
-    const int is_lst_overlay = (alt_frame_idx == gld_frame_index);
-    // clang-format off
-    const int ref_frame_offset_buffer[TOTAL_REFS_PER_FRAME] = {
-#if CONFIG_EXT_REFS
-        0, lst_offset, lst2_offset, lst3_offset, gld_offset,
-        bwd_offset, alt2_offset, alt_offset
-#else
-        0, lst_offset, gld_offset, alt_offset
-#endif
-    };
-    // clang-format on
+  if (dir == 2) ref_to_cur = -ref_to_cur;
+
+  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs;
+  const int mvs_rows = (cm->mi_rows + 1) >> 1;
+  const int mvs_cols = (cm->mi_cols + 1) >> 1;
 
-    for (int blk_row = 0; blk_row < cm->mi_rows && !is_lst_overlay; ++blk_row) {
-      for (int blk_col = 0; blk_col < cm->mi_cols; ++blk_col) {
-        MV_REF *mv_ref = &mv_ref_base[blk_row * cm->mi_cols + blk_col];
-        MV fwd_mv = mv_ref->mv[0].as_mv;
-        MV_REFERENCE_FRAME ref_frame[2] = { mv_ref->ref_frame[0],
-                                            mv_ref->ref_frame[1] };
+  for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
+    for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
+      MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
+      MV fwd_mv = mv_ref->mv.as_mv;
 
-        // Derive  motion vectors toward last reference frame.
-        if (ref_frame[0] <= GOLDEN_FRAME && ref_frame[0] > INTRA_FRAME) {
-          int_mv this_mv;
-          int mi_r, mi_c;
+      if (mv_ref->ref_frame > INTRA_FRAME) {
+        int_mv this_mv;
+        int mi_r, mi_c;
+        const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
 
-          const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[0]];
+        int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+                        ref_frame_offset > 0 &&
+                        abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
 
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
+        if (pos_valid) {
+          get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
                             ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 1);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(LAST_FRAME)][0].as_int =
-                this_mv.as_int;
-
-#if CONFIG_EXT_REFS
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst2,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(LAST2_FRAME)][0].as_int =
-                this_mv.as_int;
-
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst3,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(LAST3_FRAME)][0].as_int =
-                this_mv.as_int;
-#endif
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_gld,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(GOLDEN_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-          }
+          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
+                                         this_mv.as_mv, dir >> 1);
         }
 
-        for (int idx = 0; idx < 2; ++idx) {
-          if (ref_frame[idx] <= GOLDEN_FRAME) continue;
-
-          int_mv this_mv;
-          int mi_r, mi_c;
-          fwd_mv = mv_ref->mv[idx].as_mv;
+        if (pos_valid) {
+          int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
 
-          const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[idx]];
-
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
-                            ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 0);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(ALTREF_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-
-#if CONFIG_EXT_REFS
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_bwd,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(BWDREF_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt2,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(ALTREF2_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-#endif
-          }
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
+          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
         }
       }
     }
   }
 
-  // =======================
-  // Process ARF frame
-  // =======================
-  if (alt_buf_idx >= 0) {
-    MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[alt_buf_idx].mvs;
-    const int lst_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].lst_frame_offset;
-    const int gld_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].gld_frame_offset;
-#if CONFIG_EXT_REFS
-    const int lst2_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].lst2_frame_offset;
-    const int lst3_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].lst3_frame_offset;
-    const int bwd_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].bwd_frame_offset;
-    const int alt2_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].alt2_frame_offset;
-#endif
-
-    int lst_offset = AOMMAX(1, alt_frame_index - lst_frame_idx);
-    int gld_offset = AOMMAX(1, alt_frame_index - gld_frame_idx);
-    int cur_to_alt = alt_frame_index - cur_frame_index;
-    int cur_to_lst = cur_frame_index - lst_frame_index;
-    int cur_to_gld = cur_frame_index - gld_frame_index;
-#if CONFIG_EXT_REFS
-    int bwd_offset = AOMMAX(1, alt_frame_index - bwd_frame_idx);
-    int alt2_offset = AOMMAX(1, alt_frame_index - alt2_frame_idx);
-    int lst2_offset = AOMMAX(1, alt_frame_index - lst2_frame_idx);
-    int lst3_offset = AOMMAX(1, alt_frame_index - lst3_frame_idx);
-    int cur_to_lst2 = cur_frame_index - lst2_frame_index;
-    int cur_to_lst3 = cur_frame_index - lst3_frame_index;
-    int cur_to_bwd = bwd_frame_index - cur_frame_index;
-    int cur_to_alt2 = alt2_frame_index - cur_frame_index;
-#endif
-    const int ref_stamp = FWD_RF_OFFSET(ALTREF_FRAME);
-    // clang-format off
-    const int ref_frame_offset_buffer[TOTAL_REFS_PER_FRAME] = {
-#if CONFIG_EXT_REFS
-        0, lst_offset, lst2_offset, lst3_offset, gld_offset,
-        bwd_offset, alt2_offset, 0,
-#else
-        0, lst_offset, gld_offset, 0,
-#endif
-    };
-    // clang-format on
+  return 1;
+}
 
-    for (int blk_row = 0; blk_row < cm->mi_rows; ++blk_row) {
-      for (int blk_col = 0; blk_col < cm->mi_cols; ++blk_col) {
-        MV_REF *mv_ref = &mv_ref_base[blk_row * cm->mi_cols + blk_col];
-        MV fwd_mv = mv_ref->mv[0].as_mv;
-        MV_REFERENCE_FRAME ref_frame[2] = { mv_ref->ref_frame[0],
-                                            mv_ref->ref_frame[1] };
+void av1_setup_motion_field(AV1_COMMON *cm) {
+  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
+  if (!cm->seq_params.enable_order_hint) return;
+
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  for (int idx = 0; idx < size; ++idx) {
+    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
+    tpl_mvs_base[idx].ref_frame_offset = 0;
+  }
 
-        const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[0]];
+  const int cur_order_hint = cm->cur_frame->cur_frame_offset;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-        if (ref_frame[0] <= GOLDEN_FRAME && ref_frame[0] > INTRA_FRAME) {
-          int_mv this_mv;
-          int mi_r, mi_c;
+  int ref_buf_idx[INTER_REFS_PER_FRAME];
+  int ref_order_hint[INTER_REFS_PER_FRAME];
 
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt,
-                            ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 0);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(ALTREF_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(LAST_FRAME)][ref_stamp]
-                .as_int = this_mv.as_int;
-
-#if CONFIG_EXT_REFS
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_bwd,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(BWDREF_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt2,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(ALTREF2_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            if (ref_frame[0] >= LAST2_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst2,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST2_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    const int ref_idx = ref_frame - LAST_FRAME;
+    const int buf_idx = cm->frame_refs[ref_idx].idx;
+    int order_hint = 0;
 
-            if (ref_frame[0] >= LAST3_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst3,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST3_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
-#endif
-            if (ref_frame[0] >= GOLDEN_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_gld,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(GOLDEN_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
-          }
-        }
-      }
-    }
-  }
+    if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
 
-// ==========================================
-// Process BWD reference frame
-// ==========================================
-#if CONFIG_EXT_REFS
-  if (bwd_buf_idx >= 0) {
-    MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[bwd_buf_idx].mvs;
-    const int lst_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].lst_frame_offset;
-    const int gld_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].gld_frame_offset;
-    const int lst2_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].lst2_frame_offset;
-    const int lst3_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].lst3_frame_offset;
-    const int bwd_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].bwd_frame_offset;
-    const int alt2_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].alt2_frame_offset;
-    const int alt_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].alt_frame_offset;
-
-    int lst_offset = AOMMAX(1, bwd_frame_index - lst_frame_idx);
-    int gld_offset = AOMMAX(1, bwd_frame_index - gld_frame_idx);
-    int cur_to_lst = cur_frame_index - lst_frame_index;
-
-    int lst2_offset = AOMMAX(1, bwd_frame_index - lst2_frame_idx);
-    int lst3_offset = AOMMAX(1, bwd_frame_index - lst3_frame_idx);
-    int bwd_offset = AOMMAX(1, bwd_frame_idx - bwd_frame_index);
-    int alt2_offset = AOMMAX(1, alt2_frame_idx - bwd_frame_index);
-    int alt_offset = AOMMAX(1, alt_frame_idx - bwd_frame_index);
-    int cur_to_lst2 = cur_frame_index - lst2_frame_index;
-    int cur_to_lst3 = cur_frame_index - lst3_frame_index;
-    int cur_to_gld = cur_frame_index - gld_frame_index;
-    int cur_to_bwd = bwd_frame_index - cur_frame_index;
-
-    const int ref_stamp = FWD_RF_OFFSET(BWDREF_FRAME);
-    const int ref_frame_offset_buffer[TOTAL_REFS_PER_FRAME] = {
-      0,          lst_offset, lst2_offset, lst3_offset,
-      gld_offset, bwd_offset, alt2_offset, alt_offset,
-    };
+    ref_buf_idx[ref_idx] = buf_idx;
+    ref_order_hint[ref_idx] = order_hint;
 
-    for (int blk_row = 0; blk_row < cm->mi_rows; ++blk_row) {
-      for (int blk_col = 0; blk_col < cm->mi_cols; ++blk_col) {
-        MV_REF *mv_ref = &mv_ref_base[blk_row * cm->mi_cols + blk_col];
-        MV fwd_mv = mv_ref->mv[0].as_mv;
-        MV_REFERENCE_FRAME ref_frame[2] = { mv_ref->ref_frame[0],
-                                            mv_ref->ref_frame[1] };
+    if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
+      cm->ref_frame_side[ref_frame] = 1;
+    else if (order_hint == cur_order_hint)
+      cm->ref_frame_side[ref_frame] = -1;
+  }
 
-        if (ref_frame[0] <= GOLDEN_FRAME && ref_frame[0] > INTRA_FRAME) {
-          const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[0]];
-          int_mv this_mv;
-          int mi_r, mi_c;
+  int ref_stamp = MFMV_STACK_SIZE - 1;
 
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_bwd,
-                            ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 0);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(BWDREF_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            // Project the motion vector onto last reference frame
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(LAST_FRAME)][ref_stamp]
-                .as_int = this_mv.as_int;
-
-            if (ref_frame[0] >= LAST2_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst2,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST2_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
+  if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
+    const int alt_of_lst_order_hint =
+        frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
+            .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
 
-            if (ref_frame[0] >= LAST3_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst3,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST3_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
+    const int is_lst_overlay =
+        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
+    if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
+    --ref_stamp;
+  }
 
-            if (ref_frame[0] >= GOLDEN_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_gld,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(GOLDEN_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
-          }
-        }
-      }
-    }
+  if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
+  }
+
+  if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
   }
-#endif
+
+  if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0 &&
+      ref_stamp >= 0)
+    if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
+
+  if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
+    if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
 }
-#endif  // CONFIG_MFMV
 
-#if CONFIG_WARPED_MOTION
-#if WARPED_MOTION_SORT_SAMPLES
 static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
-                                  int *pts_mv, int global_offset_r,
-                                  int global_offset_c, int row_offset,
-                                  int sign_r, int col_offset, int sign_c) {
+                                  int row_offset, int sign_r, int col_offset,
+                                  int sign_c) {
   int bw = block_size_wide[mbmi->sb_type];
   int bh = block_size_high[mbmi->sb_type];
-  int cr_offset = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
-  int cc_offset = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
-  int x = cc_offset + global_offset_c;
-  int y = cr_offset + global_offset_r;
+  int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
+  int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
 
   pts[0] = (x * 8);
   pts[1] = (y * 8);
   pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
   pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
-  pts_mv[0] = mbmi->mv[0].as_mv.col;
-  pts_mv[1] = mbmi->mv[0].as_mv.row;
 }
 
-// Only sort pts and pts_inref, and pts_mv is not sorted.
-#define TRIM_THR 16
-int sortSamples(int *pts_mv, MV *mv, int *pts, int *pts_inref, int len) {
+// Select samples according to the motion vector difference.
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
   int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
-  int i, j, k;
-  int ret = len;
-
-  for (i = 0; i < len; ++i)
-    pts_mvd[i] =
-        abs(pts_mv[2 * i] - mv->col) + abs(pts_mv[2 * i + 1] - mv->row);
-
-  for (i = 1; i <= len - 1; ++i) {
-    for (j = 0; j < i; ++j) {
-      if (pts_mvd[j] > pts_mvd[i]) {
-        int temp, tempi, tempj, ptempi, ptempj;
-
-        temp = pts_mvd[i];
-        tempi = pts[2 * i];
-        tempj = pts[2 * i + 1];
-        ptempi = pts_inref[2 * i];
-        ptempj = pts_inref[2 * i + 1];
-
-        for (k = i; k > j; k--) {
-          pts_mvd[k] = pts_mvd[k - 1];
-          pts[2 * k] = pts[2 * (k - 1)];
-          pts[2 * k + 1] = pts[2 * (k - 1) + 1];
-          pts_inref[2 * k] = pts_inref[2 * (k - 1)];
-          pts_inref[2 * k + 1] = pts_inref[2 * (k - 1) + 1];
-        }
-
-        pts_mvd[j] = temp;
-        pts[2 * j] = tempi;
-        pts[2 * j + 1] = tempj;
-        pts_inref[2 * j] = ptempi;
-        pts_inref[2 * j + 1] = ptempj;
-        break;
-      }
-    }
+  int i, j, k, l = len;
+  int ret = 0;
+  assert(len <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Obtain the motion vector difference.
+  for (i = 0; i < len; ++i) {
+    pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+                 abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+
+    if (pts_mvd[i] > thresh)
+      pts_mvd[i] = -1;
+    else
+      ret++;
   }
 
-  for (i = len - 1; i >= 1; i--) {
-    int low = (i == 1) ? 1 : AOMMAX((pts_mvd[i - 1] - pts_mvd[0]) / (i - 1), 1);
-
-    if ((pts_mvd[i] - pts_mvd[i - 1]) >= TRIM_THR * low) ret = i;
+  // Keep at least 1 sample.
+  if (!ret) return 1;
+
+  i = 0;
+  j = l - 1;
+  for (k = 0; k < l - ret; k++) {
+    while (pts_mvd[i] != -1) i++;
+    while (pts_mvd[j] == -1) j--;
+    assert(i != j);
+    if (i > j) break;
+
+    // Replace the discarded samples;
+    pts_mvd[i] = pts_mvd[j];
+    pts[2 * i] = pts[2 * j];
+    pts[2 * i + 1] = pts[2 * j + 1];
+    pts_inref[2 * i] = pts_inref[2 * j];
+    pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
+    i++;
+    j--;
   }
 
-  if (ret > LEAST_SQUARES_SAMPLES_MAX) ret = LEAST_SQUARES_SAMPLES_MAX;
   return ret;
 }
 
 // Note: Samples returned are at 1/8-pel precision
+// Sample are the neighbor block center point's coordinates relative to the
+// left-top pixel of current block.
 int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref, int *pts_mv) {
-  MB_MODE_INFO *const mbmi0 = &(xd->mi[0]->mbmi);
+                int *pts, int *pts_inref) {
+  MB_MODE_INFO *const mbmi0 = xd->mi[0];
   int ref_frame = mbmi0->ref_frame[0];
   int up_available = xd->up_available;
   int left_available = xd->left_available;
   int i, mi_step = 1, np = 0;
-  int global_offset_c = mi_col * MI_SIZE;
-  int global_offset_r = mi_row * MI_SIZE;
 
   const TileInfo *const tile = &xd->tile;
   int do_tl = 1;
@@ -2082,8 +1166,7 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
   // scan the nearest above rows
   if (up_available) {
     int mi_row_offset = -1;
-    MODE_INFO *mi = xd->mi[mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
     uint8_t n8_w = mi_size_wide[mbmi->sb_type];
 
     if (xd->n8_w <= n8_w) {
@@ -2094,42 +1177,38 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       if (col_offset + n8_w > xd->n8_w) do_tr = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                       global_offset_c, 0, -1, col_offset, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
         pts += 2;
         pts_inref += 2;
-        pts_mv += 2;
         np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     } else {
       // Handle "current block width > above block width" case.
       for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
         int mi_col_offset = i;
-        mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        mbmi = &mi->mbmi;
+        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
         n8_w = mi_size_wide[mbmi->sb_type];
         mi_step = AOMMIN(xd->n8_w, n8_w);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
-          record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                         global_offset_c, 0, -1, i, 1);
+          record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
           pts += 2;
           pts_inref += 2;
-          pts_mv += 2;
           np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // scan the nearest left columns
   if (left_available) {
     int mi_col_offset = -1;
 
-    MODE_INFO *mi = xd->mi[mi_col_offset];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
     uint8_t n8_h = mi_size_high[mbmi->sb_type];
 
     if (xd->n8_h <= n8_h) {
@@ -2139,182 +1218,329 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       if (row_offset < 0) do_tl = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                       global_offset_c, row_offset, 1, 0, -1);
+        record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
         pts += 2;
         pts_inref += 2;
-        pts_mv += 2;
         np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     } else {
       // Handle "current block height > above block height" case.
       for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
         int mi_row_offset = i;
-        mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        mbmi = &mi->mbmi;
+        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
         n8_h = mi_size_high[mbmi->sb_type];
         mi_step = AOMMIN(xd->n8_h, n8_h);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
-          record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                         global_offset_c, i, 1, 0, -1);
+          record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
           pts += 2;
           pts_inref += 2;
-          pts_mv += 2;
           np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Top-left block
   if (do_tl && left_available && up_available) {
     int mi_row_offset = -1;
     int mi_col_offset = -1;
 
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
 
     if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-      record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                     global_offset_c, 0, -1, 0, -1);
+      record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
       pts += 2;
       pts_inref += 2;
-      pts_mv += 2;
       np++;
+      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Top-right block
   if (do_tr &&
       has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) {
     POSITION trb_pos = { -1, xd->n8_w };
 
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &trb_pos)) {
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) {
       int mi_row_offset = -1;
       int mi_col_offset = xd->n8_w;
 
-      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *mbmi = &mi->mbmi;
+      MB_MODE_INFO *mbmi =
+          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                       global_offset_c, 0, -1, xd->n8_w, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1);
         np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   return np;
 }
-#else
-void calc_projection_samples(MB_MODE_INFO *const mbmi, int x, int y,
-                             int *pts_inref) {
-  pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
-  pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
+  cm->is_skip_mode_allowed = 0;
+  cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX;
+
+  if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) ||
+      cm->reference_mode == SINGLE_REFERENCE)
+    return;
+
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  const int cur_frame_offset = cm->frame_offset;
+  int ref_frame_offset[2] = { -1, INT_MAX };
+  int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
+
+  // Identify the nearest forward and backward references.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    const int buf_idx = cm->frame_refs[i].idx;
+    if (buf_idx == INVALID_IDX) continue;
+
+    const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+    if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) {
+      // Forward reference
+      if (ref_frame_offset[0] == -1 ||
+          get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) {
+        ref_frame_offset[0] = ref_offset;
+        ref_idx[0] = i;
+      }
+    } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) {
+      // Backward reference
+      if (ref_frame_offset[1] == INT_MAX ||
+          get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) {
+        ref_frame_offset[1] = ref_offset;
+        ref_idx[1] = i;
+      }
+    }
+  }
+
+  if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
+    // == Bi-directional prediction ==
+    cm->is_skip_mode_allowed = 1;
+    cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+    cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+  } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
+    // == Forward prediction only ==
+    // Identify the second nearest forward reference.
+    ref_frame_offset[1] = -1;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      const int buf_idx = cm->frame_refs[i].idx;
+      if (buf_idx == INVALID_IDX) continue;
+
+      const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+      if ((ref_frame_offset[0] != -1 &&
+           get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) &&
+          (ref_frame_offset[1] == -1 ||
+           get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) {
+        // Second closest forward reference
+        ref_frame_offset[1] = ref_offset;
+        ref_idx[1] = i;
+      }
+    }
+    if (ref_frame_offset[1] != -1) {
+      cm->is_skip_mode_allowed = 1;
+      cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+      cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+    }
+  }
 }
 
-// Note: Samples returned are at 1/8-pel precision
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref) {
-  MB_MODE_INFO *const mbmi0 = &(xd->mi[0]->mbmi);
-  int ref_frame = mbmi0->ref_frame[0];
-  int up_available = xd->up_available;
-  int left_available = xd->left_available;
-  int i, mi_step, np = 0;
-  int global_offset_c = mi_col * MI_SIZE;
-  int global_offset_r = mi_row * MI_SIZE;
+typedef struct {
+  int map_idx;   // frame map index
+  int buf_idx;   // frame buffer index
+  int sort_idx;  // index based on the offset to be used for sorting
+} REF_FRAME_INFO;
+
+static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
+  const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
+  const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
+
+  if (info_a->sort_idx < info_b->sort_idx) return -1;
+  if (info_a->sort_idx > info_b->sort_idx) return 1;
+  return (info_a->map_idx < info_b->map_idx)
+             ? -1
+             : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+}
 
-  // scan the above row
-  if (up_available) {
-    for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
-      int mi_row_offset = -1;
-      int mi_col_offset = i;
+static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
+                               REF_FRAME_INFO *ref_info) {
+  assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME);
 
-      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *mbmi = &mi->mbmi;
+  const int buf_idx = ref_info->buf_idx;
 
-      mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
+  cm->frame_refs[frame_idx].idx = buf_idx;
+  cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+  cm->frame_refs[frame_idx].map_idx = ref_info->map_idx;
+}
 
-      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        int bw = block_size_wide[mbmi->sb_type];
-        int bh = block_size_high[mbmi->sb_type];
-        int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
-        int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
-        int x = cc_offset + global_offset_c;
-        int y = cr_offset + global_offset_r;
-
-        pts[0] = (x * 8);
-        pts[1] = (y * 8);
-        calc_projection_samples(mbmi, x, y, pts_inref);
-        pts += 2;
-        pts_inref += 2;
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
-      }
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
+                        int gld_map_idx) {
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  int lst_frame_sort_idx = -1;
+  int gld_frame_sort_idx = -1;
+
+  assert(cm->seq_params.enable_order_hint);
+  assert(cm->seq_params.order_hint_bits_minus_1 >= 0);
+  const int cur_frame_offset = (int)cm->frame_offset;
+  const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1;
+
+  REF_FRAME_INFO ref_frame_info[REF_FRAMES];
+  int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    const int map_idx = i;
+
+    ref_frame_info[i].map_idx = map_idx;
+    ref_frame_info[i].sort_idx = -1;
+
+    const int buf_idx = cm->ref_frame_map[map_idx];
+    ref_frame_info[i].buf_idx = buf_idx;
+
+    if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue;
+    // TODO(zoeliu@google.com): To verify the checking on ref_count.
+    if (frame_bufs[buf_idx].ref_count <= 0) continue;
+
+    const int offset = (int)frame_bufs[buf_idx].cur_frame_offset;
+    ref_frame_info[i].sort_idx =
+        (offset == -1) ? -1
+                       : cur_frame_sort_idx +
+                             get_relative_dist(cm, offset, cur_frame_offset);
+    assert(ref_frame_info[i].sort_idx >= -1);
+
+    if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
+    if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
+  }
+
+  // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
+  // frames.
+  if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as LAST");
+  }
+  if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as GOLDEN");
+  }
+
+  // Sort ref frames based on their frame_offset values.
+  qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
+        compare_ref_frame_info);
+
+  // Identify forward and backward reference frames.
+  // Forward  reference: offset < cur_frame_offset
+  // Backward reference: offset >= cur_frame_offset
+  int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
+
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (ref_frame_info[i].sort_idx == -1) {
+      fwd_start_idx++;
+      continue;
+    }
+
+    if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
+      fwd_end_idx = i - 1;
+      break;
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  // scan the left column
-  if (left_available) {
-    for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-      int mi_row_offset = i;
-      int mi_col_offset = -1;
+  int bwd_start_idx = fwd_end_idx + 1;
+  int bwd_end_idx = REF_FRAMES - 1;
 
-      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *mbmi = &mi->mbmi;
+  // === Backward Reference Frames ===
 
-      mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
+  // == ALTREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_end_idx]);
+    ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
+    bwd_end_idx--;
+  }
 
-      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        int bw = block_size_wide[mbmi->sb_type];
-        int bh = block_size_high[mbmi->sb_type];
-        int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
-        int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
-        int x = cc_offset + global_offset_c;
-        int y = cr_offset + global_offset_r;
-
-        pts[0] = (x * 8);
-        pts[1] = (y * 8);
-        calc_projection_samples(mbmi, x, y, pts_inref);
-        pts += 2;
-        pts_inref += 2;
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
-      }
+  // == BWDREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
+    bwd_start_idx++;
+  }
+
+  // == ALTREF2_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
+  }
+
+  // === Forward Reference Frames ===
+
+  for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
+    // == LAST_FRAME ==
+    if (ref_frame_info[i].map_idx == lst_map_idx) {
+      set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
+    }
+
+    // == GOLDEN_FRAME ==
+    if (ref_frame_info[i].map_idx == gld_map_idx) {
+      set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  if (left_available && up_available) {
-    int mi_row_offset = -1;
-    int mi_col_offset = -1;
+  assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
+         ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
 
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+  // == LAST2_FRAME ==
+  // == LAST3_FRAME ==
+  // == BWDREF_FRAME ==
+  // == ALTREF2_FRAME ==
+  // == ALTREF_FRAME ==
 
-    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-      int bw = block_size_wide[mbmi->sb_type];
-      int bh = block_size_high[mbmi->sb_type];
-      int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
-      int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
-      int x = cc_offset + global_offset_c;
-      int y = cr_offset + global_offset_r;
-
-      pts[0] = (x * 8);
-      pts[1] = (y * 8);
-      calc_projection_samples(mbmi, x, y, pts_inref);
-      np++;
+  // Set up the reference frames in the anti-chronological order.
+  static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
+    LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
+  };
+
+  int ref_idx;
+  for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+
+    while (fwd_start_idx <= fwd_end_idx &&
+           (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
+            ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
+      fwd_end_idx--;
     }
+    if (fwd_start_idx > fwd_end_idx) break;
+
+    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_end_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+
+    fwd_end_idx--;
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  return np;
+  // Assign all the remaining frame(s), if any, to the earliest reference frame.
+  for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_start_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+  }
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    assert(ref_flag_list[i] == 1);
+  }
 }
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index 348887e43..716b4a247 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -18,103 +18,36 @@
 extern "C" {
 #endif
 
-#define MVREF_NEIGHBOURS 9
-#define MVREF_ROWS 3
-#define MVREF_COLS 4
+#define MVREF_ROW_COLS 3
+
+// Set the upper limit of the motion vector component magnitude.
+// This would make a motion vector fit in 26 bits. Plus 3 bits for the
+// reference frame index. A tuple of motion vector can hence be stored within
+// 32 bit range for efficient load/store operations.
+#define REFMVS_LIMIT ((1 << 12) - 1)
 
 typedef struct position {
   int row;
   int col;
 } POSITION;
 
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
-  9,    // SMOOTH_V_PRED
-  9,    // SMOOTH_H_PRED
-#endif  // CONFIG_SMOOTH_HV
-  9,    // TM_PRED
-  0,    // NEARESTMV
-  0,    // NEARMV
-  3,    // ZEROMV
-  1,    // NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-  0,    // SR_NEAREST_NEARMV
-        //  1,    // SR_NEAREST_NEWMV
-  1,    // SR_NEAR_NEWMV
-  3,    // SR_ZERO_NEWMV
-  1,    // SR_NEW_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  0,    // NEAREST_NEARESTMV
-  0,    // NEAR_NEARMV
-  1,    // NEAREST_NEWMV
-  1,    // NEW_NEARESTMV
-  1,    // NEAR_NEWMV
-  1,    // NEW_NEARMV
-  3,    // ZERO_ZEROMV
-  1,    // NEW_NEWMV
-};
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,        // 0
-  NEW_PLUS_NON_INTRA,    // 1
-  BOTH_NEW,              // 2
-  ZERO_PLUS_PREDICTED,   // 3
-  NEW_PLUS_NON_INTRA,    // 4
-  INVALID_CASE,          // 5
-  BOTH_ZERO,             // 6
-  INVALID_CASE,          // 7
-  INVALID_CASE,          // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,          // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,          // 13
-  INVALID_CASE,          // 14
-  INVALID_CASE,          // 15
-  INVALID_CASE,          // 16
-  INVALID_CASE,          // 17
-  BOTH_INTRA             // 18
-};
+static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
+  if (!cm->seq_params.enable_order_hint) return 0;
 
-static const int idx_n_column_to_subblock[4][2] = {
-  { 1, 2 }, { 1, 3 }, { 3, 2 }, { 3, 3 }
-};
+  const int bits = cm->seq_params.order_hint_bits_minus_1 + 1;
 
-// clamp_mv_ref
-#if CONFIG_EXT_PARTITION
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-#else
-#define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
-#endif                      // CONFIG_EXT_PARTITION
+  assert(bits >= 1);
+  assert(a >= 0 && a < (1 << bits));
+  assert(b >= 0 && b < (1 << bits));
+
+  int diff = a - b;
+  int m = 1 << (bits - 1);
+  diff = (diff & (m - 1)) - (diff & m);
+  return diff;
+}
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
@@ -125,19 +58,16 @@ static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
 
 // This function returns either the appropriate sub block or block's mv
 // on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
+static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
+                                      int which_mv, int search_col) {
   (void)search_col;
-  (void)block_idx;
-  return candidate->mbmi.mv[which_mv];
+  return candidate->mv[which_mv];
 }
 
-static INLINE int_mv get_sub_block_pred_mv(const MODE_INFO *candidate,
-                                           int which_mv, int search_col,
-                                           int block_idx) {
+static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
+                                           int which_mv, int search_col) {
   (void)search_col;
-  (void)block_idx;
-  return candidate->mbmi.mv[which_mv];
+  return candidate->mv[which_mv];
 }
 
 // Performs mv sign inversion if indicated by the reference frame combination.
@@ -152,48 +82,11 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
   return mv;
 }
 
-#define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done)      \
-  do {                                                                       \
-    (mv_ref_list)[(refmv_count)] = (mv);                                     \
-    CLIP_IN_ADD(&(mv_ref_list)[(refmv_count)].as_mv, (bw), (bh), (xd));      \
-    if (refmv_count && (mv_ref_list)[1].as_int != (mv_ref_list)[0].as_int) { \
-      (refmv_count) = 2;                                                     \
-      goto Done;                                                             \
-    }                                                                        \
-    (refmv_count) = 1;                                                       \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
-                                 mv_ref_list, bw, bh, xd, Done)               \
-  do {                                                                        \
-    if (is_inter_block(mbmi)) {                                               \
-      if ((mbmi)->ref_frame[0] != ref_frame)                                  \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias),        \
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);          \
-      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame)          \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias),        \
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);          \
-    }                                                                         \
-  } while (0)
-
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
-                            int mi_rows, const AV1_COMMON *cm,
-                            const POSITION *mi_pos) {
-#if CONFIG_DEPENDENT_HORZTILES
-  const int dependent_horz_tile_flag = cm->dependent_horz_tiles;
-#else
+                            int mi_rows, const POSITION *mi_pos) {
   const int dependent_horz_tile_flag = 0;
-  (void)cm;
-#endif
   if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
     return !(mi_row + mi_pos->row < 0 ||
              mi_col + mi_pos->col < tile->mi_col_start ||
@@ -208,14 +101,8 @@ static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
 }
 
 static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
-                                        int mi_rows, const AV1_COMMON *cm,
-                                        int row_offset) {
-#if CONFIG_DEPENDENT_HORZTILES
-  const int dependent_horz_tile_flag = cm->dependent_horz_tiles;
-#else
+                                        int mi_rows, int row_offset) {
   const int dependent_horz_tile_flag = 0;
-  (void)cm;
-#endif
   if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
     return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
   else
@@ -229,87 +116,49 @@ static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
                tile->mi_col_end - mi_col - 1);
 }
 
-static INLINE void lower_mv_precision(MV *mv, int allow_hp
-#if CONFIG_AMVR
-                                      ,
-                                      int is_integer
-#endif
-                                      ) {
-#if CONFIG_AMVR
+static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   if (is_integer) {
     integer_mv_precision(mv);
   } else {
-#endif
     if (!allow_hp) {
       if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
       if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
     }
-#if CONFIG_AMVR
   }
-#endif
-}
-
-static INLINE uint8_t av1_get_pred_diff_ctx(const int_mv pred_mv,
-                                            const int_mv this_mv) {
-  if (abs(this_mv.as_mv.row - pred_mv.as_mv.row) <= 4 &&
-      abs(this_mv.as_mv.col - pred_mv.as_mv.col) <= 4)
-    return 2;
-  else
-    return 1;
-}
-
-static INLINE int av1_nmv_ctx(const uint8_t ref_mv_count,
-                              const CANDIDATE_MV *ref_mv_stack, int ref,
-                              int ref_mv_idx) {
-  if (ref_mv_stack[ref_mv_idx].weight >= REF_CAT_LEVEL && ref_mv_count > 0)
-    return ref_mv_stack[ref_mv_idx].pred_diff[ref];
-
-  return 0;
 }
 
-#if CONFIG_EXT_COMP_REFS
-static INLINE int8_t av1_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
   // Single ref pred
   if (rf[1] <= INTRA_FRAME) return -1;
 
   // Bi-directional comp ref pred
   if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
 
-  for (int8_t ref_idx = 0; ref_idx < UNIDIR_COMP_REFS; ++ref_idx) {
+  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
     if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
       return ref_idx;
   }
   return -1;
 }
-#endif  // CONFIG_EXT_COMP_REFS
 
 static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
   if (rf[1] > INTRA_FRAME) {
-#if CONFIG_EXT_COMP_REFS
-    int8_t uni_comp_ref_idx = av1_uni_comp_ref_idx(rf);
-#if !USE_UNI_COMP_REFS
-    // NOTE: uni-directional comp refs disabled
-    assert(uni_comp_ref_idx < 0);
-#endif  // !USE_UNI_COMP_REFS
+    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
     if (uni_comp_ref_idx >= 0) {
-      assert((TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
+      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
              MODE_CTX_REF_FRAMES);
-      return TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
+      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
     } else {
-#endif  // CONFIG_EXT_COMP_REFS
-      return TOTAL_REFS_PER_FRAME + FWD_RF_OFFSET(rf[0]) +
+      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
              BWD_RF_OFFSET(rf[1]) * FWD_REFS;
-#if CONFIG_EXT_COMP_REFS
     }
-#endif  // CONFIG_EXT_COMP_REFS
   }
 
   return rf[0];
 }
 
 // clang-format off
-static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
-#if CONFIG_EXT_REFS
+static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
   { LAST_FRAME, BWDREF_FRAME },  { LAST2_FRAME, BWDREF_FRAME },
   { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
 
@@ -317,58 +166,51 @@ static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
   { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
 
   { LAST_FRAME, ALTREF_FRAME },  { LAST2_FRAME, ALTREF_FRAME },
-  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }
-
-  // TODO(zoeliu): Temporarily disable uni-directional comp refs
-#if CONFIG_EXT_COMP_REFS
-  , { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
-  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }
-  // TODO(zoeliu): When ALTREF2 is enabled, we may add:
-  //               {BWDREF_FRAME, ALTREF2_FRAME}
-#endif  // CONFIG_EXT_COMP_REFS
-#else  // !CONFIG_EXT_REFS
-  { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }
-#endif  // CONFIG_EXT_REFS
+  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+
+  { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+
+  // NOTE: Following reference frame pairs are not supported to be explicitly
+  //       signalled, but they are possibly chosen by the use of skip_mode,
+  //       which may use the most recent one-sided reference frame pair.
+  { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
+  { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
+  { ALTREF2_FRAME, ALTREF_FRAME }
 };
 // clang-format on
 
 static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
                                      int8_t ref_frame_type) {
-  if (ref_frame_type >= TOTAL_REFS_PER_FRAME) {
-    rf[0] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][0];
-    rf[1] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][1];
+  if (ref_frame_type >= REF_FRAMES) {
+    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
+    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
   } else {
     rf[0] = ref_frame_type;
     rf[1] = NONE_FRAME;
-#if CONFIG_INTRABC
     assert(ref_frame_type > NONE_FRAME);
-#else
-    assert(ref_frame_type > INTRA_FRAME);
-#endif
-    assert(ref_frame_type < TOTAL_REFS_PER_FRAME);
   }
 }
 
+static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
+  { 0, 1, 1, 1, 1 },
+  { 1, 2, 3, 4, 4 },
+  { 4, 4, 5, 6, 7 },
+};
+
 static INLINE int16_t av1_mode_context_analyzer(
-    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf,
-    BLOCK_SIZE bsize, int block) {
-  int16_t mode_ctx = 0;
-  int8_t ref_frame_type = av1_ref_frame_type(rf);
-
-  if (block >= 0) {
-    mode_ctx = mode_context[rf[0]] & 0x00ff;
-#if !CONFIG_CB4X4
-    if (block > 0 && bsize < BLOCK_8X8 && bsize > BLOCK_4X4)
-      mode_ctx |= (1 << SKIP_NEARESTMV_SUB8X8_OFFSET);
-#else
-    (void)block;
-    (void)bsize;
-#endif
+    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
+  const int8_t ref_frame = av1_ref_frame_type(rf);
 
-    return mode_ctx;
-  }
+  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
+
+  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx =
+      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
 
-  return mode_context[ref_frame_type];
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
 }
 
 static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
@@ -379,92 +221,99 @@ static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
 
   if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
-    return 2;
+    return 1;
 
   if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
-    return 3;
+    return 2;
 
   return 0;
 }
 
-#if CONFIG_FRAME_MARKER
 void av1_setup_frame_buf_refs(AV1_COMMON *cm);
-#if CONFIG_FRAME_SIGN_BIAS
 void av1_setup_frame_sign_bias(AV1_COMMON *cm);
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_MFMV
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
 void av1_setup_motion_field(AV1_COMMON *cm);
-#endif  // CONFIG_MFMV
-#endif  // CONFIG_FRAME_MARKER
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+
+static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+  av1_zero(xd->neighbors_ref_counts);
+
+  uint8_t *const ref_counts = xd->neighbors_ref_counts;
+
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Above neighbor
+  if (above_in_image && is_inter_block(above_mbmi)) {
+    ref_counts[above_mbmi->ref_frame[0]]++;
+    if (has_second_ref(above_mbmi)) {
+      ref_counts[above_mbmi->ref_frame[1]]++;
+    }
+  }
+
+  // Left neighbor
+  if (left_in_image && is_inter_block(left_mbmi)) {
+    ref_counts[left_mbmi->ref_frame[0]]++;
+    if (has_second_ref(left_mbmi)) {
+      ref_counts[left_mbmi->ref_frame[1]]++;
+    }
+  }
+}
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MODE_INFO *mi, int mi_row,
-                        int mi_col, int x_mis, int y_mis);
+void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
+                        int mi_row, int mi_col, int x_mis, int y_mis);
 
-typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
-                      int16_t *compound_mode_context, int_mv *mv_ref_list,
-                      int mi_row, int mi_col, find_mv_refs_sync sync,
-                      void *const data, int16_t *mode_context);
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int mi_row, int mi_col,
+                      int16_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
-#if CONFIG_AMVR
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
                            int_mv *near_mv, int is_integer);
-#else
-void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
-                           int_mv *near_mv);
-#endif
 
-void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   CANDIDATE_MV *ref_mv_stack,
-                                   uint8_t *ref_mv_count, int_mv *mv_list,
-                                   int_mv *nearest_mv, int_mv *near_mv);
-
-// This function keeps a mode count for a given MB/SB
-void av1_update_mv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                           int_mv *mv_ref_list, int block, int mi_row,
-                           int mi_col, int16_t *mode_context);
-
-#if CONFIG_WARPED_MOTION
-#if WARPED_MOTION_SORT_SAMPLES
-int sortSamples(int *pts_mv, MV *mv, int *pts, int *pts_inref, int len);
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref, int *pts_mv);
-#else
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
 int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
                 int *pts, int *pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_INTRABC
-static INLINE void av1_find_ref_dv(int_mv *ref_dv, int mi_row, int mi_col) {
-  // TODO(aconverse@google.com): Handle tiles and such
+#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
+#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
+#define USE_WAVE_FRONT 1  // Use only top left area of frame for reference.
+
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+                                   int mib_size, int mi_row, int mi_col) {
   (void)mi_col;
-  if (mi_row < MAX_MIB_SIZE) {
+  if (mi_row - mib_size < tile->mi_row_start) {
     ref_dv->as_mv.row = 0;
-    ref_dv->as_mv.col = -MI_SIZE * MAX_MIB_SIZE;
+    ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
   } else {
-    ref_dv->as_mv.row = -MI_SIZE * MAX_MIB_SIZE;
+    ref_dv->as_mv.row = -MI_SIZE * mib_size;
     ref_dv->as_mv.col = 0;
   }
+  ref_dv->as_mv.row *= 8;
+  ref_dv->as_mv.col *= 8;
 }
 
-static INLINE int is_dv_valid(const MV dv, const TileInfo *const tile,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize) {
+static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+                                  const MACROBLOCKD *xd, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, int mib_size_log2) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int SCALE_PX_TO_MV = 8;
   // Disallow subpixel for now
   // SUBPEL_MASK is not the correct scale
-  if ((dv.row & (SCALE_PX_TO_MV - 1) || dv.col & (SCALE_PX_TO_MV - 1)))
+  if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
     return 0;
+
+  const TileInfo *const tile = &xd->tile;
   // Is the source top-left inside the current tile?
   const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
   const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
@@ -479,20 +328,44 @@ static INLINE int is_dv_valid(const MV dv, const TileInfo *const tile,
   const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
   const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
   if (src_right_edge > tile_right_edge) return 0;
-  // Is the bottom right within an already coded SB?
-  const int active_sb_top_edge =
-      (mi_row & ~MAX_MIB_MASK) * MI_SIZE * SCALE_PX_TO_MV;
-  const int active_sb_bottom_edge =
-      ((mi_row & ~MAX_MIB_MASK) + MAX_MIB_SIZE) * MI_SIZE * SCALE_PX_TO_MV;
-  const int active_sb_left_edge =
-      (mi_col & ~MAX_MIB_MASK) * MI_SIZE * SCALE_PX_TO_MV;
-  if (src_bottom_edge > active_sb_bottom_edge) return 0;
-  if (src_bottom_edge > active_sb_top_edge &&
-      src_right_edge > active_sb_left_edge)
+
+  // Special case for sub 8x8 chroma cases, to prevent referring to chroma
+  // pixels outside current tile.
+  for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                            pd->subsampling_y)) {
+      if (bw < 8 && pd->subsampling_x)
+        if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+      if (bh < 8 && pd->subsampling_y)
+        if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
+    }
+  }
+
+  // Is the bottom right within an already coded SB? Also consider additional
+  // constraints to facilitate HW decoder.
+  const int max_mib_size = 1 << mib_size_log2;
+  const int active_sb_row = mi_row >> mib_size_log2;
+  const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
+  const int sb_size = max_mib_size * MI_SIZE;
+  const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
+  const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
+  const int total_sb64_per_row =
+      ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
+  const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
+  const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
+  if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
+
+#if USE_WAVE_FRONT
+  const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
+  const int wf_offset = gradient * (active_sb_row - src_sb_row);
+  if (src_sb_row > active_sb_row ||
+      src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
     return 0;
+#endif
+
   return 1;
 }
-#endif  // CONFIG_INTRABC
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/ncobmc_kernels.c b/third_party/aom/av1/common/ncobmc_kernels.c
deleted file mode 100644
index af951398b..000000000
--- a/third_party/aom/av1/common/ncobmc_kernels.c
+++ /dev/null
@@ -1,1181 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/ncobmc_kernels.h"
-
-// The kernels are only used in the experiment "ncobmc-adapt-weight", which
-// blends four predictions to form a final prediction for an inter-block
-// The indices of the default kernels correspond to
-// 1. the index of the size of the kernels (ADAPT_OVERLAP_BLOCKS )
-// 2. the interpolation modes (NCOBMC_MODE)
-// 3. the prediction the kernels applies to
-
-static int16_t default_ncobmc_krnl_0_0_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5684, 3601, 1367, 364, 1509, 2313, 4007, 5080 },
-  { 3728, 2486, 827, 196, 1434, 2034, 2868, 3000 },
-  { 1643, 1465, 726, 208, 890, 1386, 1242, 1293 },
-  { 794, 723, 277, -237, 206, 487, 749, 896 },
-  { 1176, 730, 286, 136, 281, 262, 724, 953 },
-  { 2086, 1958, 783, 539, 751, 984, 1143, 1491 },
-  { 2665, 2520, 1402, 1037, 939, 1223, 1593, 1937 },
-  { 3451, 3172, 2350, 1291, 1069, 1916, 2672, 3223 }
-};
-static int16_t default_ncobmc_krnl_0_0_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5541, 8123, 10470, 11908, 11291, 10382, 8800, 6446 },
-  { 3338, 5536, 7249, 8080, 7671, 6428, 5280, 3900 },
-  { 1732, 3087, 3842, 4325, 4034, 2929, 2318, 1800 },
-  { 744, 1217, 1559, 2215, 1957, 1352, 707, 322 },
-  { 685, 1082, 1792, 2300, 1975, 1350, 738, 671 },
-  { 1168, 2336, 3303, 3965, 3790, 3098, 2909, 2141 },
-  { 3005, 4370, 5806, 6716, 6282, 5553, 4782, 3453 },
-  { 4748, 6650, 7779, 9010, 9208, 8184, 6987, 5197 }
-};
-static int16_t default_ncobmc_krnl_0_0_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6026, 4784, 2400, 1250, 1002, 2371, 3320, 5285 },
-  { 8638, 6094, 3257, 1498, 1297, 3145, 5252, 7625 },
-  { 10859, 7249, 3868, 1871, 1813, 3569, 6577, 8858 },
-  { 11432, 8123, 4216, 1786, 2477, 4370, 6669, 9366 },
-  { 11894, 8466, 4870, 1917, 2479, 4656, 7057, 9383 },
-  { 11109, 7432, 3924, 1288, 2018, 3946, 6660, 9877 },
-  { 10138, 6548, 2830, 461, 2087, 3810, 6170, 9255 },
-  { 8613, 5163, 1658, 279, 1694, 3082, 4807, 7897 }
-};
-static int16_t default_ncobmc_krnl_0_0_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -833, -80, 2193, 2907, 2623, 1359, 298, -383 },
-  { 705, 2300, 5090, 6649, 6024, 4820, 3020, 1892 },
-  { 2189, 4625, 7990, 10015, 9679, 8539, 6284, 4464 },
-  { 3445, 6356, 10371, 12660, 11773, 10205, 8287, 5828 },
-  { 2664, 6149, 9483, 12064, 11681, 10156, 7908, 5409 },
-  { 2040, 4690, 8405, 10631, 9862, 8396, 5711, 2909 },
-  { 626, 2993, 6387, 8212, 7123, 5840, 3877, 1788 },
-  { -402, 1431, 4636, 5850, 4461, 3246, 1964, 122 }
-};
-static int16_t default_ncobmc_krnl_0_1_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 1465, 553, -76, 10, 635, 756, 1843, 3144 },
-  { 687, 117, -404, -300, 238, 280, 696, 1415 },
-  { 49, -38, -224, -241, -135, -209, -237, 382 },
-  { 48, 37, -266, -273, -235, -137, -208, -94 },
-  { 555, -3, -132, -172, -98, 93, 347, 313 },
-  { 887, 256, -266, -307, 304, 222, -98, 82 },
-  { 1701, 816, 454, 501, 119, 230, 450, 551 },
-  { 2732, 1502, 1174, 540, 323, 709, 1002, 936 }
-};
-static int16_t default_ncobmc_krnl_0_1_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 7707, 10467, 11036, 10942, 10165, 9420, 8728, 5835 },
-  { 3167, 5146, 5854, 5771, 4914, 4684, 4357, 3154 },
-  { 900, 1646, 2412, 2014, 1974, 1986, 1776, 1005 },
-  { -198, -179, 488, 737, 866, 784, 828, 236 },
-  { -469, 32, 402, 574, 738, 495, 242, -187 },
-  { 186, 1078, 1378, 1480, 1226, 1506, 1656, 745 },
-  { 1531, 2525, 3139, 3367, 3535, 3519, 3095, 2171 },
-  { 3152, 5453, 6176, 7089, 7310, 6879, 6483, 4916 }
-};
-static int16_t default_ncobmc_krnl_0_1_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 7378, 3775, 1232, 453, 133, 936, 1688, 4950 },
-  { 10336, 5944, 2400, 1175, 168, 954, 2894, 6436 },
-  { 11176, 6145, 2051, 829, 543, 1193, 3403, 6517 },
-  { 10387, 6062, 2036, 646, 507, 1077, 2998, 6029 },
-  { 10768, 6277, 2226, 677, 321, 982, 2845, 6378 },
-  { 10072, 5808, 1937, 873, 372, 1396, 3498, 7298 },
-  { 8951, 4724, 1216, 104, 656, 1830, 3722, 7558 },
-  { 7447, 3372, 468, -135, 99, 1177, 2980, 7260 }
-};
-static int16_t default_ncobmc_krnl_0_1_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -229, 1545, 4135, 4921, 5405, 5226, 4081, 2418 },
-  { 2120, 5121, 8485, 9692, 11018, 10406, 8380, 5338 },
-  { 4205, 8593, 12099, 13717, 13936, 13366, 11402, 8436 },
-  { 6068, 10382, 14047, 15190, 15155, 14577, 12684, 10145 },
-  { 5458, 10012, 13836, 15243, 15361, 14752, 12876, 9818 },
-  { 5153, 9162, 13256, 14256, 14385, 13170, 11245, 8186 },
-  { 4140, 8257, 11521, 12362, 12028, 10762, 9062, 6053 },
-  { 2966, 5975, 8490, 8807, 8561, 7529, 5836, 3204 }
-};
-static int16_t default_ncobmc_krnl_1_0_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 4414, 2642, 2518, 1763, 1089, 644, 355, 254, -234, 454, 399, 228, 525, 785,
-    558, 919 },
-  { 2989, 3035, 2685, 1476, 1872, 768, 820, -309, -107, 273, 87, 286, 499, 638,
-    929, 1025 },
-  { 1779, 1672, 1713, 645, 953, 151, 617, 79, -91, 185, 18, 307, 794, 681, 484,
-    521 },
-  { 1429, 1571, 1893, 1493, 949, 288, -232, -248, -152, 179, -50, 74, 107, 329,
-    539, 822 },
-  { 1444, 852, 1022, 688, 850, 205, 135, -629, 334, 96, 106, 337, 259, 300, 150,
-    680 },
-  { 962, 367, 329, 921, 591, -79, 146, 201, 296, 179, -190, 143, 46, -107, 215,
-    853 },
-  { 915, 865, 463, 169, 498, -390, 12, 202, 225, 490, 410, 483, 52, 99, 293,
-    569 },
-  { 728, -135, 241, 383, 288, -69, 33, 421, 523, 506, 376, 58, 143, -4, 151,
-    218 },
-  { 337, 65, 255, 282, 173, 267, 237, 15, 38, 114, 253, 110, 32, 227, 92, -48 },
-  { 317, 115, 295, 231, 380, 435, 331, -97, 392, 393, 51, 59, 276, 41, -33,
-    46 },
-  { 31, -14, 86, 250, -36, -214, 210, -79, -117, 401, 193, 440, 171, 200, 8,
-    112 },
-  { 46, 19, 165, -6, 75, 180, 266, -98, 76, 276, 6, 29, 230, -49, 177, 168 },
-  { 104, -243, -121, 295, -8, 180, 16, -44, 232, 315, 176, 10, 0, -95, -154,
-    141 },
-  { 248, 201, 361, 430, -20, -45, 209, -44, 222, 540, 527, 297, 240, 625, 531,
-    409 },
-  { 91, 37, 193, 177, 233, 210, -299, 120, 327, 214, 293, 189, 86, 123, 206,
-    356 },
-  { 501, 779, 295, 199, 148, 81, -31, 70, 211, 309, 300, 110, 227, 30, 242,
-    261 }
-};
-static int16_t default_ncobmc_krnl_1_0_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6603, 7905, 7762, 8159, 8426, 10334, 10792, 10984, 12097, 10534, 11216,
-    10624, 9514, 8715, 8672, 8846 },
-  { 5897, 6238, 6272, 7323, 7162, 8091, 9465, 9845, 9929, 9747, 10562, 10737,
-    9059, 7651, 7330, 7314 },
-  { 5572, 6017, 5568, 7112, 6617, 6501, 7334, 8494, 8900, 8826, 9852, 8034,
-    6956, 7691, 7513, 6106 },
-  { 4564, 3877, 4682, 4586, 5135, 5795, 7968, 7859, 7720, 6548, 6306, 5639,
-    6357, 6514, 6493, 5609 },
-  { 4142, 4154, 3332, 4193, 3873, 4977, 4685, 5787, 5707, 5300, 5854, 4720,
-    5452, 5642, 4810, 4250 },
-  { 2993, 3176, 3012, 2637, 2664, 4336, 4207, 3687, 4627, 4487, 4847, 4120,
-    4079, 3931, 3730, 3205 },
-  { 2479, 2268, 1858, 1737, 2266, 2806, 2919, 3017, 3231, 2964, 3181, 3423,
-    3096, 3025, 2684, 2353 },
-  { 1969, 2001, 1997, 1959, 1323, 1565, 1963, 1351, 1957, 1711, 2093, 2057,
-    1762, 1926, 1118, 1367 },
-  { 1473, 816, 655, 1628, 1252, 1764, 1723, 1675, 2559, 3029, 1951, 2160, 2305,
-    2299, 1688, 1048 },
-  { 3073, 1667, 1324, 1360, 1562, 1774, 2154, 2740, 3281, 3434, 3258, 4095,
-    2823, 2443, 2894, 2449 },
-  { 3813, 2830, 3352, 2125, 2627, 2974, 3847, 3720, 4592, 4846, 4787, 5066,
-    4598, 4229, 4032, 3478 },
-  { 3415, 2733, 3827, 3637, 3381, 3743, 3768, 4732, 5055, 5445, 5870, 5937,
-    5734, 5980, 5010, 4954 },
-  { 4878, 3604, 5532, 4558, 4210, 4880, 4847, 5771, 5136, 6486, 7096, 6426,
-    5765, 6824, 6094, 5753 },
-  { 6076, 5817, 5318, 5268, 5784, 5482, 6453, 6582, 6803, 7077, 8113, 8173,
-    8329, 7653, 6448, 6476 },
-  { 7549, 5758, 5554, 6383, 7113, 7664, 7123, 6712, 8539, 8997, 9047, 8794,
-    8700, 8760, 7600, 7995 },
-  { 7698, 7133, 7048, 7498, 7821, 8401, 9152, 8647, 8934, 8874, 8595, 8789,
-    8828, 8766, 9019, 8783 }
-};
-static int16_t default_ncobmc_krnl_1_0_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5573, 5972, 5705, 5033, 5738, 3189, 2009, 1476, 2057, 2178, 1869, 2927,
-    3305, 4036, 4017, 5328 },
-  { 7539, 7568, 7302, 5564, 4410, 3954, 2153, 2693, 622, 1831, 1753, 1636, 3552,
-    4374, 4319, 6015 },
-  { 8753, 7544, 6620, 5710, 6142, 5819, 2731, 2898, 1702, 1487, 2249, 1688,
-    4110, 4483, 5108, 5621 },
-  { 9273, 7922, 6245, 6310, 6442, 5272, 3068, 2649, 1599, 2693, 3219, 4501,
-    4588, 4310, 5647, 6894 },
-  { 9697, 8245, 7267, 6551, 5199, 4626, 3466, 3256, 2099, 3125, 3608, 4297,
-    3944, 5468, 6056, 7545 },
-  { 8831, 8583, 7466, 6937, 6056, 5482, 3407, 3324, 1802, 3128, 3078, 4560,
-    4560, 5901, 6131, 7612 },
-  { 9556, 7457, 6602, 7342, 5370, 4431, 3573, 3339, 1668, 3172, 3779, 4564,
-    5744, 7244, 8522, 8407 },
-  { 10238, 8809, 7064, 6643, 4885, 4246, 2737, 2684, 2501, 3443, 3761, 6174,
-    5891, 6882, 7585, 8821 },
-  { 10151, 10001, 8289, 6859, 6054, 4903, 3809, 3540, 2644, 3424, 3542, 4649,
-    5389, 5384, 6733, 8360 },
-  { 9635, 9516, 7609, 7438, 6181, 4529, 4140, 3439, 2568, 3338, 3789, 5195,
-    5510, 6181, 7566, 8512 },
-  { 9988, 8848, 6807, 6731, 6139, 5355, 3797, 4097, 3364, 3319, 4230, 5136,
-    5581, 6125, 7748, 8229 },
-  { 10252, 9244, 7204, 7260, 6478, 6040, 4659, 3920, 2869, 3263, 4068, 5475,
-    5714, 7183, 7153, 8318 },
-  { 9682, 9366, 7096, 6059, 6036, 4463, 3898, 3477, 2065, 2704, 4434, 5167,
-    5502, 6743, 8002, 7443 },
-  { 9252, 8211, 6399, 6747, 6498, 5626, 4016, 3880, 2172, 2557, 3576, 4270,
-    4968, 5115, 6840, 7550 },
-  { 8753, 8157, 7097, 6500, 5779, 5174, 4190, 2645, 2380, 3239, 4155, 5263,
-    5437, 5337, 5663, 6667 },
-  { 9680, 7710, 6921, 5657, 4863, 3990, 3485, 2172, 2620, 3003, 3328, 4112,
-    4806, 6020, 6833, 7212 }
-};
-static int16_t default_ncobmc_krnl_1_0_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -219, -121, 416, 1445, 1150, 2238, 3251, 3695, 2496, 3247, 2925, 2639, 3064,
-    2869, 3167, 1320 },
-  { -68, -450, 130, 2039, 2952, 3584, 3966, 4171, 5961, 4555, 3995, 3732, 3281,
-    3731, 3827, 2052 },
-  { 262, 1161, 2497, 2934, 2690, 3939, 5735, 4938, 5906, 5924, 4296, 6388, 4553,
-    3551, 3297, 4164 },
-  { 1091, 3025, 3566, 4005, 3874, 5040, 5600, 6151, 7241, 6990, 6924, 6186,
-    5356, 5256, 3726, 3083 },
-  { 1079, 3140, 4769, 4958, 6480, 6589, 8111, 7988, 8255, 7879, 6838, 7052,
-    6751, 5005, 5393, 3931 },
-  { 3566, 4255, 5572, 5909, 7098, 6653, 8641, 9199, 9689, 8617, 8673, 7591,
-    7733, 6676, 6324, 4737 },
-  { 3411, 5802, 7481, 7149, 8259, 9553, 9900, 9854, 11285, 9779, 9040, 7939,
-    7515, 6037, 4902, 5075 },
-  { 3417, 5718, 7095, 7425, 9913, 10666, 11679, 11951, 11429, 10749, 10173,
-    8116, 8610, 7605, 7548, 5992 },
-  { 4408, 5515, 7201, 7627, 8922, 9470, 10636, 11166, 11159, 9844, 10673, 9502,
-    8693, 8503, 7905, 7046 },
-  { 3340, 5097, 7171, 7366, 8273, 9660, 9784, 10332, 10155, 9232, 9301, 7056,
-    7798, 7746, 5981, 5402 },
-  { 2531, 4732, 6148, 7284, 7672, 8287, 8551, 8672, 8567, 7846, 7199, 5757,
-    6057, 5863, 4613, 4578 },
-  { 2646, 4394, 5195, 5511, 6471, 6443, 7713, 7854, 8408, 7427, 6461, 4968,
-    4731, 3294, 4066, 2960 },
-  { 1692, 3664, 3881, 5480, 6162, 6871, 7635, 7198, 8963, 6891, 4694, 4801,
-    5141, 2932, 2459, 3060 },
-  { 769, 2144, 4310, 3945, 4125, 5329, 5712, 5975, 7200, 6220, 4179, 3662, 2868,
-    3007, 2579, 1958 },
-  { -45, 2434, 3549, 3335, 3273, 3357, 5394, 6931, 5159, 3956, 2912, 2164, 2187,
-    2187, 2935, 1388 },
-  { -1514, 786, 2135, 3045, 3561, 3922, 3800, 5515, 4650, 4225, 4169, 3387,
-    2539, 1590, 317, 161 }
-};
-static int16_t default_ncobmc_krnl_1_1_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 2375, 1912, 1469, 213, 933, -507, -173, -369, -333, 187, -128, 427, 999,
-    1166, 1515, 2728 },
-  { 1857, 1085, 817, 454, 598, 479, 53, -218, -611, 240, 76, 31, 284, 1347,
-    1738, 1317 },
-  { 1911, 531, 453, 89, 639, -361, -331, -605, -162, 63, -154, 259, 446, 390,
-    708, 1113 },
-  { 818, 1304, 871, 665, 1006, -114, -405, -407, 331, 203, 304, 506, 476, 1053,
-    1155, 879 },
-  { 1054, 874, 714, -162, 624, -144, -306, -541, 30, -281, 296, 812, 418, 858,
-    755, 252 },
-  { 967, 388, 354, 878, 31, -691, -244, -307, 425, 281, 0, -50, 110, -107, 279,
-    255 },
-  { 152, -53, 156, 266, 192, -864, -236, -110, 397, 484, -129, 14, 22, 44, -90,
-    278 },
-  { 203, -54, 103, -142, -598, -741, -546, -26, 545, 253, -43, -234, -391, -504,
-    -158, -143 },
-  { 387, 275, 136, 69, -289, -9, -210, -364, 39, 3, 4, 61, -66, -102, -94,
-    -215 },
-  { 195, 213, 433, 158, 128, -131, -203, -266, -132, -285, -301, -367, -315,
-    -249, -144, -9 },
-  { 600, 145, 418, 277, 156, -118, 85, -20, 119, 260, 41, 72, -85, 316, -97,
-    -41 },
-  { 682, 610, 356, 880, 527, 272, 90, 92, -124, 314, -204, -339, -590, -384,
-    -248, -192 },
-  { 999, 423, 208, 752, 623, 409, 91, -57, -3, -124, 148, 255, -7, 112, -128,
-    -144 },
-  { 1007, 710, 609, 766, 264, -163, 324, 291, 219, -61, 24, 507, 74, 109, 127,
-    629 },
-  { 2211, 878, 853, 462, 86, 203, -71, 122, -36, 131, 308, 267, 210, 369, 50,
-    -96 },
-  { 1810, 1630, 1123, 645, 610, 217, -93, -37, -220, -341, -250, -110, 135, 0,
-    112, 93 }
-};
-static int16_t default_ncobmc_krnl_1_1_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5824, 7106, 8063, 8929, 8632, 9731, 10164, 11047, 11088, 10239, 10606, 8987,
-    8411, 7117, 6115, 5322 },
-  { 4980, 6239, 7135, 7245, 7418, 8526, 9440, 9438, 8119, 8336, 7993, 8057,
-    6686, 5210, 4193, 4841 },
-  { 2436, 4500, 5019, 5908, 5578, 7270, 7391, 7974, 7281, 6871, 6705, 6327,
-    4867, 4521, 4286, 3605 },
-  { 2298, 3501, 4714, 4692, 4835, 5278, 5830, 4968, 4435, 4824, 4373, 4085,
-    3825, 2657, 2539, 2557 },
-  { 1643, 2741, 2604, 2664, 1877, 3334, 2995, 3162, 3367, 3104, 3356, 2827,
-    3577, 2359, 1755, 2140 },
-  { 742, 1397, 1315, 1332, 1864, 3032, 2472, 2253, 1692, 2071, 2260, 2426, 1951,
-    1610, 1189, 1275 },
-  { 482, 869, 598, 288, 769, 1490, 1284, 1692, 883, 1061, 1259, 1239, 1118, 585,
-    219, 571 },
-  { 178, 278, 580, 915, 717, 873, 1012, 721, 52, 348, 624, 540, 691, 102, -108,
-    383 },
-  { -718, -648, -223, -520, -1000, -754, -688, -639, -528, -414, -440, -365,
-    -268, -546, -672, -332 },
-  { -256, -226, -395, -158, -311, -325, -66, 87, 533, 705, 261, 344, 484, 692,
-    155, 507 },
-  { 204, 448, 131, -571, 889, 712, 626, 349, 261, 578, 240, 1012, 849, 900, 889,
-    977 },
-  { 132, 1395, 1847, 1181, 845, 1203, 1920, 2068, 2141, 2071, 1834, 2191, 2130,
-    2522, 1537, 1326 },
-  { 140, 1278, 2440, 2063, 1581, 2204, 2781, 2532, 1677, 2426, 2538, 2210, 1568,
-    2564, 2394, 1945 },
-  { 2943, 3776, 3833, 3310, 3900, 4118, 4161, 3571, 4059, 4143, 4145, 4273,
-    4034, 3940, 3720, 2418 },
-  { 3437, 3906, 4106, 4294, 5303, 5257, 4956, 4027, 5935, 5373, 4102, 4853,
-    5331, 5251, 3964, 4748 },
-  { 5493, 5799, 5966, 6535, 7015, 7397, 8011, 6526, 5832, 6257, 6247, 7097,
-    6499, 6272, 5963, 5593 }
-};
-static int16_t default_ncobmc_krnl_1_1_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6049, 4906, 3617, 2960, 2187, 1950, 556, 497, 688, 355, 503, 1054, 1170,
-    1641, 2343, 4226 },
-  { 7459, 6408, 4326, 3635, 2042, 1565, 492, 572, 746, 338, 719, 797, 2540,
-    2283, 2301, 4089 },
-  { 8025, 6914, 5072, 4249, 2793, 1910, 430, 1137, -150, 451, 1061, 872, 1515,
-    2805, 3823, 4550 },
-  { 9615, 6936, 5226, 3388, 2611, 2061, 801, 1003, -537, 72, 736, 1347, 2215,
-    3509, 4262, 5097 },
-  { 9677, 6521, 5633, 5223, 2996, 2449, 1300, 1136, 160, 918, 488, 801, 2306,
-    3781, 4818, 6441 },
-  { 9988, 7509, 6019, 4950, 3376, 2777, 1427, 1395, -118, 310, 393, 1626, 3387,
-    3649, 4737, 7431 },
-  { 10542, 7745, 5192, 4494, 1637, 1960, 1212, 1056, -309, 383, 1166, 2107,
-    4048, 4030, 7206, 7851 },
-  { 9350, 7480, 4343, 3589, 1748, 1687, 1057, 898, 592, 776, 680, 1960, 3804,
-    4598, 5688, 7834 },
-  { 8769, 7236, 5518, 4182, 2776, 2412, 915, 1370, 789, 561, 520, 1146, 3139,
-    4730, 5542, 7514 },
-  { 9580, 7116, 5910, 4623, 3085, 2450, 1703, 745, 419, 600, 1077, 1208, 3256,
-    4261, 5611, 6709 },
-  { 9725, 7053, 5594, 4217, 2573, 1834, 562, 512, 496, 356, 883, 1360, 3323,
-    4866, 5632, 7594 },
-  { 10110, 7367, 5622, 3858, 3720, 2398, 1075, 1687, 616, 461, 1082, 1786, 2570,
-    4242, 5731, 8319 },
-  { 9416, 7582, 6054, 3915, 3283, 2035, 1335, 1138, 317, 92, 253, 483, 1715,
-    3597, 5613, 8103 },
-  { 8693, 6881, 4626, 3505, 2663, 1949, 751, 792, -343, 55, 303, 460, 2027,
-    3584, 6230, 8704 },
-  { 7368, 6609, 5087, 3861, 2790, 1746, 1487, 518, 497, -165, 439, 904, 2514,
-    3735, 6082, 6914 },
-  { 7004, 5321, 3472, 2621, 1221, 999, 1172, 377, 850, 864, 866, 647, 2574,
-    3977, 6416, 7777 }
-};
-static int16_t default_ncobmc_krnl_1_1_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 2085, 2421, 3201, 4245, 4593, 5179, 5800, 5172, 4904, 5558, 5357, 5889,
-    5769, 6415, 6377, 4080 },
-  { 2031, 2607, 4062, 5018, 6279, 5766, 6373, 6562, 8085, 7434, 7557, 7449,
-    6834, 7509, 8119, 6106 },
-  { 3960, 4394, 5800, 6108, 7339, 7531, 8876, 7849, 9371, 8973, 8753, 8896,
-    9525, 8636, 7540, 7092 },
-  { 3599, 4610, 5527, 7597, 7898, 9121, 10115, 10783, 12123, 11248, 10928,
-    10406, 9827, 9129, 8401, 7814 },
-  { 3953, 6203, 7382, 8619, 10852, 10722, 12369, 12580, 12777, 12605, 12198,
-    11899, 10047, 9350, 9018, 7521 },
-  { 4615, 7038, 8644, 9190, 11073, 11216, 12685, 13003, 14345, 13679, 13689,
-    12344, 10902, 11188, 10148, 7399 },
-  { 5141, 7775, 10402, 11309, 13751, 13759, 14094, 13720, 15371, 14418, 14061,
-    12988, 11166, 11692, 9019, 7665 },
-  { 6591, 8644, 11320, 11985, 14476, 14526, 14816, 14745, 15159, 14966, 15071,
-    14071, 12238, 12154, 10931, 8266 },
-  { 7897, 9483, 10910, 12615, 14865, 14701, 16336, 15966, 16036, 16200, 16266,
-    15506, 13546, 12270, 11580, 9377 },
-  { 6808, 9239, 10394, 11719, 13438, 14348, 14923, 15789, 15519, 15341, 15316,
-    15166, 12927, 11656, 10736, 9138 },
-  { 5796, 8696, 10198, 12417, 12722, 13926, 15077, 15506, 15468, 15155, 15184,
-    13906, 12262, 10270, 9924, 7815 },
-  { 5386, 6960, 8500, 10429, 11262, 12474, 13263, 12505, 13713, 13502, 13632,
-    12702, 12233, 9964, 9329, 6889 },
-  { 5768, 7049, 7630, 9626, 10868, 11697, 12128, 12718, 14351, 13953, 13402,
-    13389, 13063, 10072, 8470, 6445 },
-  { 3665, 4962, 7272, 8760, 9507, 10431, 11095, 11676, 12400, 12216, 11874,
-    11099, 10214, 8725, 6279, 4598 },
-  { 3293, 4948, 6288, 7711, 8156, 9140, 9976, 11683, 9946, 11003, 11496, 10325,
-    8287, 6988, 6251, 4796 },
-  { 2010, 3599, 5789, 6548, 7490, 7725, 7264, 9488, 9893, 9573, 9487, 8725,
-    7145, 6110, 3858, 2891 }
-};
-static int16_t default_ncobmc_krnl_2_0_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 3437, 3490, 4578, 2066,  1672, 1354, 1502, 2345, 2273, -600, 52,
-    272,  484,  2214, -1553, -197, 165,  278,  306,  384,  73,   415,
-    -213, 357,  497,  288,   714,  6,    -82,  -118, 170,  181 },
-  { 2505,  3488, 306,   3011,  2631, 181,  636,  2608, 1663, -964, 594,
-    -1455, 1057, -1198, -1647, 187,  404,  412,  177,  -32,  269,  -24,
-    148,   233,  -290,  -359,  -178, -164, -362, -19,  -408, 106 },
-  { 2588, 3528, 3391, 3134, 1812, 2387, -34, -298, -13,  -955, 40,
-    -475, 1243, 283,  -247, -484, 200,  -46, 36,   -642, -386, -438,
-    34,   295,  93,   -528, -13,  412,  -8,  41,   -457, 28 },
-  { 796, 3353, 435,  3473,  458,  1851, 519,  1061, 259,  942,  416,
-    195, 390,  -151, -1141, -710, 716,  -401, 33,   -771, -759, -336,
-    88,  -124, -139, -372,  -223, -505, -164, -100, -512, -465 },
-  { 3233,  3990, 2698, -107,  -448, 297, 331, -13, -530, -383, -464,
-    -1530, 715,  -899, -1978, -879, 43,  93,  -77, -138, -425, -97,
-    -167,  -348, -460, -95,   280,  -45, 235, 172, -357, -200 },
-  { 868,   4162,  1417,  487,  -1446, -355, 392, -159, 202,  704,  -814,
-    -3095, -1052, -1482, -745, -1403, -199, -27, -38,  -387, -208, 20,
-    -64,   -130,  -265,  81,   -20,   238,  49,  121,  -137, 495 },
-  { 2774, 3478, 2072, 1229, 819,  1359, 106,  -222, -1445, -1559, 924,
-    -98,  44,   -347, 455,  -862, -318, -288, -31,  281,   -144,  -107,
-    148,  103,  -171, -239, -134, 25,   125,  108,  -142,  -129 },
-  { 610,  990,  -703,  1003,  437,  -275, -179, -233, -2041, -445, -1145,
-    -488, 335,  -2684, -1339, -294, -176, -195, -36,  -65,   -276, 10,
-    -111, -277, -134,  -222,  -51,  31,   -369, -279, -105,  69 },
-  { 420,  2773, 375,   -372, 489,  989,  -900, 1075, 182,  119,  -529,
-    -470, -504, -2225, 225,  101,  -264, -417, -253, -459, -317, -205,
-    -528, -7,   -43,   -268, -116, -857, -608, -208, -216, 220 },
-  { 2969, 1927, -314,  -476, 402,   -637, -838, 835,  1229, 1200, 135,
-    -299, -324, -2136, 340,  -1563, -309, -98,  -408, -137, -154, 668,
-    101,  -90,  245,   112,  -51,   -37,  -525, -254, -244, -126 },
-  { 1404, -258, 2333,  2019,  309,   -29,  -2468, 18,   -494, 70,  -260,
-    245,  515,  -1984, -1759, -1003, -504, 104,   472,  197,  -38, 265,
-    378,  6,    50,    -183,  -204,  -17,  -383,  -318, -396, 142 },
-  { 807,  637,  712,   1237,  -971, -176, -1160, -210, -2072, -782, -959,
-    -372, -590, -1159, -1017, -889, -750, -399,  -98,  -15,   2,    -172,
-    -48,  -488, -628,  -12,   -25,  136,  229,   -200, -212,  -472 },
-  { -1464, 333,  -1978, -1394, -281, -1820, -124, 385,  97,   -297, -1497,
-    -3,    -916, -660,  -949,  -504, 117,   11,   86,   88,   2,    219,
-    333,   -120, -224,  71,    237,  -507,  13,   -381, -207, -113 },
-  { 1100, -717,  -1827, -1908, -1030, -1562, 404,  794,  4,    -682, -1306,
-    -612, -1197, 8,     -131,  525,   159,   -345, -91,  9,    -222, -482,
-    -69,  482,   593,   -32,   -239,  -408,  -522, -692, -126, 712 },
-  { -798, -735, -174, -1695, 819,   -737, -15, -426, -750, 876, 34,
-    -622, 448,  -71,  -950,  -2094, 74,   170, 18,   57,   156, 443,
-    -85,  -374, -416, -537,  -348,  -126, 62,  -381, 399,  -53 },
-  { -552, -1352, 536,  -1,    -322, -1094, -428, 309,  -142, -752, 354,
-    900,  473,   -137, -1263, -370, -731,  -864, -30,  -101, 354,  -321,
-    -523, 377,   9,    -415,  -87,  -145,  -154, -286, 100,  23 },
-  { 44,  607,  316,  -268, -246, -497, 267, 154, 160, 717,  324,
-    240, -130, -218, -107, -252, -64,  4,   113, -57, -162, 123,
-    -5,  143,  -312, -66,  -230, -33,  -57, 60,  153, 85 },
-  { 158,  14,  -307, -240, -85, -416, 304, -402, -461, -221, 193,
-    -123, 384, -142, 48,   -77, -378, 36,  -56,  20,   2,    -240,
-    -88,  -1,  -185, 87,   6,   94,   -22, 82,   191,  194 },
-  { 417,  259,  -85,  -170, -45,  -151, -402, 136, 28,   -40, 101,
-    224,  -337, 97,   98,   51,   -401, 95,   -77, -153, 357, -99,
-    -473, -142, -289, -80,  -349, -76,  -87,  97,  40,   198 },
-  { -236, 62,  -104, -8,  98,  68,  128, 116, 47,  54,  -121,
-    -150, -20, -120, 196, -80, 37,  290, 231, 247, 131, -113,
-    -126, -87, 65,   250, 260, 102, -68, 234, 76,  -87 },
-  { 245, 486, 38,   -10,  -135, 106, 217,  -187, -200, 96,   20,
-    117, -40, -97,  68,   -139, 276, 8,    -55,  -53,  -187, -20,
-    -41, 1,   -145, -246, -106, -45, -145, -353, 185,  -35 },
-  { 448,  -172, -496, -63, -84, -106, 151,  9,   -143, -180, -38,
-    -276, -223, 269,  100, 38,  -236, -66,  124, -59,  475,  -78,
-    -407, -20,  -119, -19, 162, -4,   -226, 101, 247,  78 },
-  { -348, -156, -324, -260, -173, 0,   -41,  63,  235,  -114, 109,
-    -362, -96,  279,  -277, 36,   394, 394,  240, 30,   -88,  209,
-    29,   176,  59,   -20,  -244, 50,  -104, 192, -157, 48 },
-  { -376, -176, 269, -426, -159, -108, -18,  -163, 93,  130, -222,
-    -40,  539,  176, 164,  -62,  -709, -354, 502,  664, 243, -414,
-    -51,  192,  33,  54,   -10,  -57,  -141, -3,   144, 71 },
-  { -137, -636, 627,  6,    -129, -159, -45, -150, -15,  402, 207,
-    20,   202,  1,    -203, 88,   183,  62,  -76,  120,  418, -196,
-    -104, -154, -433, -338, -73,  1,    -79, -14,  -200, 84 },
-  { 184, -334, 175,  114,  -274, -60, -429, 176,  36,   373, 468,
-    134, 110,  -11,  -201, -94,  352, 109,  115,  91,   187, -83,
-    21,  0,    -154, -180, 288,  0,   -61,  -197, -246, 42 },
-  { -143, 26,   190,  -110, -335, -385, -357, 27,   103,  -66, -96,
-    -189, -337, -150, 129,  -104, -176, -418, -216, -118, 28,  126,
-    -112, -130, 110,  17,   141,  111,  -82,  238,  22,   -50 },
-  { 104, -95, 48,   -239, -40, -148, -327, 244,  323,  -102, 244,
-    151, 113, -150, -74,  223, -81,  -328, -178, 140,  -233, -165,
-    182, 514, 216,  -129, -8,  141,  -81,  451,  -110, -71 },
-  { -116, 84,   -228, 177, 318, 62,   134, -3,   239,  14,  338,
-    278,  -439, -254, 3,   -82, -210, -62, -236, -124, 5,   -60,
-    112,  -18,  -115, -31, 5,   -65,  278, 4,    -19,  -130 },
-  { 236, -64,  -147, -519, 147,  -27, 71,  -567, -133, 24, -199,
-    229, -107, 126,  -141, -148, -35, -34, 68,   230,  8,  72,
-    40,  -148, 203,  97,   84,   107, 32,  17,   -58,  -18 },
-  { -43,  -408, -101, 120, 118, 168,  -170, -233, -323, -120, -339,
-    80,   -294, -151, 85,  52,  -420, 79,   -162, -233, -237, -47,
-    -131, -53,  -199, 14,  85,  -80,  93,   -150, -15,  318 },
-  { -106, 107,  -6,   189, 53,  -109, 22,  -474, -335, -102, -279,
-    -321, -66,  186,  -65, -13, 61,   167, 43,   -159, -57,  -13,
-    37,   -125, -137, 132, 161, -156, -27, -276, -89,  15 }
-};
-static int16_t default_ncobmc_krnl_2_0_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5401, 5987, 4279, 6550, 4858, 4986,  5733,  7172,  8194, 7631, 7549,
-    6971, 9288, 7485, 8583, 9244, 12058, 11530, 10461, 8453, 8304, 11724,
-    8999, 9457, 5018, 6922, 8375, 7860,  7915,  6921,  7703, 8963 },
-  { 2308, 2670,  5018,  5298, 3883, 6449,  4267,  4119, 9252, 10082, 7844,
-    7414, 9050,  9261,  8739, 7808, 10974, 10279, 8627, 8840, 9203,  9406,
-    9360, 10574, 10156, 7673, 6238, 8876,  6800,  6423, 6931, 8589 },
-  { 6608,  4325, 3372, 5227, 6182, 3670, 5595, 5758, 8575, 8025, 8251,
-    10711, 5449, 6965, 5443, 7178, 9099, 8842, 7132, 7830, 5795, 9882,
-    8939,  8323, 7507, 7248, 8750, 6786, 6940, 4942, 7125, 6399 },
-  { 3977, 3060, 4962, 7094, 7211, 6388, 6256, 3960, 7672, 7814, 7711,
-    7237, 7088, 7232, 5716, 6040, 9565, 6643, 8113, 7841, 9849, 10144,
-    8297, 7676, 6792, 8447, 7805, 5475, 5499, 4728, 5379, 7645 },
-  { 4598, 4391, 3660, 6284, 6694, 8302, 5610,  5341, 7466, 6298, 6406,
-    7734, 5743, 5155, 5257, 6958, 9035, 11566, 9636, 7825, 8147, 9427,
-    6612, 5526, 7635, 7259, 7696, 7853, 5505,  6744, 9265, 5394 },
-  { 5980, 2356, 2746, 5955, 4045, 4283, 5117, 3799, 5386, 5594, 7671,
-    6984, 6232, 6028, 3101, 3391, 5757, 9530, 7408, 6206, 5512, 7867,
-    5144, 8011, 6690, 6994, 4877, 5063, 6175, 5205, 1965, 859 },
-  { 2619, 4096, 4225, 4712, 5637, 6418, 6649, 3904, 5463, 5102, 4785,
-    4100, 5127, 3858, 3419, 5301, 6002, 7649, 8260, 6241, 4168, 4551,
-    6153, 5016, 7113, 7845, 5201, 5455, 5069, 2335, 3311, 5194 },
-  { 1278, 4942, 4441, 3456, 3791, 5620, 5275, 2243, 5080, 4619, 5834,
-    4859, 4320, 5092, 1481, 846,  4969, 4835, 3646, 5940, 5736, 5862,
-    3628, 5918, 5865, 4945, 4385, 4699, 4342, 5415, 8383, 4711 },
-  { 3855, 1678, 2560, 4631, 2765, 1444, 1449, 1895, 4494, 5706, 4813,
-    4882, 3532, 2264, 3222, 5444, 4097, 5236, 5036, 3713, 6547, 4371,
-    5311, 2363, 5113, 6290, 3743, 5343, 5369, 2813, 2486, 1647 },
-  { -651, 1098, 2116, 3495, 2289, 1836, 4507, 4057, 5225, 4553, 2631,
-    2791, 2984, 3605, 3416, 3611, 4358, 4719, 3450, 4146, 3973, 3263,
-    3826, 5881, 6402, 4584, 4396, 3689, 2020, 1960, 2100, 4304 },
-  { -622, 1848, 379,  112,  -1474, 1013, 6023, 260,  1035, 1984, 3811,
-    2362, 1394, 2546, 3347, 2472,  1865, 755,  2251, 1139, 1933, 2252,
-    1163, 3003, 4091, 4792, 3801,  3517, 4247, 3798, 5216, 4543 },
-  { 1342, 2229, 1014, 1212, 260,  432,  1975, 99,   2798, 818,  2455,
-    3858, 2231, 3773, 136,  857,  2171, 815,  1966, 1825, 1711, 964,
-    2142, 2514, 5367, 3539, 3241, 3116, 3982, 3839, 3553, 3535 },
-  { 1800, 27,   321,  111,  1003, 528,  254,  979,  2444, 2413, 3807,
-    961,  1961, 1173, 2156, 3935, 259,  263,  1815, 1979, 1218, 2393,
-    3738, 1109, 4444, 3726, 3647, 3428, 2966, 4602, 4903, 5851 },
-  { 1340, 753,  317,  1318, 738,  1880,  -500, -691, 1108, 38,   412,
-    890,  494,  291,  -131, 759,  -111,  221,  -95,  2575, 3099, 3223,
-    3140, 3156, 3952, 1942, 2615, -2313, 2991, 6367, 5744, 4528 },
-  { 752,  490,  1255, 2396, 14,   3819, 1319,  1239, 3491, 2464, 3243,
-    3083, 392,  1273, 1712, -226, -931, -2130, 710,  864,  385,  265,
-    1431, 1796, 3063, 3531, 3879, 3986, 3503,  4045, 2539, 3489 },
-  { 1943, 170,  358,  1884, 2344, 1566, 92,   1721, 1381, 1115, 723,
-    1670, 2294, 1497, 1697, 973,  1286, 2306, 381,  2582, 2551, 3852,
-    2481, 3432, 2273, 3079, 2076, 3014, 3365, 3906, 2241, 2250 },
-  { 1741, -705, 595,  956, 2038, 793,  1518, 148,   -524, -881, -487,
-    711,  720,  773,  431, 2181, -435, -841, -1106, -552, 434,  -2007,
-    -41,  -234, -960, -23, 394,  -655, 792,  934,   1495, 1947 },
-  { 2086, 1360,  97,   1352, -95,  1800, -729, -916, -152, 956,  196,
-    1746, -1973, -690, 472,  1788, -28,  385,  781,  589,  -320, 1167,
-    -484, 66,    1136, 1038, 1741, 888,  3056, 2114, 3495, 1297 },
-  { 1900, 1373, 983,  3718, 1409,  2096, 932,  -604,  -1370, 1153, 109,
-    58,   104,  2851, 602,  -2071, 252,  -888, 1428,  2724,  1344, 1567,
-    563,  1902, 1370, 519,  -294,  393,  1153, -1032, 2129,  335 },
-  { 2652, 2620,  3178,  2344,  2466, 2241, 1145, -101, -635, 306, -1036,
-    638,  -2606, -1921, -1098, -328, -324, 2598, 1092, 1832, 493, 2507,
-    1152, 1461,  -796,  2126,  -742, 1182, 2078, 1549, 2665, 2366 },
-  { 1080, 798,  1934, 568,  1218, 3206, 155, 1844, 2313, 3509, 1090,
-    650,  1166, 2515, 1846, 1025, 259,  720, 1587, 3010, 4955, 6457,
-    2952, 2764, -396, 1937, 1563, 673,  828, 4062, 2711, 1548 },
-  { 871,  657,  2761, 1756, 2349, 198,   -1003, -1105, -1181, -69,  146,
-    3201, -27,  1493, 13,   291,  -2260, -468,  1178,  928,   2665, 3887,
-    3140, 1334, 1969, 2687, 544,  3842,  2885,  733,   3419,  1963 },
-  { 1491, 1698, 302,  2127, 1256, 907,  1607, 1833, 2061, -536, 988,
-    4380, 2723, -195, 962,  1769, 2466, 1735, 2707, -369, -713, 1599,
-    3031, 2924, 2023, 2045, 5259, 1733, 3517, 4274, 440,  412 },
-  { 2163, 1,    167,  1755, 5694, 3272, 739,  4235, 6123,  3811, 4611,
-    5800, 2424, 2409, 1458, 2152, 104,  115,  466,  -998,  -806, 2824,
-    4473, 2511, 4878, 3258, 5014, 3559, 1003, 2074, -2091, 1403 },
-  { 964,  1051, -1527, 1266, 3883, 2349, 1054, 1972,  1929, -249, 3796,
-    2861, 1542, 449,   539,  1942, -16,  58,   2080,  56,   1106, 4248,
-    580,  2540, 3095,  4536, 152,  354,  4067, -2246, 1505, 1981 },
-  { 1081, 1440, 324,  736,  2839, 2597, 3712, 2282, 3717, 2483,  1247,
-    4456, 3604, 3415, 2487, 3715, 2073, 2928, 2372, 828,  -2700, 2054,
-    4315, -125, 1777, 2211, 2992, 7336, 4216, 3571, 2657, 6780 },
-  { 1997, 2104, 1255, 1942, 1335, 1450, 3567, 1447, 3812, 6083, 5233,
-    4484, 3536, 3564, 3290, 4062, 2589, 2816, 3971, 4406, 3481, 2664,
-    1245, 1759, 3353, 1036, 2054, 1299, 2263, 4010, 4171, 3972 },
-  { 1519, 4826, -750, 988,  1338, 2999, 212,  3858, 5202, 5306,  5717,
-    3066, 2629, 6461, 6043, 6637, 8388, 7252, 4890, 4161, -1056, 4615,
-    2538, 5633, 3389, 6439, 2985, 7148, 5149, 4509, 8001, 8863 },
-  { 1047, 876,  2713, 3913, 2232, 1084, 1702, 2626, 1983,  3744, 2044,
-    3690, 2087, 4497, 2656, 5592, 6247, 4584, 4218, 6097,  6884, 6277,
-    2412, 5097, 7400, 2789, 6089, 6157, 7247, 9712, 11393, 5627 },
-  { 2876, 4288, 2443, 3081, 1569, 1823, 1050, 2325,  2558, 2591, 4223,
-    6300, 4237, 4354, 4411, 7502, 4175, 3350, 4208,  1100, 6473, 6664,
-    5460, 4207, 5297, 8047, 6850, 6496, 7866, 10375, 7455, 2868 },
-  { 3282, 5838, 6486, 6479, 3474, 4665, 3790, 2882,  5116, 4457, 4649,
-    4208, 4520, 7271, 4363, 7125, 8799, 6540, 10158, 5716, 6794, 5762,
-    6462, 8561, 2742, 7002, 9454, 8451, 8560, 7973,  7759, 6679 },
-  { 5957, 7221, 5126, 7057, 5824, 4274,  5374,  6023, 7549, 6239, 7666,
-    6368, 4014, 5338, 7150, 9793, 10608, 9838,  6748, 9691, 5465, 4631,
-    7964, 7692, 8173, 9362, 8989, 11677, 10282, 9960, 6666, 9276 }
-};
-static int16_t default_ncobmc_krnl_2_0_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 7499, 5941, 5384,  4566, 4006, 3634, 2288, 4112, 2127,  3001, 2639,
-    1927, 467,  -1639, 1484, 1143, 66,   -316, 626,  1721,  1208, 193,
-    1591, 3903, 8472,  3945, 1882, 4378, 6453, 8972, 11867, 10110 },
-  { 7919, 6226, 8601, 3825, 4644, 4380, 3957, 2964, 1316, 3586, 2268,
-    2802, 2193, 1427, 1479, 1353, -55,  373,  271,  979,  526,  1827,
-    2463, 1938, 3963, 4851, 5040, 4192, 3731, 4522, 8903, 6733 },
-  { 6373, 4994, 6414, 4822, 4923, 4881, 4383, 6117, 3342, 5068, 2353,
-    2370, 2231, 758,  1768, 1338, 742,  1498, 454,  1453, 1466, -213,
-    177,  1223, 512,  5366, 2462, 4667, 5671, 5039, 6065, 6874 },
-  { 9299, 8698, 12939, 6170, 7063, 3147, 3256, 3492, 2696, 4498, 3705,
-    3176, 2797, 1099,  2852, 1331, 527,  1272, -388, 1619, 110,  -406,
-    390,  3801, 4468,  3193, 2944, 7284, 7144, 4560, 6320, 8073 },
-  { 5937, 4572, 5212, 6678, 5291, 2561, 2752, 4892, 2713, 5203, 4202,
-    1527, -470, 2424, 2850, 1217, 401,  587,  191,  1122, 1314, 1854,
-    3860, 4579, 2455, 5427, 1614, 5037, 5073, 5074, 3101, 7734 },
-  { 7035, 5229, 7515, 6523, 7587, 5653, 5311, 4945, 4097, 4237, 2836,
-    2667, 1959, 4095, 1669, 1484, 57,   467,  1028, 642,  2843, 2782,
-    3604, -825, 1592, 4305, 2202, 4432, 4683, 3867, 3520, 9281 },
-  { 7248, 3787, 4243, 4710, 3288, 1975, 2766, 4057, 1506, 2644, 1436,
-    818,  1150, 2159, 787,  920,  98,   137,  1065, 306,  3880, 537,
-    3871, 1060, 3821, 3395, 2484, 3532, 4072, 3339, 2638, 3982 },
-  { 8810, 5802, 5538, 4090,  3659, 3742, 3818, 6827, 6474, 4756, 4093,
-    3735, 4063, 4586, -1945, 470,  328,  -163, 958,  511,  2541, 3057,
-    2972, 4349, 4754, 5115,  5847, 6843, 7299, 6652, 5891, 5655 },
-  { 9091, 5007, 6438, 4749, 5610, 3664, 6151, 5188, 3686, 2005, 2670,
-    -245, 1788, 3724, 2626, 679,  -52,  -839, -145, 356,  3488, 1970,
-    1988, 2126, 1099, 2578, 5401, 6965, 4908, 5526, 6748, 5968 },
-  { 6412, 7516, 8029, 8748, 6742, 7509, 6552, 4078, 4300, 5066, 4786,
-    3270, 4270, 3875, 2319, 4282, 1640, -843, -439, 427,  1587, 520,
-    -28,  2251, 3358, 3049, 4407, 7286, 8994, 7802, 5924, 6824 },
-  { 8467, 6838, 3934, 2952, 7200, 5407, 4593, 5882, 3353, 3865, 1870,
-    1535, 2130, 4121, 3527, 1799, -637, -937, 513,  247,  169,  607,
-    2947, 3530, 3717, 6082, 9703, 6867, 2729, 6292, 3084, 4879 },
-  { 9934, 8638, 7508, 6894, 7343, 5306, 6208, 6136, 5240, 7136, 3958,
-    1811, 3171, 1064, 2246, 882,  1681, 727,  1694, 769,  1700, 1370,
-    1901, 5812, 3852, 6468, 5875, 5416, 6007, 3348, 3600, 6661 },
-  { 10978, 9383, 9741, 10746, 5208, 8469, 4608, 5824, 4424, 3460, 3841,
-    4037,  3687, 1582, 3784,  988,  1974, 1292, 2272, 2128, 2210, 2888,
-    -967,  5864, 5568, 4693,  3796, 6361, 4816, 2697, 4559, 6437 },
-  { 8329, 9809, 8672, 9375, 7503, 5775, 3454, 4596, 5093, 5033, 4021,
-    2860, 2833, 2782, 3056, -617, 1644, 1759, 2434, 2570, 3312, 3807,
-    3518, 3521, 1126, 2830, 3378, 4432, 3261, 5211, 4073, 10050 },
-  { 9992, 8148, 7951, 7194, 5624, 5032, 3296, 2981, 5388, 3910, 2274,
-    1436, 1425, 1053, 2111, 2806, 1606, 1446, 1681, -211, 1877, 1541,
-    1700, 2736, 2088, 2551, 1045, 2977, 2632, 1719, 4896, 5378 },
-  { 9403, 8846, 8061, 7478, 5269, 6655, 6312, 4110, 3529, 5802, 3108,
-    3246, 1943, 909,  2436, 1678, 1513, 1243, 797,  213,  3888, 4015,
-    2775, 2082, 2395, 2792, 2136, 2475, 1657, 2156, 1878, 2587 },
-  { 9499, 9075, 5426, 6962, 8206, 8057, 3968, 5184, 2759, 2277, 2744,
-    3531, 2518, 367,  1075, 2118, 900,  901,  2964, 3641, 5282, 2186,
-    2416, 2312, 2366, 2149, 1024, 1912, 1119, 220,  401,  727 },
-  { 7615, 8271, 8148, 7699, 7063, 7658, 5473, 7497, 7302, 5841, 4165,
-    3092, 734,  2215, 3316, 2226, 1197, 1236, 2996, 5007, 2872, 3460,
-    2371, 1898, 1917, 1442, 853,  1412, 700,  620,  317,  1237 },
-  { 8331, 8530, 8633, 7185, 6863, 9076, 5328,  5045, 5378, 4004, 4089,
-    1469, 1341, -333, 2689, 1982, 115,  -1158, 383,  1548, 1118, 2864,
-    3154, 1803, 2079, 1676, 1450, 1165, 967,   795,  136,  1184 },
-  { 8763, 9102, 6716, 8961, 5448, 6366, 3438, 5722, 5374, 5651, 5422,
-    1728, 1751, 2444, 1024, 1118, 424,  2288, 3655, 2719, 2254, 1313,
-    3476, 1983, 1975, 1502, 1172, 2333, 937,  594,  122,  149 },
-  { 8146, 9931, 7629, 8882, 6328, 7491, 5646, 5494, 7238, 7355, 4478,
-    2019, 2646, 3486, 4193, 1121, 562,  1823, 2787, 1720, 2228, 3627,
-    4470, 3351, 2439, 2214, 1926, 2118, 1771, 767,  353,  1062 },
-  { 10816, 9814, 10917, 7424, 8207, 9717, 8537, 8728, 7356, 7376, 7246,
-    3223,  1981, 277,   1282, 951,  515,  222,  1392, 789,  4372, 2112,
-    4083,  2706, 3234,  2414, 2655, 1407, 702,  1369, 121,  676 },
-  { 11362, 10078, 7520, 7828, 10705, 7300, 7358,  6559, 8337, 7569, 5067,
-    3465,  2417,  1956, 2165, 759,   -106, -1282, 1822, 3225, 4767, 5619,
-    4119,  3383,  3877, 2702, 2410,  2459, 1441,  1392, 945,  216 },
-  { 10112, 8115, 3762, 5107, 7443, 7676, 7498, 7380, 6235, 7523, 6246,
-    3574,  2749, 3853, 303,  1558, 1896, 1107, 462,  2172, 2388, 4222,
-    2000,  1688, 3560, 2297, 1593, 3679, 3628, 1507, 1549, -188 },
-  { 7794, 6437, 6605, 5381, 6404, 4410, 6677, 4233, 4949, 3000, 2812,
-    3756, 1805, 2877, 2098, 1737, 1809, 1427, 378,  2031, 2115, 5006,
-    3159, 3602, 6343, 3503, 3356, 5971, 3138, 3932, 1028, 699 },
-  { 6757, 7738, 6538, 8248, 6959, 6557, 5264, 3092, 3765, 1895, 1865,
-    901,  2485, 2217, 1699, 1946, 3573, 1501, 2141, 2177, 180,  1003,
-    1816, 4793, 2112, 4560, 3820, 2881, 4376, 2091, 681,  623 },
-  { 9057, 8917, 7385, 7072, 6324, 5492, 5283, 5053, 5785, 4277, 3322,
-    1267, 1946, 1894, 3701, 472,  1658, 1154, 777,  2193, 2349, 3611,
-    3129, 3719, 1781, 5389, 3418, 2463, 3734, 3644, 3365, 2247 },
-  { 9444, 9439, 8598, 9152, 6982,  8451, 8279, 6129, 5172, 3730, 2416,
-    2483, 2775, 1913, 1041, -1110, -392, 1068, 556,  598,  4171, 2377,
-    1870, 1906, 5449, 5413, 2589,  3564, 6473, 6692, 3140, 2665 },
-  { 10567, 10001, 8225, 8289, 6898, 6856, 3920, 4547, 4297, 1456, 2348,
-    1526,  2343,  2863, 1429, 312,  57,   930,  1619, 1189, 596,  1815,
-    2589,  3141,  1662, 3349, 1311, 4091, 4596, 7321, 5911, 6965 },
-  { 9593, 9214, 9132, 8273, 8030, 8135, 5179,  5564,  4052, 4155, 4052,
-    2249, 2178, 1680, 439,  822,  -378, -1210, -1149, 3709, 2830, 747,
-    2987, 5873, 795,  5124, 4233, 3887, 5573,  5312,  7258, 11014 },
-  { 8373, 8033, 8934, 7880, 7434, 6144, 7528, 5163, 2591,  4301, 2489,
-    4137, 1295, 760,  703,  805,  -308, -320, 2205, -1113, 362,  581,
-    2567, 689,  5949, 2652, 1996, 2138, 7469, 4835, 8058,  11132 },
-  { 8586, 6026, 7656, 7201, 8141, 7249, 5995, 4896, 3152,  4255, 1711,
-    3498, 3933, 1852, 1444, 715,  -104, -695, 4021, 3937,  6478, 1755,
-    935,  384,  1002, 2595, 3359, 4532, 7103, 5192, 12241, 14373 }
-};
-static int16_t default_ncobmc_krnl_2_0_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -18,  921,  2116, 3151, 5822, 6391, 6844, 2748, 3794,  6358, 6115,
-    7194, 6145, 8324, 7847, 6181, 4052, 4867, 4967, 5823,  6786, 4035,
-    5989, 2636, 2376, 5222, 5409, 4121, 2105, 626,  -3363, -2857 },
-  { 3594, 3991, 2433, 4231, 5187, 5335, 7496, 6672, 4132, 3625, 5649,
-    7621, 4052, 6868, 7772, 7010, 5041, 5311, 7273, 6593, 6376, 5150,
-    4421, 3618, 2523, 4188, 5275, 3469, 6209, 5459, 953,  947 },
-  { 786,  3510, 3161, 3162, 3435, 5439, 6415, 4784, 4467, 4232, 5708,
-    3775, 7437, 8362, 9398, 8331, 6300, 6049, 8740, 7748, 9508, 7139,
-    7232, 6528, 8257, 4296, 5180, 4497, 3755, 6329, 3620, 3050 },
-  { 2273, 1239, -1997, -385, 1641, 4987, 6332, 7869, 5742, 3115, 4523,
-    5739, 6076, 8184,  8936, 9733, 5577, 8872, 8635, 7679, 7192, 6961,
-    7586, 5022, 5256,  5107, 5842, 4127, 3898, 7191, 5184, 1097 },
-  { 2576, 3444,  4787, 3494,  4843, 5213, 7669, 6154, 6713, 5224, 6221,
-    8653, 10387, 9676, 10219, 9062, 6899, 4115, 6617, 7548, 7319, 5169,
-    6051, 6609,  6735, 3759,  6779, 3520, 5518, 4355, 4386, 3459 },
-  { 2457, 4623, 4686, 3390,  6167,  6776,  5546, 7755, 6678,  5831, 6667,
-    9797, 9222, 7728, 12319, 12899, 10764, 6383, 7947, 9907,  8225, 5677,
-    7690, 9312, 8324, 4971,  9288,  6616,  5448, 7180, 11014, 5709 },
-  { 3687,  5015,  5834,  5702,  6619,  6602,  6844, 8607,  10828, 10170, 9206,
-    11527, 10057, 10677, 11683, 11009, 10585, 8869, 7057,  9542,  8465,  11391,
-    6180,  10182, 5594,  5353,  8810,  7358,  7118, 10591, 10569, 7318 },
-  { 5659, 4619, 7090, 7819,  8483,  7258,  7446,  7530,  6847, 7424, 7586,
-    8261, 7644, 9373, 18173, 15351, 11259, 11899, 11787, 9977, 8370, 7422,
-    9853, 6375, 5873, 6503,  6194,  4792,  5082,  4563,  2192, 5942 },
-  { 3004,  6927,  6994,  7359,  7505,  10247, 9661,  8199,  7979,  8529, 9388,
-    12192, 11555, 12591, 10308, 10143, 12579, 12379, 11700, 12735, 6629, 10209,
-    9592,  11878, 10187, 7755,  7344,  4922,  6699,  8240,  7341,  8532 },
-  { 7590,  5795, 6512,  4587,  6933,  7660,  6141,  7410,  5605,  5542,  8790,
-    10597, 9438, 10999, 10270, 10028, 10678, 12591, 13767, 11933, 10966, 11898,
-    12452, 8305, 6352,  8621,  7598,  5409,  5869,  6860,  8606,  5371 },
-  { 7095,  7927,  9729,  11290, 10321, 9966,  8226,  10211, 12468, 10459, 10959,
-    12232, 12326, 11686, 11247, 13106, 15660, 16448, 13119, 14772, 14295, 13233,
-    11880, 9805,  8498,  5650,  3043,  5995,  9756,  6592,  8450,  6801 },
-  { 4251,  4844,  7130,  7033,  9742,  10794, 9341,  10350, 10410, 9188,  10907,
-    11059, 11547, 12685, 14995, 15511, 13256, 15229, 12788, 13792, 12937, 14179,
-    12355, 8519,  7767,  6376,  7293,  7706,  6134,  9392,  9423,  6656 },
-  { 5032,  6597,  8267,  6875,  10431, 9182,  11606, 9174,  9394,  10754, 10214,
-    11384, 11633, 14256, 11377, 11933, 13999, 14801, 12182, 12170, 12927, 10856,
-    13248, 9493,  6586,  7871,  8697,  7094,  8561,  9451,  7116,  4183 },
-  { 5550,  6479,  9188,  7562,  9126,  10236, 12984, 11667, 10146, 11981, 13257,
-    13227, 14228, 13278, 13571, 15730, 14696, 14740, 14122, 11230, 10186, 9795,
-    9766,  9187,  10707, 11612, 10594, 14651, 10618, 5465,  6640,  1085 },
-  { 6402,  8472,  7318,  8449,  9884,  8237,  11776, 12579, 8248,  9119,  10813,
-    12464, 14087, 14122, 13487, 15884, 15630, 16883, 13968, 15663, 13943, 14099,
-    13309, 12222, 11647, 10827, 11813, 9543,  10171, 10991, 8523,  7564 },
-  { 5558,  8716,  7398,  7003,  9081,  9234,  10389, 10222, 11602, 10189, 12165,
-    10551, 11676, 14110, 13499, 14107, 14297, 13673, 15239, 13669, 9564,  8809,
-    11609, 10482, 11688, 10885, 12257, 11025, 11490, 10586, 12134, 11499 },
-  { 5054,  7370,  10001, 8690,  6346,  7990,  10600, 10877, 13977, 14230, 13786,
-    11880, 13256, 15455, 14951, 12311, 15970, 16289, 14385, 13318, 10806, 16058,
-    14004, 14150, 15275, 14285, 15169, 15124, 14484, 15130, 14320, 13627 },
-  { 6472,  6714,  8422,  7520,  9468,  7309,  11310, 10173, 9680,  9775,  11809,
-    11641, 17217, 14973, 12511, 12431, 15565, 14706, 12653, 10736, 13799, 11984,
-    14576, 14406, 13494, 13775, 13748, 13952, 12627, 13551, 12343, 13637 },
-  { 5691,  6196,  6840,  5618,  8130,  5337,  10502, 11764, 12309, 11243, 12058,
-    14603, 15254, 13730, 12988, 16426, 16398, 18336, 14653, 12258, 13528, 12015,
-    13122, 12816, 13238, 14265, 15564, 14875, 14346, 16501, 14057, 14664 },
-  { 5142,  4576,  6578,  5068,  8343,  7665,  11649, 10611, 11541, 10331, 12078,
-    14129, 17221, 15930, 16224, 15649, 16231, 11200, 11389, 11572, 13476, 12629,
-    11861, 13013, 15114, 12486, 15663, 12735, 13401, 13979, 13507, 13952 },
-  { 6851,  5162,  6778,  6922,  8951,  5567,  10360, 9216,  7036,  5410, 10771,
-    13577, 12588, 10477, 10248, 14359, 15261, 13795, 12048, 11716, 9361, 6278,
-    8997,  10237, 14438, 12459, 12976, 13600, 13892, 11879, 13127, 13802 },
-  { 4195,  6070,  3151,  7247,  5889,  6549,  8672,  8715,  10338, 9229, 9026,
-    10246, 14651, 14345, 15001, 15116, 18364, 16684, 13657, 14718, 8840, 10437,
-    9581,  12367, 11264, 11291, 13002, 11111, 13027, 14172, 12590, 13651 },
-  { 3818, 4756,  8879,  6693,  4570,  8158,  7459,  7913,  5727,  9446,  10204,
-    8887, 11326, 14337, 13524, 13813, 13628, 15506, 11578, 13470, 12391, 8927,
-    9166, 9882,  10411, 11665, 8963,  12141, 11521, 10521, 15132, 15679 },
-  { 4425, 8428,  12163, 9947,  3396,  5526,  8133,  4898,  3913,  4891,  5711,
-    7034, 10657, 9932,  14435, 12716, 15058, 15501, 14937, 14530, 14536, 9746,
-    9923, 11968, 7869,  10734, 9735,  9164,  11842, 12786, 16768, 15073 },
-  { 7712,  9515,  10650, 9707,  6201,  9752,  8700,  10334, 9503,  13202, 9555,
-    9748,  12814, 13027, 13920, 12593, 14370, 14808, 13965, 14154, 12735, 7319,
-    12721, 10395, 7361,  8678,  12937, 10057, 9234,  14695, 14044, 13613 },
-  { 8309,  7528,  9323,  7254,  6829,  7276,  7831,  10824, 8851,  11605, 12763,
-    10865, 10153, 10736, 12379, 10799, 10370, 11817, 11734, 13290, 18692, 13378,
-    10209, 11690, 12616, 9779,  9257,  6142,  7818,  10903, 13276, 8893 },
-  { 5420,  5315,  7529,  7453, 9027,  9825,  7865,  9813,  6673, 6090,  7914,
-    10790, 11205, 11064, 9239, 11947, 12306, 12802, 11856, 9896, 10502, 9968,
-    12099, 11011, 11103, 9920, 10747, 12477, 10458, 8485,  8805, 10199 },
-  { 5275,  2169,  8448, 6454, 8077,  5060, 8189, 6133,  5673,  7424,  7993,
-    10659, 10836, 8138, 9347, 10570, 8447, 8359, 11071, 11453, 13480, 9521,
-    11755, 8294,  7308, 4637, 10781, 5515, 4843, 4737,  5330,  4893 },
-  { 4846,  5401,  5671, 3987,  6910,  8363,  10605, 9189,  9832, 11154, 11632,
-    10874, 12377, 9266, 12273, 10543, 10287, 10912, 10745, 9206, 8851,  8327,
-    11242, 8123,  7431, 10266, 8947,  6186,  4259,  -682,  -920, 3901 },
-  { 3634, 2920,  4925,  5515,  6626, 6450,  10063, 9047,  9880,  9577, 8277,
-    7582, 10044, 10186, 11630, 8182, 12589, 14249, 13236, 11328, 7042, 8880,
-    7868, 6442,  10067, 3096,  5190, 5874,  2890,  668,   1718,  2480 },
-  { 4732, 2901,  1056, 1878,  5356, 5406, 5212,  8538, 8974,  7742, 9588,
-    7933, 10867, 8487, 11203, 8392, 8301, 10070, 4166, 11993, 9436, 10071,
-    7464, 7158,  7848, 6669,  4825, 5838, 236,   3720, 562,   -1751 },
-  { 1899, 3004, 3605, 1918, 2347, 4957, 5010, 5918, 6020,  5972, 7291,
-    6820, 8455, 8985, 7833, 5877, 5796, 7048, 5548, 2886,  4467, 10008,
-    7443, 8399, 7314, 4277, 3852, 296,  -983, 1487, -2474, -7290 }
-};
-static int16_t default_ncobmc_krnl_2_1_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 4140, 3361, 5678, 1978,  3443, 3453, 2905, 2131, 4836, 2612, 1530,
-    -831, -257, 584,  -1193, -391, 107,  -47,  32,   125,  282,  684,
-    161,  23,   -22,  -95,   555,  -405, 569,  -268, -92,  105 },
-  { 4680, 4183, 4456, 4730, 4264, 4681, 2310, 2034, 3081, 2493, 2012,
-    1397, 1521, -881, -976, -668, -606, -768, -273, 256,  -4,   -290,
-    64,   -55,  -444, -989, -316, -496, 206,  -169, -158, -87 },
-  { 3199, 3846,  3775, 632,  2359, 3492, 3355, 53,   -1201, 145,  263,
-    -93,  -1435, 415,  -844, 954,  -241, -483, -165, -191,  -561, -185,
-    -300, -258,  -154, -654, 308,  -64,  -36,  -150, 95,    146 },
-  { 680,  2863, 889,  1721, 3444, 2472,  -27,  2458, 816,  -186, 123,
-    3214, 2029, 2485, -631, 323,  1030,  -275, 196,  -532, -537, 153,
-    274,  61,   -453, -283, -533, -1062, -145, -388, 158,  0 },
-  { 1962, 4004, 1406, -535, 1315, 2669, 2522, 654, 3394, 4205, 2731,
-    -40,  -118, 599,  -511, 618,  162,  840,  43,  253,  -59,  222,
-    64,   -21,  -671, -179, 241,  283,  902,  226, 305,  -204 },
-  { 516,  1205, 3201, -5,   1479, 945,  2129, -628, 3181, 900, 1439,
-    1128, 799,  -158, -431, 347,  -118, 527,  389,  268,  -73, 2,
-    534,  133,  -287, -19,  561,  329,  394,  -120, 38,   -461 },
-  { 2130, 2022, 1966, 210, 447,  402,  1249, 1677, 2353, 1113, 1723,
-    1300, 2060, -144, 420, 2008, -417, -74,  -197, 135,  217,  310,
-    152,  339,  -99,  -81, 279,  44,   54,   -160, -82,  4 },
-  { 2134, -1849, -990, -93,  1932, 2119, 2954, -371, -1021, -831, 1662,
-    1330, 1634,  246,  -777, 852,  130,  -67,  191,  -316,  -429, -240,
-    -147, -198,  92,   -15,  310,  141,  -10,  146,  35,    85 },
-  { 2763, 4779, 994, 1054, 2625, 2031, 1784, -161, 1142, 1052, 2300,
-    2462, 1943, 516, 816,  27,   18,   171,  158,  -311, -636, 20,
-    -463, -235, 145, 339,  240,  -354, -110, 41,   404,  353 },
-  { 3625, 3557, 2333, 950,  2020, 2445, 2562, 1506, 2571, 1559, 4781,
-    2030, 1325, 2507, 2045, 1896, -526, -22,  -272, -143, -189, 17,
-    10,   405,  143,  414,  -95,  -229, -215, 0,    -347, 83 },
-  { 2808, 1062, 1502, 411, 1139, 998, 1577, 1233, 1637, 998,  1846,
-    2487, 3868, 2225, 533, -51,  -6,  -180, -30,  186,  -175, 247,
-    352,  57,   83,   290, 330,  160, 165,  354,  -465, 131 },
-  { 2809, 2966, 2929, 1435, 2875, 1948, 130,  1168, 252,  1276, 2838,
-    3507, 3001, 1410, 312,  1941, -336, -431, -190, -194, -130, -336,
-    238,  75,   -472, -189, 123,  61,   -583, 147,  305,  200 },
-  { -23,  2306, 2169, 33,   1848, 1832, 2721, 49,  1435, 585, 1036,
-    2116, 1658, 1011, 815,  920,  101,  108,  262, 299,  283, 357,
-    268,  141,  -71,  -285, 205,  142,  -71,  224, 252,  156 },
-  { 1447, 2625, 4643, 2096, -847, -154, 2876, 1050, 104,  -873, -327,
-    146,  -596, 622,  -337, 1317, -61,  9,    -201, 110,  90,   644,
-    337,  204,  155,  278,  320,  -306, -504, 357,  -108, 132 },
-  { -16, 2815, 1344, -2044, 2236, -549, 586,  409, 30,  152,  1588,
-    243, -115, 291,  -30,   -170, -96,  -10,  433, 205, -134, 17,
-    528, -16,  -22,  -198,  -43,  -143, -224, 270, 153, 37 },
-  { 1478, 829,  628, 1055, 1323, -406, -282, -12,  418,  40,  -795,
-    -286, -627, -41, -448, 454,  -267, -258, -129, -57,  -44, -406,
-    -260, -67,  134, -196, -236, -125, 35,   -62,  -137, -5 },
-  { 220,  26,  -380, -257, -90,  -453, -196, -56,  -193, 37,   131,
-    151,  -88, -695, 66,   -113, -200, -144, 132,  -48,  -244, -207,
-    -178, 268, -107, -1,   69,   337,  -84,  -197, 87,   119 },
-  { 7,    3,   -85,  -185, 334,  -86, -69, 152, -320, -239, 587,
-    415,  246, 290,  -146, -134, -9,  -69, -66, -148, -41,  -206,
-    -148, 283, -144, -287, -73,  93,  -23, 247, 398,  174 },
-  { 46,  -256, -114, -61,  -532, 103,  32,   -223, 24,   -20,  132,
-    339, 61,   -381, -711, -160, -200, -334, 78,   173,  -281, -139,
-    -27, 134,  -120, 96,   110,  -251, -114, -32,  -299, -183 },
-  { -193, 28,  -134, 200,  155,  -316, -363, 285,  268, 665, 233,
-    -127, 436, -20,  -536, -163, 51,   -40,  162,  78,  -27, 192,
-    -34,  -40, -17,  -205, 203,  106,  -62,  -211, -84, 60 },
-  { -440, 312, -195, 221,  251, -388, -116, -252, -101, 92,  -244,
-    -694, -27, 198,  -3,   255, -257, -17,  0,    143,  -20, 48,
-    -68,  110, -130, -340, 136, -45,  -138, 251,  -111, -2 },
-  { 325,  219,  -68,  215,  -177, -206, 14,   108,  -291, 211, 92,
-    -62,  -166, -218, -158, -220, -279, 199,  113,  -263, 271, 153,
-    -433, -16,  19,   -322, -28,  258,  -295, -300, -285, -123 },
-  { -345, 543,  356, -541, -726, -205, -332, -397, -10, -132, 232,
-    132,  308,  324, 229,  79,   -151, 161,  143,  -40, -144, -464,
-    32,   -364, -11, -99,  -285, 61,   -258, 182,  -28, 107 },
-  { -55, 70,   -78,  -269, -709, -52,  351,  94,   80,  268, 249,
-    -56, 189,  -191, -60,  -88,  15,   -205, 111,  -62, 21,  85,
-    77,  -107, -35,  -13,  -107, -472, -546, -197, 5,   115 },
-  { -363, -297, 246,  -84, -419, -230, 283,  -128, 34,   -27, 112,
-    125,  166,  163,  176, -422, 14,   -238, -80,  -153, 313, -366,
-    -208, -54,  -260, 48,  -176, 21,   -91,  -295, -270, 40 },
-  { 85,   242,  107,  -41,  -283, -390, -105, 360, 181,  -720, -582,
-    27,   -96,  -350, -217, -189, -135, -12,  280, 86,   3,    25,
-    -126, -213, -384, 41,   -15,  101,  -68,  143, -211, 86 },
-  { -183, 13,  274,  -46, -86,  -633, 181,  -232, -90, -106, -22,
-    332,  -12, -16,  -30, 87,   5,    46,   37,   -99, 27,   292,
-    -74,  -94, -237, -16, -145, 76,   -106, 227,  -52, 168 },
-  { 40,  -258, -140, -6,   203,  146,  -64, -88, -183, 221,  62,
-    67,  114,  -216, -307, -560, -197, -46, 149, -126, -120, -316,
-    -36, -227, -200, 115,  -41,  -51,  97,  123, -47,  103 },
-  { -51, 44,  -99,  -230, -156, -46, -145, -412, -56,  48, -239,
-    222, 83,  -339, -196, -64,  175, 149,  -140, -316, 6,  -62,
-    -27, -56, -21,  -269, 229,  -7,  122,  -18,  -129, 86 },
-  { -372, 106, 18,  172,  364,  19,  -245, -73,  -124, 164, -9,
-    14,   214, -67, -217, -175, -45, 119,  -194, 36,   18,  -83,
-    126,  196, 112, -297, -102, 104, -74,  -152, 19,   199 },
-  { 314,  81,  -49,  -188, 48,  -82, -4,   107, -221, -4,  207,
-    -245, 197, -37,  -185, -50, -56, -214, 100, -231, -31, -2,
-    21,   -53, -215, -77,  168, -23, 82,   5,   155,  169 },
-  { 258, 188, -27,  -27,  165,  29,  -17,  100, -27, -80, -80,
-    196, 23,  -391, -533, -171, 84,  -137, 0,   14,  251, 99,
-    35,  88,  -28,  1,    144,  -96, -235, 176, 103, -85 }
-};
-static int16_t default_ncobmc_krnl_2_1_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5724, 6155, 5101, 6937, 3616, 3940, 3066, 5662, 7104, 5021, 4979,
-    5907, 4968, 7085, 6582, 7719, 9143, 4128, 6447, 4879, 7061, 11362,
-    7837, 9965, 7152, 6477, 6581, 5803, 1819, 5309, 8559, 10776 },
-  { 1775, 3231, 4026, 2629, 4438, 6309, 5114, 2895, 5657, 6541, 6734,
-    5994, 7468, 4555, 9911, 5200, 5402, 1698, 4298, 6112, 6417, 6691,
-    4816, 6195, 4139, 5856, 3358, 1993, 1542, 661,  1660, 4762 },
-  { 1953, 726,  336,  2519, 4189, -753, 2993, 4957, 5850, 4298, 3651,
-    5353, 3255, 5491, 7815, 3406, 3928, 2987, 4148, 4276, 3530, 8058,
-    5079, 5821, 4622, 3354, 3146, 2460, 489,  1550, 1587, 1399 },
-  { -801, 328,  103,  886,  1381, 2280, 4320, 2452, 1215, 6261, 2206,
-    4849, 4488, 3829, 6128, 5213, 1739, 3173, 4425, 4567, 5845, 5197,
-    5910, 6147, 4260, 3730, 4240, 5420, 307,  672,  963,  3278 },
-  { -1721, -2596, -155, 3029, 3428, 2390, 2321, 3757, 1383, -1283, -1621,
-    1418,  2475,  4188, 5570, 3575, 799,  4017, 2856, 1426, 2012,  2722,
-    3669,  4104,  3800, 4116, 3275, 3739, 326,  95,   2421, 3075 },
-  { -551, -927, -520, 2944, 2518, -722, -215, 1875, 137,  2182, 2761,
-    159,  762,  3693, 1681, 2600, 880,  3273, 4470, 5007, 4272, 3074,
-    2474, 4254, 6828, 4219, 3671, 2407, 1044, 129,  -478, 2814 },
-  { -2686, -1229, 1372, 4761, 4668, 1462, 509,  2727, 930,  2438, 3542,
-    1456,  1961,  541,  1063, 1426, 3603, 2873, 2412, 2999, 2101, 3739,
-    2385,  5494,  5444, 5655, 5034, 381,  321,  90,   2585, 4160 },
-  { -4203, 479,  1122, 2688, 2124, 942,  -2136, -1643, -491, 2581, -2155,
-    -2375, 559,  582,  2202, 2081, 3774, 3330,  1101,  894,  3410, 3691,
-    2509,  5195, 6226, 5471, 5022, 2525, 778,   1212,  2736, 3350 },
-  { -2415, -2903, 4719, 5860, 4006, 2692, 4035, 4143, 2498, 4377, 2058,
-    488,   1429,  3199, -11,  2009, 2087, 2903, 155,  522,  4521, 2221,
-    2310,  3124,  2870, 1941, 3262, 2258, 1515, 2257, 1584, 1048 },
-  { -1469, -2652, -561,  2135, 389,  -522, -589, 447,  -847, 268,  -1641,
-    -1540, -1513, -1334, -599, -581, 2848, 2828, 1416, 2157, 2198, 925,
-    2421,  1437,  1963,  369,  2195, -548, 2051, 868,  824,  2683 },
-  { -2620, -3631, -4548, -885, 629, 523,  -528, -2178, -1743, 1644, 353,
-    -2687, -3041, -1722, 283,  178, 1594, 1190, 968,   -386,  2305, 1317,
-    245,   1443,  968,   800,  471, 521,  1564, 669,   903,   243 },
-  { -1791, -3282, -4140, -1753, -1006, -374, 1027,  -176,  -1477, -891, 191,
-    -912,  497,   96,    359,   1045,  1467, 172,   1303,  2510,  3516, 3671,
-    789,   -807,  2670,  1483,  547,   -521, -1219, -1856, 1008,  1053 },
-  { -1427, -2698, -3949, -436, 801,  -614, -1548, 523,  -176, -683, 423,
-    -871,  820,   -2279, -143, 375,  768,  2306,  5249, 1302, -338, -396,
-    -1590, -608,  1469,  2344, -187, -693, 599,   -661, -458, 160 },
-  { -3491, -3877, -2952, 1252, 767,   -3037, -3638, 188, 587,  710,  1416,
-    1176,  -319,  -473,  1873, -1997, 725,   596,   -94, 1875, 2992, -519,
-    -139,  1938,  1025,  521,  760,   1090,  3648,  392, 564,  902 },
-  { -2186, -3264, -1742, 2634, -36,  -51,  -1253, -314, -908, -459, -1701,
-    -1437, -991,  84,    1265, -964, 402,  1454,  -772, -927, 1765, 1543,
-    484,   2346,  3310,  1887, 1754, 3058, 1474,  728,  -466, -1646 },
-  { -1826, -332, 48,   744,  -618, -97, -165, -155, -908,  -143, 1285,
-    1739,  1185, 885,  1134, -531, -15, -526, 543,  1438,  2026, 3022,
-    558,   1827, -139, 1792, 2022, 769, 2400, 444,  -1572, 598 },
-  { 165,  -357, 15,  666, 1315, 1155, 376,  -7,  991,  213,  1687,
-    -34,  452,  352, 203, 1605, 1484, -498, 581, 533,  467,  1744,
-    1315, 874,  82,  900, 1437, -692, -417, 456, -271, -1132 },
-  { 646, 210,   320,  1208, 145,  971,   396, -448, 557, 1876, -1791,
-    913, -1288, -452, 1015, 925,  -1197, -49, -285, 442, 1093, -410,
-    125, 519,   -52,  513,  1497, -1337, 298, -402, 820, 732 },
-  { -796, 627, -1017, 2972, 4463, 2331, 1387, 1496, 1796, 1608, 1681,
-    -877, 881, -160,  -581, -433, 949,  471,  307,  140,  -946, -597,
-    247,  650, 1143,  694,  10,   -682, 890,  409,  617,  810 },
-  { 1653, 4435,  2388,  294,  2578, 1229, 1072, 1871, 465,  1650, 1524,
-    -430, -1195, -3427, -116, 1117, 217,  967,  -254, 259,  -55,  1425,
-    1583, -1261, -1773, 1232, 2886, 646,  1346, 1518, 2090, -837 },
-  { 2020, 728,   2038,  316, 5725, 4193, 890,  1490, 584,  2705, 694,
-    -892, 34,    2041,  972, 332,  -295, -218, -756, 2193, 1672, 1440,
-    2310, -2136, -2204, 399, -753, 743,  3155, 2521, 3534, 166 },
-  { 824,  1664, 991,  853,  700,  -80,   148, -908, -194, -620, 1053,
-    -368, 1616, 1250, 1449, 3140, -1065, 286, 2226, -590, -570, -1131,
-    477,  -61,  -708, 519,  586,  1148,  898, 1653, 4697, 1581 },
-  { 2014, 1921, -210, 556,  686,  -561, -1239, -1345, -664,  -138, -215,
-    -343, 1019, 1294, 519,  -179, 212,  -299,  -2160, -1450, -329, 293,
-    691,  162,  -645, 1079, 2005, 1466, 1127,  2263,  730,   179 },
-  { 5629, 4670, 597,  2030, 3873, 3698, 54,   2714, 62,   352,   2177,
-    908,  1306, 1504, 1464, -288, -106, -69,  -179, -900, -1340, -4,
-    877,  487,  2606, 358,  2055, 1131, 1421, 931,  -477, 1173 },
-  { 757,  -493, 1510, 2513, 4514, 4649, -478, 2069, 124, -1186, 2855,
-    1906, 1420, 1738, 19,   1916, 1195, -519, 32,   512, 230,   528,
-    43,   -263, 1314, 1350, 137,  -256, 939,  256,  168, -201 },
-  { 663, 947,  699,  3239, 4730, 5279, 1739, 1659, 2774,  -1660, -1677,
-    185, 3745, 1319, 2347, 477,  364,  531,  608,  -520,  -783,  -123,
-    -59, -345, 1202, 1766, 88,   883,  654,  1399, -1082, 658 },
-  { 4534, 5694, 5332, 4909, 4828, 4761, 7376, 3834, 2327, 4737, 7135,
-    5306, 6337, 5240, 5578, 4321, 2107, -205, 1387, 597,  1112, 904,
-    1567, 610,  461,  371,  250,  602,  358,  1807, -617, -59 },
-  { 6124, 8363, 9624, 5674, 7043, 4437, 3846, 3121, 3477, 2818, 5445,
-    3618, 5067, 3996, 5759, 7185, 2150, 785,  1581, 2084, 3321, 4828,
-    -545, 510,  2309, 2501, 1594, 2028, 528,  113,  248,  550 },
-  { 8154,  9890, 6292, 6421, 8295, 4403, 7503, 5496, 7256, 3699, 2845,
-    3725,  5365, 5905, 7170, 2903, 733,  4614, 3856, 4346, 7099, -902,
-    -1492, 1703, 2321, 1842, 3488, 1690, 982,  524,  -467, -687 },
-  { 5338, 10331, 7754, 7014, 3581, 5660, 5471, 5420, 3976, 2548, 6486,
-    9144, 6584,  5442, 6795, 4845, 5182, 2855, 8246, 3660, 5417, 1845,
-    1803, 288,   1434, 639,  1404, 2752, 923,  1055, 741,  -984 },
-  { 4457, 7110, 5195, 5959, 6818, 8562, 5548, 2071, 5544, 8734, 7080,
-    4737, 9481, 7672, 8374, 7638, 4204, 3562, 3758, 3598, 5016, 2863,
-    3927, 5001, 4677, 4444, 2481, 1773, 2525, 3142, 4840, 3965 },
-  { 1134, 3249, 4702, 5483, 4471, 7234, 7281, 6240, 5891, 7577, 3826,
-    5886, 4798, 7117, 6319, 7264, 4115, 5613, 4674, 4999, 4518, 2501,
-    6830, 4913, 2356, 789,  1926, 2190, 1914, 1434, 987,  1761 }
-};
-static int16_t default_ncobmc_krnl_2_1_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6131, 7769, 6548, 6297, 4967, 4708, 3127, 5937, 697,  748,  1850,
-    2290, 2945, -80,  216,  377,  318,  1009, 2112, 2962, -886, 849,
-    510,  4160, 2257, 2875, 4589, 5345, 7363, 5350, 6815, 1644 },
-  { 6949, 8044, 7295, 7318, 3142, 2084, 1819, 3048, 1654, 1831, 1344,
-    3344, 2065, 2889, -88,  3746, 696,  1143, 232,  1444, 1587, 4125,
-    3991, 3840, 5642, 4933, 3560, 6540, 5865, 6663, 6729, 5520 },
-  { 7816, 4894, 7089, 7533, 4271, 6814, 1972, 3845,  3755, 3498, 3571,
-    1884, 3171, 1843, 70,   2358, 2622, 1241, 143,   2657, 3804, 2968,
-    1781, 262,  2864, 4345, 1302, 5434, 7815, 10560, 9211, 8202 },
-  { 10656, 7490, 8639, 7975, 4318, 7432, 6148,  3321, 3776, 2781, 3544,
-    246,   2350, 793,  1600, 1266, 2372, -1382, -983, 1926, 493,  447,
-    2275,  3510, 4789, 3766, 878,  2353, 3314,  6282, 5853, 3709 },
-  { 11083, 7270, 6211, 6170, 4927, 4198, 3939, 4605, 1734, 2009, 2950,
-    546,   722,  99,   550,  597,  2350, 41,   1314, 1148, -183, 1143,
-    5392,  3550, 3102, 1161, -556, 1700, 7598, 8412, 6019, 9654 },
-  { 10358, 7350, 6589, 5975, 3587, 6201, 4603, 3974, 2262, 886,  1815,
-    1899,  1642, 2894, 1557, 228,  1625, 1879, 838,  182,  919,  1168,
-    3272,  1155, 889,  2292, 128,  4478, 5205, 7668, 8767, 10921 },
-  { 8569, 4702, 5397, 5147, 2577, 4301, 2139, 1630, 721,  1721, -218,
-    1595, 275,  1133, 1051, -777, 1556, -245, 972,  106,  2205, 385,
-    1410, 366,  3348, 2139, -164, 3111, 2656, 5036, 6021, 4847 },
-  { 7654, 5535, 5975, 4580, 3005, 5483, 4637, 5560, 6252, 4946, 4508,
-    3600, 1824, 1528, 338,  131,  1290, 309,  344,  3110, 3607, 2484,
-    1062, 1267, 1426, -860, 1155, 6137, 2415, 5482, 6846, 4916 },
-  { 8060,  5296,  4396, 2040, 867,  1189, 3555, 3397, 3438, 664,  -1931,
-    -1938, -1414, 1317, 762,  -312, -655, -801, -243, 2795, 1663, 1314,
-    1478,  2856,  562,  1075, 3211, 7482, 2988, 3880, 4156, 3289 },
-  { 8146, 7596, 7056,  7622, 5755, 7181, 7862, 4736, 4932, 3146, 1043,
-    -422, -813, -2152, 1444, 441,  3599, 395,  2173, 755,  4245, 3047,
-    1545, 1062, 1159,  1621, 209,  6521, 7385, 7730, 6511, 8959 },
-  { 9567,  8044, 7535, 6969, 3284, 4284, 4734, 4758, 5177, 2342, 230,
-    -1852, -839, -769, 222,  255,  -315, -16,  1101, -28,  3561, 2004,
-    -260,  789,  1856, 1960, 4962, 4207, 2425, 8406, 6771, 7796 },
-  { 8019,  7612,  8357,  5521, 4711,  3374, 4391, 7093, 5013, 3608, 238,
-    -1564, -1662, -1373, -198, -1045, 100,  2694, 1251, 489,  2110, 1670,
-    188,   -1362, 953,   2340, 3361,  3595, 6405, 7676, 1634, 7730 },
-  { 10177, 6488, 5822, 5121, 2615,  2725, 3372, 4849, 2232, 2548, 2841,
-    874,   895,  307,  1293, -150,  411,  -981, -815, -24,  936,  -2339,
-    254,   3019, 5892, 4302, -2171, 6747, 7198, 5638, 4832, 9538 },
-  { 7260,  9945, 2818, 1106, 6179, 6331, 5106, 1814, 5997, 4045, 1456,
-    -230,  297,  1045, 1918, -126, 752,  1014, 999,  -506, 198,  -732,
-    -1900, 139,  749,  3999, 5614, 5241, 6339, 8316, 3673, 7681 },
-  { 11101, 6954, 7475,  5729, 4242, 6118, 4569, 2348, 5307, 3762, 2933,
-    -1610, 988,  -1178, -104, -151, -507, 491,  -906, 1236, 3075, 1525,
-    1631,  2901, 2758,  1303, 1578, 6405, 3807, 7189, 8468, 9262 },
-  { 6835, 4602, 5501, 5568, 4338, 6143, 4304, 3557, 3258, 3797, 1242,
-    968,  1683, -251, 1218, 301,  1257, 1924, 985,  1251, 3051, 433,
-    1756, 167,  -660, 3884, 3450, 7202, 6544, 5184, 7556, 9366 },
-  { 5991, 6762, 3854, 4856, 6714, 5701, 4072, 2489, 422,  -365, 1488,
-    1660, 725,  1157, -778, 654,  313,  -18,  3162, 3065, 2925, 2391,
-    827,  5547, 461,  2487, 1492, 5810, 7042, 5284, 3995, 6870 },
-  { 6435, 8283, 4732, 5896, 5599, 4229, 4798, 3309, 3128, 941,  2565,
-    394,  257,  2477, 721,  1494, 3161, 1409, 1306, 2534, 1261, 2719,
-    756,  4388, 570,  5416, 3719, 6067, 4092, 2565, 6299, 10504 },
-  { 6042, 7417, 5391, 4671, 3245, 7547,  3777,  3203, 2044, 583,  2083,
-    1971, 1721, 1948, -169, 1197, -1141, -480,  2155, 1033, 1313, 268,
-    1857, 4493, 3083, 2005, 5347, 4397,  10144, 4828, 6622, 9817 },
-  { 7202, 5045, 6601, 6937, 3704, 5796, 5061, 3575, 2383, 1389, 3111,
-    1751, 1603, 2813, 174,  706,  -569, 2620, 1735, 1418, 1871, -1542,
-    168,  2156, 5107, 6329, 4968, 7018, 6279, 6864, 5898, 9157 },
-  { 5722, 5683, 4189, 4814, 2883, 5508, 5100, 1625, 2169, 3680, 1884,
-    2109, 462,  1145, 334,  515,  191,  441,  1058, 917,  1528, -96,
-    1843, 5395, 4498, 5681, 4193, 5196, 8356, 5303, 7262, 10141 },
-  { 5879, 5779,  7257, 3873, 6911, 6238, 5672,  3583, 3261, 3048, 2536,
-    -310, -1046, -69,  -660, 417,  -719, -2058, 1740, 888,  2746, 1367,
-    1668, 1090,  1830, 1153, 5047, 7336, 3380,  7160, 4422, 9401 },
-  { 7809, 7945, 8385, 8535, 7803, 3953, 5065, 3185,  2013,  1659, 1648,
-    769,  292,  -135, 114,  -579, 713,  1407, -1181, 1569,  3525, 5630,
-    219,  3518, 3739, 3432, 7282, 6357, 619,  5779,  10116, 6448 },
-  { 9496,  7224, 5342, 5960, 5092,  4225, 4353, 3995, 3631, 1662, 1413,
-    762,   534,  126,  -551, -1025, 2327, 602,  -452, 1285, 2103, 2579,
-    -1369, 2724, 6353, 3925, 4631,  9139, 4974, 6630, 7755, 4125 },
-  { 5226, 7729, 5768,  5815, 4531, 2948, 3029,  2603, 2549, 1366, 119,
-    405,  21,   -1831, -327, -287, -415, -1317, -214, 3017, 1586, 2436,
-    868,  1094, 290,   668,  2117, 756,  1228,  2700, 5743, 8052 },
-  { 6262, 5531, 4454, 4616, 3913, 2022, 4240, 2241, 4201, 2506, 1810,
-    628,  -496, -779, -471, 394,  756,  1666, -445, 490,  575,  -478,
-    894,  1182, 822,  626,  1782, 1781, 5333, 5482, 1760, 8187 },
-  { 6488,  6875,  4960, 6837,  4564, 1871, 390,  2940, 4330, 1634, 131,
-    -1102, -1451, -928, -1067, -419, -614, -2,   1017, 1066, 1051, 917,
-    1097,  844,   465,  513,   2377, 1031, 3548, 5088, 4516, 10564 },
-  { 6497, 6047,  5649, 7156, 4974, 3683, 2875, 4421, 1502, 1244, 668,
-    -30,  -1465, -59,  -399, -721, 954,  -281, -2,   664,  1039, 814,
-    758,  1911,  319,  4247, 1848, 1606, 2536, 2189, 1372, 7759 },
-  { 5994, 5659,  6777, 6693, 4758, 2986, 1463, 1186, 2116, -166, 499,
-    73,   -1151, -164, 279,  -895, -169, 339,  1194, 1772, 752,  1649,
-    1696, -2615, 1581, 1740, 1789, 1832, 1899, 510,  2135, 7149 },
-  { 9107,  4250, 5418, 4334,  613,   2618, 3395, 4809, 1724, 873, -78,
-    -1146, -431, -547, -1104, -1128, -6,   -290, 945,  794,  564, 1670,
-    737,   4540, 1574, 6285,  2596,  2859, 1191, 1428, 5614, 8419 },
-  { 5905, 4490, 6470,  3636, 2119,  1731, 3532, 2461, 2391, 473,  176,
-    -562, 389,  -1300, -916, -1436, 371,  567,  1038, 866,  59,   195,
-    679,  -721, 2994,  3260, 1813,  1589, 850,  1982, 7410, 11546 },
-  { 7265, 8775, 6672, 6657, 6182, 3732, 3222, 4564, 2644, 790,  924,
-    -596, 628,  -681, -57,  -236, 103,  364,  603,  1420, 309,  787,
-    1257, 770,  2453, 3401, 1175, 434,  792,  4019, 8792, 11773 }
-};
-static int16_t default_ncobmc_krnl_2_1_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 391,  -894, -939, 1155,  4362, 4297, 7296,  2684, 3758, 8010, 8044,
-    9041, 8748, 8816, 10796, 8701, 6840, 11306, 7814, 8456, 9952, 3511,
-    7870, 2227, 7018, 7148,  4672, 5660, 6657,  6007, 1098, 3866 },
-  { 2970, 945,  619,  1701, 4540, 3326,  7140,  8401,  6001, 5524, 6311,
-    5657, 5333, 9833, 7547, 8127, 10894, 14326, 12130, 8591, 8408, 5873,
-    7524, 6398, 7054, 6594, 9788, 8347,  8784,  9253,  8154, 6170 },
-  { 3423, 6928,  5192, 5699, 5575,  6852,  8083,  7546,  8019, 8464, 8910,
-    9251, 11401, 8637, 9356, 9671,  10065, 12652, 12275, 9662, 9627, 5550,
-    9836, 10565, 9075, 9350, 11656, 8549,  8120,  4437,  5501, 6658 },
-  { 5859, 5714, 6766, 5830, 7266,  4208,  5956,  8173,  10615, 7557,  10533,
-    8101, 7530, 9292, 9312, 9603,  11268, 14896, 12761, 10435, 10584, 10602,
-    7945, 6677, 7798, 9184, 11805, 9688,  12921, 9831,  9425,  9409 },
-  { 5068,  7732,  8953,  7750,  6739,  7145,  7635,  7400,  9896,  11465, 12344,
-    14483, 13309, 11497, 10778, 11614, 13096, 11519, 12197, 13573, 14652, 12324,
-    7270,  8764,  10162, 11289, 13446, 10681, 7564,  7663,  7650,  3879 },
-  { 6073,  8775,  7134, 7485,  8815,  9982,  9893,  11182, 10807, 12415, 10385,
-    13211, 13198, 9974, 13590, 13229, 14029, 10733, 10710, 10950, 11286, 12150,
-    10133, 10858, 8958, 9903,  12033, 9177,  9756,  8710,  8055,  3108 },
-  { 8368,  10916, 7650,  6261,  8713,  10236, 12507, 10373, 12385, 11135, 11343,
-    12039, 12114, 14871, 13861, 13742, 11649, 13839, 13207, 13160, 11863, 11950,
-    12423, 10188, 7712,  8705,  11270, 12864, 13370, 11422, 7881,  7390 },
-  { 10805, 12233, 10301, 9238,  9352,  7871,  10959, 12870, 11641, 9692, 12373,
-    13839, 12380, 14055, 14653, 13348, 11227, 12844, 14769, 12714, 9815, 10484,
-    12966, 10123, 8644,  11791, 9911,  7598,  13225, 9539,  6774,  8055 },
-  { 7987,  9257,  6281,  7446,  8911,  10506, 7039,  9031,  9319,  10294, 13979,
-    15391, 14445, 11372, 14852, 14690, 14954, 14129, 16319, 13385, 10855, 12837,
-    13065, 10647, 12815, 13043, 9686,  7003,  12028, 10211, 10237, 11699 },
-  { 6073,  7893,  7571,  5698,  8244,  7305,  6581,  9719,  9746,  11432, 12215,
-    16346, 17408, 17379, 13508, 14637, 10471, 13204, 13089, 13632, 10135, 12397,
-    12431, 13511, 13140, 13999, 14081, 10639, 7173,  7807,  9433,  4659 },
-  { 6634,  10941, 11920, 9920,  11356, 10608, 10624, 12593, 11330, 11413, 13971,
-    18455, 16400, 16654, 15373, 16023, 15144, 15413, 14357, 16626, 10718, 12841,
-    16053, 14104, 13496, 13334, 10605, 11490, 12221, 6956,  9178,  8213 },
-  { 7366,  9121,  9253,  11198, 9839,  11458, 10864, 8319,  12656, 12437, 13128,
-    15378, 14565, 16278, 15940, 14457, 15156, 13972, 14035, 13587, 10888, 11376,
-    15176, 18483, 13236, 12754, 12347, 13247, 11785, 10432, 13455, 7419 },
-  { 7665,  10318, 12372, 11702, 11166, 12470, 11859, 10983, 12921, 13947, 12106,
-    14300, 13037, 17367, 14444, 15259, 15107, 14974, 11715, 14835, 15525, 18775,
-    17479, 13835, 9101,  10034, 18554, 10201, 8666,  11181, 11767, 6530 },
-  { 11169, 7696,  11879, 11938, 10302, 13271, 12067, 13360, 9715,  12528, 13879,
-    15312, 17012, 15194, 12951, 17211, 14989, 14796, 15695, 14942, 13140, 17003,
-    18104, 14131, 14490, 11607, 9697,  10346, 6890,  7337,  12248, 7668 },
-  { 7494,  9902,  9327,  10081, 9955,  10895, 12521, 13971, 11975, 12950, 13579,
-    19214, 16537, 17208, 15292, 17698, 16633, 14485, 17676, 15920, 11698, 13314,
-    13747, 11163, 10360, 13396, 13119, 7073,  11331, 8217,  8258,  8754 },
-  { 9934,  11319, 10239, 9047,  11387, 10784, 12566, 13038, 13663, 12717, 14675,
-    14008, 14178, 15820, 14510, 16181, 15440, 15283, 15009, 13767, 11372, 13359,
-    14352, 14480, 17066, 10914, 11175, 8554,  7428,  10827, 10561, 6443 },
-  { 10016, 9986,  12912, 11133, 8475,  9995,  12150, 14006, 15182, 16531, 13117,
-    14634, 15313, 15598, 16928, 14269, 14814, 17080, 12532, 12849, 13261, 12479,
-    14442, 9716,  15960, 13029, 13398, 10927, 9854,  10849, 12580, 10547 },
-  { 9295,  7913,  11422, 9455,  10319, 11278, 11274, 13394, 13038, 13821, 15044,
-    14686, 17187, 14091, 14823, 14137, 14455, 15111, 15447, 13582, 14076, 14295,
-    15643, 11185, 16015, 10747, 11235, 11551, 12009, 13990, 8881,  5003 },
-  { 11095, 8615,  12138, 8821,  9239,  6419,  11207, 11937, 12556, 14236, 12501,
-    14976, 13740, 15006, 17876, 15826, 16800, 16761, 13880, 15072, 16296, 16857,
-    14333, 11125, 12310, 13605, 10932, 12928, 5472,  11185, 9435,  5957 },
-  { 7725,  6887,  7535,  8957,  9967,  9700,  10640, 10680, 13275, 12682, 11517,
-    15207, 15552, 17018, 16856, 14725, 16692, 12845, 14748, 14656, 14606, 16310,
-    14672, 15510, 13069, 9039,  8315,  8606,  8826,  8214,  8487,  7999 },
-  { 9071,  9686,  10375, 11046, 7539,  7106,  10540, 13531, 13747, 9927,  14071,
-    15876, 15935, 13026, 15104, 15296, 16773, 16198, 16098, 13165, 13227, 15002,
-    12319, 13015, 14240, 10673, 12818, 10497, 5016,  8298,  5706,  6088 },
-  { 9366,  8741,  8215,  11450, 8961,  10464, 10575, 13631, 13635, 13752, 12735,
-    17169, 16010, 15438, 15786, 13083, 18481, 17990, 12316, 16370, 13953, 16000,
-    14693, 15392, 15242, 15049, 10809, 7658,  12399, 7866,  7570,  5544 },
-  { 6903,  5972,  7864,  7864,  8655,  13231, 12904, 14949, 15064, 15007, 14738,
-    15847, 14769, 14910, 15543, 17103, 15630, 15115, 19594, 16319, 13352, 10936,
-    15453, 13064, 13305, 12008, 7408,  8514,  14898, 8171,  5583,  9657 },
-  { 1309,  4431,  10551, 8701,  8152,  8547,  11642, 9601,  12635, 14116, 12560,
-    14796, 14370, 14959, 15558, 17801, 14148, 16067, 16927, 16084, 15633, 13749,
-    16805, 13274, 7467,  12136, 9815,  6584,  10514, 9020,  9109,  10981 },
-  { 10778, 9464,  8877,  8157,  7779,  9056,  13584, 11871, 13714, 16259, 13305,
-    13956, 14785, 16328, 16541, 15199, 15586, 18478, 16668, 13019, 14279, 13814,
-    15684, 15613, 15050, 14345, 14327, 15869, 14316, 13744, 10738, 8497 },
-  { 9411,  9691,  11139, 8582,  8038,  9492,  10534, 12154, 9249,  16286, 16839,
-    15572, 13252, 16207, 14760, 15743, 15428, 14223, 15971, 16378, 16607, 16993,
-    15698, 15766, 14771, 13969, 14551, 13631, 10451, 9360,  15908, 7460 },
-  { 5565,  3814,  5832,  4698,  7091,  10412, 8442,  9852,  9831,  10137, 9167,
-    11864, 11520, 12092, 11930, 12431, 14914, 16568, 13978, 14847, 14215, 14290,
-    13812, 15033, 15711, 15541, 13908, 14681, 12577, 9266,  12542, 5718 },
-  { 3740,  2245,  1259,  3575,  4190,  8150,  9742,  8948,  11592, 12108, 10225,
-    12748, 12684, 12687, 11339, 10475, 13481, 15937, 14669, 13780, 12167, 11074,
-    16225, 14201, 13966, 9544,  12974, 12797, 13248, 13990, 14819, 7995 },
-  { 2296,  817,   3435,  3505,  3507,  9072,  7580,  10139, 7087,  12821, 13297,
-    12396, 12113, 10999, 9149,  14466, 15677, 11290, 11487, 10612, 8552,  15725,
-    16233, 17367, 12511, 13088, 10898, 12875, 13386, 15384, 14845, 9849 },
-  { 2320,  1714,  3209,  4858,  11853, 8126,  7775,  6246,  10834, 12812, 9996,
-    8379,  10020, 11558, 10914, 12851, 11272, 13723, 7409,  11919, 10393, 12987,
-    13756, 11382, 13258, 9754,  12513, 10697, 14356, 14065, 10023, 8748 },
-  { 5715,  4721,  4773,  6968, 7426,  6196,  7322,  11771, 8704,  7198,  8944,
-    12478, 6336,  10064, 9132, 10252, 11884, 12483, 11504, 12168, 11346, 13354,
-    11779, 12178, 8942,  8770, 11937, 13047, 12938, 11277, 4002,  710 },
-  { 7743,  4184,  5058,  4276,  5576,  5393,  5919,  5500,  7881, 8102,  11726,
-    10912, 10943, 10344, 10654, 9537,  12118, 10565, 11112, 9964, 11328, 13005,
-    8273,  10626, 11596, 12198, 13157, 13884, 13912, 10737, 6497, 2938 }
-};
-
-void get_default_ncobmc_kernels(AV1_COMMON *cm) {
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[0], default_ncobmc_krnl_0_0_0);
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[1], default_ncobmc_krnl_0_0_1);
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[2], default_ncobmc_krnl_0_0_2);
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[3], default_ncobmc_krnl_0_0_3);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[0], default_ncobmc_krnl_0_1_0);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[1], default_ncobmc_krnl_0_1_1);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[2], default_ncobmc_krnl_0_1_2);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[3], default_ncobmc_krnl_0_1_3);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[0], default_ncobmc_krnl_1_0_0);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[1], default_ncobmc_krnl_1_0_1);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[2], default_ncobmc_krnl_1_0_2);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[3], default_ncobmc_krnl_1_0_3);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[0], default_ncobmc_krnl_1_1_0);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[1], default_ncobmc_krnl_1_1_1);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[2], default_ncobmc_krnl_1_1_2);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[3], default_ncobmc_krnl_1_1_3);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[0], default_ncobmc_krnl_2_0_0);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[1], default_ncobmc_krnl_2_0_1);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[2], default_ncobmc_krnl_2_0_2);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[3], default_ncobmc_krnl_2_0_3);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[0], default_ncobmc_krnl_2_1_0);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[1], default_ncobmc_krnl_2_1_1);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[2], default_ncobmc_krnl_2_1_2);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[3], default_ncobmc_krnl_2_1_3);
-}
diff --git a/third_party/aom/av1/common/ncobmc_kernels.h b/third_party/aom/av1/common/ncobmc_kernels.h
deleted file mode 100644
index 358b7b7c8..000000000
--- a/third_party/aom/av1/common/ncobmc_kernels.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/common.h"
-
-#ifndef AV1_COMMON_NCOBMC_KERNELS_H_
-#define AV1_COMMON_NCOBMC_KERNELS_H_
-
-void get_default_ncobmc_kernels(AV1_COMMON *cm);
-
-#endif  // AV1_COMMON_NCOBMC_KERNELS_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
index f3940490f..3918c82c6 100644
--- a/third_party/aom/av1/common/obmc.h
+++ b/third_party/aom/av1/common/obmc.h
@@ -12,31 +12,31 @@
 #ifndef AV1_COMMON_OBMC_H_
 #define AV1_COMMON_OBMC_H_
 
-#if CONFIG_MOTION_VAR
 typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
-                                          uint8_t nb_mi_size, MODE_INFO *nb_mi,
-                                          void *fun_ctxt);
+                                          uint8_t nb_mi_size,
+                                          MB_MODE_INFO *nb_mi, void *fun_ctxt,
+                                          const int num_planes);
 
 static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
                                                  MACROBLOCKD *xd, int mi_col,
                                                  int nb_max,
                                                  overlappable_nb_visitor_t fun,
                                                  void *fun_ctxt) {
+  const int num_planes = av1_num_planes(cm);
   if (!xd->up_available) return;
 
   int nb_count = 0;
 
   // prev_row_mi points into the mi array, starting at the beginning of the
   // previous row.
-  MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
   const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
   uint8_t mi_step;
   for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
        above_mi_col += mi_step) {
-    MODE_INFO **above_mi = prev_row_mi + above_mi_col;
-    mi_step = AOMMIN(mi_size_wide[above_mi[0]->mbmi.sb_type],
-                     mi_size_wide[BLOCK_64X64]);
-#if CONFIG_CHROMA_SUB8X8
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step =
+        AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
     // If we're considering a block with width 4, it should be treated as
     // half of a pair of blocks with chroma information in the second. Move
     // above_mi_col back to the start of the pair if needed, set above_mbmi
@@ -47,12 +47,10 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
       above_mi = prev_row_mi + above_mi_col + 1;
       mi_step = 2;
     }
-#endif  // CONFIG_CHROMA_SUB8X8
-    MB_MODE_INFO *above_mbmi = &above_mi[0]->mbmi;
-    if (is_neighbor_overlappable(above_mbmi)) {
+    if (is_neighbor_overlappable(*above_mi)) {
       ++nb_count;
       fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
-          fun_ctxt);
+          fun_ctxt, num_planes);
     }
   }
 }
@@ -62,35 +60,32 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
                                                 int nb_max,
                                                 overlappable_nb_visitor_t fun,
                                                 void *fun_ctxt) {
+  const int num_planes = av1_num_planes(cm);
   if (!xd->left_available) return;
 
   int nb_count = 0;
 
   // prev_col_mi points into the mi array, starting at the top of the
   // previous column
-  MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
   const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
   uint8_t mi_step;
   for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
        left_mi_row += mi_step) {
-    MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
-    mi_step = AOMMIN(mi_size_high[left_mi[0]->mbmi.sb_type],
-                     mi_size_high[BLOCK_64X64]);
-#if CONFIG_CHROMA_SUB8X8
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step =
+        AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
     if (mi_step == 1) {
       left_mi_row &= ~1;
       left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
       mi_step = 2;
     }
-#endif  // CONFIG_CHROMA_SUB8X8
-    MB_MODE_INFO *left_mbmi = &left_mi[0]->mbmi;
-    if (is_neighbor_overlappable(left_mbmi)) {
+    if (is_neighbor_overlappable(*left_mi)) {
       ++nb_count;
       fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
-          fun_ctxt);
+          fun_ctxt, num_planes);
     }
   }
 }
 
-#endif  // CONFIG_MOTION_VAR
 #endif  // AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/odintrin.c b/third_party/aom/av1/common/odintrin.c
index 868efacc9..7584b2e52 100644
--- a/third_party/aom/av1/common/odintrin.c
+++ b/third_party/aom/av1/common/odintrin.c
@@ -13,16 +13,6 @@
 
 #include "av1/common/odintrin.h"
 
-#if defined(OD_ENABLE_ASSERTIONS)
-# include <stdio.h>
-
-void od_fatal_impl(const char *_str, const char *_file, int _line) {
-  fprintf(stderr, "Fatal (internal) error in %s, line %d: %s\n",
-   _file, _line, _str);
-  abort();
-}
-#endif
-
 /*Constants for use with OD_DIVU_SMALL().
   See \cite{Rob05} for details on computing these constants.
   @INPROCEEDINGS{Rob05,
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
index a50c456c1..e87c5a0bf 100644
--- a/third_party/aom/av1/common/odintrin.h
+++ b/third_party/aom/av1/common/odintrin.h
@@ -14,10 +14,6 @@
 #ifndef AV1_COMMON_ODINTRIN_H_
 #define AV1_COMMON_ODINTRIN_H_
 
-#if defined(_MSC_VER)
-# define _USE_MATH_DEFINES
-#endif
-#include <math.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -30,71 +26,8 @@
 extern "C" {
 #endif
 
-# if !defined(M_PI)
-#  define M_PI      (3.1415926535897932384626433832795)
-# endif
-
-# if !defined(M_SQRT2)
-#  define M_SQRT2 (1.41421356237309504880168872420970)
-# endif
-
-# if !defined(M_SQRT1_2)
-#  define M_SQRT1_2 (0.70710678118654752440084436210485)
-# endif
-
-# if !defined(M_LOG2E)
-#  define M_LOG2E (1.4426950408889634073599246810019)
-# endif
-
-# if !defined(M_LN2)
-#  define M_LN2 (0.69314718055994530941723212145818)
-# endif
-
-/*Smallest blocks are 4x4*/
-#define OD_LOG_BSIZE0 (2)
-/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
-#define OD_NBSIZES (5)
-
-/*There are 4 transform sizes total in AV1 (4x4, 8x8, 16x16 and 32x32).*/
-#define OD_TXSIZES TX_SIZES
-/*The log of the maximum length of the side of a transform.*/
-#define OD_LOG_TXSIZE_MAX (OD_LOG_BSIZE0 + OD_TXSIZES - 1)
-/*The maximum length of the side of a transform.*/
-#define OD_TXSIZE_MAX (1 << OD_LOG_TXSIZE_MAX)
-
-/**The maximum number of color planes allowed in a single frame.*/
-# define OD_NPLANES_MAX (3)
-
-# define OD_COEFF_SHIFT (4)
-
-# define OD_DISABLE_CFL (1)
-# define OD_DISABLE_FILTER (1)
-
-#if !defined(NDEBUG)
-# define OD_ENABLE_ASSERTIONS (1)
-#endif
-
-# define OD_LOG(a)
-# define OD_LOG_PARTIAL(a)
-
-/*Possible block sizes, note that OD_BLOCK_NXN = log2(N) - 2.*/
-#define OD_BLOCK_4X4 (0)
-#define OD_BLOCK_8X8 (1)
-#define OD_BLOCK_16X16 (2)
-#define OD_BLOCK_32X32 (3)
-#define OD_BLOCK_SIZES (OD_BLOCK_32X32 + 1)
-
-# define OD_LIMIT_BSIZE_MIN (OD_BLOCK_4X4)
-# define OD_LIMIT_BSIZE_MAX (OD_BLOCK_32X32)
-
 typedef int od_coeff;
 
-/*This is the strength reduced version of ((_a)/(1 << (_b))).
-  This will not work for _b == 0, however currently this is only used for
-   b == 1 anyway.*/
-# define OD_UNBIASED_RSHIFT32(_a, _b) \
-  (((int32_t)(((uint32_t)(_a) >> (32 - (_b))) + (_a))) >> (_b))
-
 #define OD_DIVU_DMAX (1024)
 
 extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
@@ -116,14 +49,6 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 #define OD_CLZ0 (1)
 #define OD_CLZ(x) (-get_msb(x))
 #define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
-/*Note that __builtin_clz is not defined when x == 0, according to the gcc
-   documentation (and that of the x86 BSR instruction that implements it), so
-   we have to special-case it.
-  We define a special version of the macro to use when x can be zero.*/
-#define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
-
-#define OD_LOG2(x) (M_LOG2E*log(x))
-#define OD_EXP2(x) (exp(M_LN2*(x)))
 
 /*Enable special features for gcc and compatible compilers.*/
 #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -146,36 +71,6 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 #define OD_ARG_NONNULL(x)
 #endif
 
-#if defined(OD_ENABLE_ASSERTIONS)
-#if OD_GNUC_PREREQ(2, 5, 0)
-__attribute__((noreturn))
-#endif
-void od_fatal_impl(const char *_str, const char *_file, int _line);
-
-#define OD_FATAL(_str) (od_fatal_impl(_str, __FILE__, __LINE__))
-
-#define OD_ASSERT(_cond)                     \
-  do {                                       \
-    if (!(_cond)) {                          \
-      OD_FATAL("assertion failed: " #_cond); \
-    }                                        \
-  } while (0)
-
-#define OD_ASSERT2(_cond, _message)                        \
-  do {                                                     \
-    if (!(_cond)) {                                        \
-      OD_FATAL("assertion failed: " #_cond "\n" _message); \
-    }                                                      \
-  } while (0)
-
-#define OD_ALWAYS_TRUE(_cond) OD_ASSERT(_cond)
-
-#else
-#define OD_ASSERT(_cond)
-#define OD_ASSERT2(_cond, _message)
-#define OD_ALWAYS_TRUE(_cond) ((void)(_cond))
-#endif
-
 /** Copy n elements of memory from src to dst. The 0* term provides
     compile-time type checking  */
 #if !defined(OVERRIDE_OD_COPY)
@@ -190,85 +85,10 @@ void od_fatal_impl(const char *_str, const char *_file, int _line);
  (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
 #endif
 
-/** Linkage will break without this if using a C++ compiler, and will issue
- * warnings without this for a C compiler*/
-#if defined(__cplusplus)
-# define OD_EXTERN extern
-#else
-# define OD_EXTERN
-#endif
-
-/** Set n elements of dst to zero */
-#if !defined(OVERRIDE_OD_CLEAR)
-# define OD_CLEAR(dst, n) (memset((dst), 0, sizeof(*(dst))*(n)))
-#endif
-
-/** Silence unused parameter/variable warnings */
-# define OD_UNUSED(expr) (void)(expr)
-
-#if defined(OD_FLOAT_PVQ)
-typedef double od_val16;
-typedef double od_val32;
-# define OD_QCONST32(x, bits) (x)
-# define OD_ROUND16(x) (x)
-# define OD_ROUND32(x) (x)
-# define OD_SHL(x, shift) (x)
-# define OD_SHR(x, shift) (x)
-# define OD_SHR_ROUND(x, shift) (x)
-# define OD_ABS(x) (fabs(x))
-# define OD_MULT16_16(a, b) ((a)*(b))
-# define OD_MULT16_32_Q16(a, b) ((a)*(b))
-#else
-typedef int16_t od_val16;
-typedef int32_t od_val32;
-/** Compile-time conversion of float constant to 32-bit value */
-# define OD_QCONST32(x, bits) ((od_val32)(.5 + (x)*(((od_val32)1) << (bits))))
-# define OD_ROUND16(x) (int16_t)(floor(.5 + (x)))
-# define OD_ROUND32(x) (int32_t)(floor(.5 + (x)))
-/*Shift x left by shift*/
-# define OD_SHL(a, shift) ((int32_t)((uint32_t)(a) << (shift)))
-/*Shift x right by shift (without rounding)*/
-# define OD_SHR(x, shift) \
-  ((int32_t)((x) >> (shift)))
-/*Shift x right by shift (with rounding)*/
-# define OD_SHR_ROUND(x, shift) \
-  ((int32_t)(((x) + (1 << (shift) >> 1)) >> (shift)))
-/*Shift x right by shift (without rounding) or left by -shift if shift
-  is negative.*/
-# define OD_VSHR(x, shift) \
-  (((shift) > 0) ? OD_SHR(x, shift) : OD_SHL(x, -(shift)))
-/*Shift x right by shift (with rounding) or left by -shift if shift
-  is negative.*/
-# define OD_VSHR_ROUND(x, shift) \
-  (((shift) > 0) ? OD_SHR_ROUND(x, shift) : OD_SHL(x, -(shift)))
-# define OD_ABS(x) (abs(x))
-/* (od_val32)(od_val16) gives TI compiler a hint that it's 16x16->32 multiply */
-/** 16x16 multiplication where the result fits in 32 bits */
-# define OD_MULT16_16(a, b) \
- (((od_val32)(od_val16)(a))*((od_val32)(od_val16)(b)))
-/* Multiplies 16-bit a by 32-bit b and keeps bits [16:47]. */
-# define OD_MULT16_32_Q16(a, b) ((int16_t)(a)*(int64_t)(int32_t)(b) >> 16)
-/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
-# define OD_MULT16_16_Q15(a, b) \
-  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
-/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
-# define OD_MULT16_16_Q16(a, b) \
-  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> 16)
-#endif
-
 /*All of these macros should expect floats as arguments.*/
-/*These two should compile as a single SSE instruction.*/
-# define OD_MINF(a, b) ((a) < (b) ? (a) : (b))
-# define OD_MAXF(a, b) ((a) > (b) ? (a) : (b))
-
-# define OD_DIV_R0(x, y) (((x) + OD_FLIPSIGNI((((y) + 1) >> 1) - 1, (x)))/(y))
-
 # define OD_SIGNMASK(a) (-((a) < 0))
 # define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
 
-# define OD_MULT16_16_Q15(a, b) \
-  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index 2396ce2f3..fa5f02e52 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -12,76 +12,72 @@
 #ifndef AV1_COMMON_ONYXC_INT_H_
 #define AV1_COMMON_ONYXC_INT_H_
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_util/aom_thread.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_loopfilter.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
 #include "av1/common/frame_buffers.h"
 #include "av1/common/mv.h"
 #include "av1/common/quant_common.h"
-#if CONFIG_LOOP_RESTORATION
 #include "av1/common/restoration.h"
-#endif  // CONFIG_LOOP_RESTORATION
 #include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
 #include "av1/common/odintrin.h"
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#endif
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
-#if CONFIG_HASH_ME
-// TODO(youzhou@microsoft.com): Encoder only. Move it out of common
 #include "av1/encoder/hash_motion.h"
-#endif
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define CDEF_MAX_STRENGTHS 16
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough))  // NOLINT
+#endif
 
-#define REF_FRAMES_LOG2 3
-#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+  do {                           \
+  } while (0)
+#endif
 
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
-// TODO(jkoleszar): These 3 extra references could probably come from the
-// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+#define CDEF_MAX_STRENGTHS 16
 
-#if CONFIG_REFERENCE_BUFFER
 /* Constant values while waiting for the sequence header */
-#define FRAME_ID_NUMBERS_PRESENT_FLAG 1
-#define FRAME_ID_LENGTH_MINUS7 8         // Allows frame id up to 2^15-1
-#define DELTA_FRAME_ID_LENGTH_MINUS2 12  // Allows frame id deltas up to 2^14-1
-#endif                                   // CONFIG_REFERENCE_BUFFER
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
 
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
 #define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
 // Extra frame context which is always kept at default values
 #define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
-#else
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
 
-#if CONFIG_EXT_REFS
-#define FRAME_CONTEXTS_LOG2 3
-#else
-#define FRAME_CONTEXTS_LOG2 2
-#endif
+#define NUM_PING_PONG_BUFFERS 2
 
-#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+  MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
+/* clang-format on*/
 
-#define NUM_PING_PONG_BUFFERS 2
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
 
 typedef enum {
   SINGLE_REFERENCE = 0,
@@ -90,20 +86,11 @@ typedef enum {
   REFERENCE_MODES = 3,
 } REFERENCE_MODE;
 
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-typedef enum {
-  RESET_FRAME_CONTEXT_NONE = 0,
-  RESET_FRAME_CONTEXT_CURRENT = 1,
-  RESET_FRAME_CONTEXT_ALL = 2,
-} RESET_FRAME_CONTEXT_MODE;
-#endif
-
 typedef enum {
   /**
-   * Update frame context to values resulting from forward probability
-   * updates signaled in the frame header
+   * Frame context updates are disabled
    */
-  REFRESH_FRAME_CONTEXT_FORWARD,
+  REFRESH_FRAME_CONTEXT_DISABLED,
   /**
    * Update frame context to values resulting from backward probability
    * updates based on entropy/counts in the decoded frame
@@ -111,57 +98,41 @@ typedef enum {
   REFRESH_FRAME_CONTEXT_BACKWARD,
 } REFRESH_FRAME_CONTEXT_MODE;
 
-#if CONFIG_MFMV
-#define MFMV_STACK_SIZE INTER_REFS_PER_FRAME
-
+#define MFMV_STACK_SIZE 3
 typedef struct {
-  int_mv mfmv[INTER_REFS_PER_FRAME][MFMV_STACK_SIZE];
+  int_mv mfmv0;
+  uint8_t ref_frame_offset;
 } TPL_MV_REF;
-#endif
 
 typedef struct {
-  int_mv mv[2];
-  int_mv pred_mv[2];
-  MV_REFERENCE_FRAME ref_frame[2];
+  int_mv mv;
+  MV_REFERENCE_FRAME ref_frame;
 } MV_REF;
 
 typedef struct {
   int ref_count;
 
-#if CONFIG_FRAME_MARKER
-  int cur_frame_offset;
-  int lst_frame_offset;
-  int alt_frame_offset;
-  int gld_frame_offset;
-#if CONFIG_EXT_REFS
-  int lst2_frame_offset;
-  int lst3_frame_offset;
-  int bwd_frame_offset;
-  int alt2_frame_offset;
-#endif
-#endif  // CONFIG_FRAME_MARKER
+  unsigned int cur_frame_offset;
+  unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
 
-#if CONFIG_MFMV
-  TPL_MV_REF *tpl_mvs;
-#endif
   MV_REF *mvs;
+  uint8_t *seg_map;
+  struct segmentation seg;
   int mi_rows;
   int mi_cols;
   // Width and height give the size of the buffer (before any upscaling, unlike
   // the sizes that can be derived from the buf structure)
   int width;
   int height;
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
+  WarpedMotionParams global_motion[REF_FRAMES];
+  int showable_frame;  // frame can be used as show existing frame in future
+  int film_grain_params_present;
+  aom_film_grain_t film_grain_params;
   aom_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
-#if CONFIG_HASH_ME
   hash_table hash_table;
-#endif
-#if CONFIG_TEMPMV_SIGNALING
   uint8_t intra_only;
-#endif
+  FRAME_TYPE frame_type;
   // The Following variables will only be used in frame parallel decode.
 
   // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
@@ -173,6 +144,12 @@ typedef struct {
   // when the frame is fully decoded.
   int row;
   int col;
+
+  // Inter frame reference frame delta for loop filter
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
 } RefCntBuffer;
 
 typedef struct BufferPool {
@@ -195,28 +172,77 @@ typedef struct BufferPool {
   InternalFrameBufferList int_frame_buffers;
 } BufferPool;
 
-#if CONFIG_LV_MAP
 typedef struct {
-  int base_ctx_table[2 /*row*/][2 /*col*/][2 /*sig_map*/]
+  int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
                     [BASE_CONTEXT_POSITION_NUM + 1];
 } LV_MAP_CTX_TABLE;
-typedef int BASE_CTX_TABLE[2 /*col*/][2 /*sig_map*/]
+typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
                           [BASE_CONTEXT_POSITION_NUM + 1];
-#endif
 
-#if CONFIG_REFERENCE_BUFFER
+typedef struct BitstreamLevel {
+  uint8_t major;
+  uint8_t minor;
+} BitstreamLevel;
+
 /* Initial version of sequence header structure */
 typedef struct SequenceHeader {
+  int num_bits_width;
+  int num_bits_height;
+  int max_frame_width;
+  int max_frame_height;
   int frame_id_numbers_present_flag;
-  int frame_id_length_minus7;
-  int delta_frame_id_length_minus2;
+  int frame_id_length;
+  int delta_frame_id_length;
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
+  int order_hint_bits_minus_1;
+  int force_screen_content_tools;  // 0 - force off
+                                   // 1 - force on
+                                   // 2 - adaptive
+  int force_integer_mv;            // 0 - Not to force. MV can be in 1/4 or 1/8
+                                   // 1 - force to integer
+                                   // 2 - adaptive
+  int still_picture;               // Video is a single frame still picture
+  int reduced_still_picture_hdr;   // Use reduced header for still picture
+  int monochrome;                  // Monochorme video
+  int enable_filter_intra;         // enables/disables filterintra
+  int enable_intra_edge_filter;    // enables/disables corner/edge/upsampling
+  int enable_interintra_compound;  // enables/disables interintra_compound
+  int enable_masked_compound;      // enables/disables masked compound
+  int enable_dual_filter;          // 0 - disable dual interpolation filter
+                                   // 1 - enable vert/horiz filter selection
+  int enable_order_hint;           // 0 - disable order hint, and related tools
+                                   // jnt_comp, ref_frame_mvs, frame_sign_bias
+                                   // if 0, enable_jnt_comp and
+                                   // enable_ref_frame_mvs must be set zs 0.
+  int enable_jnt_comp;             // 0 - disable joint compound modes
+                                   // 1 - enable it
+  int enable_ref_frame_mvs;        // 0 - disable ref frame mvs
+                                   // 1 - enable it
+  int enable_warped_motion;        // 0 - disable warped motion for sequence
+                                   // 1 - enable it for the sequence
+  int enable_superres;     // 0 - Disable superres for the sequence, and disable
+                           //     transmitting per-frame superres enabled flag.
+                           // 1 - Enable superres for the sequence, and also
+                           //     enable per-frame flag to denote if superres is
+                           //     enabled for that frame.
+  int enable_cdef;         // To turn on/off CDEF
+  int enable_restoration;  // To turn on/off loop restoration
+  int operating_points_cnt_minus_1;
+  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  int display_model_info_present_flag;
+  int decoder_model_info_present_flag;
+  BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
+  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
+                                           // or 1.
 } SequenceHeader;
-#endif  // CONFIG_REFERENCE_BUFFER
 
 typedef struct AV1Common {
   struct aom_internal_error_info error;
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
   aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
   int width;
@@ -225,6 +251,14 @@ typedef struct AV1Common {
   int render_height;
   int last_width;
   int last_height;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int buffer_removal_delay_present;
+  aom_dec_model_info_t buffer_model;
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+  int tu_presentation_delay_flag;
+  int64_t tu_presentation_delay;
 
   // TODO(jkoleszar): this implies chroma ss right now, but could vary per
   // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
@@ -232,10 +266,15 @@ typedef struct AV1Common {
   int subsampling_x;
   int subsampling_y;
 
-#if CONFIG_HIGHBITDEPTH
+  int largest_tile_id;
+  size_t largest_tile_size;
+  int context_update_tile_id;
+
+  // Scale of the current frame with respect to itself.
+  struct scale_factors sf_identity;
+
   // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
   int use_highbitdepth;
-#endif
   YV12_BUFFER_CONFIG *frame_to_show;
   RefCntBuffer *prev_frame;
 
@@ -253,6 +292,10 @@ typedef struct AV1Common {
 
   // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
   RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+  int is_skip_mode_allowed;
+  int skip_mode_flag;
+  int ref_frame_idx_0;
+  int ref_frame_idx_1;
 
   int new_fb_idx;
 
@@ -260,39 +303,26 @@ typedef struct AV1Common {
   FRAME_TYPE frame_type;
 
   int show_frame;
+  int showable_frame;  // frame can be used as show existing frame in future
   int last_show_frame;
   int show_existing_frame;
-#if CONFIG_EXT_REFS
   // Flag for a frame used as a reference - not written to the bitstream
   int is_reference_frame;
-#endif  // CONFIG_EXT_REFS
+  int reset_decoder_state;
 
   // Flag signaling that the frame is encoded using only INTRA modes.
   uint8_t intra_only;
   uint8_t last_intra_only;
-
+  uint8_t disable_cdf_update;
   int allow_high_precision_mv;
-#if CONFIG_AMVR
-  int seq_mv_precision_level;        // 0 the default in AOM, 1 only integer, 2
-                                     // adaptive
-  int cur_frame_mv_precision_level;  // 0 the default in AOM, 1 only integer
-#endif
+  int cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
 
   int allow_screen_content_tools;
-#if CONFIG_INTERINTRA
-  int allow_interintra_compound;
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  int allow_masked_compound;
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  // Flag signaling which frame contexts should be reset to default values.
-  RESET_FRAME_CONTEXT_MODE reset_frame_context;
-#endif
+  int allow_intrabc;
+  int allow_warped_motion;
 
   // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
-  // MODE_INFO (8-pixel) units.
+  // MB_MODE_INFO (8-pixel) units.
   int MBs;
   int mb_rows, mi_rows;
   int mb_cols, mi_cols;
@@ -301,119 +331,120 @@ typedef struct AV1Common {
   /* profile settings */
   TX_MODE tx_mode;
 
+#if CONFIG_ENTROPY_STATS
+  int coef_cdf_category;
+#endif
+
   int base_qindex;
   int y_dc_delta_q;
-  int uv_dc_delta_q;
-  int uv_ac_delta_q;
-  int16_t y_dequant[MAX_SEGMENTS][2];
-  int16_t uv_dequant[MAX_SEGMENTS][2];
+  int u_dc_delta_q;
+  int v_dc_delta_q;
+  int u_ac_delta_q;
+  int v_ac_delta_q;
+
+  int separate_uv_delta_q;
+
+  // The dequantizers below are true dequntizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
 
-#if CONFIG_AOM_QM
   // Global quant matrix tables
-  qm_val_t *giqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES_ALL];
-  qm_val_t *gqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES_ALL];
+  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
 
   // Local quant matrix tables for each frame
-  qm_val_t *y_iqmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  qm_val_t *uv_iqmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  // Encoder
-  qm_val_t *y_qmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  qm_val_t *uv_qmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
+  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
 
+  // Encoder
   int using_qmatrix;
+  int qm_y;
+  int qm_u;
+  int qm_v;
   int min_qmlevel;
   int max_qmlevel;
-#endif
-#if CONFIG_NEW_QUANT
-  dequant_val_type_nuq y_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
-  dequant_val_type_nuq uv_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
-#endif
 
-  /* We allocate a MODE_INFO struct for each macroblock, together with
+  /* We allocate a MB_MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
   int mi_alloc_size;
-  MODE_INFO *mip; /* Base of allocated array */
-  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+  MB_MODE_INFO *mip; /* Base of allocated array */
+  MB_MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
 
   // TODO(agrange): Move prev_mi into encoder structure.
   // prev_mip and prev_mi will only be allocated in encoder.
-  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
-  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+  MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
+  MB_MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
   // Separate mi functions between encoder and decoder.
   int (*alloc_mi)(struct AV1Common *cm, int mi_size);
   void (*free_mi)(struct AV1Common *cm);
   void (*setup_mi)(struct AV1Common *cm);
 
-  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // Grid of pointers to 8x8 MB_MODE_INFO structs.  Any 8x8 not in the visible
   // area will be NULL.
-  MODE_INFO **mi_grid_base;
-  MODE_INFO **mi_grid_visible;
-  MODE_INFO **prev_mi_grid_base;
-  MODE_INFO **prev_mi_grid_visible;
-
-  // Whether to use previous frame's motion vectors for prediction.
-  int use_prev_frame_mvs;
+  MB_MODE_INFO **mi_grid_base;
+  MB_MODE_INFO **mi_grid_visible;
+  MB_MODE_INFO **prev_mi_grid_base;
+  MB_MODE_INFO **prev_mi_grid_visible;
 
-  // Persistent mb segment id map used in prediction.
-  int seg_map_idx;
-  int prev_seg_map_idx;
+  // Whether to use previous frames' motion vectors for prediction.
+  int allow_ref_frame_mvs;
 
-  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
   uint8_t *last_frame_seg_map;
   uint8_t *current_frame_seg_map;
   int seg_map_alloc_size;
 
   InterpFilter interp_filter;
 
+  int switchable_motion_mode;
+
   loop_filter_info_n lf_info;
-#if CONFIG_FRAME_SUPERRES
   // The denominator of the superres scale; the numerator is fixed.
   uint8_t superres_scale_denominator;
   int superres_upscaled_width;
   int superres_upscaled_height;
-#endif  // CONFIG_FRAME_SUPERRES
-#if CONFIG_LOOP_RESTORATION
   RestorationInfo rst_info[MAX_MB_PLANE];
-  RestorationInternal rst_internal;
-#endif  // CONFIG_LOOP_RESTORATION
+
+  // rst_end_stripe[i] is one more than the index of the bottom stripe
+  // for tile row i.
+  int rst_end_stripe[MAX_TILE_ROWS];
+
+  // Pointer to a scratch buffer used by self-guided restoration
+  int32_t *rst_tmpbuf;
+  RestorationLineBuffers *rlbs;
+
+  // Output of loop restoration
+  YV12_BUFFER_CONFIG rst_frame;
 
   // Flag signaling how frame contexts should be updated at the end of
   // a frame decode
   REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
 
-  int ref_frame_sign_bias[TOTAL_REFS_PER_FRAME]; /* Two state 0, 1 */
+  int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
 
   struct loopfilter lf;
   struct segmentation seg;
-  int all_lossless;
-  int frame_parallel_decode;  // frame-based threading.
+  int coded_lossless;  // frame is fully lossless at the coded resolution.
+  int all_lossless;    // frame is fully lossless at the upscaled resolution.
 
-#if CONFIG_EXT_TX
   int reduced_tx_set_used;
-#endif  // CONFIG_EXT_TX
 
-// Context probabilities for reference frame prediction
-#if CONFIG_EXT_REFS
+  // Context probabilities for reference frame prediction
   MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
   MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
-#else
-  MV_REFERENCE_FRAME comp_fixed_ref;
-  MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
-#endif  // CONFIG_EXT_REFS
   REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT *fc;              /* this frame entropy */
   FRAME_CONTEXT *frame_contexts;  // FRAME_CONTEXTS
-  FRAME_CONTEXT *pre_fc;          // Context referenced in this frame
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
   unsigned int frame_context_idx; /* Context to use/update */
-#endif
-  FRAME_COUNTS counts;
+  int fb_of_context_type[REF_FRAMES];
+  int primary_ref_frame;
 
-#if CONFIG_FRAME_MARKER
   unsigned int frame_offset;
-#endif
 
   unsigned int current_video_frame;
   BITSTREAM_PROFILE profile;
@@ -423,44 +454,27 @@ typedef struct AV1Common {
   aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 
   int error_resilient_mode;
+  int force_primary_ref_none;
 
   int tile_cols, tile_rows;
   int last_tile_cols, last_tile_rows;
 
-#if CONFIG_MAX_TILE
+  int max_tile_width_sb;
   int min_log2_tile_cols;
   int max_log2_tile_cols;
   int max_log2_tile_rows;
   int min_log2_tile_rows;
   int min_log2_tiles;
-  int max_tile_width_sb;
   int max_tile_height_sb;
   int uniform_tile_spacing_flag;
   int log2_tile_cols;                        // only valid for uniform tiles
   int log2_tile_rows;                        // only valid for uniform tiles
   int tile_col_start_sb[MAX_TILE_COLS + 1];  // valid for 0 <= i <= tile_cols
   int tile_row_start_sb[MAX_TILE_ROWS + 1];  // valid for 0 <= i <= tile_rows
-#if CONFIG_DEPENDENT_HORZTILES
-  int tile_row_independent[MAX_TILE_ROWS];  // valid for 0 <= i <  tile_rows
-#endif
-#else
-  int log2_tile_cols, log2_tile_rows;  // Used in non-large_scale_tile_coding.
-  int tile_width, tile_height;         // In MI units
-#endif  // CONFIG_MAX_TILE
+  int tile_width, tile_height;               // In MI units
 
-#if CONFIG_EXT_TILE
   unsigned int large_scale_tile;
   unsigned int single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_DEPENDENT_HORZTILES
-  int dependent_horz_tiles;
-  int tile_group_start_row[MAX_TILE_ROWS][MAX_TILE_COLS];
-  int tile_group_start_col[MAX_TILE_ROWS][MAX_TILE_COLS];
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  int loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 
   int byte_alignment;
   int skip_loop_filter;
@@ -476,74 +490,65 @@ typedef struct AV1Common {
   // External BufferPool passed from outside.
   BufferPool *buffer_pool;
 
-  PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-#if CONFIG_VAR_TX
-  TXFM_CONTEXT *above_txfm_context;
-  TXFM_CONTEXT *top_txfm_context[MAX_MB_PLANE];
-  TXFM_CONTEXT left_txfm_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
-#endif
-  int above_context_alloc_cols;
-
-  // scratch memory for intraonly/keyframe forward updates from default tables
-  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
-  // each keyframe and not used afterwards
-  aom_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
-#endif
-
-  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
-  int mib_size;        // Size of the superblock in units of MI blocks
-  int mib_size_log2;   // Log 2 of above.
-#if CONFIG_CDEF
+  PARTITION_CONTEXT **above_seg_context;
+  ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
+  TXFM_CONTEXT **above_txfm_context;
+  WarpedMotionParams global_motion[REF_FRAMES];
+  aom_film_grain_table_t *film_grain_table;
+  int film_grain_params_present;
+  aom_film_grain_t film_grain_params;
   int cdef_pri_damping;
   int cdef_sec_damping;
   int nb_cdef_strengths;
   int cdef_strengths[CDEF_MAX_STRENGTHS];
   int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
   int cdef_bits;
-#endif
 
   int delta_q_present_flag;
   // Resolution of delta quant
   int delta_q_res;
-#if CONFIG_EXT_DELTA_Q
   int delta_lf_present_flag;
   // Resolution of delta lf level
   int delta_lf_res;
-#if CONFIG_LOOPFILTER_LEVEL
   // This is a flag for number of deltas of loop filter level
   // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
   // 1: use separate deltas for each filter level
   int delta_lf_multi;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
   int num_tg;
-#if CONFIG_REFERENCE_BUFFER
   SequenceHeader seq_params;
   int current_frame_id;
   int ref_frame_id[REF_FRAMES];
   int valid_for_referencing[REF_FRAMES];
-  int refresh_mask;
-  int invalid_delta_frame_id_minus1;
-#endif  // CONFIG_REFERENCE_BUFFER
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  int ans_window_size_log2;
-#endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  NCOBMC_KERNELS ncobmc_kernels[ADAPT_OVERLAP_BLOCKS][ALL_NCOBMC_MODES];
-  uint8_t *ncobmcaw_buf[4];
-#endif
-#if CONFIG_LV_MAP
+  int invalid_delta_frame_id_minus_1;
   LV_MAP_CTX_TABLE coeff_ctx_table;
+  TPL_MV_REF *tpl_mvs;
+  int tpl_mvs_mem_size;
+  // TODO(jingning): This can be combined with sign_bias later.
+  int8_t ref_frame_side[REF_FRAMES];
+
+  int is_annexb;
+
+  int frame_refs_short_signaling;
+  int temporal_layer_id;
+  int spatial_layer_id;
+  unsigned int number_temporal_layers;
+  unsigned int number_spatial_layers;
+  int num_allocated_above_context_mi_col;
+  int num_allocated_above_contexts;
+  int num_allocated_above_context_planes;
+
+#if TXCOEFF_TIMER
+  int64_t cum_txcoeff_timer;
+  int64_t txcoeff_timer;
+  int txb_count;
 #endif
-#if CONFIG_LPF_SB
-  int final_lpf_encode;
-#endif
-#if CONFIG_ADAPT_SCAN
-  int use_adapt_scan;
+
+#if TXCOEFF_COST_TIMER
+  int64_t cum_txcoeff_cost_timer;
+  int64_t txcoeff_cost_timer;
+  int64_t txcoeff_cost_count;
 #endif
+  const cfg_options_t *options;
 } AV1_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -585,6 +590,17 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
     if (frame_bufs[i].ref_count == 0) break;
 
   if (i != FRAME_BUFFERS) {
+    if (frame_bufs[i].buf.use_external_refernce_buffers) {
+      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+      // external reference buffers. Restore the buffer pointers to point to the
+      // internally allocated memory.
+      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+      ybf->y_buffer = ybf->store_buf_adr[0];
+      ybf->u_buffer = ybf->store_buf_adr[1];
+      ybf->v_buffer = ybf->store_buf_adr[2];
+      ybf->use_external_refernce_buffers = 0;
+    }
+
     frame_bufs[i].ref_count = 1;
   } else {
     // Reset i to be INVALID_IDX to indicate no free buffer found.
@@ -606,270 +622,236 @@ static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   bufs[new_idx].ref_count++;
 }
 
-#if CONFIG_TEMPMV_SIGNALING
-// Returns 1 if this frame might use mvs from some previous frame. This
-// function doesn't consider whether prev_frame is actually suitable (see
-// frame_can_use_prev_frame_mvs for that)
-static INLINE int frame_might_use_prev_frame_mvs(const AV1_COMMON *cm) {
-  return !cm->error_resilient_mode && !cm->intra_only;
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+  return cm->frame_type == S_FRAME;
 }
 
-// Returns 1 if this frame really can use MVs from some previous frame.
-static INLINE int frame_can_use_prev_frame_mvs(const AV1_COMMON *cm) {
-  return (frame_might_use_prev_frame_mvs(cm) && cm->last_show_frame &&
-          cm->prev_frame && !cm->prev_frame->intra_only &&
-          cm->width == cm->prev_frame->width &&
-          cm->height == cm->prev_frame->height);
+static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+      cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) {
+    return NULL;
+  } else {
+    return &cm->buffer_pool
+                ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx];
+  }
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+  return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs &&
+         cm->seq_params.enable_order_hint && !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+  return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
+         cm->seq_params.enable_warped_motion;
 }
-#endif
 
 static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
-  if (buf->mvs == NULL || buf->mi_rows < cm->mi_rows ||
-      buf->mi_cols < cm->mi_cols) {
+  const int buf_rows = buf->mi_rows;
+  const int buf_cols = buf->mi_cols;
+
+  if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
     aom_free(buf->mvs);
     buf->mi_rows = cm->mi_rows;
     buf->mi_cols = cm->mi_cols;
-#if CONFIG_TMV
     CHECK_MEM_ERROR(cm, buf->mvs,
                     (MV_REF *)aom_calloc(
                         ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
                         sizeof(*buf->mvs)));
-#else
-    CHECK_MEM_ERROR(
-        cm, buf->mvs,
-        (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols, sizeof(*buf->mvs)));
-#endif  // CONFIG_TMV
-
-#if CONFIG_MFMV
-    aom_free(buf->tpl_mvs);
-    CHECK_MEM_ERROR(
-        cm, buf->tpl_mvs,
-        (TPL_MV_REF *)aom_calloc((cm->mi_rows + MAX_MIB_SIZE) * cm->mi_stride,
-                                 sizeof(*buf->tpl_mvs)));
-#endif
+    aom_free(buf->seg_map);
+    CHECK_MEM_ERROR(cm, buf->seg_map,
+                    (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                          sizeof(*buf->seg_map)));
   }
-}
 
-#if CONFIG_VAR_REFS
-#define LAST_IS_VALID(cm) ((cm)->frame_refs[LAST_FRAME - 1].is_valid)
-#define LAST2_IS_VALID(cm) ((cm)->frame_refs[LAST2_FRAME - 1].is_valid)
-#define LAST3_IS_VALID(cm) ((cm)->frame_refs[LAST3_FRAME - 1].is_valid)
-#define GOLDEN_IS_VALID(cm) ((cm)->frame_refs[GOLDEN_FRAME - 1].is_valid)
-#define BWDREF_IS_VALID(cm) ((cm)->frame_refs[BWDREF_FRAME - 1].is_valid)
-#define ALTREF2_IS_VALID(cm) ((cm)->frame_refs[ALTREF2_FRAME - 1].is_valid)
-#define ALTREF_IS_VALID(cm) ((cm)->frame_refs[ALTREF_FRAME - 1].is_valid)
-
-#define L_OR_L2(cm) (LAST_IS_VALID(cm) || LAST2_IS_VALID(cm))
-#define L_AND_L2(cm) (LAST_IS_VALID(cm) && LAST2_IS_VALID(cm))
-#define L_AND_L3(cm) (LAST_IS_VALID(cm) && LAST3_IS_VALID(cm))
-#define L_AND_G(cm) (LAST_IS_VALID(cm) && GOLDEN_IS_VALID(cm))
-
-#define L3_OR_G(cm) (LAST3_IS_VALID(cm) || GOLDEN_IS_VALID(cm))
-#define L3_AND_G(cm) (LAST3_IS_VALID(cm) && GOLDEN_IS_VALID(cm))
-
-#define BWD_OR_ALT2(cm) (BWDREF_IS_VALID(cm) || ALTREF2_IS_VALID(cm))
-#define BWD_AND_ALT2(cm) (BWDREF_IS_VALID(cm) && ALTREF2_IS_VALID(cm))
-#define BWD_OR_ALT(cm) (BWDREF_IS_VALID(cm) || ALTREF_IS_VALID(cm))
-#define BWD_AND_ALT(cm) (BWDREF_IS_VALID(cm) && ALTREF_IS_VALID(cm))
-#endif  // CONFIG_VAR_REFS
+  const int mem_size =
+      ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  int realloc = cm->tpl_mvs == NULL;
+  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+  if (realloc) {
+    aom_free(cm->tpl_mvs);
+    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+    cm->tpl_mvs_mem_size = mem_size;
+  }
+}
 
 static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
-  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->mib_size_log2);
+  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
 }
 
 static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
-  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->mib_size_log2);
+  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
 }
 
-static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
-  return cm->frame_type == KEY_FRAME || cm->intra_only;
+void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
 }
 
-#if CONFIG_CFL
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-static INLINE void cfl_clear_sub8x8_val(CFL_CTX *cfl) {
-  memset(cfl->sub8x8_val, 0, sizeof(cfl->sub8x8_val));
+static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          const int tile_row) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->above_context[i] = cm->above_context[i][tile_row];
+  }
+  xd->above_seg_context = cm->above_seg_context[tile_row];
+  xd->above_txfm_context = cm->above_txfm_context[tile_row];
 }
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
-#endif  // CONFIG_CFL
 
 static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
-#if CONFIG_PVQ
-                                        tran_low_t *pvq_ref_coeff,
-#endif
-#if CONFIG_CFL
-                                        CFL_CTX *cfl,
-#endif
                                         tran_low_t *dqcoeff) {
-  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
-#if CONFIG_PVQ
-    xd->plane[i].pvq_ref_coeff = pvq_ref_coeff;
-#endif
-    xd->above_context[i] = cm->above_context[i];
+
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
-      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
-#if CONFIG_AOM_QM
+      memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
+             sizeof(cm->y_dequant_QTX));
       memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
-#endif
 
-#if CONFIG_NEW_QUANT
-      memcpy(xd->plane[i].seg_dequant_nuq, cm->y_dequant_nuq,
-             sizeof(cm->y_dequant_nuq));
-#endif
     } else {
-      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
-#if CONFIG_AOM_QM
-      memcpy(xd->plane[i].seg_iqmatrix, cm->uv_iqmatrix,
-             sizeof(cm->uv_iqmatrix));
-#endif
-#if CONFIG_NEW_QUANT
-      memcpy(xd->plane[i].seg_dequant_nuq, cm->uv_dequant_nuq,
-             sizeof(cm->uv_dequant_nuq));
-#endif
+      if (i == AOM_PLANE_U) {
+        memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
+               sizeof(cm->u_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
+               sizeof(cm->u_iqmatrix));
+      } else {
+        memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
+               sizeof(cm->v_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
+               sizeof(cm->v_iqmatrix));
+      }
     }
   }
-  xd->fc = cm->fc;
-  xd->above_seg_context = cm->above_seg_context;
-#if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context;
-#endif
-#if CONFIG_CFL
-  cfl_init(cfl, cm);
-  xd->cfl = cfl;
-#endif
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
+  cfl_init(&xd->cfl, cm);
 }
 
-static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    const int num_planes) {
   int i;
   int row_offset = mi_row;
   int col_offset = mi_col;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-#if CONFIG_CHROMA_SUB8X8
     // Offset the buffer pointer
-    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
     if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
       row_offset = mi_row - 1;
     if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
       col_offset = mi_col - 1;
-#endif
-    int above_idx = col_offset << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-    int left_idx = (row_offset & MAX_MIB_MASK)
-                   << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+    int above_idx = col_offset;
+    int left_idx = row_offset & MAX_MIB_MASK;
     pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
     pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
   }
 }
 
 static INLINE int calc_mi_size(int len) {
-  // len is in mi units.
-  return len + MAX_MIB_SIZE;
+  // len is in mi units. Align to a multiple of SBs.
+  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
 }
 
-static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh) {
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+                                const int num_planes) {
   int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
-    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
-
+  for (i = 0; i < num_planes; i++) {
     xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
     xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
 
-#if !CONFIG_CHROMA_2X2
     xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
     xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
-#endif
   }
 }
 
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh, int mi_col, int bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                                  int dependent_horz_tile_flag,
-#endif  // CONFIG_DEPENDENT_HORZTILES
                                   int mi_rows, int mi_cols) {
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
   xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
   xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
 
-#if CONFIG_DEPENDENT_HORZTILES
-  if (dependent_horz_tile_flag) {
-    xd->up_available = (mi_row > tile->mi_row_start) || !tile->tg_horz_boundary;
-  } else {
-#endif  // CONFIG_DEPENDENT_HORZTILES
-    // Are edges available for intra prediction?
-    xd->up_available = (mi_row > tile->mi_row_start);
-#if CONFIG_DEPENDENT_HORZTILES
-  }
-#endif  // CONFIG_DEPENDENT_HORZTILES
+  // Are edges available for intra prediction?
+  xd->up_available = (mi_row > tile->mi_row_start);
+
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
 
   xd->left_available = (mi_col > tile->mi_col_start);
-#if CONFIG_CHROMA_SUB8X8
   xd->chroma_up_available = xd->up_available;
   xd->chroma_left_available = xd->left_available;
-  if (xd->plane[1].subsampling_x && bw < mi_size_wide[BLOCK_8X8])
+  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
     xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
-  if (xd->plane[1].subsampling_y && bh < mi_size_high[BLOCK_8X8])
+  if (ss_y && bh < mi_size_high[BLOCK_8X8])
     xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
-#endif
   if (xd->up_available) {
-    xd->above_mi = xd->mi[-xd->mi_stride];
-    // above_mi may be NULL in encoder's first pass.
-    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+    xd->above_mbmi = xd->mi[-xd->mi_stride];
   } else {
-    xd->above_mi = NULL;
     xd->above_mbmi = NULL;
   }
 
   if (xd->left_available) {
-    xd->left_mi = xd->mi[-1];
-    // left_mi may be NULL in encoder's first pass.
-    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+    xd->left_mbmi = xd->mi[-1];
   } else {
-    xd->left_mi = NULL;
     xd->left_mbmi = NULL;
   }
 
+  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+  if (chroma_ref) {
+    // To help calculate the "above" and "left" chroma blocks, note that the
+    // current block may cover multiple luma blocks (eg, if partitioned into
+    // 4x4 luma blocks).
+    // First, find the top-left-most luma block covered by this chroma block
+    MB_MODE_INFO **base_mi =
+        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+    // Then, we consider the luma region covered by the left or above 4x4 chroma
+    // prediction. We want to point to the chroma reference block in that
+    // region, which is the bottom-right-most mi unit.
+    // This leads to the following offsets:
+    MB_MODE_INFO *chroma_above_mi =
+        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+    xd->chroma_above_mbmi = chroma_above_mi;
+
+    MB_MODE_INFO *chroma_left_mi =
+        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+    xd->chroma_left_mbmi = chroma_left_mi;
+  }
+
   xd->n8_h = bh;
   xd->n8_w = bw;
   xd->is_sec_rect = 0;
-  if (xd->n8_w < xd->n8_h)
-    if (mi_col & (xd->n8_h - 1)) xd->is_sec_rect = 1;
+  if (xd->n8_w < xd->n8_h) {
+    // Only mark is_sec_rect as 1 for the last block.
+    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+    // For other partitions, it would be (0, 1).
+    if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1;
+  }
 
   if (xd->n8_w > xd->n8_h)
     if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
 }
 
-static INLINE const aom_prob *get_y_mode_probs(const AV1_COMMON *cm,
-                                               const MODE_INFO *mi,
-                                               const MODE_INFO *above_mi,
-                                               const MODE_INFO *left_mi,
-                                               int block) {
-  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
-  return cm->kf_y_prob[above][left];
-}
-
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
-                                           const MODE_INFO *mi,
-                                           const MODE_INFO *above_mi,
-                                           const MODE_INFO *left_mi,
-                                           int block) {
-  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
-
-#if CONFIG_KF_CTX
-  int above_ctx = intra_mode_context[above];
-  int left_ctx = intra_mode_context[left];
+                                           const MB_MODE_INFO *above_mi,
+                                           const MB_MODE_INFO *left_mi) {
+  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[above];
+  const int left_ctx = intra_mode_context[left];
   return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
-#else
-  return tile_ctx->kf_y_cdf[above][left];
-#endif
 }
 
 static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
@@ -879,130 +861,117 @@ static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
   PARTITION_CONTEXT *const left_ctx =
       xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
-#if CONFIG_EXT_PARTITION_TYPES
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   memset(above_ctx, partition_context_lookup[subsize].above, bw);
   memset(left_ctx, partition_context_lookup[subsize].left, bh);
-#else
-  // num_4x4_blocks_wide_lookup[bsize] / 2
-  const int bs = mi_size_wide[bsize];
-
-  // update the partition context at the end notes. set partition bits
-  // of block sizes larger than the current one to be one, and partition
-  // bits of smaller block sizes to be zero.
-  memset(above_ctx, partition_context_lookup[subsize].above, bs);
-  memset(left_ctx, partition_context_lookup[subsize].left, bs);
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-#if CONFIG_CB4X4
 static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
                                       int subsampling_x, int subsampling_y) {
-#if CONFIG_CHROMA_2X2
-  return 1;
-#endif
-
-#if CONFIG_CHROMA_SUB8X8
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-
   int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
                 ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
-
-  return ref_pos;
-#else
-  int ref_pos = !(((mi_row & 0x01) && subsampling_y) ||
-                  ((mi_col & 0x01) && subsampling_x));
-
-  if (bsize >= BLOCK_8X8) ref_pos = 1;
-
   return ref_pos;
-#endif
-}
-
-#if CONFIG_SUPERTX
-static INLINE int need_handle_chroma_sub8x8(BLOCK_SIZE bsize, int subsampling_x,
-                                            int subsampling_y) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-
-  if (bsize >= BLOCK_8X8 ||
-      ((!(bh & 0x01) || !subsampling_y) && (!(bw & 0x01) || !subsampling_x)))
-    return 0;
-  else
-    return 1;
 }
-#endif
 
 static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
                                             int subsampling_y) {
   BLOCK_SIZE bs = bsize;
-
-  if (bs < BLOCK_8X8) {
-    if (subsampling_x == 1 && subsampling_y == 1)
-      bs = BLOCK_8X8;
-    else if (subsampling_x == 1)
-      bs = BLOCK_8X4;
-    else if (subsampling_y == 1)
-      bs = BLOCK_4X8;
+  switch (bsize) {
+    case BLOCK_4X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_4X8:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_8X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_8X8;
+      break;
+    case BLOCK_4X16:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X16;
+      break;
+    case BLOCK_16X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_16X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_16X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_16X8;
+      break;
+    default: break;
   }
-
   return bs;
 }
-#endif
 
 static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
                                             size_t element) {
   assert(cdf != NULL);
-#if !CONFIG_ANS
   return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
-#else
-  return cdf[element] - (element > 0 ? cdf[element - 1] : 0);
-#endif
 }
 
 static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
-                                               const aom_cdf_prob *const in) {
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
   out[0] = CDF_PROB_TOP;
   out[0] -= cdf_element_prob(in, PARTITION_HORZ);
   out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
-#if CONFIG_EXT_PARTITION_TYPES
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
-#endif
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
   out[0] = AOM_ICDF(out[0]);
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
 
 static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
-                                               const aom_cdf_prob *const in) {
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
   out[0] = CDF_PROB_TOP;
   out[0] -= cdf_element_prob(in, PARTITION_VERT);
   out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
-#if CONFIG_EXT_PARTITION_TYPES
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
-#endif
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
   out[0] = AOM_ICDF(out[0]);
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
 static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
                                                 int mi_col, BLOCK_SIZE subsize,
                                                 BLOCK_SIZE bsize,
                                                 PARTITION_TYPE partition) {
   if (bsize >= BLOCK_8X8) {
-#if !CONFIG_EXT_PARTITION_TYPES_AB
     const int hbs = mi_size_wide[bsize] / 2;
-    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
+    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
     switch (partition) {
       case PARTITION_SPLIT:
         if (bsize != BLOCK_8X8) break;
+        AOM_FALLTHROUGH_INTENDED;
       case PARTITION_NONE:
       case PARTITION_HORZ:
       case PARTITION_VERT:
@@ -1010,30 +979,6 @@ static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
       case PARTITION_VERT_4:
         update_partition_context(xd, mi_row, mi_col, subsize, bsize);
         break;
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      case PARTITION_HORZ_A:
-        update_partition_context(xd, mi_row, mi_col,
-                                 get_subsize(bsize, PARTITION_HORZ_4), subsize);
-        update_partition_context(xd, mi_row + mi_size_high[bsize] / 2, mi_col,
-                                 subsize, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
-        update_partition_context(xd, mi_row + mi_size_high[bsize] / 2, mi_col,
-                                 get_subsize(bsize, PARTITION_HORZ_4), subsize);
-        break;
-      case PARTITION_VERT_A:
-        update_partition_context(xd, mi_row, mi_col,
-                                 get_subsize(bsize, PARTITION_VERT_4), subsize);
-        update_partition_context(xd, mi_row, mi_col + mi_size_wide[bsize] / 2,
-                                 subsize, subsize);
-        break;
-      case PARTITION_VERT_B:
-        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
-        update_partition_context(xd, mi_row, mi_col + mi_size_wide[bsize] / 2,
-                                 get_subsize(bsize, PARTITION_VERT_4), subsize);
-        break;
-#else
       case PARTITION_HORZ_A:
         update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
         update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
@@ -1050,41 +995,35 @@ static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
         update_partition_context(xd, mi_row, mi_col, subsize, subsize);
         update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
         break;
-#endif
       default: assert(0 && "Invalid partition type");
     }
   }
 }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
-                                          int mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                          int has_rows, int has_cols,
-#endif
-                                          BLOCK_SIZE bsize) {
+                                          int mi_col, BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx =
       xd->left_seg_context + (mi_row & MAX_MIB_MASK);
   // Minimum partition point is 8x8. Offset the bsl accordingly.
-  const int bsl = mi_width_log2_lookup[bsize] - mi_width_log2_lookup[BLOCK_8X8];
+  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
   int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
 
-  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
   assert(bsl >= 0);
 
-#if CONFIG_UNPOISON_PARTITION_CTX
-  if (has_rows && has_cols)
-    return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-  else if (has_rows && !has_cols)
-    return PARTITION_CONTEXTS_PRIMARY + bsl;
-  else if (!has_rows && has_cols)
-    return PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES + bsl;
-  else
-    return INVALID_PARTITION_CTX;  // Bogus context, forced SPLIT
-#else
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-#endif
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+  if (bsize <= BLOCK_8X8)
+    return PARTITION_TYPES;
+  else if (bsize == BLOCK_128X128)
+    return EXT_PARTITION_TYPES - 2;
+  else
+    return EXT_PARTITION_TYPES;
 }
 
 static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
@@ -1107,11 +1046,10 @@ static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 
-  // Scale the width in the transform block unit.
-  return max_blocks_high >> tx_size_wide_log2[0];
+  // Scale the height in the transform block unit.
+  return max_blocks_high >> tx_size_high_log2[0];
 }
 
-#if CONFIG_CFL
 static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
                                         BLOCK_SIZE plane_bsize, int plane,
                                         TX_SIZE tx_size) {
@@ -1127,36 +1065,43 @@ static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
                               << tx_size_high_log2[0];
   return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
 }
-#endif  // CONFIG_CFL
 
 static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
-                                          int mi_col_start, int mi_col_end) {
+  int mi_col_start, int mi_col_end, const int tile_row) {
+  const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, cm->mib_size_log2);
+  const int aligned_width =
+    ALIGN_POWER_OF_TWO(width, cm->seq_params.mib_size_log2);
 
-  const int offset_y = mi_col_start << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-  const int width_y = aligned_width << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
+  const int offset_y = mi_col_start;
+  const int width_y = aligned_width;
   const int offset_uv = offset_y >> cm->subsampling_x;
   const int width_uv = width_y >> cm->subsampling_x;
 
-  av1_zero_array(cm->above_context[0] + offset_y, width_y);
-  av1_zero_array(cm->above_context[1] + offset_uv, width_uv);
-  av1_zero_array(cm->above_context[2] + offset_uv, width_uv);
+  av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
+  if (num_planes > 1) {
+    if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
+      av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
+      av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
+    } else {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid value of planes");
+    }
+  }
 
-  av1_zero_array(cm->above_seg_context + mi_col_start, aligned_width);
+  av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
 
-#if CONFIG_VAR_TX
-  av1_zero_array(cm->above_txfm_context + (mi_col_start << TX_UNIT_WIDE_LOG2),
-                 aligned_width << TX_UNIT_WIDE_LOG2);
-#endif  // CONFIG_VAR_TX
+  memset(cm->above_txfm_context[tile_row] + mi_col_start,
+    tx_size_wide[TX_SIZES_LARGEST],
+    aligned_width * sizeof(TXFM_CONTEXT));
 }
 
 static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
   av1_zero(xd->left_context);
   av1_zero(xd->left_seg_context);
-#if CONFIG_VAR_TX
-  av1_zero(xd->left_txfm_context_buffer);
-#endif
+
+  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+         sizeof(xd->left_txfm_context_buffer));
 }
 
 // Disable array-bounds checks as the TX_SIZE enum contains values larger than
@@ -1166,15 +1111,11 @@ static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
 #if defined(__GNUC__) && __GNUC__ >= 4
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
-static INLINE TX_SIZE get_min_tx_size(TX_SIZE tx_size) {
-  assert(tx_size < TX_SIZES_ALL);
-  return txsize_sqr_map[tx_size];
-}
+
 #if defined(__GNUC__) && __GNUC__ >= 4
 #pragma GCC diagnostic warning "-Warray-bounds"
 #endif
 
-#if CONFIG_VAR_TX
 static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
   int i;
   for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
@@ -1190,16 +1131,16 @@ static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
     bh = n8_h * MI_SIZE;
   }
 
-  set_txfm_ctx(xd->above_txfm_context, bw, n8_w << TX_UNIT_WIDE_LOG2);
-  set_txfm_ctx(xd->left_txfm_context, bh, n8_h << TX_UNIT_HIGH_LOG2);
+  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
 }
 
 static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          TX_SIZE tx_size, TX_SIZE txb_size) {
   BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
-  int bh = mi_size_high[bsize] << TX_UNIT_HIGH_LOG2;
-  int bw = mi_size_wide[bsize] << TX_UNIT_WIDE_LOG2;
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
   uint8_t txw = tx_size_wide[tx_size];
   uint8_t txh = tx_size_high[tx_size];
   int i;
@@ -1209,16 +1150,8 @@ static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
 
 static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
   switch (tx_dim) {
-#if CONFIG_EXT_PARTITION
     case 128:
-#endif  // CONFIG_EXT_PARTITION
-    case 64:
-#if CONFIG_TX64X64
-      return TX_64X64;
-#else
-      return TX_32X32;
-#endif  // CONFIG_TX64X64
-      break;
+    case 64: return TX_64X64; break;
     case 32: return TX_32X32; break;
     case 16: return TX_16X16; break;
     case 8: return TX_8X8; break;
@@ -1226,6 +1159,45 @@ static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
   }
 }
 
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+  if (width == height) {
+    return get_sqr_tx_size(width);
+  }
+  if (width < height) {
+    if (width + width == height) {
+      switch (width) {
+        case 4: return TX_4X8; break;
+        case 8: return TX_8X16; break;
+        case 16: return TX_16X32; break;
+        case 32: return TX_32X64; break;
+      }
+    } else {
+      switch (width) {
+        case 4: return TX_4X16; break;
+        case 8: return TX_8X32; break;
+        case 16: return TX_16X64; break;
+      }
+    }
+  } else {
+    if (height + height == width) {
+      switch (height) {
+        case 4: return TX_8X4; break;
+        case 8: return TX_16X8; break;
+        case 16: return TX_32X16; break;
+        case 32: return TX_64X32; break;
+      }
+    } else {
+      switch (height) {
+        case 4: return TX_16X4; break;
+        case 8: return TX_32X8; break;
+        case 16: return TX_64X16; break;
+      }
+    }
+  }
+  assert(0);
+  return TX_4X4;
+}
+
 static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -1233,7 +1205,7 @@ static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
   const uint8_t txh = tx_size_high[tx_size];
   const int above = *above_ctx < txw;
   const int left = *left_ctx < txh;
-  int category = TXFM_PARTITION_CONTEXTS - 1;
+  int category = TXFM_PARTITION_CONTEXTS;
 
   // dummy return, not used by others.
   if (tx_size <= TX_4X4) return 0;
@@ -1242,13 +1214,13 @@ static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
       get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
 
   if (max_tx_size >= TX_8X8) {
-    category = (tx_size != max_tx_size && max_tx_size > TX_8X8) +
-               (TX_SIZES - 1 - max_tx_size) * 2;
+    category =
+        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+        (TX_SIZES - 1 - max_tx_size) * 2;
   }
-  if (category == TXFM_PARTITION_CONTEXTS - 1) return category;
+  assert(category != TXFM_PARTITION_CONTEXTS);
   return category * 3 + above + left;
 }
-#endif
 
 // Compute the next partition in the direction of the sb_type stored in the mi
 // array, starting with bsize.
@@ -1258,8 +1230,8 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
 
   const int offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO **mi = cm->mi_grid_visible + offset;
-  const BLOCK_SIZE subsize = mi[0]->mbmi.sb_type;
+  MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
+  const BLOCK_SIZE subsize = mi[0]->sb_type;
 
   if (subsize == bsize) return PARTITION_NONE;
 
@@ -1268,25 +1240,14 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
   const int sshigh = mi_size_high[subsize];
   const int sswide = mi_size_wide[subsize];
 
-#if CONFIG_EXT_PARTITION_TYPES
   if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
       mi_col + bhigh / 2 < cm->mi_cols) {
     // In this case, the block might be using an extended partition
     // type.
-    const MB_MODE_INFO *const mbmi_right = &mi[bwide / 2]->mbmi;
-    const MB_MODE_INFO *const mbmi_below = &mi[bhigh / 2 * cm->mi_stride]->mbmi;
+    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
 
     if (sswide == bwide) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      // Smaller height but same width. Is PARTITION_HORZ, PARTITION_HORZ_4,
-      // PARTITION_HORZ_A or PARTITION_HORZ_B.
-      if (sshigh * 2 == bhigh)
-        return (mbmi_below->sb_type == subsize) ? PARTITION_HORZ
-                                                : PARTITION_HORZ_B;
-      assert(sshigh * 4 == bhigh);
-      return (mbmi_below->sb_type == subsize) ? PARTITION_HORZ_4
-                                              : PARTITION_HORZ_A;
-#else
       // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
       // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
       // half was split.
@@ -1297,18 +1258,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
         return PARTITION_HORZ;
       else
         return PARTITION_HORZ_B;
-#endif
     } else if (sshigh == bhigh) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      // Smaller width but same height. Is PARTITION_VERT, PARTITION_VERT_4,
-      // PARTITION_VERT_A or PARTITION_VERT_B.
-      if (sswide * 2 == bwide)
-        return (mbmi_right->sb_type == subsize) ? PARTITION_VERT
-                                                : PARTITION_VERT_B;
-      assert(sswide * 4 == bwide);
-      return (mbmi_right->sb_type == subsize) ? PARTITION_VERT_4
-                                              : PARTITION_VERT_A;
-#else
       // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
       // PARTITION_VERT_B. To distinguish the latter two, check if the right
       // half was split.
@@ -1319,9 +1269,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
         return PARTITION_VERT;
       else
         return PARTITION_VERT_B;
-#endif
     } else {
-#if !CONFIG_EXT_PARTITION_TYPES_AB
       // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
       // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
       // dimensions, we immediately know this is a split (which will recurse to
@@ -1333,12 +1281,10 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
 
       if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
       if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
-#endif
 
       return PARTITION_SPLIT;
     }
   }
-#endif
   const int vert_split = sswide < bwide;
   const int horz_split = sshigh < bhigh;
   const int split_idx = (vert_split << 1) | horz_split;
@@ -1352,49 +1298,46 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
 }
 
 static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) {
-#if CONFIG_REFERENCE_BUFFER
   cm->seq_params.frame_id_numbers_present_flag = use;
-#else
-  (void)cm;
-  (void)use;
-#endif
 }
 
-static INLINE void set_sb_size(AV1_COMMON *const cm, BLOCK_SIZE sb_size) {
-  cm->sb_size = sb_size;
-  cm->mib_size = mi_size_wide[cm->sb_size];
-#if CONFIG_CB4X4
-  cm->mib_size_log2 = b_width_log2_lookup[cm->sb_size];
-#else
-  cm->mib_size_log2 = mi_width_log2_lookup[cm->sb_size];
-#endif
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+                               BLOCK_SIZE sb_size) {
+  seq_params->sb_size = sb_size;
+  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
 }
 
-static INLINE int all_lossless(const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  int i;
-  int all_lossless = 1;
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int coded_lossless = 1;
   if (cm->seg.enabled) {
-    for (i = 0; i < MAX_SEGMENTS; ++i) {
+    for (int i = 0; i < MAX_SEGMENTS; ++i) {
       if (!xd->lossless[i]) {
-        all_lossless = 0;
+        coded_lossless = 0;
         break;
       }
     }
   } else {
-    all_lossless = xd->lossless[0];
+    coded_lossless = xd->lossless[0];
   }
-  return all_lossless;
+  return coded_lossless;
 }
 
-static INLINE int use_compressed_header(const AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_RESTRICT_COMPRESSED_HDR && CONFIG_NEW_MULTISYMBOL
-  return 0;
-#elif CONFIG_RESTRICT_COMPRESSED_HDR
-  return cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD;
-#else
-  return 1;
-#endif  // CONFIG_RESTRICT_COMPRESSED_HDR && CONFIG_NEW_MULTISYMBOL
+static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
+  return seq_level_idx < 24 || seq_level_idx == 31;
+}
+
+static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
+  assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
+  // Since bl.minor is unsigned a comparison will return a warning:
+  // comparison is always true due to limited range of data type
+  assert(LEVEL_MINOR_MIN == 0);
+  assert(bl.minor <= LEVEL_MINOR_MAX);
+  return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/partition.c b/third_party/aom/av1/common/partition.c
deleted file mode 100644
index 634a9edd5..000000000
--- a/third_party/aom/av1/common/partition.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "enums.h"
-#include "odintrin.h"
-#include "partition.h"
-#include "zigzag.h"
-
-OD_EXTERN const index_pair *OD_ZIGZAG4[4] = {
-  OD_ZIGZAG4_DCT_DCT,
-  OD_ZIGZAG4_ADST_DCT,
-  OD_ZIGZAG4_DCT_ADST,
-  OD_ZIGZAG4_ADST_ADST
-};
-
-OD_EXTERN const index_pair *OD_ZIGZAG8[4] = {
-  OD_ZIGZAG8_DCT_DCT,
-  OD_ZIGZAG8_ADST_DCT,
-  OD_ZIGZAG8_DCT_ADST,
-  OD_ZIGZAG8_ADST_ADST
-};
-
-OD_EXTERN const index_pair *OD_ZIGZAG16[4] = {
-  OD_ZIGZAG16_DCT_DCT,
-  OD_ZIGZAG16_ADST_DCT,
-  OD_ZIGZAG16_DCT_ADST,
-  OD_ZIGZAG16_ADST_ADST
-};
-
-OD_EXTERN const index_pair *OD_ZIGZAG32[4] = {
-  OD_ZIGZAG32_DCT_DCT,
-  OD_ZIGZAG32_DCT_DCT,
-  OD_ZIGZAG32_DCT_DCT,
-  OD_ZIGZAG32_DCT_DCT
-};
-
-/* The tables below specify how coefficient blocks are translated to
-   and from PVQ partition coding scan order for 4x4, 8x8 and 16x16 */
-
-static const int OD_LAYOUT32_OFFSETS[4] = { 0, 128, 256, 768 };
-const band_layout OD_LAYOUT32 = {
-  OD_ZIGZAG32,
-  32,
-  3,
-  OD_LAYOUT32_OFFSETS
-};
-
-static const int OD_LAYOUT16_OFFSETS[4] = { 0, 32, 64, 192 };
-const band_layout OD_LAYOUT16 = {
-  OD_ZIGZAG16,
-  16,
-  3,
-  OD_LAYOUT16_OFFSETS
-};
-
-const int OD_LAYOUT8_OFFSETS[4] = { 0, 8, 16, 48 };
-const band_layout OD_LAYOUT8 = {
-  OD_ZIGZAG8,
-  8,
-  3,
-  OD_LAYOUT8_OFFSETS
-};
-
-static const int OD_LAYOUT4_OFFSETS[2] = { 0, 15 };
-const band_layout OD_LAYOUT4 = {
-  OD_ZIGZAG4,
-  4,
-  1,
-  OD_LAYOUT4_OFFSETS
-};
-
-/* First element is the number of bands, followed by the list all the band
-  boundaries. */
-static const int OD_BAND_OFFSETS4[] = {1, 1, 16};
-static const int OD_BAND_OFFSETS8[] = {4, 1, 16, 24, 32, 64};
-static const int OD_BAND_OFFSETS16[] = {7, 1, 16, 24, 32, 64, 96, 128, 256};
-static const int OD_BAND_OFFSETS32[] = {10, 1, 16, 24, 32, 64, 96, 128, 256,
- 384, 512, 1024};
-static const int OD_BAND_OFFSETS64[] = {13, 1, 16, 24, 32, 64, 96, 128, 256,
- 384, 512, 1024, 1536, 2048, 4096};
-
-const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1] = {
-  OD_BAND_OFFSETS4,
-  OD_BAND_OFFSETS8,
-  OD_BAND_OFFSETS16,
-  OD_BAND_OFFSETS32,
-  OD_BAND_OFFSETS64
-};
-
-/** Perform a single stage of conversion from a coefficient block in
- * raster order into coding scan order
- *
- * @param [in]     layout  scan order specification
- * @param [out]    dst     destination vector
- * @param [in]     src     source coefficient block
- * @param [int]    int     source vector row stride
- */
-static void od_band_from_raster(const band_layout *layout, tran_low_t *dst,
- const tran_low_t *src, int stride, TX_TYPE tx_type) {
-  int i;
-  int len;
-  len = layout->band_offsets[layout->nb_bands];
-  for (i = 0; i < len; i++) {
-    dst[i] = src[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]];
-  }
-}
-
-/** Perform a single stage of conversion from a vector in coding scan
-    order back into a coefficient block in raster order
- *
- * @param [in]     layout  scan order specification
- * @param [out]    dst     destination coefficient block
- * @param [in]     src     source vector
- * @param [int]    stride  destination vector row stride
- */
-static void od_raster_from_band(const band_layout *layout, tran_low_t *dst,
- int stride, TX_TYPE tx_type, const tran_low_t *src) {
-  int i;
-  int len;
-  len = layout->band_offsets[layout->nb_bands];
-  for (i = 0; i < len; i++) {
-    dst[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]] = src[i];
-  }
-}
-
-static const band_layout *const OD_LAYOUTS[] = {&OD_LAYOUT4, &OD_LAYOUT8,
- &OD_LAYOUT16, &OD_LAYOUT32};
-
-/** Converts a coefficient block in raster order into a vector in
- * coding scan order with the PVQ partitions laid out one after
- * another.  This works in stages; the 4x4 conversion is applied to
- * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
- * nearest DC that was not already coded by 4x4, then 16x16 following
- * the same pattern.
- *
- * @param [out]    dst        destination vector
- * @param [in]     n          block size (along one side)
- * @param [in]     ty_type    transfrom type
- * @param [in]     src        source coefficient block
- * @param [in]     stride     source vector row stride
- */
-void od_raster_to_coding_order(tran_low_t *dst, int n, TX_TYPE ty_type,
- const tran_low_t *src, int stride) {
-  int bs;
-  /* dst + 1 because DC is not included for 4x4 blocks. */
-  od_band_from_raster(OD_LAYOUTS[0], dst + 1, src, stride, ty_type);
-  for (bs = 1; bs < OD_TXSIZES; bs++) {
-    int size;
-    int offset;
-    /* Length of block size > 4. */
-    size = 1 << (OD_LOG_BSIZE0 + bs);
-    /* Offset is the size of the previous block squared. */
-    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
-    if (n >= size) {
-      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
-      od_band_from_raster(OD_LAYOUTS[bs], dst + offset, src, stride, ty_type);
-    }
-  }
-  dst[0] = src[0];
-}
-
-/** Converts a vector in coding scan order witht he PVQ partitions
- * laid out one after another into a coefficient block in raster
- * order. This works in stages in the reverse order of raster->scan
- * order; the 16x16 conversion is applied to the coefficients that
- * don't appear in an 8x8 block, then the 8x8 applied to the 8x8 block
- * sans the 4x4 block it contains, then 4x4 is converted sans DC.
- *
- * @param [out]    dst        destination coefficient block
- * @param [in]     stride     destination vector row stride
- * @param [in]     src        source vector
- * @param [in]     n          block size (along one side)
- */
-void od_coding_order_to_raster(tran_low_t *dst, int stride, TX_TYPE ty_type,
- const tran_low_t *src, int n) {
-  int bs;
-  /* src + 1 because DC is not included for 4x4 blocks. */
-  od_raster_from_band(OD_LAYOUTS[0], dst, stride, ty_type, src + 1);
-  for (bs = 1; bs < OD_TXSIZES; bs++) {
-    int size;
-    int offset;
-    /* Length of block size > 4 */
-    size = 1 << (OD_LOG_BSIZE0 + bs);
-    /* Offset is the size of the previous block squared. */
-    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
-    if (n >= size) {
-      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
-      od_raster_from_band(OD_LAYOUTS[bs], dst, stride, ty_type, src + offset);
-    }
-  }
-  dst[0] = src[0];
-}
-
-/** Perform a single stage of conversion from a coefficient block in
- * raster order into coding scan order
- *
- * @param [in]     layout  scan order specification
- * @param [out]    dst     destination vector
- * @param [in]     src     source coefficient block
- * @param [int]    int     source vector row stride
- */
-static void od_band_from_raster_16(const band_layout *layout, int16_t *dst,
- const int16_t *src, int stride) {
-  int i;
-  int len;
-  len = layout->band_offsets[layout->nb_bands];
-  for (i = 0; i < len; i++) {
-    dst[i] = src[layout->dst_table[DCT_DCT][i][1]*stride + layout->dst_table[DCT_DCT][i][0]];
-  }
-}
-
-/** Converts a coefficient block in raster order into a vector in
- * coding scan order with the PVQ partitions laid out one after
- * another.  This works in stages; the 4x4 conversion is applied to
- * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
- * nearest DC that was not already coded by 4x4, then 16x16 following
- * the same pattern.
- *
- * @param [out]    dst        destination vector
- * @param [in]     n          block size (along one side)
- * @param [in]     src        source coefficient block
- * @param [in]     stride     source vector row stride
- */
-void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
- int stride) {
-  int bs;
-  /* dst + 1 because DC is not included for 4x4 blocks. */
-  od_band_from_raster_16(OD_LAYOUTS[0], dst + 1, src, stride);
-  for (bs = 1; bs < OD_TXSIZES; bs++) {
-    int size;
-    int offset;
-    /* Length of block size > 4. */
-    size = 1 << (OD_LOG_BSIZE0 + bs);
-    /* Offset is the size of the previous block squared. */
-    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
-    if (n >= size) {
-      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
-      od_band_from_raster_16(OD_LAYOUTS[bs], dst + offset, src, stride);
-    }
-  }
-  dst[0] = src[0];
-}
diff --git a/third_party/aom/av1/common/partition.h b/third_party/aom/av1/common/partition.h
deleted file mode 100644
index bd308f94f..000000000
--- a/third_party/aom/av1/common/partition.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_partition_H)
-# define _partition_H
-
-#include "av1/common/enums.h"
-#include "odintrin.h"
-
-typedef unsigned char index_pair[2];
-
-typedef struct {
-  const index_pair **const dst_table;
-  int size;
-  int nb_bands;
-  const int *const band_offsets;
-} band_layout;
-
-extern const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1];
-
-void od_raster_to_coding_order(tran_low_t *dst, int n,  TX_TYPE ty_type,
- const tran_low_t *src, int stride);
-
-void od_coding_order_to_raster(tran_low_t *dst, int stride,  TX_TYPE ty_type,
- const tran_low_t *src, int n);
-
-void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
- int stride);
-
-#endif
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 000000000..58933a7b3
--- /dev/null
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector int8_t int8x16_t;
+typedef vector uint8_t uint8x16_t;
+typedef vector int16_t int16x8_t;
+typedef vector uint16_t uint16x8_t;
+typedef vector int32_t int32x4_t;
+typedef vector uint32_t uint32x4_t;
+typedef vector uint64_t uint64x2_t;
+
+static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
+                                        int height, int round_offset,
+                                        int num_pel_log2) {
+  const int16_t *end = pred_buf + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = pred_buf;
+  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+  do {
+    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    if (width >= 16) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+    if (width == 32) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+  } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+  sum_32x4 = vec_add(sum_32x4, perm_64);
+  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+  sum_32x4 = vec_add(sum_32x4, perm_32);
+  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+  const int16x8_t vec_avg = vec_pack(avg, avg);
+  do {
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
+               OFF_0 + CFL_LINE_2, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
+               OFF_0 + CFL_LINE_3, pred_buf);
+    if (width >= 16) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_3, pred_buf);
+    }
+    if (width == 32) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_3, pred_buf);
+
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_3, pred_buf);
+    }
+  } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_c,     /* 4x4 */
+    subtract_average_8x8_vsx,   /* 8x8 */
+    subtract_average_16x16_vsx, /* 16x16 */
+    subtract_average_32x32_vsx, /* 32x32 */
+    cfl_subtract_average_null,  /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_c,     /* 4x8 */
+    subtract_average_8x4_vsx,   /* 8x4 */
+    subtract_average_8x16_vsx,  /* 8x16 */
+    subtract_average_16x8_vsx,  /* 16x8 */
+    subtract_average_16x32_vsx, /* 16x32 */
+    subtract_average_32x16_vsx, /* 32x16 */
+    cfl_subtract_average_null,  /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_c,    /* 4x16 */
+    subtract_average_16x4_vsx,  /* 16x4 */
+    subtract_average_8x32_vsx,  /* 8x32 */
+    subtract_average_32x8_vsx,  /* 32x8 */
+    cfl_subtract_average_null,  /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
index 51fd0389e..d77739d85 100644
--- a/third_party/aom/av1/common/pred_common.c
+++ b/third_party/aom/av1/common/pred_common.c
@@ -12,30 +12,23 @@
 #include "av1/common/common.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
-#if CONFIG_EXT_INTRA
 #include "av1/common/reconintra.h"
-#endif  // CONFIG_EXT_INTRA
 #include "av1/common/seg_common.h"
 
 // Returns a context number for the given MB prediction signal
-#if CONFIG_DUAL_FILTER
-static InterpFilter get_ref_filter_type(const MODE_INFO *mi,
+static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi,
                                         const MACROBLOCKD *xd, int dir,
                                         MV_REFERENCE_FRAME ref_frame) {
-  const MB_MODE_INFO *ref_mbmi = &mi->mbmi;
-  int use_subpel[2] = {
-    has_subpel_mv_component(mi, xd, dir),
-    has_subpel_mv_component(mi, xd, dir + 2),
-  };
-
-  return (((ref_mbmi->ref_frame[0] == ref_frame && use_subpel[0]) ||
-           (ref_mbmi->ref_frame[1] == ref_frame && use_subpel[1]))
+  (void)xd;
+
+  return ((ref_mbmi->ref_frame[0] == ref_frame ||
+           ref_mbmi->ref_frame[1] == ref_frame)
               ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
               : SWITCHABLE_FILTERS);
 }
 
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx_offset =
       (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
   MV_REFERENCE_FRAME ref_frame =
@@ -69,132 +62,57 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
 
   return filter_type_ctx;
 }
-#else
-int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int left_type =
-      xd->left_available && is_inter_block(left_mbmi)
-          ? av1_extract_interp_filter(left_mbmi->interp_filters, 0)
-          : SWITCHABLE_FILTERS;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const int above_type =
-      xd->up_available && is_inter_block(above_mbmi)
-          ? av1_extract_interp_filter(above_mbmi->interp_filters, 0)
-          : SWITCHABLE_FILTERS;
-
-  if (left_type == above_type) {
-    return left_type;
-  } else if (left_type == SWITCHABLE_FILTERS) {
-    assert(above_type != SWITCHABLE_FILTERS);
-    return above_type;
-  } else if (above_type == SWITCHABLE_FILTERS) {
-    assert(left_type != SWITCHABLE_FILTERS);
-    return left_type;
-  } else {
-    return SWITCHABLE_FILTERS;
-  }
-}
-#endif
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-// Obtain the reference filter type from the above/left neighbor blocks.
-static INTRA_FILTER get_ref_intra_filter(const MB_MODE_INFO *ref_mbmi) {
-  INTRA_FILTER ref_type = INTRA_FILTERS;
-
-  if (ref_mbmi->sb_type >= BLOCK_8X8) {
-    const PREDICTION_MODE mode = ref_mbmi->mode;
-    if (is_inter_block(ref_mbmi)) {
-      switch (av1_extract_interp_filter(ref_mbmi->interp_filters, 0)) {
-        case EIGHTTAP_REGULAR: ref_type = INTRA_FILTER_8TAP; break;
-        case EIGHTTAP_SMOOTH: ref_type = INTRA_FILTER_8TAP_SMOOTH; break;
-        case MULTITAP_SHARP: ref_type = INTRA_FILTER_8TAP_SHARP; break;
-        case BILINEAR: ref_type = INTRA_FILTERS; break;
-        default: break;
-      }
-    } else {
-      if (av1_is_directional_mode(mode, ref_mbmi->sb_type)) {
-        const int p_angle =
-            mode_to_angle_map[mode] + ref_mbmi->angle_delta[0] * ANGLE_STEP;
-        if (av1_is_intra_filter_switchable(p_angle)) {
-          ref_type = ref_mbmi->intra_filter;
-        }
-      }
-    }
-  }
-  return ref_type;
-}
-
-int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd) {
-  int left_type = INTRA_FILTERS, above_type = INTRA_FILTERS;
 
-  if (xd->left_available) left_type = get_ref_intra_filter(xd->left_mbmi);
+static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
+  // Do not add an already existing value
+  if (*n > 0 && val == cache[*n - 1]) return;
 
-  if (xd->up_available) above_type = get_ref_intra_filter(xd->above_mbmi);
-
-  if (left_type == above_type)
-    return left_type;
-  else if (left_type == INTRA_FILTERS && above_type != INTRA_FILTERS)
-    return above_type;
-  else if (left_type != INTRA_FILTERS && above_type == INTRA_FILTERS)
-    return left_type;
-  else
-    return INTRA_FILTERS;
+  cache[(*n)++] = val;
 }
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
                           uint16_t *cache) {
   const int row = -xd->mb_to_top_edge >> 3;
   // Do not refer to above SB row when on SB boundary.
-  const MODE_INFO *const above_mi =
-      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mi : NULL;
-  const MODE_INFO *const left_mi = xd->left_mi;
+  const MB_MODE_INFO *const above_mi =
+      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   int above_n = 0, left_n = 0;
-  if (above_mi)
-    above_n = above_mi->mbmi.palette_mode_info.palette_size[plane != 0];
-  if (left_mi)
-    left_n = left_mi->mbmi.palette_mode_info.palette_size[plane != 0];
+  if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
+  if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
   if (above_n == 0 && left_n == 0) return 0;
   int above_idx = plane * PALETTE_MAX_SIZE;
   int left_idx = plane * PALETTE_MAX_SIZE;
   int n = 0;
   const uint16_t *above_colors =
-      above_mi ? above_mi->mbmi.palette_mode_info.palette_colors : NULL;
+      above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
   const uint16_t *left_colors =
-      left_mi ? left_mi->mbmi.palette_mode_info.palette_colors : NULL;
+      left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
   // Merge the sorted lists of base colors from above and left to get
   // combined sorted color cache.
   while (above_n > 0 && left_n > 0) {
     uint16_t v_above = above_colors[above_idx];
     uint16_t v_left = left_colors[left_idx];
     if (v_left < v_above) {
-      if (n == 0 || v_left != cache[n - 1]) cache[n++] = v_left;
+      palette_add_to_cache(cache, &n, v_left);
       ++left_idx, --left_n;
     } else {
-      if (n == 0 || v_above != cache[n - 1]) cache[n++] = v_above;
+      palette_add_to_cache(cache, &n, v_above);
       ++above_idx, --above_n;
       if (v_left == v_above) ++left_idx, --left_n;
     }
   }
   while (above_n-- > 0) {
     uint16_t val = above_colors[above_idx++];
-    if (n == 0 || val != cache[n - 1]) cache[n++] = val;
+    palette_add_to_cache(cache, &n, val);
   }
   while (left_n-- > 0) {
     uint16_t val = left_colors[left_idx++];
-    if (n == 0 || val != cache[n - 1]) cache[n++] = val;
+    palette_add_to_cache(cache, &n, val);
   }
   assert(n <= 2 * PALETTE_MAX_SIZE);
   return n;
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real macroblocks.
@@ -220,65 +138,17 @@ int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
   }
 }
 
-#if CONFIG_COMPOUND_SINGLEREF
-// The compound/single mode info data structure has one element border above and
-// to the left of the entries corresponding to real macroblocks.
-// The prediction flags in these dummy entries are initialized to 0.
-int av1_get_inter_mode_context(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  if (has_above && has_left) {  // both edges available
-    const int above_inter_comp_mode =
-        is_inter_anyref_comp_mode(above_mbmi->mode);
-    const int left_inter_comp_mode = is_inter_anyref_comp_mode(left_mbmi->mode);
-    if (above_inter_comp_mode && left_inter_comp_mode)
-      return 0;
-    else if (above_inter_comp_mode || left_inter_comp_mode)
-      return 1;
-    else if (!is_inter_block(above_mbmi) && !is_inter_block(left_mbmi))
-      return 2;
-    else
-      return 3;
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *const edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (is_inter_anyref_comp_mode(edge_mbmi->mode))
-      return 1;
-    else if (!is_inter_block(edge_mbmi))
-      return 2;
-    else
-      return 3;
-  } else {  // no edge available
-    return 2;
-  }
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_EXT_REFS
 #define CHECK_BACKWARD_REFS(ref_frame) \
   (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
 #define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
-#else
-#define IS_BACKWARD_REF_FRAME(ref_frame) ((ref_frame) == cm->comp_fixed_ref)
-#endif  // CONFIG_EXT_REFS
-
-#define CHECK_GOLDEN_OR_LAST3(ref_frame) \
-  (((ref_frame) == GOLDEN_FRAME) || ((ref_frame) == LAST3_FRAME))
 
-int av1_get_reference_mode_context(const AV1_COMMON *cm,
-                                   const MACROBLOCKD *xd) {
+int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
   int ctx;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
 
-#if CONFIG_EXT_REFS
-  (void)cm;
-#endif  // CONFIG_EXT_REFS
-
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
@@ -314,9 +184,6 @@ int av1_get_reference_mode_context(const AV1_COMMON *cm,
   return ctx;
 }
 
-#if CONFIG_EXT_COMP_REFS
-// TODO(zoeliu): To try on the design of 3 contexts, instead of 5:
-//               COMP_REF_TYPE_CONTEXTS = 3
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -344,9 +211,8 @@ int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
       const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
 
       if (a_sg && l_sg) {  // single/single
-        pred_context =
-            1 +
-            2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^ IS_BACKWARD_REF_FRAME(frfl)));
+        pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
+                                  IS_BACKWARD_REF_FRAME(frfl)));
       } else if (l_sg || a_sg) {  // single/comp
         const int uni_rfc =
             a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
@@ -397,44 +263,16 @@ int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
 // 3 contexts: Voting is used to compare the count of forward references with
 //             that of backward references from the spatial neighbors.
 int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of forward references (L, L2, L3, or G)
-  int frf_count = 0;
+  const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
   // Count of backward references (B or A)
-  int brf_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    if (above_mbmi->ref_frame[0] <= GOLDEN_FRAME)
-      ++frf_count;
-    else
-      ++brf_count;
-    if (has_second_ref(above_mbmi)) {
-      if (above_mbmi->ref_frame[1] <= GOLDEN_FRAME)
-        ++frf_count;
-      else
-        ++brf_count;
-    }
-  }
-
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    if (left_mbmi->ref_frame[0] <= GOLDEN_FRAME)
-      ++frf_count;
-    else
-      ++brf_count;
-    if (has_second_ref(left_mbmi)) {
-      if (left_mbmi->ref_frame[1] <= GOLDEN_FRAME)
-        ++frf_count;
-      else
-        ++brf_count;
-    }
-  }
+  const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
 
-  pred_context =
+  const int pred_context =
       (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
@@ -450,50 +288,17 @@ int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
 // 3 contexts: Voting is used to compare the count of LAST2_FRAME with the
 //             total count of LAST3/GOLDEN from the spatial neighbors.
 int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of LAST2
-  int last2_count = 0;
+  const int last2_count = ref_counts[LAST2_FRAME];
   // Count of LAST3 or GOLDEN
-  int last3_or_gld_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    last2_count = (above_mbmi->ref_frame[0] == LAST2_FRAME) ? last2_count + 1
-                                                            : last2_count;
-    last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(above_mbmi->ref_frame[0])
-                             ? last3_or_gld_count + 1
-                             : last3_or_gld_count;
-    if (has_second_ref(above_mbmi)) {
-      last2_count = (above_mbmi->ref_frame[1] == LAST2_FRAME) ? last2_count + 1
-                                                              : last2_count;
-      last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(above_mbmi->ref_frame[1])
-                               ? last3_or_gld_count + 1
-                               : last3_or_gld_count;
-    }
-  }
+  const int last3_or_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
 
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    last2_count = (left_mbmi->ref_frame[0] == LAST2_FRAME) ? last2_count + 1
-                                                           : last2_count;
-    last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(left_mbmi->ref_frame[0])
-                             ? last3_or_gld_count + 1
-                             : last3_or_gld_count;
-    if (has_second_ref(left_mbmi)) {
-      last2_count = (left_mbmi->ref_frame[1] == LAST2_FRAME) ? last2_count + 1
-                                                             : last2_count;
-      last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(left_mbmi->ref_frame[1])
-                               ? last3_or_gld_count + 1
-                               : last3_or_gld_count;
-    }
-  }
-
-  pred_context = (last2_count == last3_or_gld_count)
-                     ? 1
-                     : ((last2_count < last3_or_gld_count) ? 0 : 2);
+  const int pred_context = (last2_count == last3_or_gld_count)
+                               ? 1
+                               : ((last2_count < last3_or_gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
   return pred_context;
@@ -508,415 +313,83 @@ int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
 // 3 contexts: Voting is used to compare the count of LAST3_FRAME with the
 //             total count of GOLDEN_FRAME from the spatial neighbors.
 int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of LAST3
-  int last3_count = 0;
+  const int last3_count = ref_counts[LAST3_FRAME];
   // Count of GOLDEN
-  int gld_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    last3_count = (above_mbmi->ref_frame[0] == LAST3_FRAME) ? last3_count + 1
-                                                            : last3_count;
-    gld_count =
-        (above_mbmi->ref_frame[0] == GOLDEN_FRAME) ? gld_count + 1 : gld_count;
-    if (has_second_ref(above_mbmi)) {
-      last3_count = (above_mbmi->ref_frame[1] == LAST3_FRAME) ? last3_count + 1
-                                                              : last3_count;
-      gld_count = (above_mbmi->ref_frame[1] == GOLDEN_FRAME) ? gld_count + 1
-                                                             : gld_count;
-    }
-  }
+  const int gld_count = ref_counts[GOLDEN_FRAME];
 
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    last3_count = (left_mbmi->ref_frame[0] == LAST3_FRAME) ? last3_count + 1
-                                                           : last3_count;
-    gld_count =
-        (left_mbmi->ref_frame[0] == GOLDEN_FRAME) ? gld_count + 1 : gld_count;
-    if (has_second_ref(left_mbmi)) {
-      last3_count = (left_mbmi->ref_frame[1] == LAST3_FRAME) ? last3_count + 1
-                                                             : last3_count;
-      gld_count =
-          (left_mbmi->ref_frame[1] == GOLDEN_FRAME) ? gld_count + 1 : gld_count;
-    }
-  }
-
-  pred_context =
+  const int pred_context =
       (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
   return pred_context;
 }
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_EXT_REFS
 
-// TODO(zoeliu): Future work will be conducted to optimize the context design
-//               for the coding of the reference frames.
-
-#define CHECK_LAST_OR_LAST2(ref_frame) \
-  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME))
-
-// Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be either
-// GOLDEN/LAST3, or LAST/LAST2.
+// == Common context functions for both comp and single ref ==
 //
-// NOTE(zoeliu): The probability of ref_frame[0] is either
-//               GOLDEN_FRAME or LAST3_FRAME.
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+// Obtain contexts to signal a reference frame to be either LAST/LAST2 or
+// LAST3/GOLDEN.
+static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-// Note:
-// The mode info data structure has a one element border above and to the
-// left of the entries correpsonding to real macroblocks.
-// The prediction flags in these dummy entries are initialised to 0.
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
-  const int bwd_ref_sign_idx = 1;
-#else
-  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#endif  // CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
-
-  (void)cm;
+  // Count of LAST + LAST2
+  const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
+  // Count of LAST3 + GOLDEN
+  const int last3_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
 
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context =
-            1 + 2 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
-      else  // comp pred (1/3)
-        pred_context = 1 +
-                       2 * (!CHECK_GOLDEN_OR_LAST3(
-                               edge_mbmi->ref_frame[fwd_ref_sign_idx]));
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME frfa =
-          a_sg ? above_mbmi->ref_frame[0]
-               : above_mbmi->ref_frame[fwd_ref_sign_idx];
-      const MV_REFERENCE_FRAME frfl =
-          l_sg ? left_mbmi->ref_frame[0]
-               : left_mbmi->ref_frame[fwd_ref_sign_idx];
-
-      if (frfa == frfl && CHECK_GOLDEN_OR_LAST3(frfa)) {
-        pred_context = 0;
-      } else if (l_sg && a_sg) {  // single/single
-        if ((CHECK_BACKWARD_REFS(frfa) && CHECK_LAST_OR_LAST2(frfl)) ||
-            (CHECK_BACKWARD_REFS(frfl) && CHECK_LAST_OR_LAST2(frfa))) {
-          pred_context = 4;
-        } else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl)) {
-          pred_context = 1;
-        } else {
-          pred_context = 3;
-        }
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
-
-        if (CHECK_GOLDEN_OR_LAST3(frfc) && !CHECK_GOLDEN_OR_LAST3(rfs))
-          pred_context = 1;
-        else if (CHECK_GOLDEN_OR_LAST3(rfs) && !CHECK_GOLDEN_OR_LAST3(frfc))
-          pred_context = 2;
-        else
-          pred_context = 4;
-      } else {  // comp/comp
-        if ((CHECK_LAST_OR_LAST2(frfa) && CHECK_LAST_OR_LAST2(frfl))) {
-          pred_context = 4;
-        } else {
-// NOTE(zoeliu): Following assert may be removed once confirmed.
-#if !USE_UNI_COMP_REFS
-          // TODO(zoeliu): To further study the UNIDIR scenario
-          assert(CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
-#endif  // !USE_UNI_COMP_REFS
-          pred_context = 2;
-        }
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi))
-        pred_context =
-            4 *
-            (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[fwd_ref_sign_idx]));
-      else
-        pred_context = 3 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
+  const int pred_context = (last_last2_count == last3_gld_count)
+                               ? 1
+                               : ((last_last2_count < last3_gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
   return pred_context;
 }
 
-// Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be LAST,
-// conditioning on that it is known either LAST/LAST2.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
-// conditioning on it is either LAST_FRAME or LAST2_FRAME.
-int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
-
-// Note:
-// The mode info data structure has a one element border above and to the
-// left of the entries correpsonding to real macroblocks.
-// The prediction flags in these dummy entries are initialised to 0.
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
-  const int bwd_ref_sign_idx = 1;
-#else
-  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#endif  //  CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
-
-  (void)cm;
-
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST_FRAME);
-      else  // comp pred (1/3)
-        pred_context =
-            1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME frfa =
-          a_sg ? above_mbmi->ref_frame[0]
-               : above_mbmi->ref_frame[fwd_ref_sign_idx];
-      const MV_REFERENCE_FRAME frfl =
-          l_sg ? left_mbmi->ref_frame[0]
-               : left_mbmi->ref_frame[fwd_ref_sign_idx];
-
-      if (frfa == frfl && frfa == LAST_FRAME)
-        pred_context = 0;
-      else if (l_sg && a_sg) {  // single/single
-        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
-          pred_context = 1;
-        else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl))
-          pred_context = 2 + (frfa != frfl);
-        else if (frfa == frfl ||
-                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
-          pred_context = 3;
-        else
-          pred_context = 4;
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+// Obtain contexts to signal a reference frame to be either LAST or LAST2.
+static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-        if (frfc == LAST_FRAME && rfs != LAST_FRAME)
-          pred_context = 1;
-        else if (rfs == LAST_FRAME && frfc != LAST_FRAME)
-          pred_context = 2;
-        else
-          pred_context =
-              3 + (frfc == LAST2_FRAME || CHECK_GOLDEN_OR_LAST3(rfs));
-      } else {  // comp/comp
-        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
-          pred_context = 2;
-        else
-          pred_context =
-              3 + (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  // Count of LAST
+  const int last_count = ref_counts[LAST_FRAME];
+  // Count of LAST2
+  const int last2_count = ref_counts[LAST2_FRAME];
 
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi)) {
-        pred_context =
-            4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
-      } else {
-        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
-          pred_context = 0;
-        else
-          pred_context = 2 + CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]);
-      }
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
+  const int pred_context =
+      (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
   return pred_context;
 }
 
-// Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be GOLDEN,
-// conditioning on that it is known either GOLDEN or LAST3.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
-// conditioning on it is either GOLDEN or LAST3.
-int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
-
-// Note:
-// The mode info data structure has a one element border above and to the
-// left of the entries correpsonding to real macroblocks.
-// The prediction flags in these dummy entries are initialised to 0.
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int bwd_ref_sign_idx = 1;
-#else
-  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#endif  // CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
-
-  (void)cm;
+// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN.
+static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != GOLDEN_FRAME);
-      else  // comp pred (1/3)
-        pred_context =
-            1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME frfa =
-          a_sg ? above_mbmi->ref_frame[0]
-               : above_mbmi->ref_frame[fwd_ref_sign_idx];
-      const MV_REFERENCE_FRAME frfl =
-          l_sg ? left_mbmi->ref_frame[0]
-               : left_mbmi->ref_frame[fwd_ref_sign_idx];
-
-      if (frfa == frfl && frfa == GOLDEN_FRAME)
-        pred_context = 0;
-      else if (l_sg && a_sg) {  // single/single
-        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
-          pred_context = 1;
-        else if (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl))
-          pred_context = 2 + (frfa != frfl);
-        else if (frfa == frfl ||
-                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
-          pred_context = 3;
-        else
-          pred_context = 4;
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
-
-        if (frfc == GOLDEN_FRAME && rfs != GOLDEN_FRAME)
-          pred_context = 1;
-        else if (rfs == GOLDEN_FRAME && frfc != GOLDEN_FRAME)
-          pred_context = 2;
-        else
-          pred_context = 3 + (frfc == LAST3_FRAME || CHECK_LAST_OR_LAST2(rfs));
-      } else {  // comp/comp
-        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
-          pred_context = 2;
-        else
-          pred_context =
-              3 + (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl));
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  // Count of LAST3
+  const int last3_count = ref_counts[LAST3_FRAME];
+  // Count of GOLDEN
+  const int gld_count = ref_counts[GOLDEN_FRAME];
 
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi)) {
-        pred_context =
-            4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
-      } else {
-        if (edge_mbmi->ref_frame[0] == GOLDEN_FRAME)
-          pred_context = 0;
-        else
-          pred_context = 2 + CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
-      }
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
+  const int pred_context =
+      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
   return pred_context;
 }
 
 // Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or
 // ALTREF.
-int av1_get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
-  int bwdref_counts[ALTREF_FRAME - BWDREF_FRAME + 1] = { 0 };
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    if (above_mbmi->ref_frame[0] >= BWDREF_FRAME)
-      ++bwdref_counts[above_mbmi->ref_frame[0] - BWDREF_FRAME];
-    if (has_second_ref(above_mbmi)) {
-      if (above_mbmi->ref_frame[1] >= BWDREF_FRAME)
-        ++bwdref_counts[above_mbmi->ref_frame[1] - BWDREF_FRAME];
-    }
-  }
+  const int brfarf2_count =
+      ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
+  const int arf_count = ref_counts[ALTREF_FRAME];
 
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    if (left_mbmi->ref_frame[0] >= BWDREF_FRAME)
-      ++bwdref_counts[left_mbmi->ref_frame[0] - BWDREF_FRAME];
-    if (has_second_ref(left_mbmi)) {
-      if (left_mbmi->ref_frame[1] >= BWDREF_FRAME)
-        ++bwdref_counts[left_mbmi->ref_frame[1] - BWDREF_FRAME];
-    }
-  }
-
-  const int brfarf2_count = bwdref_counts[BWDREF_FRAME - BWDREF_FRAME] +
-                            bwdref_counts[ALTREF2_FRAME - BWDREF_FRAME];
-  const int arf_count = bwdref_counts[ALTREF_FRAME - BWDREF_FRAME];
   const int pred_context =
       (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
 
@@ -925,42 +398,13 @@ int av1_get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
 }
 
 // Obtain contexts to signal a reference frame be either BWDREF or ALTREF2.
-int av1_get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of BWDREF frames (B)
-  int brf_count = 0;
+  const int brf_count = ref_counts[BWDREF_FRAME];
   // Count of ALTREF2 frames (A2)
-  int arf2_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    if (above_mbmi->ref_frame[0] == BWDREF_FRAME)
-      ++brf_count;
-    else if (above_mbmi->ref_frame[0] == ALTREF2_FRAME)
-      ++arf2_count;
-    if (has_second_ref(above_mbmi)) {
-      if (above_mbmi->ref_frame[1] == BWDREF_FRAME)
-        ++brf_count;
-      else if (above_mbmi->ref_frame[1] == ALTREF2_FRAME)
-        ++arf2_count;
-    }
-  }
-
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    if (left_mbmi->ref_frame[0] == BWDREF_FRAME)
-      ++brf_count;
-    else if (left_mbmi->ref_frame[0] == ALTREF2_FRAME)
-      ++arf2_count;
-    if (has_second_ref(left_mbmi)) {
-      if (left_mbmi->ref_frame[1] == BWDREF_FRAME)
-        ++brf_count;
-      else if (left_mbmi->ref_frame[1] == ALTREF2_FRAME)
-        ++arf2_count;
-    }
-  }
+  const int arf2_count = ref_counts[ALTREF2_FRAME];
 
   const int pred_context =
       (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
@@ -969,168 +413,57 @@ int av1_get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
   return pred_context;
 }
 
-// Signal the 2nd reference frame for a compound mode be either
-// ALTREF, or ALTREF2/BWDREF.
-int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd) {
-  (void)cm;
-  return av1_get_pred_context_brfarf2_or_arf(xd);
+// == Context functions for comp ref ==
+//
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_ll2_or_l3gld(xd);
 }
 
-// Signal the 2nd reference frame for a compound mode be either
-// ALTREF2 or BWDREF.
-int av1_get_pred_context_comp_bwdref_p1(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd) {
-  (void)cm;
-  return av1_get_pred_context_brf_or_arf2(xd);
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_last_or_last2(xd);
 }
 
-#else  // !CONFIG_EXT_REFS
-
 // Returns a context number for the given MB prediction signal
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-  const int var_ref_idx = !fix_ref_idx;
-
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
-      else  // comp pred (1/3)
-        pred_context =
-            1 + 2 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]);
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME vrfa =
-          a_sg ? above_mbmi->ref_frame[0] : above_mbmi->ref_frame[var_ref_idx];
-      const MV_REFERENCE_FRAME vrfl =
-          l_sg ? left_mbmi->ref_frame[0] : left_mbmi->ref_frame[var_ref_idx];
-
-      if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
-        pred_context = 0;
-      } else if (l_sg && a_sg) {  // single/single
-        if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
-            (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
-          pred_context = 4;
-        else if (vrfa == vrfl)
-          pred_context = 3;
-        else
-          pred_context = 1;
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
-        if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
-          pred_context = 1;
-        else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
-          pred_context = 2;
-        else
-          pred_context = 4;
-      } else if (vrfa == vrfl) {  // comp/comp
-        pred_context = 4;
-      } else {
-        pred_context = 2;
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi))
-        pred_context =
-            4 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]);
-      else
-        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
-  return pred_context;
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
+  return get_pred_context_last3_or_gld(xd);
 }
 
-#endif  // CONFIG_EXT_REFS
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF, or ALTREF2/BWDREF.
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_brfarf2_or_arf(xd);
+}
 
-#if CONFIG_EXT_REFS
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF2 or BWDREF.
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_brf_or_arf2(xd);
+}
 
+// == Context functions for single ref ==
+//
 // For the bit to signal whether the single reference is a forward reference
 // frame or a backward reference frame.
 int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single
-        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
-      else  // comp
-        pred_context = 2;
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+  // Count of forward reference frames
+  const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+  // Count of backward reference frames
+  const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
 
-      if (above_has_second && left_has_second) {  // comp/comp
-        pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-
-        pred_context = (!CHECK_BACKWARD_REFS(rfs)) ? 4 : 1;
-      } else {  // single/single
-        pred_context = 2 * (!CHECK_BACKWARD_REFS(above0)) +
-                       2 * (!CHECK_BACKWARD_REFS(left0));
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (!is_inter_block(edge_mbmi)) {  // intra
-      pred_context = 2;
-    } else {                           // inter
-      if (!has_second_ref(edge_mbmi))  // single
-        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
-      else  // comp
-        pred_context = 2;
-    }
-  } else {  // no edges available
-    pred_context = 2;
-  }
+  const int pred_context =
+      (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
@@ -1140,445 +473,29 @@ int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
 // non-ALTREF backward reference frame, knowing that it shall be either of
 // these 2 choices.
 int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
-  return av1_get_pred_context_brfarf2_or_arf(xd);
+  return get_pred_context_brfarf2_or_arf(xd);
 }
 
 // For the bit to signal whether the single reference is LAST3/GOLDEN or
 // LAST2/LAST, knowing that it shall be either of these 2 choices.
 int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {  // single
-        if (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
-          pred_context = 3;
-        else
-          pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
-      } else {  // comp
-        pred_context = 1 +
-                       2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
-                            CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {  // comp/comp
-        if (above0 == left0 && above1 == left1)
-          pred_context =
-              3 * (CHECK_LAST_OR_LAST2(above0) || CHECK_LAST_OR_LAST2(above1) ||
-                   CHECK_LAST_OR_LAST2(left0) || CHECK_LAST_OR_LAST2(left1));
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (CHECK_LAST_OR_LAST2(rfs))
-          pred_context =
-              3 + (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-        else if (CHECK_GOLDEN_OR_LAST3(rfs))
-          pred_context =
-              (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-        else
-          pred_context =
-              1 + 2 * (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-      } else {  // single/single
-        if (CHECK_BACKWARD_REFS(above0) && CHECK_BACKWARD_REFS(left0)) {
-          pred_context = 2 + (above0 == left0);
-        } else if (CHECK_BACKWARD_REFS(above0) || CHECK_BACKWARD_REFS(left0)) {
-          const MV_REFERENCE_FRAME edge0 =
-              CHECK_BACKWARD_REFS(above0) ? left0 : above0;
-          pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
-        } else {
-          pred_context =
-              2 * CHECK_LAST_OR_LAST2(above0) + 2 * CHECK_LAST_OR_LAST2(left0);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
-         !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))  // single
-      pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
-    else  // comp
-      pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
-                          CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_ll2_or_l3gld(xd);
 }
 
 // For the bit to signal whether the single reference is LAST2_FRAME or
 // LAST_FRAME, knowing that it shall be either of these 2 choices.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
-// on it is either LAST2_FRAME/LAST_FRAME.
 int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {  // single
-        if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
-          pred_context = 3;
-        else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      } else {  // comp
-        pred_context = 1 +
-                       2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {  // comp/comp
-        if (above0 == left0 && above1 == left1)
-          pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
-                              left0 == LAST_FRAME || left1 == LAST_FRAME);
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == LAST_FRAME)
-          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else if (rfs == LAST2_FRAME)
-          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else
-          pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      } else {  // single/single
-        if (!CHECK_LAST_OR_LAST2(above0) && !CHECK_LAST_OR_LAST2(left0)) {
-          pred_context = 2 + (above0 == left0);
-        } else if (!CHECK_LAST_OR_LAST2(above0) ||
-                   !CHECK_LAST_OR_LAST2(left0)) {
-          const MV_REFERENCE_FRAME edge0 =
-              !CHECK_LAST_OR_LAST2(above0) ? left0 : above0;
-          pred_context = 4 * (edge0 == LAST_FRAME);
-        } else {
-          pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
-         !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))  // single
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-    else  // comp
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                          edge_mbmi->ref_frame[1] == LAST_FRAME);
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_last_or_last2(xd);
 }
 
 // For the bit to signal whether the single reference is GOLDEN_FRAME or
 // LAST3_FRAME, knowing that it shall be either of these 2 choices.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME, conditioning
-// on it is either GOLDEN_FRAME/LAST3_FRAME.
 int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {  // single
-        if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
-          pred_context = 3;
-        else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-      } else {  // comp
-        pred_context = 1 +
-                       2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST3_FRAME);
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {  // comp/comp
-        if (above0 == left0 && above1 == left1)
-          pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
-                              left0 == LAST3_FRAME || left1 == LAST3_FRAME);
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == LAST3_FRAME)
-          pred_context = 3 + (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-        else if (rfs == GOLDEN_FRAME)
-          pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-        else
-          pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-      } else {  // single/single
-        if (!CHECK_GOLDEN_OR_LAST3(above0) && !CHECK_GOLDEN_OR_LAST3(left0)) {
-          pred_context = 2 + (above0 == left0);
-        } else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
-                   !CHECK_GOLDEN_OR_LAST3(left0)) {
-          const MV_REFERENCE_FRAME edge0 =
-              !CHECK_GOLDEN_OR_LAST3(above0) ? left0 : above0;
-          pred_context = 4 * (edge0 == LAST3_FRAME);
-        } else {
-          pred_context =
-              2 * (above0 == LAST3_FRAME) + 2 * (left0 == LAST3_FRAME);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
-         !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))  // single
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-    else  // comp
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
-                          edge_mbmi->ref_frame[1] == LAST3_FRAME);
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_last3_or_gld(xd);
 }
 
 // For the bit to signal whether the single reference is ALTREF2_FRAME or
 // BWDREF_FRAME, knowing that it shall be either of these 2 choices.
 int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
-  return av1_get_pred_context_brf_or_arf2(xd);
-}
-
-#else  // !CONFIG_EXT_REFS
-
-int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {
-        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
-                            left0 == LAST_FRAME || left1 == LAST_FRAME);
-      } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == LAST_FRAME)
-          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else
-          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      } else {
-        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (!is_inter_block(edge_mbmi)) {  // intra
-      pred_context = 2;
-    } else {  // inter
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    }
-  } else {  // no edges available
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_brf_or_arf2(xd);
 }
-
-int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
-        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
-          pred_context = 3;
-        else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
-      } else {
-        pred_context = 1 +
-                       2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                            edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {
-        if (above0 == left0 && above1 == left1)
-          pred_context =
-              3 * (above0 == GOLDEN_FRAME || above1 == GOLDEN_FRAME ||
-                   left0 == GOLDEN_FRAME || left1 == GOLDEN_FRAME);
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == GOLDEN_FRAME)
-          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-        else if (rfs != GOLDEN_FRAME && rfs != LAST_FRAME)
-          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
-        else
-          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-      } else {
-        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
-          pred_context = 3;
-        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
-          const MV_REFERENCE_FRAME edge0 =
-              (above0 == LAST_FRAME) ? left0 : above0;
-          pred_context = 4 * (edge0 == GOLDEN_FRAME);
-        } else {
-          pred_context =
-              2 * (above0 == GOLDEN_FRAME) + 2 * (left0 == GOLDEN_FRAME);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
-    else
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
-}
-
-#endif  // CONFIG_EXT_REFS
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
index db4618a59..6a835c467 100644
--- a/third_party/aom/av1/common/pred_common.h
+++ b/third_party/aom/av1/common/pred_common.h
@@ -13,6 +13,7 @@
 #define AV1_COMMON_PRED_COMMON_H_
 
 #include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
 #include "av1/common/onyxc_int.h"
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -39,115 +40,174 @@ static INLINE int get_segment_id(const AV1_COMMON *const cm,
   return segment_id;
 }
 
+static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+                                           const MACROBLOCKD *const xd,
+                                           int mi_row, int mi_col,
+                                           int *cdf_index) {
+  int prev_ul = -1;  // top left segment_id
+  int prev_l = -1;   // left segment_id
+  int prev_u = -1;   // top segment_id
+  if ((xd->up_available) && (xd->left_available)) {
+    prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                             mi_row - 1, mi_col - 1);
+  }
+  if (xd->up_available) {
+    prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                            mi_row - 1, mi_col - 0);
+  }
+  if (xd->left_available) {
+    prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                            mi_row - 0, mi_col - 1);
+  }
+
+  // Pick CDF index based on number of matching/out-of-bounds segment IDs.
+  if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+    *cdf_index = 0;
+  else if ((prev_ul == prev_u) && (prev_ul == prev_l))
+    *cdf_index = 2;
+  else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
+    *cdf_index = 1;
+  else
+    *cdf_index = 0;
+
+  // If 2 or more are identical returns that as predictor, otherwise prev_l.
+  if (prev_u == -1)  // edge case
+    return prev_l == -1 ? 0 : prev_l;
+  if (prev_l == -1)  // edge case
+    return prev_u;
+  return (prev_ul == prev_u) ? prev_u : prev_l;
+}
+
 static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const int above_sip =
-      (above_mi != NULL) ? above_mi->mbmi.seg_id_predicted : 0;
-  const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
 
   return above_sip + left_sip;
 }
 
-static INLINE aom_prob av1_get_pred_prob_seg_id(
-    const struct segmentation_probs *segp, const MACROBLOCKD *xd) {
-  return segp->pred_probs[av1_get_pred_context_seg_id(xd)];
+static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+  int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+  int cur_frame_index = cm->cur_frame->cur_frame_offset;
+
+  if (bck_idx >= 0)
+    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+
+  if (fwd_idx >= 0)
+    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+  int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index));
+  int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index));
+
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+
+  int above_ctx = 0, left_ctx = 0;
+  const int offset = (fwd == bck);
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->compound_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 1;
+  }
+
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->compound_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 1;
+  }
+
+  return above_ctx + left_ctx + 3 * offset;
+}
+
+static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int above_ctx = 0, left_ctx = 0;
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->comp_group_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 3;
+  }
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->comp_group_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 3;
+  }
+
+  return AOMMIN(5, above_ctx + left_ctx);
 }
 
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
     struct segmentation_probs *segp, const MACROBLOCKD *xd) {
   return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
 }
-#endif
 
-static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
-  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
-  return above_skip + left_skip;
+static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
+  const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
+  return above_skip_mode + left_skip_mode;
 }
 
-static INLINE aom_prob av1_get_skip_prob(const AV1_COMMON *cm,
-                                         const MACROBLOCKD *xd) {
-  return cm->fc->skip_probs[av1_get_skip_context(xd)];
+static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip = above_mi ? above_mi->skip : 0;
+  const int left_skip = left_mi ? left_mi->skip : 0;
+  return above_skip + left_skip;
 }
 
-#if CONFIG_DUAL_FILTER
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
-#else
-int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
-#endif
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Get a list of palette base colors that are used in the above and left blocks,
 // referred to as "color cache". The return value is the number of colors in the
 // cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache"
 // in ascending order.
 int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
                           uint16_t *cache);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
-int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+  return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
+}
 
-static INLINE aom_prob av1_get_intra_inter_prob(const AV1_COMMON *cm,
-                                                const MACROBLOCKD *xd) {
-  return cm->fc->intra_inter_prob[av1_get_intra_inter_context(xd)];
+static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int ctx = 0;
+  if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
+  if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
+  return ctx;
 }
 
-int av1_get_reference_mode_context(const AV1_COMMON *cm, const MACROBLOCKD *xd);
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_reference_mode_prob(const AV1_COMMON *cm,
-                                                   const MACROBLOCKD *xd) {
-  return cm->fc->comp_inter_prob[av1_get_reference_mode_context(cm, xd)];
-}
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(cm, xd)];
+int av1_get_reference_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
-#endif
 
-#if CONFIG_EXT_COMP_REFS
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_comp_reference_type_prob(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  return cm->fc->comp_ref_type_prob[av1_get_comp_reference_type_context(xd)];
-}
+// == Uni-directional contexts ==
 
 int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_uni_comp_ref_p(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
-  return cm->fc->uni_comp_ref_prob[pred_context][0];
-}
-
 int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE aom_prob
-av1_get_pred_prob_uni_comp_ref_p1(const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
-  return cm->fc->uni_comp_ref_prob[pred_context][1];
-}
-
 int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE aom_prob
-av1_get_pred_prob_uni_comp_ref_p2(const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
-  return cm->fc->uni_comp_ref_prob[pred_context][2];
-}
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_comp_reference_type_context(xd);
@@ -171,211 +231,126 @@ static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
   const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
   return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
 
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd);
+// == Bi-directional contexts ==
 
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
-  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
-}
-#endif
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p(const AV1_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context][0];
-}
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd);
 
-#if CONFIG_EXT_REFS
-int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd);
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd);
 
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p1(cm, xd);
-  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
-}
-#endif  // CONFIG_NEW_MULTISYMBOL
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p1(const AV1_COMMON *cm,
-                                                     const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p1(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context][1];
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
 }
 
-int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd);
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
+}
 
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p2(cm, xd);
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
   return xd->tile_ctx->comp_ref_cdf[pred_context][2];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p2(const AV1_COMMON *cm,
-                                                     const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p2(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context][2];
-}
-
-int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd);
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p(cm, xd);
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
   return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-static INLINE aom_prob av1_get_pred_prob_comp_bwdref_p(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p(cm, xd);
-  return cm->fc->comp_bwdref_prob[pred_context][0];
-}
-
-int av1_get_pred_context_comp_bwdref_p1(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd);
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p1(cm, xd);
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
   return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-static INLINE aom_prob av1_get_pred_prob_comp_bwdref_p1(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p1(cm, xd);
-  return cm->fc->comp_bwdref_prob[pred_context][1];
-}
-#endif  // CONFIG_EXT_REFS
+// == Single contexts ==
 
 int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p1(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p1(xd)][0];
-}
-
 int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p2(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p2(xd)][1];
-}
-
-#if CONFIG_EXT_REFS
 int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p3(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p3(xd)][2];
-}
-
 int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p4(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p4(xd)][3];
-}
-
 int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p5(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p5(xd)][4];
-}
-
 int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p6(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p6(xd)][5];
-}
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
 }
-#if CONFIG_EXT_REFS
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
 }
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_COMPOUND_SINGLEREF
-int av1_get_inter_mode_context(const MACROBLOCKD *xd);
-
-static INLINE aom_prob av1_get_inter_mode_prob(const AV1_COMMON *cm,
-                                               const MACROBLOCKD *xd) {
-  return cm->fc->comp_inter_mode_prob[av1_get_inter_mode_context(xd)];
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
-  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+  const int max_tx_wide = tx_size_wide[max_tx_size];
+  const int max_tx_high = tx_size_high[max_tx_size];
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
-  int above_ctx = (has_above && !above_mbmi->skip)
-                      ? (int)txsize_sqr_map[above_mbmi->tx_size]
-                      : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip)
-                     ? (int)txsize_sqr_map[left_mbmi->tx_size]
-                     : max_tx_size;
-
-  if (!has_left) left_ctx = above_ctx;
 
-  if (!has_above) above_ctx = left_ctx;
-  return (above_ctx + left_ctx) > max_tx_size + TX_SIZE_LUMA_MIN;
+  int above = xd->above_txfm_context[0] >= max_tx_wide;
+  int left = xd->left_txfm_context[0] >= max_tx_high;
+
+  if (has_above)
+    if (is_inter_block(above_mbmi))
+      above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+
+  if (has_left)
+    if (is_inter_block(left_mbmi))
+      left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+
+  if (has_above && has_left)
+    return (above + left);
+  else if (has_above)
+    return above;
+  else if (has_left)
+    return left;
+  else
+    return 0;
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/pvq.c b/third_party/aom/av1/common/pvq.c
deleted file mode 100644
index 221c90c04..000000000
--- a/third_party/aom/av1/common/pvq.c
+++ /dev/null
@@ -1,1007 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "odintrin.h"
-#include "partition.h"
-#include "pvq.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* Imported from encode.c in daala */
-/* These are the PVQ equivalent of quantization matrices, except that
-   the values are per-band. */
-#define OD_MASKING_DISABLED 0
-#define OD_MASKING_ENABLED 1
-
-const unsigned char OD_LUMA_QM_Q4[2][OD_QM_SIZE] = {
-/* Flat quantization for PSNR. The DC component isn't 16 because the DC
-   magnitude compensation is done here for inter (Haar DC doesn't need it).
-   Masking disabled: */
- {
-  16, 16,
-  16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
- },
-/* The non-flat AC coefficients compensate for the non-linear scaling caused
-   by activity masking. The values are currently hand-tuned so that the rate
-   of each band remains roughly constant when enabling activity masking
-   on intra.
-   Masking enabled: */
- {
-  16, 16,
-  16, 18, 28, 32,
-  16, 14, 20, 20, 28, 32,
-  16, 11, 14, 14, 17, 17, 22, 28
- }
-};
-
-const unsigned char OD_CHROMA_QM_Q4[2][OD_QM_SIZE] = {
-/* Chroma quantization is different because of the reduced lapping.
-   FIXME: Use the same matrix as luma for 4:4:4.
-   Masking disabled: */
- {
-  16, 16,
-  16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
- },
-/* The AC part is flat for chroma because it has no activity masking.
-   Masking enabled: */
- {
-  16, 16,
-  16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
- }
-};
-
-/* No interpolation, always use od_flat_qm_q4, but use a different scale for
-   each plane.
-   FIXME: Add interpolation and properly tune chroma. */
-const od_qm_entry OD_DEFAULT_QMS[2][2][OD_NPLANES_MAX] = {
-  /* Masking disabled */
-  { { { 4, 256, OD_LUMA_QM_Q4[OD_MASKING_DISABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_DISABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_DISABLED] } },
-    { { 0, 0, NULL},
-      { 0, 0, NULL},
-      { 0, 0, NULL} } },
-  /* Masking enabled */
-  { { { 4, 256, OD_LUMA_QM_Q4[OD_MASKING_ENABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_ENABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_ENABLED] } },
-    { { 0, 0, NULL},
-      { 0, 0, NULL},
-      { 0, 0, NULL} } }
-};
-
-/* Constants for the beta parameter, which controls how activity masking is
-   used.
-   beta = 1 / (1 - alpha), so when beta is 1, alpha is 0 and activity
-   masking is disabled. When beta is 1.5, activity masking is used. Note that
-   activity masking is neither used for 4x4 blocks nor for chroma. */
-#define OD_BETA(b) OD_QCONST32(b, OD_BETA_SHIFT)
-static const od_val16 OD_PVQ_BETA4_LUMA[1] = {OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA8_LUMA[4] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA16_LUMA[7] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA32_LUMA[10] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-
-static const od_val16 OD_PVQ_BETA4_LUMA_MASKING[1] = {OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA8_LUMA_MASKING[4] = {OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
-static const od_val16 OD_PVQ_BETA16_LUMA_MASKING[7] = {OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
- OD_BETA(1.5)};
-static const od_val16 OD_PVQ_BETA32_LUMA_MASKING[10] = {OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
-
-static const od_val16 OD_PVQ_BETA4_CHROMA[1] = {OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA8_CHROMA[4] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA16_CHROMA[7] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA32_CHROMA[10] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-
-const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1] = {
- {{OD_PVQ_BETA4_LUMA, OD_PVQ_BETA8_LUMA,
-   OD_PVQ_BETA16_LUMA, OD_PVQ_BETA32_LUMA},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA}},
- {{OD_PVQ_BETA4_LUMA_MASKING, OD_PVQ_BETA8_LUMA_MASKING,
-   OD_PVQ_BETA16_LUMA_MASKING, OD_PVQ_BETA32_LUMA_MASKING},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA}}
-};
-
-
-void od_interp_qm(unsigned char *out, int q, const od_qm_entry *entry1,
-  const od_qm_entry *entry2) {
-  int i;
-  if (entry2 == NULL || entry2->qm_q4 == NULL
-   || q < entry1->interp_q << OD_COEFF_SHIFT) {
-    /* Use entry1. */
-    for (i = 0; i < OD_QM_SIZE; i++) {
-      out[i] = OD_MINI(255, entry1->qm_q4[i]*entry1->scale_q8 >> 8);
-    }
-  }
-  else if (entry1 == NULL || entry1->qm_q4 == NULL
-   || q > entry2->interp_q << OD_COEFF_SHIFT) {
-    /* Use entry2. */
-    for (i = 0; i < OD_QM_SIZE; i++) {
-      out[i] = OD_MINI(255, entry2->qm_q4[i]*entry2->scale_q8 >> 8);
-    }
-  }
-  else {
-    /* Interpolate between entry1 and entry2. The interpolation is linear
-       in terms of log(q) vs log(m*scale). Considering that we're ultimately
-       multiplying the result it makes sense, but we haven't tried other
-       interpolation methods. */
-    double x;
-    const unsigned char *m1;
-    const unsigned char *m2;
-    int q1;
-    int q2;
-    m1 = entry1->qm_q4;
-    m2 = entry2->qm_q4;
-    q1 = entry1->interp_q << OD_COEFF_SHIFT;
-    q2 = entry2->interp_q << OD_COEFF_SHIFT;
-    x = (log(q)-log(q1))/(log(q2)-log(q1));
-    for (i = 0; i < OD_QM_SIZE; i++) {
-      out[i] = OD_MINI(255, (int)floor(.5 + (1./256)*exp(
-       x*log(m2[i]*entry2->scale_q8) + (1 - x)*log(m1[i]*entry1->scale_q8))));
-    }
-  }
-}
-
-void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe) {
-  od_pvq_codeword_ctx *ctx;
-  int i;
-  int pli;
-  int bs;
-  ctx = &state->pvq_codeword_ctx;
-  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[0].cdf);
-  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[1].cdf);
-  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[2].cdf);
-  for (i = 0; i < 2*OD_TXSIZES; i++) {
-    ctx->pvq_adapt[4*i + OD_ADAPT_K_Q8] = 384;
-    ctx->pvq_adapt[4*i + OD_ADAPT_SUM_EX_Q8] = 256;
-    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_Q8] = 104;
-    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_EX_Q8] = 128;
-  }
-  OD_CDFS_INIT_DYNAMIC(ctx->pvq_k1_cdf);
-  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
-    for (bs = 0; bs < OD_TXSIZES; bs++)
-    for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
-      state->pvq_exg[pli][bs][i] = 2 << 16;
-    }
-  }
-  for (i = 0; i < OD_TXSIZES*PVQ_MAX_PARTITIONS; i++) {
-    state->pvq_ext[i] = is_keyframe ? 24576 : 2 << 16;
-  }
-  OD_CDFS_INIT_DYNAMIC(state->pvq_gaintheta_cdf);
-  OD_CDFS_INIT_Q15(state->pvq_skip_dir_cdf);
-  OD_CDFS_INIT_DYNAMIC(ctx->pvq_split_cdf);
-}
-
-/* QMs are arranged from smallest to largest blocksizes, first for
-   blocks with decimation=0, followed by blocks with decimation=1.*/
-int od_qm_offset(int bs, int xydec)
-{
-    return xydec*OD_QM_STRIDE + OD_QM_OFFSET(bs);
-}
-
-#if defined(OD_FLOAT_PVQ)
-#define OD_DEFAULT_MAG 1.0
-#else
-#define OD_DEFAULT_MAG OD_QM_SCALE
-#endif
-
-/* Initialize the quantization matrix. */
-// Note: When hybrid transform and corresponding scan order is used by PVQ,
-// we don't need seperate qm and qm_inv for each transform type,
-// because AOM does not do magnitude compensation (i.e. simplay x16 for all coeffs).
-void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm) {
-  int i;
-  int j;
-  int16_t y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
-  int16_t y_inv[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
-  int16_t *x1;
-  int16_t *x1_inv;
-  int off;
-  int bs;
-  int xydec;
-  for (bs = 0; bs < OD_TXSIZES; bs++) {
-    for (xydec = 0; xydec < 2; xydec++) {
-      off = od_qm_offset(bs, xydec);
-      x1 = x + off;
-      x1_inv = x_inv + off;
-      for (i = 0; i < 4 << bs; i++) {
-        for (j = 0; j < 4 << bs; j++) {
-          /*This will ultimately be clamped to fit in 16 bits.*/
-          od_val32 mag;
-          int16_t ytmp;
-          mag = OD_DEFAULT_MAG;
-          if (i != 0 || j != 0) {
-#if defined(OD_FLOAT_PVQ)
-            mag /= 0.0625*qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
-#else
-            int qmv;
-            qmv = qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
-            mag *= 16;
-            mag = (mag + (qmv >> 1))/qmv;
-#endif
-            OD_ASSERT(mag > 0.0);
-          }
-          /*Convert to fit in 16 bits.*/
-#if defined(OD_FLOAT_PVQ)
-          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX,
-           (int32_t)floor(.5 + mag*OD_QM_SCALE));
-          y_inv[i*(4 << bs) + j] = (int16_t)floor(.5
-           + OD_QM_SCALE*OD_QM_INV_SCALE/(double)y[i*(4 << bs) + j]);
-#else
-          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX, mag);
-          ytmp = y[i*(4 << bs) + j];
-          y_inv[i*(4 << bs) + j] = (int16_t)((OD_QM_SCALE*OD_QM_INV_SCALE
-           + (ytmp >> 1))/ytmp);
-#endif
-        }
-      }
-      od_raster_to_coding_order_16(x1, 4 << bs, y, 4 << bs);
-      od_raster_to_coding_order_16(x1_inv, 4 << bs, y_inv, 4 << bs);
-    }
-  }
-}
-
-/* Maps each possible size (n) in the split k-tokenizer to a different value.
-   Possible values of n are:
-   2, 3, 4, 7, 8, 14, 15, 16, 31, 32, 63, 64, 127, 128
-   Since we don't care about the order (even in the bit-stream) the simplest
-   ordering (implemented here) is:
-   14, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 */
-int od_pvq_size_ctx(int n) {
-  int logn;
-  int odd;
-  logn = OD_ILOG(n - 1);
-  odd = n & 1;
-  return 2*logn - 1 - odd - 7*(n == 14);
-}
-
-/* Maps a length n to a context for the (k=1, n<=16) coder, with a special
-   case when n is the original length (orig_length=1) of the vector (i.e. we
-   haven't split it yet). For orig_length=0, we use the same mapping as
-   od_pvq_size_ctx() up to n=16. When orig_length=1, we map lengths
-   7, 8, 14, 15 to contexts 8 to 11. */
-int od_pvq_k1_ctx(int n, int orig_length) {
-  if (orig_length) return 8 + 2*(n > 8) + (n & 1);
-  else return od_pvq_size_ctx(n);
-}
-
-/* Indexing for the packed quantization matrices. */
-int od_qm_get_index(int bs, int band) {
-  /* The -band/3 term is due to the fact that we force corresponding horizontal
-     and vertical bands to have the same quantization. */
-  OD_ASSERT(bs >= 0 && bs < OD_TXSIZES);
-  return bs*(bs + 1) + band - band/3;
-}
-
-#if !defined(OD_FLOAT_PVQ)
-/*See celt/mathops.c in Opus and tools/cos_search.c.*/
-static int16_t od_pvq_cos_pi_2(int16_t x)
-{
-  int16_t x2;
-  x2 = OD_MULT16_16_Q15(x, x);
-  return OD_MINI(32767, (1073758164 - x*x + x2*(-7654 + OD_MULT16_16_Q16(x2,
-   16573 + OD_MULT16_16_Q16(-2529, x2)))) >> 15);
-}
-#endif
-
-/*Approximates cos(x) for -pi < x < pi.
-  Input is in OD_THETA_SCALE.*/
-od_val16 od_pvq_cos(od_val32 x) {
-#if defined(OD_FLOAT_PVQ)
-  return cos(x);
-#else
-  /*Wrap x around by masking, since cos is periodic.*/
-  x = x & 0x0001ffff;
-  if (x > (1 << 16)) {
-    x = (1 << 17) - x;
-  }
-  if (x & 0x00007fff) {
-    if (x < (1 << 15)) {
-       return od_pvq_cos_pi_2((int16_t)x);
-    }
-    else {
-      return -od_pvq_cos_pi_2((int16_t)(65536 - x));
-    }
-  }
-  else {
-    if (x & 0x0000ffff) {
-      return 0;
-    }
-    else if (x & 0x0001ffff) {
-      return -32767;
-    }
-    else {
-      return 32767;
-    }
-  }
-#endif
-}
-
-/*Approximates sin(x) for 0 <= x < pi.
-  Input is in OD_THETA_SCALE.*/
-od_val16 od_pvq_sin(od_val32 x) {
-#if defined(OD_FLOAT_PVQ)
-  return sin(x);
-#else
-  return od_pvq_cos(32768 - x);
-#endif
-}
-
-#if !defined(OD_FLOAT_PVQ)
-/* Computes an upper-bound on the number of bits required to store the L2 norm
-   of a vector (excluding sign). */
-int od_vector_log_mag(const od_coeff *x, int n) {
-  int i;
-  int32_t sum;
-  sum = 0;
-  for (i = 0; i < n; i++) {
-    int16_t tmp;
-    tmp = x[i] >> 8;
-    sum += tmp*(int32_t)tmp;
-  }
-  /* We add one full bit (instead of rounding OD_ILOG() up) for safety because
-     the >> 8 above causes the sum to be slightly underestimated. */
-  return 8 + 1 + OD_ILOG(n + sum)/2;
-}
-#endif
-
-/** Computes Householder reflection that aligns the reference r to the
- *  dimension in r with the greatest absolute value. The reflection
- *  vector is returned in r.
- *
- * @param [in,out]  r      reference vector to be reflected, reflection
- *                         also returned in r
- * @param [in]      n      number of dimensions in r
- * @param [in]      gr     gain of reference vector
- * @param [out]     sign   sign of reflection
- * @return                 dimension number to which reflection aligns
- **/
-int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
- int shift) {
-  int m;
-  int i;
-  int s;
-  od_val16 maxr;
-  OD_UNUSED(shift);
-  /* Pick component with largest magnitude. Not strictly
-   * necessary, but it helps numerical stability */
-  m = 0;
-  maxr = 0;
-  for (i = 0; i < n; i++) {
-    if (OD_ABS(r[i]) > maxr) {
-      maxr = OD_ABS(r[i]);
-      m = i;
-    }
-  }
-  s = r[m] > 0 ? 1 : -1;
-  /* This turns r into a Householder reflection vector that would reflect
-   * the original r[] to e_m */
-  r[m] += OD_SHR_ROUND(gr*s, shift);
-  *sign = s;
-  return m;
-}
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_RCP_INSHIFT 15
-#define OD_RCP_OUTSHIFT 14
-static od_val16 od_rcp(od_val16 x)
-{
-  int i;
-  od_val16 n;
-  od_val16 r;
-  i = OD_ILOG(x) - 1;
-  /*n is Q15 with range [0,1).*/
-  n = OD_VSHR_ROUND(x, i - OD_RCP_INSHIFT) - (1 << OD_RCP_INSHIFT);
-  /*Start with a linear approximation:
-    r = 1.8823529411764706-0.9411764705882353*n.
-    The coefficients and the result are Q14 in the range [15420,30840].*/
-  r = 30840 + OD_MULT16_16_Q15(-15420, n);
-  /*Perform two Newton iterations:
-    r -= r*((r*n)-1.Q15)
-       = r*((r*n)+(r-1.Q15)).*/
-  r = r - OD_MULT16_16_Q15(r, (OD_MULT16_16_Q15(r, n) + r - 32768));
-  /*We subtract an extra 1 in the second iteration to avoid overflow; it also
-     neatly compensates for truncation error in the rest of the process.*/
-  r = r - (1 + OD_MULT16_16_Q15(r, OD_MULT16_16_Q15(r, n) + r - 32768));
-  /*r is now the Q15 solution to 2/(n+1), with a maximum relative error
-     of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute
-     error of 1.24665/32768.*/
-  return OD_VSHR_ROUND(r, i - OD_RCP_OUTSHIFT);
-}
-#endif
-
-/** Applies Householder reflection from compute_householder(). The
- * reflection is its own inverse.
- *
- * @param [out]     out    reflected vector
- * @param [in]      x      vector to be reflected
- * @param [in]      r      reflection
- * @param [in]      n      number of dimensions in x,r
- */
-void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
- int n) {
-  int i;
-  od_val32 proj;
-  od_val16 proj_1;
-  od_val32 l2r;
-#if !defined(OD_FLOAT_PVQ)
-  od_val16 proj_norm;
-  od_val16 l2r_norm;
-  od_val16 rcp;
-  int proj_shift;
-  int l2r_shift;
-  int outshift;
-#endif
-  /*FIXME: Can we get l2r and/or l2r_shift from an earlier computation?*/
-  l2r = 0;
-  for (i = 0; i < n; i++) {
-    l2r += OD_MULT16_16(r[i], r[i]);
-  }
-  /* Apply Householder reflection */
-  proj = 0;
-  for (i = 0; i < n; i++) {
-    proj += OD_MULT16_16(r[i], x[i]);
-  }
-#if defined(OD_FLOAT_PVQ)
-  proj_1 = proj*2./(1e-100 + l2r);
-  for (i = 0; i < n; i++) {
-    out[i] = x[i] - r[i]*proj_1;
-  }
-#else
-  /*l2r_norm is [0.5, 1.0[ in Q15.*/
-  l2r_shift = (OD_ILOG(l2r) - 1) - 14;
-  l2r_norm = OD_VSHR_ROUND(l2r, l2r_shift);
-  rcp = od_rcp(l2r_norm);
-  proj_shift = (OD_ILOG(abs(proj)) - 1) - 14;
-  /*proj_norm is [0.5, 1.0[ in Q15.*/
-  proj_norm = OD_VSHR_ROUND(proj, proj_shift);
-  proj_1 = OD_MULT16_16_Q15(proj_norm, rcp);
-  /*The proj*2. in the float code becomes -1 in the final outshift.
-    The sign of l2r_shift is positive since we're taking the reciprocal of
-     l2r_norm and this is a right shift.*/
-  outshift = OD_MINI(30, OD_RCP_OUTSHIFT - proj_shift - 1 + l2r_shift);
-  if (outshift >= 0) {
-    for (i = 0; i < n; i++) {
-      int32_t tmp;
-      tmp = OD_MULT16_16(r[i], proj_1);
-      tmp = OD_SHR_ROUND(tmp, outshift);
-      out[i] = x[i] - tmp;
-    }
-  }
-  else {
-    /*FIXME: Can we make this case impossible?
-      Right now, if r[] is all zeros except for 1, 2, or 3 ones, and
-       if x[] is all zeros except for large values at the same position as the
-       ones in r[], then we can end up with a shift of -1.*/
-    for (i = 0; i < n; i++) {
-      int32_t tmp;
-      tmp = OD_MULT16_16(r[i], proj_1);
-      tmp = OD_SHL(tmp, -outshift);
-      out[i] = x[i] - tmp;
-    }
-  }
-#endif
-}
-
-#if !defined(OD_FLOAT_PVQ)
-static od_val16 od_beta_rcp(od_val16 beta){
-  if (beta == OD_BETA(1.))
-    return OD_BETA(1.);
-  else if (beta == OD_BETA(1.5))
-    return OD_BETA(1./1.5);
-  else {
-    od_val16 rcp_beta;
-    /*Shift by 1 less, transposing beta to range [.5, .75] and thus < 32768.*/
-    rcp_beta = od_rcp(beta << (OD_RCP_INSHIFT - 1 - OD_BETA_SHIFT));
-    return OD_SHR_ROUND(rcp_beta, OD_RCP_OUTSHIFT + 1 - OD_BETA_SHIFT);
-  }
-}
-
-#define OD_EXP2_INSHIFT 15
-#define OD_EXP2_FRACSHIFT 15
-#define OD_EXP2_OUTSHIFT 15
-static const int32_t OD_EXP2_C[5] = {32768, 22709, 7913, 1704, 443};
-/*Output is [1.0, 2.0) in Q(OD_EXP2_FRACSHIFT).
-  It does not include the integer offset, which is added in od_exp2 after the
-   final shift).*/
-static int32_t od_exp2_frac(int32_t x)
-{
-  return OD_MULT16_16_Q15(x, (OD_EXP2_C[1] + OD_MULT16_16_Q15(x,
-   (OD_EXP2_C[2] + OD_MULT16_16_Q15(x, (OD_EXP2_C[3]
-   + OD_MULT16_16_Q15(x, OD_EXP2_C[4])))))));
-}
-
-/** Base-2 exponential approximation (2^x) with Q15 input and output.*/
-static int32_t od_exp2(int32_t x)
-{
-  int integer;
-  int32_t frac;
-  integer = x >> OD_EXP2_INSHIFT;
-  if (integer > 14)
-    return 0x7f000000;
-  else if (integer < -15)
-    return 0;
-  frac = od_exp2_frac(x - OD_SHL(integer, OD_EXP2_INSHIFT));
-  return OD_VSHR_ROUND(OD_EXP2_C[0] + frac, -integer) + 1;
-}
-
-#define OD_LOG2_INSHIFT 15
-#define OD_LOG2_OUTSHIFT 15
-#define OD_LOG2_INSCALE_1 (1./(1 << OD_LOG2_INSHIFT))
-#define OD_LOG2_OUTSCALE (1 << OD_LOG2_OUTSHIFT)
-static int16_t od_log2(int16_t x)
-{
-  return x + OD_MULT16_16_Q15(x, (14482 + OD_MULT16_16_Q15(x, (-23234
-   + OD_MULT16_16_Q15(x, (13643 + OD_MULT16_16_Q15(x, (-6403
-   + OD_MULT16_16_Q15(x, 1515)))))))));
-}
-
-static int32_t od_pow(int32_t x, od_val16 beta)
-{
-  int16_t t;
-  int xshift;
-  int log2_x;
-  od_val32 logr;
-  /*FIXME: this conditional is to avoid doing log2(0).*/
-  if (x == 0)
-    return 0;
-  log2_x = (OD_ILOG(x) - 1);
-  xshift = log2_x - OD_LOG2_INSHIFT;
-  /*t should be in range [0.0, 1.0[ in Q(OD_LOG2_INSHIFT).*/
-  t = OD_VSHR(x, xshift) - (1 << OD_LOG2_INSHIFT);
-  /*log2(g/OD_COMPAND_SCALE) = log2(x) - OD_COMPAND_SHIFT in
-     Q(OD_LOG2_OUTSHIFT).*/
-  logr = od_log2(t) + (log2_x - OD_COMPAND_SHIFT)*OD_LOG2_OUTSCALE;
-  logr = (od_val32)OD_MULT16_32_QBETA(beta, logr);
-  return od_exp2(logr);
-}
-#endif
-
-/** Gain companding: raises gain to the power 1/beta for activity masking.
- *
- * @param [in]  g     real (uncompanded) gain
- * @param [in]  q0    uncompanded quality parameter
- * @param [in]  beta  activity masking beta param (exponent)
- * @return            g^(1/beta)
- */
-static od_val32 od_gain_compand(od_val32 g, int q0, od_val16 beta) {
-#if defined(OD_FLOAT_PVQ)
-  if (beta == 1) return OD_CGAIN_SCALE*g/(double)q0;
-  else {
-    return OD_CGAIN_SCALE*OD_COMPAND_SCALE*pow(g*OD_COMPAND_SCALE_1,
-     1./beta)/(double)q0;
-  }
-#else
-  if (beta == OD_BETA(1)) return (OD_CGAIN_SCALE*g + (q0 >> 1))/q0;
-  else {
-    int32_t expr;
-    expr = od_pow(g, od_beta_rcp(beta));
-    expr <<= OD_CGAIN_SHIFT + OD_COMPAND_SHIFT - OD_EXP2_OUTSHIFT;
-    return (expr + (q0 >> 1))/q0;
-  }
-#endif
-}
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_SQRT_INSHIFT 16
-#define OD_SQRT_OUTSHIFT 15
-static int16_t od_rsqrt_norm(int16_t x);
-
-static int16_t od_sqrt_norm(int32_t x)
-{
-  OD_ASSERT(x < 65536);
-  return OD_MINI(OD_SHR_ROUND(x*od_rsqrt_norm(x), OD_SQRT_OUTSHIFT), 32767);
-}
-
-static int16_t od_sqrt(int32_t x, int *sqrt_shift)
-{
-  int k;
-  int s;
-  int32_t t;
-  if (x == 0) {
-    *sqrt_shift = 0;
-     return 0;
-  }
-  OD_ASSERT(x < (1 << 30));
-  k = ((OD_ILOG(x) - 1) >> 1);
-  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
-    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
-  s = 2*k - (OD_SQRT_INSHIFT - 2);
-  t = OD_VSHR(x, s);
-  /*We want to express od_sqrt() in terms of od_sqrt_norm(), which is
-     defined as (2^OUTSHIFT)*sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
-    This simplifies to 2^(OUTSHIFT-(INSHIFT/2)-(s/2))*sqrt(x), so the caller
-     needs to shift right by OUTSHIFT - INSHIFT/2 - s/2.*/
-  *sqrt_shift = OD_SQRT_OUTSHIFT - ((s + OD_SQRT_INSHIFT) >> 1);
-  return od_sqrt_norm(t);
-}
-#endif
-
-/** Gain expanding: raises gain to the power beta for activity masking.
- *
- * @param [in]  cg    companded gain
- * @param [in]  q0    uncompanded quality parameter
- * @param [in]  beta  activity masking beta param (exponent)
- * @return            g^beta
- */
-od_val32 od_gain_expand(od_val32 cg0, int q0, od_val16 beta) {
-  if (beta == OD_BETA(1)) {
-    /*The multiply fits into 28 bits because the expanded gain has a range from
-       0 to 2^20.*/
-    return OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
-  }
-  else if (beta == OD_BETA(1.5)) {
-#if defined(OD_FLOAT_PVQ)
-    double cg;
-    cg = cg0*OD_CGAIN_SCALE_1;
-    cg *= q0*OD_COMPAND_SCALE_1;
-    return OD_COMPAND_SCALE*cg*sqrt(cg);
-#else
-    int32_t irt;
-    int64_t tmp;
-    int sqrt_inshift;
-    int sqrt_outshift;
-    /*cg0 is in Q(OD_CGAIN_SHIFT) and we need to divide it by
-       2^OD_COMPAND_SHIFT.*/
-    irt = od_sqrt(cg0*q0, &sqrt_outshift);
-    sqrt_inshift = (OD_CGAIN_SHIFT + OD_COMPAND_SHIFT) >> 1;
-    /*tmp is in Q(OD_CGAIN_SHIFT + OD_COMPAND_SHIFT).*/
-    tmp = cg0*q0*(int64_t)irt;
-    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), thus OD_COMPAND_SHIFT is
-       not included here.*/
-    return OD_MAXI(1,
-        OD_VSHR_ROUND(tmp, OD_CGAIN_SHIFT + sqrt_outshift + sqrt_inshift));
-#endif
-  }
-  else {
-#if defined(OD_FLOAT_PVQ)
-    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the multiply by
-       OD_COMPAND_SCALE.*/
-    double cg;
-    cg = cg0*OD_CGAIN_SCALE_1;
-    return OD_COMPAND_SCALE*pow(cg*q0*OD_COMPAND_SCALE_1, beta);
-#else
-    int32_t expr;
-    int32_t cg;
-    cg = OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
-    expr = od_pow(cg, beta);
-    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the subtraction by
-       OD_COMPAND_SHIFT.*/
-    return OD_MAXI(1, OD_SHR_ROUND(expr, OD_EXP2_OUTSHIFT - OD_COMPAND_SHIFT));
-#endif
-  }
-}
-
-/** Computes the raw and quantized/companded gain of a given input
- * vector
- *
- * @param [in]      x      vector of input data
- * @param [in]      n      number of elements in vector x
- * @param [in]      q0     quantizer
- * @param [out]     g      raw gain
- * @param [in]      beta   activity masking beta param
- * @param [in]      bshift shift to be applied to raw gain
- * @return                 quantized/companded gain
- */
-od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
- od_val16 beta, int bshift) {
-  int i;
-  od_val32 acc;
-#if !defined(OD_FLOAT_PVQ)
-  od_val32 irt;
-  int sqrt_shift;
-#else
-  OD_UNUSED(bshift);
-#endif
-  acc = 0;
-  for (i = 0; i < n; i++) {
-    acc += x[i]*(od_val32)x[i];
-  }
-#if defined(OD_FLOAT_PVQ)
-  *g = sqrt(acc);
-#else
-  irt = od_sqrt(acc, &sqrt_shift);
-  *g = OD_VSHR_ROUND(irt, sqrt_shift - bshift);
-#endif
-  /* Normalize gain by quantization step size and apply companding
-     (if ACTIVITY != 1). */
-  return od_gain_compand(*g, q0, beta);
-}
-
-/** Compute theta quantization range from quantized/companded gain
- *
- * @param [in]      qcg    quantized companded gain value
- * @param [in]      beta   activity masking beta param
- * @return                 max theta value
- */
-int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta){
-  /* Set angular resolution (in ra) to match the encoded gain */
-#if defined(OD_FLOAT_PVQ)
-  int ts = (int)floor(.5 + qcg*OD_CGAIN_SCALE_1*M_PI/(2*beta));
-#else
-  int ts = OD_SHR_ROUND(qcg*OD_MULT16_16_QBETA(OD_QCONST32(M_PI/2,
-   OD_CGAIN_SHIFT), od_beta_rcp(beta)), OD_CGAIN_SHIFT*2);
-#endif
-  /* Special case for low gains -- will need to be tuned anyway */
-  if (qcg < OD_QCONST32(1.4, OD_CGAIN_SHIFT)) ts = 1;
-  return ts;
-}
-
-/** Decode quantized theta value from coded value
- *
- * @param [in]      t          quantized companded gain value
- * @param [in]      max_theta  maximum theta value
- * @return                     decoded theta value
- */
-od_val32 od_pvq_compute_theta(int t, int max_theta) {
-  if (max_theta != 0) {
-#if defined(OD_FLOAT_PVQ)
-    return OD_MINI(t, max_theta - 1)*.5*M_PI/max_theta;
-#else
-    return (OD_MAX_THETA_SCALE*OD_MINI(t, max_theta - 1)
-     + (max_theta >> 1))/max_theta;
-#endif
-  }
-  else return 0;
-}
-
-#define OD_SQRT_TBL_SHIFT (10)
-
-#define OD_ITHETA_SHIFT 15
-/** Compute the number of pulses used for PVQ encoding a vector from
- * available metrics (encode and decode side)
- *
- * @param [in]      qcg        quantized companded gain value
- * @param [in]      itheta     quantized PVQ error angle theta
- * @param [in]      noref      indicates present or lack of reference
- *                             (prediction)
- * @param [in]      n          number of elements to be coded
- * @param [in]      beta       activity masking beta param
- * @return                     number of pulses to use for coding
- */
-int od_pvq_compute_k(od_val32 qcg, int itheta, int noref, int n,
-    od_val16 beta) {
-#if !defined(OD_FLOAT_PVQ)
-  /*Lookup table for sqrt(n+3/2) and sqrt(n+2/2) in Q10.
-    Real max values are 32792 and 32784, but clamped to stay within 16 bits.
-    Update with tools/gen_sqrt_tbl if needed.*/
-  static const od_val16 od_sqrt_table[2][13] = {
-   {0, 0, 0, 0, 2290, 2985, 4222, 0, 8256, 0, 16416, 0, 32767},
-   {0, 0, 0, 0, 2401, 3072, 4284, 0, 8287, 0, 16432, 0, 32767}};
-#endif
-  if (noref) {
-    if (qcg == 0) return 0;
-    if (n == 15 && qcg == OD_CGAIN_SCALE && beta > OD_BETA(1.25)) {
-      return 1;
-    }
-    else {
-#if defined(OD_FLOAT_PVQ)
-      return OD_MAXI(1, (int)floor(.5 + (qcg*OD_CGAIN_SCALE_1 - .2)*
-       sqrt((n + 3)/2)/beta));
-#else
-      od_val16 rt;
-      OD_ASSERT(OD_ILOG(n + 1) < 13);
-      rt = od_sqrt_table[1][OD_ILOG(n + 1)];
-      /*FIXME: get rid of 64-bit mul.*/
-      return OD_MAXI(1, OD_SHR_ROUND((int64_t)((qcg
-       - (int64_t)OD_QCONST32(.2, OD_CGAIN_SHIFT))*
-       OD_MULT16_16_QBETA(od_beta_rcp(beta), rt)), OD_CGAIN_SHIFT
-       + OD_SQRT_TBL_SHIFT));
-#endif
-    }
-  }
-  else {
-    if (itheta == 0) return 0;
-    /* Sets K according to gain and theta, based on the high-rate
-       PVQ distortion curves (see PVQ document). Low-rate will have to be
-       perceptually tuned anyway. We subtract 0.2 from the radius as an
-       approximation for the fact that the coefficients aren't identically
-       distributed within a band so at low gain the number of dimensions that
-       are likely to have a pulse is less than n. */
-#if defined(OD_FLOAT_PVQ)
-    return OD_MAXI(1, (int)floor(.5 + (itheta - .2)*sqrt((n + 2)/2)));
-#else
-    od_val16 rt;
-    OD_ASSERT(OD_ILOG(n + 1) < 13);
-    rt = od_sqrt_table[0][OD_ILOG(n + 1)];
-    /*FIXME: get rid of 64-bit mul.*/
-    return OD_MAXI(1, OD_VSHR_ROUND(((OD_SHL(itheta, OD_ITHETA_SHIFT)
-     - OD_QCONST32(.2, OD_ITHETA_SHIFT)))*(int64_t)rt,
-     OD_SQRT_TBL_SHIFT + OD_ITHETA_SHIFT));
-#endif
-  }
-}
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_RSQRT_INSHIFT 16
-#define OD_RSQRT_OUTSHIFT 14
-/** Reciprocal sqrt approximation where the input is in the range [0.25,1) in
-     Q16 and the output is in the range (1.0, 2.0] in Q14).
-    Error is always within +/1 of round(1/sqrt(t))*/
-static int16_t od_rsqrt_norm(int16_t t)
-{
-  int16_t n;
-  int32_t r;
-  int32_t r2;
-  int32_t ry;
-  int32_t y;
-  int32_t ret;
-  /* Range of n is [-16384,32767] ([-0.5,1) in Q15).*/
-  n = t - 32768;
-  OD_ASSERT(n >= -16384);
-  /*Get a rough initial guess for the root.
-    The optimal minimax quadratic approximation (using relative error) is
-     r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
-    Coefficients here, and the final result r, are Q14.*/
-  r = (23565 + OD_MULT16_16_Q15(n, (-13481 + OD_MULT16_16_Q15(n, 6711))));
-  /*We want y = t*r*r-1 in Q15, but t is 32-bit Q16 and r is Q14.
-    We can compute the result from n and r using Q15 multiplies with some
-     adjustment, carefully done to avoid overflow.*/
-  r2 = r*r;
-  y = (((r2 >> 15)*n + r2) >> 12) - 131077;
-  ry = r*y;
-  /*Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5).
-    This yields the Q14 reciprocal square root of the Q16 t, with a maximum
-     relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a peak
-     absolute error of 2.26591/16384.*/
-  ret = r + ((((ry >> 16)*(3*y) >> 3) - ry) >> 18);
-  OD_ASSERT(ret >= 16384 && ret < 32768);
-  return (int16_t)ret;
-}
-
-static int16_t od_rsqrt(int32_t x, int *rsqrt_shift)
-{
-   int k;
-   int s;
-   int16_t t;
-   k = (OD_ILOG(x) - 1) >> 1;
-  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
-    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
-   s = 2*k - (OD_RSQRT_INSHIFT - 2);
-   t = OD_VSHR(x, s);
-   /*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is
-      defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
-     This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller
-      needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/
-   *rsqrt_shift = OD_RSQRT_OUTSHIFT + ((s + OD_RSQRT_INSHIFT) >> 1);
-   return od_rsqrt_norm(t);
-}
-#endif
-
-/** Synthesizes one parition of coefficient values from a PVQ-encoded
- * vector.  This 'partial' version is called by the encode loop where
- * the Householder reflection has already been computed and there's no
- * need to recompute it.
- *
- * @param [out]     xcoeff  output coefficient partition (x in math doc)
- * @param [in]      ypulse  PVQ-encoded values (y in the math doc); in
- *                          the noref case, this vector has n entries,
- *                          in the reference case it contains n-1 entries
- *                          (the m-th entry is not included)
- * @param [in]      r       reference vector (prediction)
- * @param [in]      n       number of elements in this partition
- * @param [in]      noref   indicates presence or lack of prediction
- * @param [in]      g       decoded quantized vector gain
- * @param [in]      theta   decoded theta (prediction error)
- * @param [in]      m       alignment dimension of Householder reflection
- * @param [in]      s       sign of Householder reflection
- * @param [in]      qm_inv  inverse of the QM with magnitude compensation
- */
-void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
- const od_val16 *r16, int n, int noref, od_val32 g, od_val32 theta, int m, int s,
- const int16_t *qm_inv) {
-  int i;
-  int yy;
-  od_val32 scale;
-  int nn;
-#if !defined(OD_FLOAT_PVQ)
-  int gshift;
-  int qshift;
-#endif
-  OD_ASSERT(g != 0);
-  nn = n-(!noref); /* when noref==0, vector in is sized n-1 */
-  yy = 0;
-  for (i = 0; i < nn; i++)
-    yy += ypulse[i]*(int32_t)ypulse[i];
-#if !defined(OD_FLOAT_PVQ)
-  /* Shift required for the magnitude of the pre-qm synthesis to be guaranteed
-     to fit in 16 bits. In practice, the range will be 8192-16384 after scaling
-     most of the time. */
-  gshift = OD_MAXI(0, OD_ILOG(g) - 14);
-#endif
-  /*scale is g/sqrt(yy) in Q(16-gshift) so that x[]*scale has a norm that fits
-     in 16 bits.*/
-  if (yy == 0) scale = 0;
-#if defined(OD_FLOAT_PVQ)
-  else {
-    scale = g/sqrt(yy);
-  }
-#else
-  else {
-    int rsqrt_shift;
-    int16_t rsqrt;
-    /*FIXME: should be < int64_t*/
-    int64_t tmp;
-    rsqrt = od_rsqrt(yy, &rsqrt_shift);
-    tmp = rsqrt*(int64_t)g;
-    scale = OD_VSHR_ROUND(tmp, rsqrt_shift + gshift - 16);
-  }
-  /* Shift to apply after multiplying by the inverse QM, taking into account
-     gshift. */
-  qshift = OD_QM_INV_SHIFT - gshift;
-#endif
-  if (noref) {
-    for (i = 0; i < n; i++) {
-      od_val32 x;
-      /* This multiply doesn't round, so it introduces some bias.
-         It would be nice (but not critical) to fix this. */
-      x = (od_val32)OD_MULT16_32_Q16(ypulse[i], scale);
-#if defined(OD_FLOAT_PVQ)
-      xcoeff[i] = (od_coeff)floor(.5
-       + x*(qm_inv[i]*OD_QM_INV_SCALE_1));
-#else
-      xcoeff[i] = OD_SHR_ROUND(x*qm_inv[i], qshift);
-#endif
-    }
-  }
-  else{
-    od_val16 x[MAXN];
-    scale = OD_ROUND32(scale*OD_TRIG_SCALE_1*od_pvq_sin(theta));
-    /* The following multiply doesn't round, but it's probably OK since
-       the Householder reflection is likely to undo most of the resulting
-       bias. */
-    for (i = 0; i < m; i++)
-      x[i] = OD_MULT16_32_Q16(ypulse[i], scale);
-    x[m] = OD_ROUND16(-s*(OD_SHR_ROUND(g, gshift))*OD_TRIG_SCALE_1*
-     od_pvq_cos(theta));
-    for (i = m; i < nn; i++)
-      x[i+1] = OD_MULT16_32_Q16(ypulse[i], scale);
-    od_apply_householder(x, x, r16, n);
-    for (i = 0; i < n; i++) {
-#if defined(OD_FLOAT_PVQ)
-      xcoeff[i] = (od_coeff)floor(.5 + (x[i]*(qm_inv[i]*OD_QM_INV_SCALE_1)));
-#else
-      xcoeff[i] = OD_SHR_ROUND(x[i]*qm_inv[i], qshift);
-#endif
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/pvq.h b/third_party/aom/av1/common/pvq.h
deleted file mode 100644
index 4adf22f02..000000000
--- a/third_party/aom/av1/common/pvq.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_pvq_H)
-# define _pvq_H (1)
-# include "generic_code.h"
-# include "odintrin.h"
-
-extern const uint16_t EXP_CDF_TABLE[][16];
-extern const uint16_t LAPLACE_OFFSET[];
-
-#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
-
-# define PVQ_MAX_PARTITIONS (1 + 3*(OD_TXSIZES-1))
-
-# define OD_NOREF_ADAPT_SPEED (4)
-/* Normalized lambda for PVQ quantizer. Since we normalize the gain by q, the
-   distortion is normalized by q^2 and lambda does not need the q^2 factor.
-   At high rate, this would be log(2)/6, but we're using a slightly more
-   aggressive value, closer to:
-   Li, Xiang, et al. "Laplace distribution based Lagrangian rate distortion
-   optimization for hybrid video coding." Circuits and Systems for Video
-   Technology, IEEE Transactions on 19.2 (2009): 193-205.
-   */
-# define OD_PVQ_LAMBDA (.1146)
-
-#define OD_PVQ_SKIP_ZERO 1
-#define OD_PVQ_SKIP_COPY 2
-
-/* Maximum size for coding a PVQ band. */
-#define OD_MAX_PVQ_SIZE (1024)
-
-#if defined(OD_FLOAT_PVQ)
-#define OD_QM_SHIFT (15)
-#else
-#define OD_QM_SHIFT (11)
-#endif
-#define OD_QM_SCALE (1 << OD_QM_SHIFT)
-#if defined(OD_FLOAT_PVQ)
-#define OD_QM_SCALE_1 (1./OD_QM_SCALE)
-#endif
-#define OD_QM_SCALE_MAX 32767
-#define OD_QM_INV_SHIFT (12)
-#define OD_QM_INV_SCALE (1 << OD_QM_INV_SHIFT)
-#if defined(OD_FLOAT_PVQ)
-#define OD_QM_INV_SCALE_1 (1./OD_QM_INV_SCALE)
-#endif
-#define OD_QM_OFFSET(bs) ((((1 << 2*bs) - 1) << 2*OD_LOG_BSIZE0)/3)
-#define OD_QM_STRIDE (OD_QM_OFFSET(OD_TXSIZES))
-#define OD_QM_BUFFER_SIZE (2*OD_QM_STRIDE)
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_THETA_SHIFT (15)
-#define OD_THETA_SCALE ((1 << OD_THETA_SHIFT)*2./M_PI)
-#define OD_MAX_THETA_SCALE (1 << OD_THETA_SHIFT)
-#define OD_TRIG_SCALE (32768)
-#define OD_BETA_SHIFT (12)
-#define OD_BETA_SCALE_1 (1./(1 << OD_BETA_SHIFT))
-/*Multiplies 16-bit a by 32-bit b and keeps bits [16:64-OD_BETA_SHIFT-1].*/
-#define OD_MULT16_32_QBETA(a, b) \
- ((int16_t)(a)*(int64_t)(int32_t)(b) >> OD_BETA_SHIFT)
-# define OD_MULT16_16_QBETA(a, b) \
-  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> OD_BETA_SHIFT)
-#define OD_CGAIN_SHIFT (8)
-#define OD_CGAIN_SCALE (1 << OD_CGAIN_SHIFT)
-#else
-#define OD_BETA_SCALE_1 (1.)
-#define OD_THETA_SCALE (1)
-#define OD_TRIG_SCALE (1)
-#define OD_CGAIN_SCALE (1)
-#endif
-#define OD_THETA_SCALE_1 (1./OD_THETA_SCALE)
-#define OD_TRIG_SCALE_1 (1./OD_TRIG_SCALE)
-#define OD_CGAIN_SCALE_1 (1./OD_CGAIN_SCALE)
-#define OD_CGAIN_SCALE_2 (OD_CGAIN_SCALE_1*OD_CGAIN_SCALE_1)
-
-/* Largest PVQ partition is half the coefficients of largest block size. */
-#define MAXN (OD_TXSIZE_MAX*OD_TXSIZE_MAX/2)
-
-#define OD_COMPAND_SHIFT (8 + OD_COEFF_SHIFT)
-#define OD_COMPAND_SCALE (1 << OD_COMPAND_SHIFT)
-#define OD_COMPAND_SCALE_1 (1./OD_COMPAND_SCALE)
-
-#define OD_QM_SIZE (OD_TXSIZES*(OD_TXSIZES + 1))
-
-#define OD_FLAT_QM 0
-#define OD_HVS_QM  1
-
-# define OD_NSB_ADAPT_CTXS (4)
-
-# define OD_ADAPT_K_Q8        0
-# define OD_ADAPT_SUM_EX_Q8   1
-# define OD_ADAPT_COUNT_Q8    2
-# define OD_ADAPT_COUNT_EX_Q8 3
-
-# define OD_ADAPT_NO_VALUE (-2147483647-1)
-
-typedef enum {
-  PVQ_SKIP = 0x0,
-  DC_CODED = 0x1,
-  AC_CODED = 0x2,
-  AC_DC_CODED = 0x3,
-} PVQ_SKIP_TYPE;
-
-typedef struct od_pvq_adapt_ctx  od_pvq_adapt_ctx;
-typedef struct od_pvq_codeword_ctx od_pvq_codeword_ctx;
-
-struct od_pvq_codeword_ctx {
-  int                 pvq_adapt[2*OD_TXSIZES*OD_NSB_ADAPT_CTXS];
-  /* CDFs are size 16 despite the fact that we're using less than that. */
-  uint16_t            pvq_k1_cdf[12][CDF_SIZE(16)];
-  uint16_t            pvq_split_cdf[22*7][CDF_SIZE(8)];
-};
-
-struct od_pvq_adapt_ctx {
-  od_pvq_codeword_ctx pvq_codeword_ctx;
-  generic_encoder     pvq_param_model[3];
-  int                 pvq_ext[OD_TXSIZES*PVQ_MAX_PARTITIONS];
-  int                 pvq_exg[OD_NPLANES_MAX][OD_TXSIZES][PVQ_MAX_PARTITIONS];
-  uint16_t pvq_gaintheta_cdf[2*OD_TXSIZES*PVQ_MAX_PARTITIONS][CDF_SIZE(16)];
-  uint16_t pvq_skip_dir_cdf[2*(OD_TXSIZES-1)][CDF_SIZE(7)];
-};
-
-typedef struct od_qm_entry {
-  int interp_q;
-  int scale_q8;
-  const unsigned char *qm_q4;
-} od_qm_entry;
-
-extern const od_qm_entry OD_DEFAULT_QMS[2][2][OD_NPLANES_MAX];
-
-void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe);
-int od_pvq_size_ctx(int n);
-int od_pvq_k1_ctx(int n, int orig_size);
-
-od_val16 od_pvq_sin(od_val32 x);
-od_val16 od_pvq_cos(od_val32 x);
-#if !defined(OD_FLOAT_PVQ)
-int od_vector_log_mag(const od_coeff *x, int n);
-#endif
-
-void od_interp_qm(unsigned char *out, int q, const od_qm_entry *entry1,
-                  const od_qm_entry *entry2);
-
-int od_qm_get_index(int bs, int band);
-
-extern const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1];
-
-void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm);
-int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
- int shift);
-void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
- int n);
-void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
-                                  const od_val16 *r, int n,
-                                  int noref, od_val32 g,
-                                  od_val32 theta, int m, int s,
-                                  const int16_t *qm_inv);
-od_val32 od_gain_expand(od_val32 cg, int q0, od_val16 beta);
-od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
- od_val16 beta, int bshift);
-int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta);
-od_val32 od_pvq_compute_theta(int t, int max_theta);
-int od_pvq_compute_k(od_val32 qcg, int itheta, int noref, int n, od_val16 beta);
-
-int od_vector_is_null(const od_coeff *x, int len);
-int od_qm_offset(int bs, int xydec);
-
-#endif
diff --git a/third_party/aom/av1/common/pvq_state.c b/third_party/aom/av1/common/pvq_state.c
deleted file mode 100644
index 197b9b3a8..000000000
--- a/third_party/aom/av1/common/pvq_state.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/pvq_state.h"
-#include "av1/common/odintrin.h"
-
-void od_adapt_ctx_reset(od_adapt_ctx *adapt, int is_keyframe) {
-  int pli;
-  od_adapt_pvq_ctx_reset(&adapt->pvq, is_keyframe);
-  OD_CDFS_INIT_Q15(adapt->skip_cdf);
-  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
-    int i;
-    OD_CDFS_INIT_DYNAMIC(adapt->model_dc[pli].cdf);
-    for (i = 0; i < OD_TXSIZES; i++) {
-      int j;
-      adapt->ex_g[pli][i] = 8;
-      for (j = 0; j < 3; j++) {
-        adapt->ex_dc[pli][i][j] = pli > 0 ? 8 : 32768;
-      }
-    }
-  }
-}
-
-void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe, int bo,
-                            int n, int w) {
-  int i;
-  int j;
-  if (is_keyframe) {
-    for (i = 0; i < n; i++) {
-      for (j = 0; j < n; j++) {
-        /* skip DC */
-        if (i || j) d[bo + i * w + j] = 0;
-      }
-    }
-  } else {
-    for (i = 0; i < n; i++) {
-      for (j = 0; j < n; j++) {
-        d[bo + i * w + j] = pred[i * n + j];
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/pvq_state.h b/third_party/aom/av1/common/pvq_state.h
deleted file mode 100644
index 84d454e70..000000000
--- a/third_party/aom/av1/common/pvq_state.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_state_H)
-# define _state_H (1)
-
-typedef struct od_state     od_state;
-typedef struct od_adapt_ctx od_adapt_ctx;
-
-# include "generic_code.h"
-# include "odintrin.h"
-# include "pvq.h"
-
-/*Adaptation speed of scalar Laplace encoding.*/
-# define OD_SCALAR_ADAPT_SPEED (4)
-
-struct od_adapt_ctx {
-  /* Support for PVQ encode/decode */
-  od_pvq_adapt_ctx pvq;
-
-  generic_encoder model_dc[OD_NPLANES_MAX];
-
-  int ex_dc[OD_NPLANES_MAX][OD_TXSIZES][3];
-  int ex_g[OD_NPLANES_MAX][OD_TXSIZES];
-
-  /* Joint skip flag for DC and AC */
-  uint16_t skip_cdf[OD_TXSIZES*2][CDF_SIZE(4)];
-};
-
-struct od_state {
-  od_adapt_ctx *adapt;
-  unsigned char pvq_qm_q4[OD_NPLANES_MAX][OD_QM_SIZE];
-  /* Quantization matrices and their inverses. */
-  int16_t qm[OD_QM_BUFFER_SIZE];
-  int16_t qm_inv[OD_QM_BUFFER_SIZE];
-};
-
-void od_adapt_ctx_reset(od_adapt_ctx *state, int is_keyframe);
-void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe,
- int bo, int n, int w);
-
-#endif
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
index ea7140cdc..84575d74b 100644
--- a/third_party/aom/av1/common/quant_common.c
+++ b/third_party/aom/av1/common/quant_common.c
@@ -16,111 +16,7 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/blockd.h"
 
-#if CONFIG_NEW_QUANT
-// Bin widths expressed as a fraction over 128 of the quant stepsize,
-// for the quantization bins 0-4.
-// So a value x indicates the bin is actually factor x/128 of the
-// nominal quantization step.  For the zero bin, the width is only
-// for one side of zero, so the actual width is twice that.
-//
-// Functions with nuq correspond to "non uniform quantization"
-// TODO(sarahparker, debargha): Optimize these tables
-
-typedef struct {
-  uint8_t knots[NUQ_KNOTS];  // offsets
-  uint8_t doff;              // dequantization
-} qprofile_type;
-
-static const qprofile_type nuq[QUANT_PROFILES][COEF_BANDS] = {
-  {
-      // lossless
-      { { 64, 128, 128 }, 0 },  // dc, band 0
-      { { 64, 128, 128 }, 0 },  // band 1
-      { { 64, 128, 128 }, 0 },  // band 2
-      { { 64, 128, 128 }, 0 },  // band 3
-      { { 64, 128, 128 }, 0 },  // band 4
-      { { 64, 128, 128 }, 0 },  // band 5
-  },
-  {
-      { { 64, 128, 128 }, 4 },   // dc, band 0
-      { { 64, 128, 128 }, 6 },   // band 1
-      { { 64, 128, 128 }, 8 },   // band 2
-      { { 64, 128, 128 }, 10 },  // band 3
-      { { 72, 128, 128 }, 12 },  // band 4
-      { { 80, 128, 128 }, 14 }   // band 5
-  },
-  {
-      { { 64, 128, 128 }, 6 },   // dc, band 0
-      { { 64, 128, 128 }, 8 },   // band 1
-      { { 64, 128, 128 }, 10 },  // band 2
-      { { 64, 128, 128 }, 12 },  // band 3
-      { { 72, 128, 128 }, 14 },  // band 4
-      { { 80, 128, 128 }, 16 }   // band 5
-  },
-  {
-      { { 64, 128, 128 }, 8 },   // dc, band 0
-      { { 64, 128, 128 }, 10 },  // band 1
-      { { 64, 128, 128 }, 12 },  // band 2
-      { { 72, 128, 128 }, 14 },  // band 3
-      { { 76, 128, 128 }, 16 },  // band 4
-      { { 80, 128, 128 }, 18 }   // band 5
-  }
-};
-
-static const uint8_t *get_nuq_knots(int band, int q_profile) {
-  return nuq[q_profile][band].knots;
-}
-
-static INLINE int16_t quant_to_doff_fixed(int band, int q_profile) {
-  return nuq[q_profile][band].doff;
-}
-
-// get cumulative bins
-static INLINE void get_cuml_bins_nuq(int q, int band, tran_low_t *cuml_bins,
-                                     int q_profile) {
-  const uint8_t *knots = get_nuq_knots(band, q_profile);
-  int16_t cuml_knots[NUQ_KNOTS];
-  int i;
-  cuml_knots[0] = knots[0];
-  for (i = 1; i < NUQ_KNOTS; ++i) cuml_knots[i] = cuml_knots[i - 1] + knots[i];
-  for (i = 0; i < NUQ_KNOTS; ++i)
-    cuml_bins[i] = ROUND_POWER_OF_TWO(cuml_knots[i] * q, 7);
-}
-
-void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
-                             tran_low_t *cuml_bins, int q_profile) {
-  const uint8_t *knots = get_nuq_knots(band, q_profile);
-  tran_low_t cuml_bins_[NUQ_KNOTS], *cuml_bins_ptr;
-  tran_low_t doff;
-  int i;
-  cuml_bins_ptr = (cuml_bins ? cuml_bins : cuml_bins_);
-  get_cuml_bins_nuq(q, band, cuml_bins_ptr, q_profile);
-  dq[0] = 0;
-  for (i = 1; i < NUQ_KNOTS; ++i) {
-    doff = quant_to_doff_fixed(band, q_profile);
-    doff = ROUND_POWER_OF_TWO(doff * knots[i], 7);
-    dq[i] =
-        cuml_bins_ptr[i - 1] + ROUND_POWER_OF_TWO((knots[i] - doff * 2) * q, 8);
-  }
-  doff = quant_to_doff_fixed(band, q_profile);
-  dq[NUQ_KNOTS] =
-      cuml_bins_ptr[NUQ_KNOTS - 1] + ROUND_POWER_OF_TWO((64 - doff) * q, 7);
-}
-
-tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq) {
-  if (v <= NUQ_KNOTS)
-    return dq[v];
-  else
-    return dq[NUQ_KNOTS] + (v - NUQ_KNOTS) * q;
-}
-
-tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq) {
-  tran_low_t dqmag = av1_dequant_abscoeff_nuq(abs(v), q, dq);
-  return (v < 0 ? -dqmag : dqmag);
-}
-#endif  // CONFIG_NEW_QUANT
-
-static const int16_t dc_qlookup[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
   4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
   19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
   31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
@@ -142,8 +38,7 @@ static const int16_t dc_qlookup[QINDEX_RANGE] = {
   1184, 1232, 1282, 1336,
 };
 
-#if CONFIG_HIGHBITDEPTH
-static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
   4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
   40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
   86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
@@ -166,7 +61,7 @@ static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
   3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
 };
 
-static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
   4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
   103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
   251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
@@ -192,9 +87,8 @@ static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
   13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
   19718, 20521, 21387,
 };
-#endif
 
-static const int16_t ac_qlookup[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
   4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
@@ -217,8 +111,7 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = {
   1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-#if CONFIG_HIGHBITDEPTH
-static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
   4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
   44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
   96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
@@ -241,7 +134,7 @@ static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
   6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
 };
 
-static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
   4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
   112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
   280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
@@ -267,64 +160,88 @@ static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
   22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
   28143, 28687, 29247,
 };
-#endif
 
-int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
-#if CONFIG_HIGHBITDEPTH
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms.  Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values.  Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale().  Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients.  However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// dequantized/decoded coefficients, even for large TX blocks, are always
+// effectively Q3. Meanwhile, quantized/coded coefficients are Q0
+// because Qn quantizers are applied to Qn tx coefficients.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input.  In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
   switch (bit_depth) {
-    case AOM_BITS_8: return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
-#endif
 }
 
-int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
-#if CONFIG_HIGHBITDEPTH
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
   switch (bit_depth) {
-    case AOM_BITS_8: return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
-#endif
 }
 
-int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth) {
+// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
+// bits), so QTX == Q3.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_dc_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_ac_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) {
   int i;
-  const int16_t *tab = ac_qlookup;
-  ac *= 4;
-#if CONFIG_HIGHBITDEPTH
+  const int16_t *tab = ac_qlookup_Q3;
   switch (bit_depth) {
     case AOM_BITS_10: {
-      tab = ac_qlookup_10;
-      ac *= 4;
+      tab = ac_qlookup_10_Q3;
       break;
     }
     case AOM_BITS_12: {
-      tab = ac_qlookup_12;
-      ac *= 16;
+      tab = ac_qlookup_12_Q3;
       break;
     }
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#endif
   (void)bit_depth;
   for (i = 0; i < QINDEX_RANGE; i++) {
-    if (ac <= tab[i]) return i;
+    if (ac_Q3 <= tab[i]) return i;
   }
   return QINDEX_RANGE - 1;
 }
@@ -333,55 +250,47 @@ int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    const int seg_qindex =
-        seg->abs_delta == SEGMENT_ABSDATA ? data : base_qindex + data;
+    const int seg_qindex = base_qindex + data;
     return clamp(seg_qindex, 0, MAXQ);
   } else {
     return base_qindex;
   }
 }
 
-#if CONFIG_AOM_QM
-qm_val_t *aom_iqmatrix(AV1_COMMON *cm, int qmlevel, int is_chroma,
-                       TX_SIZE tx_size, int is_intra) {
-  return &cm->giqmatrix[qmlevel][!!is_chroma][!!is_intra][tx_size][0];
+const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+                             TX_SIZE tx_size) {
+  return &cm->giqmatrix[qmlevel][plane][tx_size][0];
 }
-qm_val_t *aom_qmatrix(AV1_COMMON *cm, int qmlevel, int is_chroma,
-                      TX_SIZE tx_size, int is_intra) {
-  return &cm->gqmatrix[qmlevel][!!is_chroma][!!is_intra][tx_size][0];
+const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+                            TX_SIZE tx_size) {
+  return &cm->gqmatrix[qmlevel][plane][tx_size][0];
 }
 
-#if CONFIG_CHROMA_2X2
-#define QM_TOTAL_SIZE 3348
-#else
 #define QM_TOTAL_SIZE 3344
-#endif
-static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
-static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
 
-void aom_qm_init(AV1_COMMON *cm) {
-  int q, c, f, t;
+void av1_qm_init(AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  int q, c, t;
   int current;
   for (q = 0; q < NUM_QM_LEVELS; ++q) {
-    for (c = 0; c < 2; ++c) {
-      for (f = 0; f < 2; ++f) {
-        current = 0;
-        for (t = 0; t < TX_SIZES_ALL; ++t) {
-          const int size = tx_size_2d[t];
-          // Don't use QM for sizes > 32x32
-          if (q == NUM_QM_LEVELS - 1 || size > 1024) {
-            cm->gqmatrix[q][c][f][t] = NULL;
-            cm->giqmatrix[q][c][f][t] = NULL;
-          } else {
-            assert(current + size <= QM_TOTAL_SIZE);
-            cm->gqmatrix[q][c][f][t] = &wt_matrix_ref[AOMMIN(
-                NUM_QM_LEVELS - 1, f == 0 ? q + DEFAULT_QM_INTER_OFFSET : q)][c]
-                                                     [current];
-            cm->giqmatrix[q][c][f][t] = &iwt_matrix_ref[AOMMIN(
-                NUM_QM_LEVELS - 1, f == 0 ? q + DEFAULT_QM_INTER_OFFSET : q)][c]
-                                                       [current];
-            current += size;
-          }
+    for (c = 0; c < num_planes; ++c) {
+      current = 0;
+      for (t = 0; t < TX_SIZES_ALL; ++t) {
+        const int size = tx_size_2d[t];
+        const int qm_tx_size = av1_get_adjusted_tx_size(t);
+        if (q == NUM_QM_LEVELS - 1) {
+          cm->gqmatrix[q][c][t] = NULL;
+          cm->giqmatrix[q][c][t] = NULL;
+        } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
+          cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
+          cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+        } else {
+          assert(current + size <= QM_TOTAL_SIZE);
+          cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+          cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+          current += size;
         }
       }
     }
@@ -399,13 +308,9 @@ void aom_qm_init(AV1_COMMON *cm) {
    frequency domain according to different nominal viewing
    distances.
  */
-static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        43, 86, 86, 166,
-#endif
         /* Size 4x4 */
         32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200,
         /* Size 8x8 */
@@ -632,10 +537,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
         152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        50, 62, 62, 100,
-#endif
         /* Size 4x4 */
         35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
         /* Size 8x8 */
@@ -848,10 +749,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        39, 82, 82, 155,
-#endif
         /* Size 4x4 */
         32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184,
         /* Size 8x8 */
@@ -1076,10 +973,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161,
         171, 174, 179, 181, 188, 188, 190 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        48, 60, 60, 97,
-#endif
         /* Size 4x4 */
         33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
         /* Size 8x8 */
@@ -1291,10 +1184,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        39, 76, 76, 140,
-#endif
         /* Size 4x4 */
         32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
         /* Size 8x8 */
@@ -1515,10 +1404,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163,
         168, 169, 175, 175, 176 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        48, 58, 58, 91,
-#endif
         /* Size 4x4 */
         32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
         /* Size 8x8 */
@@ -1730,10 +1615,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        36, 71, 71, 134,
-#endif
         /* Size 4x4 */
         32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
         /* Size 8x8 */
@@ -1953,10 +1834,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163,
         163, 163 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        47, 55, 55, 89,
-#endif
         /* Size 4x4 */
         32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
         /* Size 8x8 */
@@ -2168,10 +2045,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        35, 63, 63, 117,
-#endif
         /* Size 4x4 */
         32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
         /* Size 8x8 */
@@ -2387,10 +2260,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
         152 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        47, 52, 52, 82,
-#endif
         /* Size 4x4 */
         32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
         /* Size 8x8 */
@@ -2601,10 +2470,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        35, 58, 58, 105,
-#endif
         /* Size 4x4 */
         32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
         /* Size 8x8 */
@@ -2817,10 +2682,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
         118, 125, 125, 133, 133, 136, 136, 141 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        47, 50, 50, 76,
-#endif
         /* Size 4x4 */
         32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
         /* Size 8x8 */
@@ -3031,10 +2892,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        34, 52, 52, 89,
-#endif
         /* Size 4x4 */
         32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
         /* Size 8x8 */
@@ -3246,10 +3103,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103,
         105, 108, 112, 114, 119, 119, 127, 127 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        41, 48, 48, 69,
-#endif
         /* Size 4x4 */
         31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
         /* Size 8x8 */
@@ -3460,10 +3313,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 47, 47, 75,
-#endif
         /* Size 4x4 */
         32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
         /* Size 8x8 */
@@ -3673,10 +3522,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
         100, 105, 105, 109 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        39, 47, 47, 63,
-#endif
         /* Size 4x4 */
         31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
         /* Size 8x8 */
@@ -3887,10 +3732,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 42, 42, 64,
-#endif
         /* Size 4x4 */
         32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
         /* Size 8x8 */
@@ -4099,10 +3940,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
         82, 83, 87, 87 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        38, 45, 45, 59,
-#endif
         /* Size 4x4 */
         31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
         /* Size 8x8 */
@@ -4313,10 +4150,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 38, 38, 54,
-#endif
         /* Size 4x4 */
         32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
         /* Size 8x8 */
@@ -4525,10 +4358,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
         69, 70, 70, 73 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 45, 45, 54,
-#endif
         /* Size 4x4 */
         31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
         /* Size 8x8 */
@@ -4739,10 +4568,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 34, 34, 48,
-#endif
         /* Size 4x4 */
         32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
         /* Size 8x8 */
@@ -4951,10 +4776,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
         58, 60, 63, 63 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 46, 46, 53,
-#endif
         /* Size 4x4 */
         31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
         /* Size 8x8 */
@@ -5165,10 +4986,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 33, 33, 39,
-#endif
         /* Size 4x4 */
         32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
         /* Size 8x8 */
@@ -5377,10 +5194,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
         48, 48, 48, 49 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 42, 42, 48,
-#endif
         /* Size 4x4 */
         31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
         /* Size 8x8 */
@@ -5591,10 +5404,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 35,
-#endif
         /* Size 4x4 */
         31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
         /* Size 8x8 */
@@ -5803,10 +5612,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
         36, 37, 38, 38 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 38, 38, 46,
-#endif
         /* Size 4x4 */
         31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
         /* Size 8x8 */
@@ -6017,10 +5822,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 33,
-#endif
         /* Size 4x4 */
         31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
         /* Size 8x8 */
@@ -6229,10 +6030,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
         34, 34, 34, 34 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 33, 33, 36,
-#endif
         /* Size 4x4 */
         31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
         /* Size 8x8 */
@@ -6443,10 +6240,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 31, 31, 32,
-#endif
         /* Size 4x4 */
         31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
         /* Size 8x8 */
@@ -6655,10 +6448,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 31, 31, 31,
-#endif
         /* Size 4x4 */
         31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
         /* Size 8x8 */
@@ -6869,10 +6658,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -7081,10 +6866,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -7295,13 +7076,9 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
 };
 
-static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        24, 12, 12, 6,
-#endif
         /* Size 4x4 */
         32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5,
         /* Size 8x8 */
@@ -7494,10 +7271,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5,
         5, 5 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        20, 17, 17, 10,
-#endif
         /* Size 4x4 */
         29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9,
         /* Size 8x8 */
@@ -7708,10 +7481,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        26, 12, 12, 7,
-#endif
         /* Size 4x4 */
         32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6,
         /* Size 8x8 */
@@ -7907,10 +7676,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6,
         6, 6, 6, 6, 5, 5, 5 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        21, 17, 17, 11,
-#endif
         /* Size 4x4 */
         31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10,
         /* Size 8x8 */
@@ -8121,10 +7886,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        26, 13, 13, 7,
-#endif
         /* Size 4x4 */
         32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6,
         /* Size 8x8 */
@@ -8321,10 +8082,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7,
         7, 7, 6, 6, 6, 6, 6, 6, 6 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        21, 18, 18, 11,
-#endif
         /* Size 4x4 */
         32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10,
         /* Size 8x8 */
@@ -8535,10 +8292,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        28, 14, 14, 8,
-#endif
         /* Size 4x4 */
         32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7,
         /* Size 8x8 */
@@ -8735,10 +8488,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9,
         9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        22, 19, 19, 12,
-#endif
         /* Size 4x4 */
         32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11,
         /* Size 8x8 */
@@ -8949,10 +8698,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        29, 16, 16, 9,
-#endif
         /* Size 4x4 */
         32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7,
         /* Size 8x8 */
@@ -9152,10 +8897,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11,
         11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        22, 20, 20, 12,
-#endif
         /* Size 4x4 */
         32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11,
         /* Size 8x8 */
@@ -9366,10 +9107,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        29, 18, 18, 10,
-#endif
         /* Size 4x4 */
         32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8,
         /* Size 8x8 */
@@ -9571,10 +9308,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13,
         13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        22, 20, 20, 13,
-#endif
         /* Size 4x4 */
         32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12,
         /* Size 8x8 */
@@ -9785,10 +9518,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        30, 20, 20, 12,
-#endif
         /* Size 4x4 */
         32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9,
         /* Size 8x8 */
@@ -9997,10 +9726,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
         8, 8 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        25, 21, 21, 15,
-#endif
         /* Size 4x4 */
         33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13,
         /* Size 8x8 */
@@ -10211,10 +9936,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 22, 22, 14,
-#endif
         /* Size 4x4 */
         32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11,
         /* Size 8x8 */
@@ -10423,10 +10144,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,
         10, 10, 10, 9 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        26, 22, 22, 16,
-#endif
         /* Size 4x4 */
         33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14,
         /* Size 8x8 */
@@ -10637,10 +10354,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 24, 24, 16,
-#endif
         /* Size 4x4 */
         32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13,
         /* Size 8x8 */
@@ -10849,10 +10562,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13,
         12, 12, 12, 12 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        27, 23, 23, 17,
-#endif
         /* Size 4x4 */
         33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16,
         /* Size 8x8 */
@@ -11063,10 +10772,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 27, 27, 19,
-#endif
         /* Size 4x4 */
         32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16,
         /* Size 8x8 */
@@ -11275,10 +10980,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15,
         15, 15, 15, 14 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 23, 23, 19,
-#endif
         /* Size 4x4 */
         33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17,
         /* Size 8x8 */
@@ -11489,10 +11190,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 30, 30, 21,
-#endif
         /* Size 4x4 */
         32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19,
         /* Size 8x8 */
@@ -11701,10 +11398,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18,
         18, 17, 16, 16 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 22, 22, 19,
-#endif
         /* Size 4x4 */
         33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19,
         /* Size 8x8 */
@@ -11915,10 +11608,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 31, 31, 26,
-#endif
         /* Size 4x4 */
         32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22,
         /* Size 8x8 */
@@ -12127,10 +11816,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22,
         21, 21, 21, 21 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 24, 24, 21,
-#endif
         /* Size 4x4 */
         33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20,
         /* Size 8x8 */
@@ -12341,10 +12026,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 29,
-#endif
         /* Size 4x4 */
         33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29,
         /* Size 8x8 */
@@ -12553,10 +12234,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
         28, 28, 27, 27 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 27, 27, 22,
-#endif
         /* Size 4x4 */
         33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22,
         /* Size 8x8 */
@@ -12767,10 +12444,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 31,
-#endif
         /* Size 4x4 */
         33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31,
         /* Size 8x8 */
@@ -12979,10 +12652,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
         30, 30, 30, 30 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 31, 31, 28,
-#endif
         /* Size 4x4 */
         33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26,
         /* Size 8x8 */
@@ -13193,10 +12862,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 33, 33, 32,
-#endif
         /* Size 4x4 */
         33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
         /* Size 8x8 */
@@ -13405,10 +13070,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 33, 33, 33,
-#endif
         /* Size 4x4 */
         33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
         /* Size 8x8 */
@@ -13619,10 +13280,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -13831,10 +13488,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -14044,63 +13697,3 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32 },
   },
 };
-#endif
-
-#if CONFIG_PVQ
-/* Quantization matrices for 8x8. For other block sizes, we currently just do
-   resampling. */
-/* Flat quantization, i.e. optimize for PSNR. */
-const int OD_QM8_Q4_FLAT[] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16 };
-#if 0
-/* M1: MPEG2 matrix for inter (which has a dead zone). */
-const int OD_QM8_Q4[] = {
-  16, 17, 18, 19, 20, 21, 22, 23,
-  17, 18, 19, 20, 21, 22, 23, 24,
-  18, 19, 20, 21, 22, 23, 24, 25,
-  19, 20, 21, 22, 23, 24, 26, 27,
-  20, 21, 22, 23, 25, 26, 27, 28,
-  21, 22, 23, 24, 26, 27, 28, 30,
-  22, 23, 24, 26, 27, 28, 30, 31,
-  23, 24, 25, 27, 28, 30, 31, 33};
-#endif
-#if 0
-/* M2: MPEG2 matrix for intra (no dead zone). */
-const int OD_QM8_Q4[] = {
-  16, 16, 19, 22, 22, 26, 26, 27,
-  16, 16, 22, 22, 26, 27, 27, 29,
-  19, 22, 26, 26, 27, 29, 29, 35,
-  22, 24, 27, 27, 29, 32, 34, 38,
-  26, 27, 29, 29, 32, 35, 38, 46,
-  27, 29, 34, 34, 35, 40, 46, 56,
-  29, 34, 34, 37, 40, 48, 56, 69,
-  34, 37, 38, 40, 48, 58, 69, 83
-};
-#endif
-#if 0
-/* M3: Taken from dump_psnrhvs. */
-const int OD_QM8_Q4[] = {
-  16, 16, 17, 20, 24, 29, 36, 42,
-  16, 17, 17, 19, 22, 26, 31, 37,
-  17, 17, 21, 23, 26, 30, 34, 40,
-  20, 19, 23, 28, 31, 35, 39, 45,
-  24, 22, 26, 31, 36, 41, 46, 51,
-  29, 26, 30, 35, 41, 47, 52, 58,
-  36, 31, 34, 39, 46, 52, 59, 66,
-  42, 37, 40, 45, 51, 58, 66, 73
-};
-#endif
-#if 1
-/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
-const int OD_QM8_Q4_HVS[] = { 16, 16, 18, 21, 24, 28, 32, 36, 16, 17, 20,
-                              21, 24, 27, 31, 35, 18, 20, 24, 25, 27, 31,
-                              33, 38, 21, 21, 25, 28, 30, 34, 37, 42, 24,
-                              24, 27, 30, 34, 38, 43, 49, 28, 27, 31, 34,
-                              38, 44, 50, 58, 32, 31, 33, 37, 43, 50, 58,
-                              68, 36, 35, 38, 42, 49, 58, 68, 78 };
-#endif
-#endif
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index 92843fe4d..f9681036d 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -25,82 +25,37 @@ extern "C" {
 #define MAXQ 255
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
-#if CONFIG_AOM_QM
 // Total number of QM sets stored
 #define QM_LEVEL_BITS 4
 #define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
 /* Range of QMS is between first and last value, with offset applied to inter
  * blocks*/
+#define DEFAULT_QM_Y 10
+#define DEFAULT_QM_U 11
+#define DEFAULT_QM_V 12
 #define DEFAULT_QM_FIRST 5
 #define DEFAULT_QM_LAST 9
-#define DEFAULT_QM_INTER_OFFSET 0
-#endif
 
 struct AV1Common;
 
-int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth);
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
-#if CONFIG_AOM_QM
 // Reduce the large number of quantizers to a smaller number of levels for which
 // different matrices may be defined
 static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
   return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
 }
-void aom_qm_init(struct AV1Common *cm);
-qm_val_t *aom_iqmatrix(struct AV1Common *cm, int qindex, int comp,
-                       TX_SIZE tx_size, int is_intra);
-qm_val_t *aom_qmatrix(struct AV1Common *cm, int qindex, int comp,
-                      TX_SIZE tx_size, int is_intra);
-#endif
-
-#if CONFIG_NEW_QUANT
-
-#define QUANT_PROFILES 4
-#define QUANT_RANGES 2
-#define NUQ_KNOTS 3
-
-typedef tran_low_t dequant_val_type_nuq[NUQ_KNOTS + 1];
-typedef tran_low_t cuml_bins_type_nuq[NUQ_KNOTS];
-void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
-                             tran_low_t *cuml_bins, int dq_off_index);
-tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq);
-tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq);
-
-static INLINE int qindex_to_qrange(int qindex) {
-  return (qindex < 140 ? 1 : 0);
-}
-
-static INLINE int get_dq_profile_from_ctx(int qindex, int q_ctx, int is_inter,
-                                          PLANE_TYPE plane_type) {
-  // intra/inter, Y/UV, ctx, qrange
-  static const int
-      def_dq_profile_lookup[REF_TYPES][PLANE_TYPES][COEFF_CONTEXTS0]
-                           [QUANT_RANGES] = {
-                             {
-                                 // intra
-                                 { { 2, 1 }, { 2, 1 }, { 2, 1 } },  // Y
-                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
-                             },
-                             {
-                                 // inter
-                                 { { 3, 1 }, { 2, 1 }, { 2, 1 } },  // Y
-                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
-                             },
-                           };
-  if (!qindex) return 0;  // lossless
-  return def_dq_profile_lookup[is_inter][plane_type][q_ctx]
-                              [qindex_to_qrange(qindex)];
-}
-#endif  // CONFIG_NEW_QUANT
-
-#if CONFIG_PVQ
-extern const int OD_QM8_Q4_FLAT[];
-extern const int OD_QM8_Q4_HVS[];
-#endif
+void av1_qm_init(struct AV1Common *cm);
+const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
+                             TX_SIZE tx_size);
+const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
+                            TX_SIZE tx_size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index a1a22a0af..b6ac436fb 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -13,208 +13,157 @@
 #include <stdio.h>
 #include <limits.h>
 
-#include "./aom_scale_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 
 #include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#if CONFIG_MOTION_VAR
 #include "av1/common/onyxc_int.h"
 #include "av1/common/obmc.h"
-#endif  // CONFIG_MOTION_VAR
 
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#define USE_PRECOMPUTED_WEDGE_MASK 1
+#define USE_PRECOMPUTED_WEDGE_SIGN 1
+
 // This function will determine whether or not to create a warped
-// prediction and return the appropriate motion model depending
-// on the configuration. Behavior will change with different
-// combinations of GLOBAL_MOTION, WARPED_MOTION and MOTION_VAR.
-static INLINE int allow_warp(const MODE_INFO *const mi,
-                             const WarpTypesAllowed *const warp_types,
-#if CONFIG_GLOBAL_MOTION
-                             const WarpedMotionParams *const gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_MOTION_VAR
-                             int build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-                             WarpedMotionParams *final_warp_params) {
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  *final_warp_params = default_warp_params;
-
-// Only global motion configured
-#if CONFIG_GLOBAL_MOTION && !CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
-  (void)mbmi;
-  if (warp_types->global_warp_allowed) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_GLOBAL_MOTION && !CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
+// prediction.
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, int x_scale, int y_scale,
+                   WarpedMotionParams *final_warp_params) {
+  if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS)
+    return 0;
 
-// Only warped motion configured
-#if CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION && !CONFIG_MOTION_VAR
-  if (warp_types->local_warp_allowed) {
-    memcpy(final_warp_params, &mbmi->wm_params[0], sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION && !CONFIG_MOTION_VAR
-
-// Warped and global motion configured
-#if CONFIG_GLOBAL_MOTION && CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
-  // When both are enabled, warped will take priority. The global parameters
-  // will only be used to compute projection samples to find the warped model.
-  // Note that when a block chooses global, it will not be possible to
-  // select WARPED_CAUSAL.
-  if (warp_types->local_warp_allowed) {
-    memcpy(final_warp_params, &mbmi->wm_params[0], sizeof(*final_warp_params));
-    return 1;
-  } else if (warp_types->global_warp_allowed) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_GLOBAL_MOTION && CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
-
-// Motion var and global motion configured
-#if CONFIG_GLOBAL_MOTION && CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
-  // We warp if either case is true:
-  //   1.) We are predicting a block which uses global motion
-  //   2.) We are predicting a neighboring block of a block using OBMC,
-  //       the neighboring block uses global motion, and we have enabled
-  //       WARP_GM_NEIGHBORS_WITH_OBMC
-  (void)mbmi;
-  if (warp_types->global_warp_allowed &&
-      (WARP_GM_NEIGHBORS_WITH_OBMC || !build_for_obmc)) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_GLOBAL_MOTION && CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
-
-// Motion var and warped motion configured
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && !CONFIG_GLOBAL_MOTION
-  // We warp if either case is true:
-  //   1.) We are predicting a block with motion mode WARPED_CAUSAL
-  //   2.) We are predicting a neighboring block of a block using OBMC,
-  //       the neighboring block has mode WARPED_CAUSAL, and we have enabled
-  //       WARP_WM_NEIGHBORS_WITH_OBMC
-  if (warp_types->local_warp_allowed) {
-    if ((build_for_obmc && WARP_WM_NEIGHBORS_WITH_OBMC) || (!build_for_obmc)) {
-      memcpy(final_warp_params, &mbmi->wm_params[0],
-             sizeof(*final_warp_params));
-      return 1;
-    }
-  }
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && !CONFIG_GLOBAL_MOTION
+  if (final_warp_params != NULL) *final_warp_params = default_warp_params;
 
-// Motion var, warped motion and global motion all configured
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
-  if (warp_types->local_warp_allowed) {
-    if ((build_for_obmc && WARP_WM_NEIGHBORS_WITH_OBMC) || (!build_for_obmc)) {
+  if (build_for_obmc) return 0;
+
+  if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) {
+    if (final_warp_params != NULL)
       memcpy(final_warp_params, &mbmi->wm_params[0],
              sizeof(*final_warp_params));
-      return 1;
-    }
-  } else if (warp_types->global_warp_allowed &&
-             (WARP_GM_NEIGHBORS_WITH_OBMC || !build_for_obmc)) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
+    if (final_warp_params != NULL)
+      memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
     return 1;
   }
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
 
   return 0;
 }
-#endif  // CONFIG_GLOBAL_MOTION ||CONFIG_WARPED_MOTION
-
-static INLINE void av1_make_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
-    int w, int h, ConvolveParams *conv_params, InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    const WarpTypesAllowed *warp_types, int p_col, int p_row, int plane,
-    int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-    const MODE_INFO *mi, int build_for_obmc,
-#endif
-    int xs, int ys, const MACROBLOCKD *xd) {
-  (void)xd;
 
-#if !CONFIG_MOTION_VAR
-  const MODE_INFO *mi = xd->mi[0];
-  (void)mi;
-#endif  // CONFIG_MOTION_VAR
-
-// Make sure the selected motion mode is valid for this configuration
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  assert_motion_mode_valid(mi->mbmi.motion_mode,
-#if CONFIG_GLOBAL_MOTION
-                           0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-                           xd,
-#endif
-                           mi);
-#endif  // CONFIG MOTION_VAR || CONFIG_WARPED_MOTION
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, const SubpelParams *subpel_params,
+                              const struct scale_factors *sf, int w, int h,
+                              ConvolveParams *conv_params,
+                              InterpFilters interp_filters,
+                              const WarpTypesAllowed *warp_types, int p_col,
+                              int p_row, int plane, int ref,
+                              const MB_MODE_INFO *mi, int build_for_obmc,
+                              const MACROBLOCKD *xd, int can_use_previous) {
+  // Make sure the selected motion mode is valid for this configuration
+  assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
+                           can_use_previous);
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
-#if CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
   WarpedMotionParams final_warp_params;
-  const int do_warp = allow_warp(
-      mi, warp_types,
-#if CONFIG_GLOBAL_MOTION
-#if CONFIG_COMPOUND_SINGLEREF
-      // TODO(zoeliu): To further check the single
-      // ref comp mode to work together with
-      //               global motion.
-      has_second_ref(&mi->mbmi) ? &xd->global_motion[mi->mbmi.ref_frame[ref]]
-                                : &xd->global_motion[mi->mbmi.ref_frame[0]],
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-      &xd->global_motion[mi->mbmi.ref_frame[ref]],
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_MOTION_VAR
-      build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-      &final_warp_params);
-  if (do_warp
-#if CONFIG_AMVR
-      && xd->cur_frame_mv_precision_level == 0
-#endif
-      ) {
+  const int do_warp =
+      (w >= 8 && h >= 8 &&
+       av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
+                      build_for_obmc, subpel_params->xs, subpel_params->ys,
+                      &final_warp_params));
+  if (do_warp && xd->cur_frame_force_integer_mv == 0) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const struct buf_2d *const pre_buf = &pd->pre[ref];
     av1_warp_plane(&final_warp_params,
-#if CONFIG_HIGHBITDEPTH
                    xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
                    pre_buf->buf0, pre_buf->width, pre_buf->height,
                    pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
-                   pd->subsampling_x, pd->subsampling_y, xs, ys, conv_params);
-    return;
-  }
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                           sf, w, h, conv_params, interp_filters, xs, ys,
-                           xd->bd);
-    return;
+                   pd->subsampling_x, pd->subsampling_y, conv_params);
+  } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
+                           w, h, conv_params, interp_filters, xd->bd);
+  } else {
+    inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
+                    conv_params, interp_filters);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, sf, w,
-                  h, conv_params, interp_filters, xs, ys);
 }
 
-#define NSMOOTHERS 1
+#if USE_PRECOMPUTED_WEDGE_MASK
+static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
+  37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+  46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+  43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+  if (shift >= 0) {
+    memcpy(dst + shift, src, width - shift);
+    memset(dst, src[0], shift);
+  } else {
+    shift = -shift;
+    memcpy(dst, src + shift, width - shift);
+    memset(dst + width - shift, src[width - 1], shift);
+  }
+}
+#endif  // USE_PRECOMPUTED_WEDGE_MASK
 
-// [smoother][negative][direction]
+#if USE_PRECOMPUTED_WEDGE_SIGN
+/* clang-format off */
 DECLARE_ALIGNED(16, static uint8_t,
-                wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
-                              [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
-
+                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+};
+/* clang-format on */
+#else
 DECLARE_ALIGNED(16, static uint8_t,
                 wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
+#endif  // USE_PRECOMPUTED_WEDGE_SIGN
+
+// [negative][direction]
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
 
 // 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
 // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
@@ -223,88 +172,6 @@ DECLARE_ALIGNED(16, static uint8_t,
 
 static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
 
-// Some unused wedge codebooks left temporarily to facilitate experiments.
-// To be removed when settled.
-/*
-static wedge_code_type wedge_codebook_8_hgtw[8] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
-  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
-};
-
-static wedge_code_type wedge_codebook_8_hltw[8] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
-  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
-};
-
-static wedge_code_type wedge_codebook_8_heqw[8] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
-  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
-};
-
-static const wedge_code_type wedge_codebook_32_hgtw[32] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
-  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
-  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
-  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
-  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
-  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
-  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
-  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
-  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
-  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
-  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
-  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
-  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
-};
-
-static const wedge_code_type wedge_codebook_32_hltw[32] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
-  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
-  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
-  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
-  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
-  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
-  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
-  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
-  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
-  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
-  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
-  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
-  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
-  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
-};
-
-static const wedge_code_type wedge_codebook_32_heqw[32] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
-  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
-  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
-  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
-  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
-  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
-  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
-  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
-  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
-  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
-  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
-  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
-  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
-  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
-};
-*/
-
 static const wedge_code_type wedge_codebook_16_hgtw[16] = {
   { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
   { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
@@ -339,78 +206,37 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
 };
 
 const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#if CONFIG_WEDGE
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
-    wedge_masks[BLOCK_8X8] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
-    wedge_masks[BLOCK_8X16] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
-    wedge_masks[BLOCK_16X8] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
-    wedge_masks[BLOCK_16X16] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
-    wedge_masks[BLOCK_16X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
-    wedge_masks[BLOCK_32X16] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
-    wedge_masks[BLOCK_32X32] },
-#else
-  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
     wedge_masks[BLOCK_8X8] },
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
     wedge_masks[BLOCK_8X16] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
     wedge_masks[BLOCK_16X8] },
-  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
     wedge_masks[BLOCK_16X16] },
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
     wedge_masks[BLOCK_16X32] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
     wedge_masks[BLOCK_32X16] },
-  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
     wedge_masks[BLOCK_32X32] },
-#endif  // CONFIG_WEDGE
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#if CONFIG_EXT_PARTITION
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_WEDGE
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_4X16], 0,
-    wedge_masks[BLOCK_4X16] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X4], 0,
-    wedge_masks[BLOCK_16X4] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], 0,
-    wedge_masks[BLOCK_8X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], 0,
-    wedge_masks[BLOCK_32X8] },
-#else
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_4X16], 0,
-    wedge_masks[BLOCK_4X16] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X4], 0,
-    wedge_masks[BLOCK_16X4] },
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], 0,
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
     wedge_masks[BLOCK_8X32] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], 0,
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
     wedge_masks[BLOCK_32X8] },
-#endif  // CONFIG_WEDGE
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#if CONFIG_EXT_PARTITION
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_EXT_PARTITION
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
 };
 
 static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
@@ -420,7 +246,6 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
   const int bw = block_size_wide[sb_type];
   const wedge_code_type *a =
       wedge_params_lookup[sb_type].codebook + wedge_index;
-  const int smoother = wedge_params_lookup[sb_type].smoother;
   int woff, hoff;
   const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
 
@@ -428,339 +253,231 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
          wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
   woff = (a->x_offset * bw) >> 3;
   hoff = (a->y_offset * bh) >> 3;
-  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
+  master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
            MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
            MASK_MASTER_SIZE / 2 - woff;
   return master;
 }
 
-const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
-                                 BLOCK_SIZE sb_type, int offset_x,
-                                 int offset_y) {
-  const uint8_t *mask =
-      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
-  if (mask) mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
-  return mask;
-}
-
-#if CONFIG_COMPOUND_SEGMENT
-static uint8_t *invert_mask(uint8_t *mask_inv_buffer, const uint8_t *const mask,
-                            int h, int w, int stride) {
-  int i, j;
-
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      mask_inv_buffer[i * stride + j] =
-          AOM_BLEND_A64_MAX_ALPHA - mask[i * stride + j];
-    }
-  return mask_inv_buffer;
-}
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-const uint8_t *av1_get_compound_type_mask_inverse(
-    const INTERINTER_COMPOUND_DATA *const comp_data,
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t *mask_buffer, int h, int w, int stride,
-#endif
-    BLOCK_SIZE sb_type) {
-  assert(is_masked_compound_type(comp_data->interinter_compound_type));
-  (void)sb_type;
-  switch (comp_data->interinter_compound_type) {
-#if CONFIG_WEDGE
-    case COMPOUND_WEDGE:
-      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
-                                          !comp_data->wedge_sign, sb_type);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      return invert_mask(mask_buffer, comp_data->seg_mask, h, w, stride);
-#endif  // CONFIG_COMPOUND_SEGMENT
-    default: assert(0); return NULL;
-  }
-}
-
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
-  assert(is_masked_compound_type(comp_data->interinter_compound_type));
+  assert(is_masked_compound_type(comp_data->type));
   (void)sb_type;
-  switch (comp_data->interinter_compound_type) {
-#if CONFIG_WEDGE
+  switch (comp_data->type) {
     case COMPOUND_WEDGE:
       return av1_get_contiguous_soft_mask(comp_data->wedge_index,
                                           comp_data->wedge_sign, sb_type);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG: return comp_data->seg_mask;
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_DIFFWTD: return comp_data->seg_mask;
     default: assert(0); return NULL;
   }
 }
 
-#if CONFIG_COMPOUND_SEGMENT
-#if COMPOUND_SEGMENT_TYPE == 0
-static void uniform_mask(uint8_t *mask, int which_inverse, BLOCK_SIZE sb_type,
-                         int h, int w, int mask_val) {
-  int i, j;
-  int block_stride = block_size_wide[sb_type];
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - mask_val : mask_val;
-    }
-}
-
-void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                             const uint8_t *src0, int src0_stride,
-                             const uint8_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w) {
-  (void)src0;
-  (void)src1;
-  (void)src0_stride;
-  (void)src1_stride;
-  switch (mask_type) {
-    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
-    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
-    default: assert(0);
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                    const uint8_t *src0, int src0_stride,
-                                    const uint8_t *src1, int src1_stride,
-                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
-  (void)src0;
-  (void)src1;
-  (void)src0_stride;
-  (void)src1_stride;
-  (void)bd;
-  switch (mask_type) {
-    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
-    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#elif COMPOUND_SEGMENT_TYPE == 1
-#define DIFF_FACTOR 16
-
-#if CONFIG_CONVOLVE_ROUND
-static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base,
-                             const int32_t *src0, int src0_stride,
-                             const int32_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w,
-                             ConvolveParams *conv_params, int bd) {
+static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
+                             const CONV_BUF_TYPE *src0, int src0_stride,
+                             const CONV_BUF_TYPE *src1, int src1_stride, int h,
+                             int w, ConvolveParams *conv_params, int bd) {
   int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   int i, j, m, diff;
-  int block_stride = block_size_wide[sb_type];
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
       diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
       diff = ROUND_POWER_OF_TWO(diff, round);
       m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
     }
   }
 }
 
-static void build_compound_seg_mask_d32(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                        const int32_t *src0, int src0_stride,
-                                        const int32_t *src1, int src1_stride,
-                                        BLOCK_SIZE sb_type, int h, int w,
-                                        ConvolveParams *conv_params, int bd) {
+void av1_build_compound_diffwtd_mask_d16_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
   switch (mask_type) {
     case DIFFWTD_38:
-      diffwtd_mask_d32(mask, 0, 38, src0, src0_stride, src1, src1_stride,
-                       sb_type, h, w, conv_params, bd);
+      diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
       break;
     case DIFFWTD_38_INV:
-      diffwtd_mask_d32(mask, 1, 38, src0, src0_stride, src1, src1_stride,
-                       sb_type, h, w, conv_params, bd);
+      diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
       break;
     default: assert(0);
   }
 }
-#endif
 
 static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
                          const uint8_t *src0, int src0_stride,
-                         const uint8_t *src1, int src1_stride,
-                         BLOCK_SIZE sb_type, int h, int w) {
+                         const uint8_t *src1, int src1_stride, int h, int w) {
   int i, j, m, diff;
-  int block_stride = block_size_wide[sb_type];
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
       diff =
           abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
       m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
     }
   }
 }
 
-void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                             const uint8_t *src0, int src0_stride,
-                             const uint8_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w) {
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
+                                       DIFFWTD_MASK_TYPE mask_type,
+                                       const uint8_t *src0, int src0_stride,
+                                       const uint8_t *src1, int src1_stride,
+                                       int h, int w) {
   switch (mask_type) {
     case DIFFWTD_38:
-      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, sb_type,
-                   h, w);
+      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
       break;
     case DIFFWTD_38_INV:
-      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, sb_type,
-                   h, w);
+      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
       break;
     default: assert(0);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void diffwtd_mask_highbd(uint8_t *mask, int which_inverse, int mask_base,
-                                const uint16_t *src0, int src0_stride,
-                                const uint16_t *src1, int src1_stride,
-                                BLOCK_SIZE sb_type, int h, int w, int bd) {
-  int i, j, m, diff;
-  int block_stride = block_size_wide[sb_type];
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      diff = abs((int)src0[i * src0_stride + j] -
-                 (int)src1[i * src1_stride + j]) >>
-             (bd - 8);
-      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+static AOM_FORCE_INLINE void diffwtd_mask_highbd(
+    uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
+    int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
+    const unsigned int bd) {
+  assert(bd >= 8);
+  if (bd == 8) {
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  } else {
+    const unsigned int bd_shift = bd - 8;
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
     }
   }
 }
 
-void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                    const uint8_t *src0, int src0_stride,
-                                    const uint8_t *src1, int src1_stride,
-                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
+void av1_build_compound_diffwtd_mask_highbd_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
   switch (mask_type) {
     case DIFFWTD_38:
       diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
-                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
-                          bd);
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
       break;
     case DIFFWTD_38_INV:
       diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
-                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
-                          bd);
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
       break;
     default: assert(0);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // COMPOUND_SEGMENT_TYPE
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-#if MASK_MASTER_SIZE == 64
-static const uint8_t wedge_master_oblique_odd[NSMOOTHERS][MASK_MASTER_SIZE] = {
-  {
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
-      37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  }
-};
-static const uint8_t wedge_master_oblique_even[NSMOOTHERS][MASK_MASTER_SIZE] = {
-  {
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
-      46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  }
-};
-static const uint8_t wedge_master_vertical[NSMOOTHERS][MASK_MASTER_SIZE] = { {
-    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
-    43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-} };
-
-static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
-  if (shift >= 0) {
-    memcpy(dst + shift, src, width - shift);
-    memset(dst, src[0], shift);
-  } else {
-    shift = -shift;
-    memcpy(dst, src + shift, width - shift);
-    memset(dst + width - shift, src[width - 1], shift);
-  }
-}
-#else
-static const double smoother_param[NSMOOTHERS] = { 3.0 };
-#endif  // MASK_MASTER_SIZE == 64
 
 static void init_wedge_master_masks() {
-  int i, j, s;
+  int i, j;
   const int w = MASK_MASTER_SIZE;
   const int h = MASK_MASTER_SIZE;
   const int stride = MASK_MASTER_STRIDE;
-  for (s = 0; s < NSMOOTHERS; s++) {
 // Note: index [0] stores the masters, and [1] its complement.
-#if MASK_MASTER_SIZE == 64
-    // Generate prototype by shifting the masters
-    int shift = h / 4;
-    for (i = 0; i < h; i += 2) {
-      shift_copy(wedge_master_oblique_even[s],
-                 &wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride], shift,
-                 MASK_MASTER_SIZE);
-      shift--;
-      shift_copy(wedge_master_oblique_odd[s],
-                 &wedge_mask_obl[s][0][WEDGE_OBLIQUE63][(i + 1) * stride],
-                 shift, MASK_MASTER_SIZE);
-      memcpy(&wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride],
-             wedge_master_vertical[s],
-             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[s][0]));
-      memcpy(&wedge_mask_obl[s][0][WEDGE_VERTICAL][(i + 1) * stride],
-             wedge_master_vertical[s],
-             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[s][0]));
-    }
+#if USE_PRECOMPUTED_WEDGE_MASK
+  // Generate prototype by shifting the masters
+  int shift = h / 4;
+  for (i = 0; i < h; i += 2) {
+    shift_copy(wedge_master_oblique_even,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
+               MASK_MASTER_SIZE);
+    shift--;
+    shift_copy(wedge_master_oblique_odd,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
+               MASK_MASTER_SIZE);
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+  }
 #else
-    const int a[2] = { 2, 1 };
-    const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
-    for (i = 0; i < h; i++) {
-      for (j = 0; j < w; ++j) {
-        int x = (2 * j + 1 - w);
-        int y = (2 * i + 1 - h);
-        double d = (a[0] * x + a[1] * y) / asqrt;
-        const int msk = (int)rint((1.0 + tanh(d / smoother_param[s])) * 32);
-        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] = msk;
-        const int mskx = (int)rint((1.0 + tanh(x / smoother_param[s])) * 32);
-        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] = mskx;
-      }
+  static const double smoother_param = 2.85;
+  const int a[2] = { 2, 1 };
+  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; ++j) {
+      int x = (2 * j + 1 - w);
+      int y = (2 * i + 1 - h);
+      double d = (a[0] * x + a[1] * y) / asqrt;
+      const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
+      wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
+      const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
+      wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
     }
-#endif  // MASK_MASTER_SIZE == 64
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int msk = wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j];
-        wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] = msk;
-        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
-            wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
-                (1 << WEDGE_WEIGHT_BITS) - msk;
-        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] =
-            wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] =
-                (1 << WEDGE_WEIGHT_BITS) - msk;
-        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
-            wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
-                msk;
-        const int mskx = wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j];
-        wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
-        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] =
-            wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] =
-                (1 << WEDGE_WEIGHT_BITS) - mskx;
-      }
+  }
+#endif  // USE_PRECOMPUTED_WEDGE_MASK
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
+      wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
+      wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
+      const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
+      wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+      wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - mskx;
     }
   }
 }
 
+#if !USE_PRECOMPUTED_WEDGE_SIGN
 // If the signs for the wedges for various blocksizes are
 // inconsistent flip the sign flag. Do it only once for every
 // wedge codebook.
@@ -774,28 +491,29 @@ static void init_wedge_signs() {
     const int wbits = wedge_params.bits;
     const int wtypes = 1 << wbits;
     int i, w;
-    if (wbits == 0) continue;
-    for (w = 0; w < wtypes; ++w) {
-      // Get the mask master, i.e. index [0]
-      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
-      int avg = 0;
-      for (i = 0; i < bw; ++i) avg += mask[i];
-      for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
-      avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
-      // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
-      // If default sign is 1:
-      //   If sign requested is 0, we need to flip the sign and return
-      //   the complement i.e. index [1] instead. If sign requested is 1
-      //   we need to flip the sign and return index [0] instead.
-      // If default sign is 0:
-      //   If sign requested is 0, we need to return index [0] the master
-      //   if sign requested is 1, we need to return the complement index [1]
-      //   instead.
-      wedge_params.signflip[w] = (avg < 32);
-      // printf("%d[%d] = %d\n", sb_type, w, wedge_params.signflip[w]);
+    if (wbits) {
+      for (w = 0; w < wtypes; ++w) {
+        // Get the mask master, i.e. index [0]
+        const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+        int avg = 0;
+        for (i = 0; i < bw; ++i) avg += mask[i];
+        for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
+        avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
+        // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
+        // If default sign is 1:
+        //   If sign requested is 0, we need to flip the sign and return
+        //   the complement i.e. index [1] instead. If sign requested is 1
+        //   we need to flip the sign and return index [0] instead.
+        // If default sign is 0:
+        //   If sign requested is 0, we need to return index [0] the master
+        //   if sign requested is 1, we need to return the complement index [1]
+        //   instead.
+        wedge_params.signflip[w] = (avg < 32);
+      }
     }
   }
 }
+#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
 
 static void init_wedge_masks() {
   uint8_t *dst = wedge_mask_buf;
@@ -830,83 +548,32 @@ static void init_wedge_masks() {
 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
 void av1_init_wedge_masks() {
   init_wedge_master_masks();
+#if !USE_PRECOMPUTED_WEDGE_SIGN
   init_wedge_signs();
+#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
   init_wedge_masks();
 }
 
-#if CONFIG_SUPERTX
-static void build_masked_compound_wedge_extend(
-    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
-    const uint8_t *src1, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
-    int wedge_offset_x, int wedge_offset_y, int h, int w) {
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
-  const uint8_t *mask;
-  size_t mask_stride;
-  switch (comp_data->interinter_compound_type) {
-    case COMPOUND_WEDGE:
-      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
-                               sb_type, wedge_offset_x, wedge_offset_y);
-      mask_stride = MASK_MASTER_STRIDE;
-      break;
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      mask = comp_data->seg_mask;
-      mask_stride = block_size_wide[sb_type];
-      break;
-#endif
-    default: assert(0); return;
-  }
-  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, (int)mask_stride, h, w, subh, subw);
-}
-
-#if CONFIG_HIGHBITDEPTH
-static void build_masked_compound_wedge_extend_highbd(
-    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
-    const uint8_t *src1_8, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
-    int wedge_offset_x, int wedge_offset_y, int h, int w, int bd) {
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
-  const uint8_t *mask;
-  size_t mask_stride;
-  switch (comp_data->interinter_compound_type) {
-    case COMPOUND_WEDGE:
-      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
-                               sb_type, wedge_offset_x, wedge_offset_y);
-      mask_stride = MASK_MASTER_STRIDE;
-      break;
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      mask = comp_data->seg_mask;
-      mask_stride = block_size_wide[sb_type];
-      break;
-#endif
-    default: assert(0); return;
-  }
-  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                            src1_stride, mask, (int)mask_stride, h, w, subh,
-                            subw, bd);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-#else
-#if CONFIG_CONVOLVE_ROUND
 static void build_masked_compound_no_round(
-    CONV_BUF_TYPE *dst, int dst_stride, const CONV_BUF_TYPE *src0,
-    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
+    uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w) {
+    int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  aom_blend_a64_d32_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, block_size_wide[sb_type], h, w, subh, subw);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, block_size_wide[sb_type],
+                                  w, h, subw, subh, conv_params, xd->bd);
+  else
+    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, block_size_wide[sb_type], w,
+                                 h, subw, subh, conv_params);
 }
-#endif  // CONFIG_CONVOLVE_ROUND
+
 static void build_masked_compound(
     uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
     const uint8_t *src1, int src1_stride,
@@ -914,14 +581,13 @@ static void build_masked_compound(
     int w) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
   aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, block_size_wide[sb_type], h, w, subh, subw);
+                     mask, block_size_wide[sb_type], w, h, subw, subh);
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void build_masked_compound_highbd(
     uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
     const uint8_t *src1_8, int src1_stride,
@@ -929,501 +595,329 @@ static void build_masked_compound_highbd(
     int w, int bd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
   // const uint8_t *mask =
   //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
   aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                            src1_stride, mask, block_size_wide[sb_type], h, w,
-                            subh, subw, bd);
+                            src1_stride, mask, block_size_wide[sb_type], w, h,
+                            subw, subh, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_SUPERTX
 
 void av1_make_masked_inter_predictor(
     const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
-    int w, int h, ConvolveParams *conv_params, InterpFilters interp_filters,
-    int xs, int ys,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    int plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
     const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    MACROBLOCKD *xd) {
-  const MODE_INFO *mi = xd->mi[0];
-
-  const INTERINTER_COMPOUND_DATA comp_data = {
-#if CONFIG_WEDGE
-    mi->mbmi.wedge_index,
-    mi->mbmi.wedge_sign,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mi->mbmi.mask_type,
-    xd->seg_mask,
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mi->mbmi.interinter_compound_type
-  };
+    MACROBLOCKD *xd, int can_use_previous) {
+  MB_MODE_INFO *mi = xd->mi[0];
+  (void)dst;
+  (void)dst_stride;
+  mi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
 
 // We're going to call av1_make_inter_predictor to generate a prediction into
 // a temporary buffer, then will blend that temporary buffer with that from
 // the other reference.
 //
-// With CONFIG_CONVOLVE_ROUND, if the rounding mode is CONVOLVE_OPT_NO_ROUND
-// then the predictions are at 32-bits, so we'll need 32 bits per
-// pixel. Otherwise, we'll need up to 16 bits per pixel if
-// CONFIG_HIGHBITDEPTH or just 8 otherwise.
-#if CONFIG_CONVOLVE_ROUND
-#define INTER_PRED_BYTES_PER_PIXEL 4
-#elif CONFIG_HIGHBITDEPTH
 #define INTER_PRED_BYTES_PER_PIXEL 2
-#else
-#define INTER_PRED_BYTES_PER_PIXEL 1
-#endif
-  DECLARE_ALIGNED(16, uint8_t,
+
+  DECLARE_ALIGNED(32, uint8_t,
                   tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
 #undef INTER_PRED_BYTES_PER_PIXEL
 
-#if CONFIG_HIGHBITDEPTH
   uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                          ? CONVERT_TO_BYTEPTR(tmp_buf)
                          : tmp_buf;
-  const int bd = xd->bd;
-#else
-  uint8_t *tmp_dst = tmp_buf;
-  const int bd = 8;
-#endif
 
-#if CONFIG_CONVOLVE_ROUND
   const int tmp_buf_stride = MAX_SB_SIZE;
-  const int is_conv_no_round = conv_params->round == CONVOLVE_OPT_NO_ROUND;
   CONV_BUF_TYPE *org_dst = conv_params->dst;
   int org_dst_stride = conv_params->dst_stride;
-  CONV_BUF_TYPE *tmp_buf32 = (CONV_BUF_TYPE *)tmp_buf;
-  if (is_conv_no_round) {
-    conv_params->dst = tmp_buf32;
-    conv_params->dst_stride = tmp_buf_stride;
-    assert(conv_params->do_average == 0);
-  }
-#endif  // CONFIG_CONVOLVE_ROUND
+  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+  conv_params->dst = tmp_buf16;
+  conv_params->dst_stride = tmp_buf_stride;
+  assert(conv_params->do_average == 0);
 
   // This will generate a prediction in tmp_buf for the second reference
-  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
-                           subpel_y, sf, w, h, conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           mi, 0,
-#endif
-                           xs, ys, xd);
-
-#if CONFIG_COMPOUND_SEGMENT
-  if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-#if CONFIG_CONVOLVE_ROUND
-    if (is_conv_no_round) {
-      build_compound_seg_mask_d32(
-          comp_data.seg_mask, comp_data.mask_type, org_dst, org_dst_stride,
-          tmp_buf32, tmp_buf_stride, mi->mbmi.sb_type, h, w, conv_params, bd);
-    } else {
-#endif  // CONFIG_CONVOLVE_ROUND
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type,
-                                       dst, dst_stride, tmp_dst, MAX_SB_SIZE,
-                                       mi->mbmi.sb_type, h, w, bd);
-      } else {
-#endif
-        build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
-                                dst_stride, tmp_dst, MAX_SB_SIZE,
-                                mi->mbmi.sb_type, h, w);
-#if CONFIG_HIGHBITDEPTH
-      }
-#endif
-#if CONFIG_CONVOLVE_ROUND
-    }
-#endif
-  }
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-#if CONFIG_SUPERTX
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    build_masked_compound_wedge_extend_highbd(
-        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
-        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
-  else
-#endif  // CONFIG_HIGHBITDEPTH
-    build_masked_compound_wedge_extend(
-        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
-        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
-#else
-#if CONFIG_CONVOLVE_ROUND
-  if (is_conv_no_round) {
-    build_masked_compound_no_round(org_dst, org_dst_stride, org_dst,
-                                   org_dst_stride, tmp_buf32, tmp_buf_stride,
-                                   &comp_data, mi->mbmi.sb_type, h, w);
-
-    const int convolve_rounding_bits =
-        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      av1_highbd_convolve_rounding(org_dst, org_dst_stride, dst, dst_stride, w,
-                                   h, convolve_rounding_bits, xd->bd);
-    else
-#endif
-      av1_convolve_rounding(org_dst, org_dst_stride, dst, dst_stride, w, h,
-                            convolve_rounding_bits);
-
-    conv_params->do_post_rounding = 0;
-  } else {
-#endif  // CONFIG_CONVOLVE_ROUND
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
+                           sf, w, h, conv_params, interp_filters, warp_types,
+                           p_col, p_row, plane, ref, mi, 0, xd,
+                           can_use_previous);
 
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
-                                   MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h,
-                                   w, xd->bd);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
-                            MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
-#if CONFIG_CONVOLVE_ROUND
+  if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+    av1_build_compound_diffwtd_mask_d16(
+        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+        tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
   }
-#endif  // CONFIG_CONVOLVE_ROUND
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_COMPOUND_SEGMENT
-  (void)plane;
-#endif  // CONFIG_COMPOUND_SEGMENT
+  build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
+                                 tmp_buf16, tmp_buf_stride, comp_data,
+                                 mi->sb_type, h, w, conv_params, xd);
 }
 
 // TODO(sarahparker) av1_highbd_build_inter_predictor and
 // av1_build_inter_predictor should be combined with
 // av1_make_inter_predictor
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_build_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
-    InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    const WarpTypesAllowed *warp_types, int p_col, int p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd) {
+    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
+    int p_row, int plane, enum mv_precision precision, int x, int y,
+    const MACROBLOCKD *xd, int can_use_previous) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
   mv.col += SCALE_EXTRA_OFF;
   mv.row += SCALE_EXTRA_OFF;
-  const int subpel_x = mv.col & SCALE_SUBPEL_MASK;
-  const int subpel_y = mv.row & SCALE_SUBPEL_MASK;
-  ConvolveParams conv_params = get_conv_params(ref, ref, plane);
+  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                       mv.col & SCALE_SUBPEL_MASK,
+                                       mv.row & SCALE_SUBPEL_MASK };
+  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
 
   src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
          (mv.col >> SCALE_SUBPEL_BITS);
 
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                           sf, w, h, &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           xd->mi[0], 0,
-#endif
-                           sf->x_step_q4, sf->y_step_q4, xd);
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+                           w, h, &conv_params, interp_filters, warp_types,
+                           p_col, p_row, plane, ref, xd->mi[0], 0, xd,
+                           can_use_previous);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
                                ConvolveParams *conv_params,
                                InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                const WarpTypesAllowed *warp_types, int p_col,
                                int p_row, int plane, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd) {
+                               const MACROBLOCKD *xd, int can_use_previous) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
   mv.col += SCALE_EXTRA_OFF;
   mv.row += SCALE_EXTRA_OFF;
-  const int subpel_x = mv.col & SCALE_SUBPEL_MASK;
-  const int subpel_y = mv.row & SCALE_SUBPEL_MASK;
 
+  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                       mv.col & SCALE_SUBPEL_MASK,
+                                       mv.row & SCALE_SUBPEL_MASK };
   src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
          (mv.col >> SCALE_SUBPEL_BITS);
 
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                           sf, w, h, conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           xd->mi[0], 0,
-#endif
-                           sf->x_step_q4, sf->y_step_q4, xd);
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+                           w, h, conv_params, interp_filters, warp_types, p_col,
+                           p_row, plane, ref, xd->mi[0], 0, xd,
+                           can_use_previous);
 }
 
-typedef struct SubpelParams {
-  int xs;
-  int ys;
-  int subpel_x;
-  int subpel_y;
-} SubpelParams;
-
-static INLINE void build_inter_predictors(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
-#if CONFIG_MOTION_VAR
-    const MODE_INFO *mi, int build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-    int block, int bw, int bh, int x, int y, int w, int h,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#if !CONFIG_MOTION_VAR
-  const MODE_INFO *mi = xd->mi[0];
-#endif  // CONFIG_MOTION_VAR
-  int is_compound = has_second_ref(&mi->mbmi);
-#if CONFIG_COMPOUND_SINGLEREF
-  int is_comp_mode_pred =
-      is_compound || is_inter_singleref_comp_mode(mi->mbmi.mode);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  int ref;
-#if CONFIG_INTRABC
-  const int is_intrabc = is_intrabc_block(&mi->mbmi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-#endif  // CONFIG_INTRABC
-#if CONFIG_GLOBAL_MOTION
-  int is_global[2] = { 0, 0 };
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, block, wm->wmtype);
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                                int order_idx, int *fwd_offset, int *bck_offset,
+                                int *use_jnt_comp_avg, int is_compound) {
+  assert(fwd_offset != NULL && bck_offset != NULL);
+  if (!is_compound || mbmi->compound_idx) {
+    *use_jnt_comp_avg = 0;
+    return;
   }
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!is_compound && is_comp_mode_pred) is_global[1] = is_global[0];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_GLOBAL_MOTION
-
-#if CONFIG_CB4X4
-  (void)block;
-  (void)cm;
-#endif
 
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  int sub8x8_inter = bsize < BLOCK_8X8 && (ss_x || ss_y);
+  *use_jnt_comp_avg = 1;
+  const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+  const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  const int cur_frame_index = cm->cur_frame->cur_frame_offset;
+  int bck_frame_index = 0, fwd_frame_index = 0;
 
-#if CONFIG_INTRABC
-  if (is_intrabc) {
-    sub8x8_inter = 0;
+  if (bck_idx >= 0) {
+    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
   }
-#endif
 
-#if CONFIG_MOTION_VAR
-  sub8x8_inter = sub8x8_inter && !build_for_obmc;
-#endif  // CONFIG_MOTION_VAR
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
-
-  if (sub8x8_inter) {
-    for (int row = row_start; row <= 0 && sub8x8_inter; ++row)
-      for (int col = col_start; col <= 0; ++col)
-        if (!is_inter_block(&xd->mi[row * xd->mi_stride + col]->mbmi))
-          sub8x8_inter = 0;
+  if (fwd_idx >= 0) {
+    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
   }
 
-  if (sub8x8_inter) {
-    // block size
-    const int b4_w = block_size_wide[bsize] >> ss_x;
-    const int b4_h = block_size_high[bsize] >> ss_y;
-    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
-    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
-    const int b8_h = block_size_high[plane_bsize] >> ss_y;
-    int idx, idy;
+  int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+  int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
 
-    const int x_base = x;
-    const int y_base = y;
+  const int order = d0 <= d1;
 
-    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+  if (d0 == 0 || d1 == 0) {
+    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
+    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+    return;
+  }
 
-    int row = row_start;
-    for (idy = 0; idy < b8_h; idy += b4_h) {
-      int col = col_start;
-      for (idx = 0; idx < b8_w; idx += b4_w) {
-        MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi;
-        is_compound = has_second_ref(this_mbmi);
-#if CONFIG_CONVOLVE_ROUND
-        DECLARE_ALIGNED(16, int32_t, tmp_dst[8 * 8]);
+  int i;
+  for (i = 0; i < 3; ++i) {
+    int c0 = quant_dist_weight[i][order];
+    int c1 = quant_dist_weight[i][!order];
+    int d0_c0 = d0 * c0;
+    int d1_c1 = d1 * c1;
+    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+  }
+
+  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
+  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+}
+
+static INLINE void calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+    int bw, int bh) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+           (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+  } else {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+           (x + (mv_q4.col >> SUBPEL_BITS));
+  }
+}
+
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int plane, const MB_MODE_INFO *mi,
+                                          int build_for_obmc, int bw, int bh,
+                                          int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int is_compound = has_second_ref(mi);
+  int ref;
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  int is_global[2] = { 0, 0 };
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  sub8x8_inter = sub8x8_inter && !build_for_obmc;
+  if (sub8x8_inter) {
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
+    }
+  }
+
+  if (sub8x8_inter) {
+    // block size
+    const int b4_w = block_size_wide[bsize] >> ss_x;
+    const int b4_h = block_size_high[bsize] >> ss_y;
+    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+    const int b8_h = block_size_high[plane_bsize] >> ss_y;
+    assert(!is_compound);
+
+    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+    int row = row_start;
+    for (int y = 0; y < b8_h; y += b4_h) {
+      int col = col_start;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        is_compound = has_second_ref(this_mbmi);
+        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
         int tmp_dst_stride = 8;
-        assert(w <= 8 && h <= 8);
-#endif  // CONFIG_CONVOLVE_ROUND
-#if CONFIG_CONVOLVE_ROUND
-        ConvolveParams conv_params =
-            get_conv_params_no_round(0, 0, plane, tmp_dst, tmp_dst_stride);
-#else
-        ConvolveParams conv_params = get_conv_params(0, 0, plane);
-#endif
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
         struct buf_2d *const dst_buf = &pd->dst;
-        x = x_base + idx;
-        y = y_base + idy;
         uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
 
-        // TODO(zoeliu): If single ref comp modes are considered here, a
-        //               mismatch was caused. Need a further investigation.
-        for (ref = 0; ref < 1 + is_compound; ++ref) {
-          const RefBuffer *ref_buf =
-              &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
-
-          const int c_offset = (mi_x + MI_SIZE * col_start) >> ss_x;
-          const int r_offset = (mi_y + MI_SIZE * row_start) >> ss_y;
-          pd->pre[ref].buf0 =
-              (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
-          pd->pre[ref].buf =
-              pd->pre[ref].buf0 + scaled_buffer_offset(c_offset, r_offset,
-                                                       ref_buf->buf->uv_stride,
-                                                       &ref_buf->sf);
-          pd->pre[ref].width = ref_buf->buf->uv_crop_width;
-          pd->pre[ref].height = ref_buf->buf->uv_crop_height;
-          pd->pre[ref].stride = ref_buf->buf->uv_stride;
-
-#if CONFIG_INTRABC
-          const struct scale_factors *const sf =
-              is_intrabc ? &xd->sf_identity : &ref_buf->sf;
-          struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-#else
-          const struct scale_factors *const sf = &ref_buf->sf;
-          struct buf_2d *const pre_buf = &pd->pre[ref];
-#endif  // CONFIG_INTRABC
-
-          const MV mv = this_mbmi->mv[ref].as_mv;
-
-          uint8_t *pre;
-          int xs, ys, subpel_x, subpel_y;
-          const int is_scaled = av1_is_scaled(sf);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
-          warp_types.global_warp_allowed = is_global[ref];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-          warp_types.local_warp_allowed =
-              this_mbmi->motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-          if (is_scaled) {
-            int ssx = pd->subsampling_x;
-            int ssy = pd->subsampling_y;
-            int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
-            orig_pos_y += mv.row * (1 << (1 - ssy));
-            int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
-            orig_pos_x += mv.col * (1 << (1 - ssx));
-            int pos_y = sf->scale_value_y(orig_pos_y, sf);
-            int pos_x = sf->scale_value_x(orig_pos_x, sf);
-            pos_x += SCALE_EXTRA_OFF;
-            pos_y += SCALE_EXTRA_OFF;
-
-            const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
-            const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                               << SCALE_SUBPEL_BITS;
-            const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
-            const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                              << SCALE_SUBPEL_BITS;
-            pos_y = clamp(pos_y, top, bottom);
-            pos_x = clamp(pos_x, left, right);
-
-            pre = pre_buf->buf0 +
-                  (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-                  (pos_x >> SCALE_SUBPEL_BITS);
-            subpel_x = pos_x & SCALE_SUBPEL_MASK;
-            subpel_y = pos_y & SCALE_SUBPEL_MASK;
-            xs = sf->x_step_q4;
-            ys = sf->y_step_q4;
-          } else {
-            const MV mv_q4 = clamp_mv_to_umv_border_sb(
-                xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-            xs = ys = SCALE_SUBPEL_SHIFTS;
-            subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-            subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-            pre = pre_buf->buf +
-                  (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-                  (x + (mv_q4.col >> SUBPEL_BITS));
-          }
-
-          conv_params.ref = ref;
-          conv_params.do_average = ref;
-          if (is_masked_compound_type(mi->mbmi.interinter_compound_type)) {
-            // masked compound type has its own average mechanism
-            conv_params.do_average = 0;
-#if CONFIG_CONVOLVE_ROUND && CONFIG_COMPOUND_SEGMENT && CONFIG_SUPERTX
-            // TODO(angiebird): convolve_round does not support compound_segment
-            // when supertx is on
-            conv_params = get_conv_params(ref, 0, plane);
-#endif
-          }
-          if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
-            av1_make_masked_inter_predictor(
-                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
-                sf, b4_w, b4_h, &conv_params, mi->mbmi.interp_filters, xs, ys,
-#if CONFIG_SUPERTX
-                wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-                plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                &warp_types, (mi_x >> pd->subsampling_x) + x,
-                (mi_y >> pd->subsampling_y) + y, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                xd);
-          else
-            av1_make_inter_predictor(
-                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
-                sf, b4_w, b4_h, &conv_params, this_mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                &warp_types, (mi_x >> pd->subsampling_x) + x,
-                (mi_y >> pd->subsampling_y) + y, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                mi, build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-                xs, ys, xd);
-        }  // for (ref = 0; ref < 1 + is_compound; ++ref)
-#if CONFIG_CONVOLVE_ROUND
-        if (conv_params.do_post_rounding) {
-#if CONFIG_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            av1_highbd_convolve_rounding(
-                tmp_dst, tmp_dst_stride, dst, dst_buf->stride, b4_w, b4_h,
-                FILTER_BITS * 2 + is_compound - conv_params.round_0 -
-                    conv_params.round_1,
-                xd->bd);
-          else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_COMPOUND_SINGLEREF
-            av1_convolve_rounding(
-                tmp_dst, tmp_dst_stride, dst, dst_buf->stride, b4_w, b4_h,
-                FILTER_BITS * 2 + is_comp_mode_pred - conv_params.round_0 -
-                    conv_params.round_1);
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-          av1_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_buf->stride,
-                                b4_w, b4_h,
-                                FILTER_BITS * 2 + is_compound -
-                                    conv_params.round_0 - conv_params.round_1);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                           &subpel_params, bw, bh);
+
+        conv_params.ref = ref;
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
         }
-#endif  // CONFIG_CONVOLVE_ROUND
+
+        av1_make_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
         ++col;
       }
       ++row;
@@ -1432,194 +926,50 @@ static INLINE void build_inter_predictors(
     for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
     return;
   }
-#else
-  (void)cm;
-#endif  // CONFIG_CHROMA_SUB8X8
 
   {
+    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
     struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    uint8_t *pre[2];
-    SubpelParams subpel_params[2];
-#if CONFIG_CONVOLVE_ROUND
-    DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-#endif  // CONFIG_CONVOLVE_ROUND
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + is_comp_mode_pred; ++ref)
-#else
-    for (ref = 0; ref < 1 + is_compound; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-#if CONFIG_INTRABC
+    uint8_t *const dst = dst_buf->buf;
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf =
-          is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
       struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-#else
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-#endif  // CONFIG_INTRABC
-#if CONFIG_CB4X4
-      const MV mv = mi->mbmi.mv[ref].as_mv;
-#else
-      const MV mv =
-#if CONFIG_MOTION_VAR
-          (mi->mbmi.sb_type < BLOCK_8X8 && !build_for_obmc)
-              ?
-#else
-          mi->mbmi.sb_type < BLOCK_8X8 ?
-#endif
-              average_split_mvs(pd, mi, ref, block)
-              : mi->mbmi.mv[ref].as_mv;
-#endif
-
-      const int is_scaled = av1_is_scaled(sf);
-      if (is_scaled) {
-        // Note: The various inputs here have different units:
-        // * mi_x/mi_y are in units of luma pixels
-        // * mv is in units of 1/8 luma pixels
-        // * x/y are in units of pixels *in the current plane*
-        // Here we unify these into a q4-format position within the current
-        // plane, then project into the reference frame
-        int ssx = pd->subsampling_x;
-        int ssy = pd->subsampling_y;
-        int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
-        orig_pos_y += mv.row * (1 << (1 - ssy));
-        int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
-        orig_pos_x += mv.col * (1 << (1 - ssx));
-        int pos_y = sf->scale_value_y(orig_pos_y, sf);
-        int pos_x = sf->scale_value_x(orig_pos_x, sf);
-        pos_x += SCALE_EXTRA_OFF;
-        pos_y += SCALE_EXTRA_OFF;
-
-        // Clamp against the reference frame borders, with enough extension
-        // that we don't force the reference block to be partially onscreen.
-        const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
-        const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                           << SCALE_SUBPEL_BITS;
-        const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
-        const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                          << SCALE_SUBPEL_BITS;
-        pos_y = clamp(pos_y, top, bottom);
-        pos_x = clamp(pos_x, left, right);
-
-        pre[ref] = pre_buf->buf0 +
-                   (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-                   (pos_x >> SCALE_SUBPEL_BITS);
-        subpel_params[ref].subpel_x = pos_x & SCALE_SUBPEL_MASK;
-        subpel_params[ref].subpel_y = pos_y & SCALE_SUBPEL_MASK;
-        subpel_params[ref].xs = sf->x_step_q4;
-        subpel_params[ref].ys = sf->y_step_q4;
-      } else {
-        const MV mv_q4 = clamp_mv_to_umv_border_sb(
-            xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-        subpel_params[ref].subpel_x = (mv_q4.col & SUBPEL_MASK)
-                                      << SCALE_EXTRA_BITS;
-        subpel_params[ref].subpel_y = (mv_q4.row & SUBPEL_MASK)
-                                      << SCALE_EXTRA_BITS;
-        subpel_params[ref].xs = SCALE_SUBPEL_SHIFTS;
-        subpel_params[ref].ys = SCALE_SUBPEL_SHIFTS;
-        pre[ref] = pre_buf->buf +
-                   (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-                   (x + (mv_q4.col >> SUBPEL_BITS));
-      }
-    }
+      const MV mv = mi->mv[ref].as_mv;
 
-#if CONFIG_CONVOLVE_ROUND
-    ConvolveParams conv_params =
-        get_conv_params_no_round(ref, ref, plane, tmp_dst, MAX_SB_SIZE);
-#else
-    ConvolveParams conv_params = get_conv_params(ref, ref, plane);
-#endif  // CONFIG_CONVOLVE_ROUND
+      uint8_t *pre;
+      SubpelParams subpel_params;
+      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+                         &subpel_params, bw, bh);
 
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + is_comp_mode_pred; ++ref)
-#else
-    for (ref = 0; ref < 1 + is_compound; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-#if CONFIG_INTRABC
-      const struct scale_factors *const sf =
-          is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-#else
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-#endif  // CONFIG_INTRABC
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
       WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
       warp_types.global_warp_allowed = is_global[ref];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
       conv_params.ref = ref;
-      conv_params.do_average = ref;
-      if (is_masked_compound_type(mi->mbmi.interinter_compound_type)) {
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
         // masked compound type has its own average mechanism
         conv_params.do_average = 0;
-#if CONFIG_CONVOLVE_ROUND && CONFIG_COMPOUND_SEGMENT && CONFIG_SUPERTX
-        // TODO(angiebird): convolve_round does not support compound_segment
-        // when supertx is on
-        conv_params = get_conv_params(ref, 0, plane);
-#endif
-      }
-
-      if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
         av1_make_masked_inter_predictor(
-            pre[ref], pre_buf->stride, dst, dst_buf->stride,
-            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
-            &conv_params, mi->mbmi.interp_filters, subpel_params[ref].xs,
-            subpel_params[ref].ys,
-#if CONFIG_SUPERTX
-            wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-            plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-            &warp_types, (mi_x >> pd->subsampling_x) + x,
-            (mi_y >> pd->subsampling_y) + y, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-            xd);
-      else
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, plane, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+            cm->allow_warped_motion);
+      } else {
+        conv_params.do_average = ref;
         av1_make_inter_predictor(
-            pre[ref], pre_buf->stride, dst, dst_buf->stride,
-            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
-            &conv_params, mi->mbmi.interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-            &warp_types, (mi_x >> pd->subsampling_x) + x,
-            (mi_y >> pd->subsampling_y) + y, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-            mi, build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-            subpel_params[ref].xs, subpel_params[ref].ys, xd);
-    }
-
-#if CONFIG_CONVOLVE_ROUND
-    // TODO(angiebird): This part needs optimization
-    if (conv_params.do_post_rounding) {
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        av1_highbd_convolve_rounding(
-            tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-            FILTER_BITS * 2 + is_compound - conv_params.round_0 -
-                conv_params.round_1,
-            xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_COMPOUND_SINGLEREF
-        av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-                              FILTER_BITS * 2 + is_comp_mode_pred -
-                                  conv_params.round_0 - conv_params.round_1);
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-                            FILTER_BITS * 2 + is_compound -
-                                conv_params.round_0 - conv_params.round_1);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+            mi, build_for_obmc, xd, cm->allow_warped_motion);
+      }
     }
-#endif  // CONFIG_CONVOLVE_ROUND
   }
 }
 
@@ -1630,56 +980,16 @@ static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
   for (plane = plane_from; plane <= plane_to; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = pd->width;
     const int bh = pd->height;
 
-#if CONFIG_CB4X4
     if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
                              pd->subsampling_y))
       continue;
-#endif
 
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !unify_bsize) {
-      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
-      const int have_vsplit = bp != PARTITION_HORZ;
-      const int have_hsplit = bp != PARTITION_VERT;
-      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
-      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
-      int x, y;
-      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
-      assert(bsize == BLOCK_8X8);
-      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                                 xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                                 y * 2 + x, bw, bh, 4 * x, 4 * y, pw, ph,
-#if CONFIG_SUPERTX
-                                 0, 0,
-#endif  // CONFIG_SUPERTX
-                                 mi_x, mi_y);
-    } else {
-      build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                             xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                             0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
+    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
   }
 }
 
@@ -1687,17 +997,14 @@ void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int mi_row, int mi_col, BUFFER_SET *ctx,
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-#if CONFIG_INTERINTRA
-  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+
+  if (is_interintra_pred(xd->mi[0])) {
     BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
                                { xd->plane[0].dst.stride, 0, 0 } };
     if (!ctx) ctx = &default_ctx;
     av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
                                         xd->plane[0].dst.stride, ctx, bsize);
   }
-#else
-  (void)ctx;
-#endif  // CONFIG_INTERINTRA
 }
 
 void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1705,8 +1012,8 @@ void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
-#if CONFIG_INTERINTRA
-  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+
+  if (is_interintra_pred(xd->mi[0])) {
     BUFFER_SET default_ctx = {
       { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
       { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
@@ -1716,247 +1023,49 @@ void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
         cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
         xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
   }
-#else
-  (void)ctx;
-#endif  // CONFIG_INTERINTRA
 }
 
 void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col, BUFFER_SET *ctx,
                                    BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
   av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
 }
 
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
-                          const YV12_BUFFER_CONFIG *src, int mi_row,
-                          int mi_col) {
-  const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
-                                     src->uv_crop_width };
-  const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
-                                      src->uv_crop_height };
-  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
-                                      src->uv_stride };
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
     struct macroblockd_plane *const pd = &planes[i];
-    setup_pred_plane(&pd->dst, bsize, src->buffers[i], widths[i], heights[i],
-                     strides[i], mi_row, mi_col, NULL, pd->subsampling_x,
-                     pd->subsampling_y);
+    const int is_uv = i > 0;
+    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
   }
 }
 
 void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const struct scale_factors *sf) {
+                          const struct scale_factors *sf,
+                          const int num_planes) {
   if (src != NULL) {
-    int i;
-    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
-                                             src->v_buffer };
-    const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
-                                       src->uv_crop_width };
-    const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
-                                        src->uv_crop_height };
-    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
-                                        src->uv_stride };
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
+    // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+    // the static analysis warnings.
+    for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
-      setup_pred_plane(&pd->pre[idx], xd->mi[0]->mbmi.sb_type, buffers[i],
-                       widths[i], heights[i], strides[i], mi_row, mi_col, sf,
+      const int is_uv = i > 0;
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+                       src->crop_widths[is_uv], src->crop_heights[is_uv],
+                       src->strides[is_uv], mi_row, mi_col, sf,
                        pd->subsampling_x, pd->subsampling_y);
     }
   }
 }
 
-#if CONFIG_SUPERTX
-#if CONFIG_CB4X4
-static const uint8_t mask_4[4] = { 64, 52, 12, 0 };
-static const uint8_t mask_4_uv[4] = { 64, 52, 12, 0 };
-#endif  // CONFIG_CB4X4
-static const uint8_t mask_8[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
-
-static const uint8_t mask_16[16] = { 63, 62, 60, 58, 55, 50, 43, 36,
-                                     28, 21, 14, 9,  6,  4,  2,  1 };
-
-static const uint8_t mask_32[32] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63,
-                                     61, 57, 52, 45, 36, 28, 19, 12, 7,  3,  1,
-                                     0,  0,  0,  0,  0,  0,  0,  0,  0,  0 };
-
-static const uint8_t mask_8_uv[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
-
-static const uint8_t mask_16_uv[16] = { 64, 64, 64, 64, 61, 53, 45, 36,
-                                        28, 19, 11, 3,  0,  0,  0,  0 };
-
-static const uint8_t mask_32_uv[32] = { 64, 64, 64, 64, 64, 64, 64, 64,
-                                        64, 64, 64, 64, 60, 54, 46, 36,
-                                        28, 18, 10, 4,  0,  0,  0,  0,
-                                        0,  0,  0,  0,  0,  0,  0,  0 };
-
-static const uint8_t *get_supertx_mask(int length, int plane) {
-  switch (length) {
-#if CONFIG_CB4X4
-    case 4: return plane ? mask_4_uv : mask_4;
-#endif  // CONFIG_CB4X4
-    case 8: return plane ? mask_8_uv : mask_8;
-    case 16: return plane ? mask_16_uv : mask_16;
-    case 32: return plane ? mask_32_uv : mask_32;
-    default: assert(0);
-  }
-  return NULL;
-}
-
-void av1_build_masked_inter_predictor_complex(
-    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
-    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
-    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
-    int plane) {
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const int ssx = pd->subsampling_x;
-  const int ssy = pd->subsampling_y;
-  const int top_w = block_size_wide[top_bsize] >> ssx;
-  const int top_h = block_size_high[top_bsize] >> ssy;
-  const int w = block_size_wide[bsize] >> ssx;
-  const int h = block_size_high[bsize] >> ssy;
-  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
-  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
-
-  int w_remain, h_remain;
-
-#if CONFIG_HIGHBITDEPTH
-  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  assert(bsize <= BLOCK_32X32);
-  assert(IMPLIES(plane == 0, ssx == 0));
-  assert(IMPLIES(plane == 0, ssy == 0));
-
-  switch (partition) {
-    case PARTITION_HORZ: {
-      const uint8_t *const mask = get_supertx_mask(h, ssy);
-
-      w_remain = top_w;
-      h_remain = top_h - h_offset - h;
-      dst += h_offset * dst_stride;
-      pre += h_offset * pre_stride;
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hdb)
-        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre,
-                                   pre_stride, mask, h, top_w, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
-                            mask, h, top_w);
-
-      dst += h * dst_stride;
-      pre += h * pre_stride;
-      break;
-    }
-    case PARTITION_VERT: {
-      const uint8_t *const mask = get_supertx_mask(w, ssx);
-
-      w_remain = top_w - w_offset - w;
-      h_remain = top_h;
-      dst += w_offset;
-      pre += w_offset;
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hdb)
-        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre,
-                                   pre_stride, mask, top_h, w, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
-                            mask, top_h, w);
-
-      dst += w;
-      pre += w;
-      break;
-    }
-    default: {
-      assert(0);
-      return;
-    }
-  }
-
-  if (w_remain == 0 || h_remain == 0) {
-    return;
-  }
-
-#if CONFIG_HIGHBITDEPTH
-  if (is_hdb) {
-    dst = (uint8_t *)CONVERT_TO_SHORTPTR(dst);
-    pre = (const uint8_t *)CONVERT_TO_SHORTPTR(pre);
-    dst_stride *= 2;
-    pre_stride *= 2;
-    w_remain *= 2;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  do {
-    memcpy(dst, pre, w_remain * sizeof(uint8_t));
-    dst += dst_stride;
-    pre += pre_stride;
-  } while (--h_remain);
-}
-
-void av1_build_inter_predictor_sb_sub8x8_extend(const AV1_COMMON *cm,
-                                                MACROBLOCKD *xd, int mi_row_ori,
-                                                int mi_col_ori, int mi_row,
-                                                int mi_col, int plane,
-                                                BLOCK_SIZE bsize, int block) {
-  // Prediction function used in supertx:
-  // Use the mv at current block (which is less than 8x8)
-  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
-  // bsize can be larger than 8x8.
-  // block (0-3): the sub8x8 location of current block
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
-  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
-
-  // For sub8x8 uv:
-  // Skip uv prediction in supertx except the first block (block = 0)
-  int max_plane = block ? 1 : MAX_MB_PLANE;
-  if (plane >= max_plane) return;
-
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int bw = 4 * num_4x4_w;
-  const int bh = 4 * num_4x4_h;
-
-  build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                         xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                         block, bw, bh, 0, 0, bw, bh, wedge_offset_x,
-                         wedge_offset_y, mi_x, mi_y);
-}
-
-void av1_build_inter_predictor_sb_extend(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row_ori, int mi_col_ori,
-                                         int mi_row, int mi_col, int plane,
-                                         BLOCK_SIZE bsize) {
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
-  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
-
-  build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                         xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                         0, bw, bh, 0, 0, bw, bh, wedge_offset_x,
-                         wedge_offset_y, mi_x, mi_y);
-}
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_MOTION_VAR
 // obmc_mask_N[overlap_position]
 static const uint8_t obmc_mask_1[1] = { 64 };
 
@@ -1974,14 +1083,12 @@ static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
                                           56, 57, 58, 59, 60, 60, 61, 62,
                                           64, 64, 64, 64, 64, 64, 64, 64 };
 
-#if CONFIG_EXT_PARTITION
 static const uint8_t obmc_mask_64[64] = {
   33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
   45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
   56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
   62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
 };
-#endif  // CONFIG_EXT_PARTITION
 
 const uint8_t *av1_get_obmc_mask(int length) {
   switch (length) {
@@ -1991,69 +1098,25 @@ const uint8_t *av1_get_obmc_mask(int length) {
     case 8: return obmc_mask_8;
     case 16: return obmc_mask_16;
     case 32: return obmc_mask_32;
-#if CONFIG_EXT_PARTITION
     case 64: return obmc_mask_64;
-#endif  // CONFIG_EXT_PARTITION
     default: assert(0); return NULL;
   }
 }
 
-#if CONFIG_NCOBMC
-// obmc_mask_flipN[overlap_position]
-static const uint8_t obmc_mask_flip1[1] = { 55 };
-
-static const uint8_t obmc_mask_flip2[2] = { 62, 45 };
-
-static const uint8_t obmc_mask_flip4[4] = { 64, 59, 50, 39 };
-
-static const uint8_t obmc_mask_flip8[8] = { 64, 63, 61, 57, 53, 48, 42, 36 };
-
-static const uint8_t obmc_mask_flip16[16] = { 64, 64, 64, 63, 61, 60, 58, 56,
-                                              54, 52, 49, 46, 43, 40, 37, 34 };
-
-static const uint8_t obmc_mask_flip32[32] = { 64, 64, 64, 64, 64, 63, 63, 62,
-                                              62, 61, 60, 60, 59, 58, 57, 56,
-                                              55, 53, 52, 51, 50, 48, 47, 45,
-                                              44, 43, 41, 40, 38, 36, 35, 33 };
-
-#if CONFIG_EXT_PARTITION
-static const uint8_t obmc_mask_flip64[64] = {
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62,
-  62, 62, 62, 61, 60, 60, 60, 60, 60, 59, 58, 58, 57, 57, 56, 56,
-  56, 55, 54, 53, 52, 52, 51, 51, 51, 50, 49, 48, 47, 47, 46, 45,
-  44, 44, 44, 43, 42, 41, 40, 40, 39, 38, 37, 36, 35, 35, 34, 33,
-};
-#endif  // CONFIG_EXT_PARTITION
-
-const uint8_t *av1_get_obmc_mask_flipped(int length) {
-  switch (length) {
-    case 1: return obmc_mask_flip1;
-    case 2: return obmc_mask_flip2;
-    case 4: return obmc_mask_flip4;
-    case 8: return obmc_mask_flip8;
-    case 16: return obmc_mask_flip16;
-    case 32: return obmc_mask_flip32;
-#if CONFIG_EXT_PARTITION
-    case 64: return obmc_mask_flip64;
-#endif  // CONFIG_EXT_PARTITION
-    default: assert(0); return NULL;
-  }
-}
-#endif  // CONFIG_NCOBMC
-
 static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
-                                     uint8_t mi_hw, MODE_INFO *mi,
-                                     void *fun_ctxt) {
+                                     uint8_t mi_hw, MB_MODE_INFO *mi,
+                                     void *fun_ctxt, const int num_planes) {
   (void)xd;
   (void)rel_mi_rc;
   (void)mi_hw;
   (void)mi;
   ++*(int *)fun_ctxt;
+  (void)num_planes;
 }
 
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
 
   mbmi->overlappable_neighbors[0] = 0;
   mbmi->overlappable_neighbors[1] = 0;
@@ -2066,21 +1129,17 @@ void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                &mbmi->overlappable_neighbors[1]);
 }
 
-// HW does not support < 4x4 prediction. To limit the bandwidth requirement, for
-// small blocks, only blend with neighbors from one side. If block-size of
-// current plane is 4x4 or 8x4, the above neighbor (dir = 0) will be skipped. If
-// it is 4x8, the left neighbor (dir = 1) will be skipped.
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
+// block-size of current plane is smaller than 8x8, always only blend with the
+// left neighbor(s) (skip blending with the above side).
 #define DISABLE_CHROMA_U8X8_OBMC 0  // 0: one-sided obmc; 1: disable
 
-int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
-                           int dir) {
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir) {
   assert(is_motion_variation_allowed_bsize(bsize));
 
-  BLOCK_SIZE bsize_plane =
-      ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  if (bsize_plane < BLOCK_4X4) return 1;
-#endif
+  const BLOCK_SIZE bsize_plane =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   switch (bsize_plane) {
 #if DISABLE_CHROMA_U8X8_OBMC
     case BLOCK_4X4:
@@ -2095,6 +1154,13 @@ int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
   }
 }
 
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+  return;
+}
+
 struct obmc_inter_pred_ctxt {
   uint8_t **adjacent;
   int *adjacent_stride;
@@ -2102,24 +1168,23 @@ struct obmc_inter_pred_ctxt {
 
 static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
                                                uint8_t above_mi_width,
-                                               MODE_INFO *above_mi,
-                                               void *fun_ctxt) {
+                                               MB_MODE_INFO *above_mi,
+                                               void *fun_ctxt,
+                                               const int num_planes) {
   (void)above_mi;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#if CONFIG_HIGHBITDEPTH
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
     const int bh = overlap >> pd->subsampling_y;
     const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
 
     const int dst_stride = pd->dst.stride;
     uint8_t *const dst = &pd->dst.buf[plane_col];
@@ -2127,37 +1192,34 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
     const uint8_t *const mask = av1_get_obmc_mask(bh);
 
-#if CONFIG_HIGHBITDEPTH
     if (is_hbd)
       aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                 tmp_stride, mask, bh, bw, xd->bd);
+                                 tmp_stride, mask, bw, bh, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                          mask, bh, bw);
+                          mask, bw, bh);
   }
 }
 
 static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
                                               uint8_t left_mi_height,
-                                              MODE_INFO *left_mi,
-                                              void *fun_ctxt) {
+                                              MB_MODE_INFO *left_mi,
+                                              void *fun_ctxt,
+                                              const int num_planes) {
   (void)left_mi;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-#if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = overlap >> pd->subsampling_x;
     const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
     const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
 
     const int dst_stride = pd->dst.stride;
     uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
@@ -2165,14 +1227,12 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
     const uint8_t *const mask = av1_get_obmc_mask(bw);
 
-#if CONFIG_HIGHBITDEPTH
     if (is_hbd)
       aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                 tmp_stride, mask, bh, bw, xd->bd);
+                                 tmp_stride, mask, bw, bh, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                          mask, bh, bw);
+                          mask, bw, bh);
   }
 }
 
@@ -2186,86 +1246,41 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]) {
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
   // handle above row
   struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
   foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_obmc_inter_pred_above, &ctxt_above);
 
   // handle left column
   struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
   foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
-void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
-  if (is_interintra_pred(mbmi)) {
-    mbmi->ref_frame[1] = NONE_FRAME;
-  } else if (has_second_ref(mbmi) &&
-             is_masked_compound_type(mbmi->interinter_compound_type)) {
-    mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-    mbmi->ref_frame[1] = NONE_FRAME;
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (!has_second_ref(mbmi) &&
-             is_inter_singleref_comp_mode(mbmi->mode)) {
-    // mbmi->mode = compound_ref0_mode(mbmi->mode);
-    mbmi->mode = compound_ref1_mode(mbmi->mode);
-    assert(is_inter_singleref_mode(mbmi->mode));
-    mbmi->mv[0].as_int = mbmi->mv[1].as_int;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  }
-  if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
-  return;
-}
-
-struct build_prediction_ctxt {
-  const AV1_COMMON *cm;
-  int mi_row;
-  int mi_col;
-  uint8_t **tmp_buf;
-  int *tmp_width;
-  int *tmp_height;
-  int *tmp_stride;
-  int mb_to_far_edge;
-};
-
-static INLINE void build_prediction_by_above_pred(MACROBLOCKD *xd,
-                                                  int rel_mi_col,
-                                                  uint8_t above_mi_width,
-                                                  MODE_INFO *above_mi,
-                                                  void *fun_ctxt) {
-  MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes) {
   const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int above_mi_col = ctxt->mi_col + rel_mi_col;
 
-  MB_MODE_INFO backup_mbmi = *above_mbmi;
-  modify_neighbor_predictor_for_obmc(above_mbmi);
+  av1_modify_neighbor_predictor_for_obmc(above_mbmi);
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     struct macroblockd_plane *const pd = &xd->plane[j];
     setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
                      ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
                      NULL, pd->subsampling_x, pd->subsampling_y);
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  const int num_refs = 1 + is_inter_anyref_comp_mode(above_mbmi->mode);
-#else
   const int num_refs = 1 + has_second_ref(above_mbmi);
-#endif
 
   for (int ref = 0; ref < num_refs; ++ref) {
-#if CONFIG_COMPOUND_SINGLEREF
-    const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
-                                         ? above_mbmi->ref_frame[ref]
-                                         : above_mbmi->ref_frame[0];
-#else
     const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
     const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
 
@@ -2274,31 +1289,37 @@ static INLINE void build_prediction_by_above_pred(MACROBLOCKD *xd,
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
-                         &ref_buf->sf);
+                         &ref_buf->sf, num_planes);
   }
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
   xd->mb_to_right_edge = ctxt->mb_to_far_edge +
                          (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+}
+
+static INLINE void build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
 
-  int mi_x = above_mi_col << MI_SIZE_LOG2;
-  int mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           above_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
     int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
     int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
                    block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, above_mi, 1, 0, bw, bh, 0, 0, bw,
-                           bh,
-#if CONFIG_SUPERTX
-                           0, 0,
-#endif  // CONFIG_SUPERTX
-                           mi_x, mi_y);
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
   }
   *above_mbmi = backup_mbmi;
 }
@@ -2322,9 +1343,9 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         mi_col,     tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_prediction_by_above_pred, &ctxt);
 
   xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
@@ -2332,40 +1353,27 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
   xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
 }
 
-static INLINE void build_prediction_by_left_pred(MACROBLOCKD *xd,
-                                                 int rel_mi_row,
-                                                 uint8_t left_mi_height,
-                                                 MODE_INFO *left_mi,
-                                                 void *fun_ctxt) {
-  MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes) {
   const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int left_mi_row = ctxt->mi_row + rel_mi_row;
 
-  MB_MODE_INFO backup_mbmi = *left_mbmi;
-  modify_neighbor_predictor_for_obmc(left_mbmi);
+  av1_modify_neighbor_predictor_for_obmc(left_mbmi);
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     struct macroblockd_plane *const pd = &xd->plane[j];
     setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
                      ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
                      NULL, pd->subsampling_x, pd->subsampling_y);
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  const int num_refs = 1 + is_inter_anyref_comp_mode(left_mbmi->mode);
-#else
   const int num_refs = 1 + has_second_ref(left_mbmi);
-#endif
 
   for (int ref = 0; ref < num_refs; ++ref) {
-#if CONFIG_COMPOUND_SINGLEREF
-    const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
-                                         ? left_mbmi->ref_frame[ref]
-                                         : left_mbmi->ref_frame[0];
-#else
     const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
     const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
 
@@ -2374,31 +1382,37 @@ static INLINE void build_prediction_by_left_pred(MACROBLOCKD *xd,
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
-                         &ref_buf->sf);
+                         &ref_buf->sf, num_planes);
   }
 
   xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
   xd->mb_to_bottom_edge =
       ctxt->mb_to_far_edge +
       (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+}
 
-  int mi_x = ctxt->mi_col << MI_SIZE_LOG2;
-  int mi_y = left_mi_row << MI_SIZE_LOG2;
+static INLINE void build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          left_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
     int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
                    block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
     int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, left_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                           0, 0,
-#endif  // CONFIG_SUPERTX
-                           mi_x, mi_y);
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
   }
   *left_mbmi = backup_mbmi;
 }
@@ -2422,9 +1436,9 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         mi_col,     tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_prediction_by_left_pred, &ctxt);
 
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
@@ -2434,13 +1448,9 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col) {
-#if CONFIG_HIGHBITDEPTH
+  const int num_planes = av1_num_planes(cm);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -2449,7 +1459,6 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
@@ -2459,434 +1468,25 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
     dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
     dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
     dst_buf1[0] = tmp_buf1;
     dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
     dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
     dst_buf2[0] = tmp_buf2;
     dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
     dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
-#if CONFIG_HIGHBITDEPTH
   }
-#endif  // CONFIG_HIGHBITDEPTH
   av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                       dst_width1, dst_height1, dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                      dst_width2, dst_height2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->mbmi.sb_type,
-                       get_frame_new_buffer(cm), mi_row, mi_col);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
   av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
                                   dst_buf2, dst_stride2);
 }
 
-#if CONFIG_NCOBMC
-void av1_build_prediction_by_bottom_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          uint8_t *tmp_buf[MAX_MB_PLANE],
-                                          int tmp_width[MAX_MB_PLANE],
-                                          int tmp_height[MAX_MB_PLANE],
-                                          int tmp_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-#if CONFIG_DEBUG
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#endif
-  int i, j, mi_step, ref;
-  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-  int mb_to_right_edge_base = xd->mb_to_right_edge;
-
-  if (mi_row + xd->n8_h >= tile->mi_row_end ||
-      (mi_row + xd->n8_h) % MI_SIZE == 0 || (mi_row + xd->n8_h) >= cm->mi_rows)
-    return;
-  assert(bsize >= BLOCK_8X8);
-
-  xd->mb_to_top_edge -= xd->n8_h * 32;
-  for (i = 0; i < ilimit; i += mi_step) {
-    int mi_row_offset = xd->n8_h;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    MB_MODE_INFO backup_mbmi;
-
-    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    backup_mbmi = *mbmi;
-    modify_neighbor_predictor_for_obmc(mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, AOMMAX(mbmi->sb_type, BLOCK_8X8), tmp_buf[j],
-                       tmp_width[j], tmp_height[j], tmp_stride[j],
-                       (xd->n8_h >> 1), i, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + (xd->n8_h >> 1),
-                           mi_col + i, &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * 64;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = (mi_row << MI_SIZE_LOG2) + xd->n8_h * (MI_SIZE >> 1);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_x;
-      bh = (xd->n8_h << (MI_SIZE_LOG2 - 1)) >> pd->subsampling_y;
-
-      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> (!have_vsplit);
-        const int num_4x4_h = 2 >> (!have_hsplit);
-        const int pw = 8 >> (have_vsplit + pd->subsampling_x);
-        int x, y;
-
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT) && y != 0)
-              continue;
-
-            build_inter_predictors(cm, xd, j, mi, 1, y * 2 + x, bw, bh,
-                                   (4 * x) >> pd->subsampling_x,
-                                   xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0,
-                                   pw, bh,
-#if CONFIG_SUPERTX
-                                   0, 0,
-#endif  // CONFIG_SUPERTX
-                                   mi_x, mi_y);
-          }
-      } else {
-        build_inter_predictors(cm, xd, j, mi, 1, 0, bw, bh, 0,
-                               xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0, bw,
-                               bh,
-#if CONFIG_SUPERTX
-                               0, 0,
-#endif  // CONFIG_SUPERTX
-                               mi_x, mi_y);
-      }
-    }
-    *mbmi = backup_mbmi;
-  }
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_top_edge += xd->n8_h * 32;
-}
-
-void av1_build_prediction_by_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_width[MAX_MB_PLANE],
-                                         int tmp_height[MAX_MB_PLANE],
-                                         const int tmp_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-#if CONFIG_DEBUG
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#endif
-  int i, j, mi_step, ref;
-  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-
-  if (mi_col + xd->n8_w >= tile->mi_col_end ||
-      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
-    return;
-
-  assert(bsize >= BLOCK_8X8);
-
-  xd->mb_to_left_edge -= xd->n8_w / 2 * MI_SIZE * 8;
-  for (i = 0; i < ilimit; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = xd->n8_w;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    MB_MODE_INFO backup_mbmi;
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    backup_mbmi = *mbmi;
-    modify_neighbor_predictor_for_obmc(mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, AOMMAX(mbmi->sb_type, BLOCK_8X8), tmp_buf[j],
-                       tmp_width[j], tmp_height[j], tmp_stride[j], i,
-                       xd->n8_w >> 1, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i,
-                           mi_col + (xd->n8_w >> 1), &ref_buf->sf);
-    }
-
-    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col << MI_SIZE_LOG2) + xd->n8_w * (MI_SIZE >> 1);
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (xd->n8_w << (MI_SIZE_LOG2 - 1)) >> pd->subsampling_x;
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> (!have_vsplit);
-        const int num_4x4_h = 2 >> (!have_hsplit);
-        const int ph = 8 >> (have_hsplit + pd->subsampling_y);
-        int x, y;
-
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT) && x != 0)
-              continue;
-
-            build_inter_predictors(cm, xd, j, mi, 1, y * 2 + x, bw, bh,
-                                   xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0,
-                                   (4 * y) >> pd->subsampling_y, bw, ph,
-#if CONFIG_SUPERTX
-                                   0, 0,
-#endif  // CONFIG_SUPERTX
-                                   mi_x, mi_y);
-          }
-      } else {
-        build_inter_predictors(cm, xd, j, mi, 1, 0, bw, bh,
-                               xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0, 0,
-                               bw, bh,
-#if CONFIG_SUPERTX
-                               0, 0,
-#endif  // CONFIG_SUPERTX
-                               mi_x, mi_y);
-      }
-    }
-    *mbmi = backup_mbmi;
-  }
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_left_edge += xd->n8_w / 2 * MI_SIZE * 8;
-}
-
-// This function combines motion compensated predictions that is generated by
-// bottom/right neighboring blocks' inter predictors with prediction in dst
-// buffer.
-void av1_merge_dst_bottom_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col,
-                                      uint8_t *bottom[MAX_MB_PLANE],
-                                      const int bottom_stride[MAX_MB_PLANE],
-                                      uint8_t *right[MAX_MB_PLANE],
-                                      const int right_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int plane, i, mi_step;
-  const int bottom_available = mi_row + xd->n8_h < tile->mi_row_end &&
-                               (mi_row + xd->n8_h) % MI_SIZE != 0 &&
-                               (mi_row + xd->n8_h) < cm->mi_rows;
-#if CONFIG_HIGHBITDEPTH
-  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  // handle bottom row
-  for (i = 0; bottom_available && i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-       i += mi_step) {
-    int mi_row_offset = xd->n8_h;
-    int mi_col_offset = i;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    int overlap;
-
-    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    overlap = num_4x4_blocks_high_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-      const int bh = overlap >> pd->subsampling_y;
-      const int dst_stride = pd->dst.stride;
-      uint8_t *dst =
-          &pd->dst.buf[((i * MI_SIZE) >> pd->subsampling_x) +
-                       (((xd->n8_h * MI_SIZE - overlap) * dst_stride) >>
-                        pd->subsampling_y)];
-      const int tmp_stride = bottom_stride[plane];
-      const uint8_t *const tmp =
-          &bottom[plane][((i * MI_SIZE) >> pd->subsampling_x) +
-                         (((xd->n8_h * MI_SIZE - overlap) * tmp_stride) >>
-                          pd->subsampling_y)];
-      const uint8_t *const mask = av1_get_obmc_mask_flipped(bh);
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hbd)
-        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                   tmp_stride, mask, bh, bw, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                            mask, bh, bw);
-    }
-  }  // each mi in the bottom row
-
-  // handle right column
-  if (mi_col + xd->n8_w >= tile->mi_col_end ||
-      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
-    return;
-
-  for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = xd->n8_w;
-    int overlap;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      const int bw = overlap >> pd->subsampling_x;
-      const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
-      const int dst_stride = pd->dst.stride;
-      uint8_t *dst =
-          &pd->dst.buf[((i * MI_SIZE * dst_stride) >> pd->subsampling_y) +
-                       ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
-      const int tmp_stride = right_stride[plane];
-      const uint8_t *const tmp =
-          &right[plane][((i * MI_SIZE * tmp_stride) >> pd->subsampling_y) +
-                        ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
-      const uint8_t *const mask = av1_get_obmc_mask_flipped(bw);
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hbd)
-        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                   tmp_stride, mask, bh, bw, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                            mask, bh, bw);
-    }
-  }  // each mi in the right column
-}
-
-// This function generates 4 sided obmc. (1) Prediction blocks generated by
-// bottom and right motion vectors are calculated. (2) Combine them with the
-// original prediction block (which should be pre-stored in xd->plane[].dst.buf
-// before calling this function). The results is updated in xd->plane[].dst.buf
-// (3) Call causal obmc prediction function, which will generate left and above
-// preds, and then merge them and xd->plane[].dst.buf.
-void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col) {
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
-    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
-    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  // TODO(zoeliu): COMPOUND_SINGLEREF has not worked with NCOBMC yet.
-  av1_build_prediction_by_bottom_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                       dst_width1, dst_height1, dst_stride1);
-  av1_build_prediction_by_right_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                      dst_width2, dst_height2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-  av1_merge_dst_bottom_right_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                   dst_stride1, dst_buf2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-  av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-}
-#endif  // CONFIG_NCOBMC
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-void reset_xd_boundary(MACROBLOCKD *xd, int mi_row, int bh, int mi_col, int bw,
-                       int mi_rows, int mi_cols) {
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
-}
-void set_sb_mi_boundaries(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                          const int mi_row, const int mi_col) {
-  const BLOCK_SIZE sb = cm->sb_size;
-  const int num_mi_w = mi_size_wide[sb];
-  const int num_mi_h = mi_size_high[sb];
-
-  xd->sb_mi_bd.mi_col_begin = mi_col;
-  xd->sb_mi_bd.mi_row_begin = mi_row;
-  // points to the last mi
-  xd->sb_mi_bd.mi_col_end =
-      mi_col + num_mi_w > cm->mi_cols ? cm->mi_cols - 1 : mi_col + num_mi_w - 1;
-  xd->sb_mi_bd.mi_row_end =
-      mi_row + num_mi_h > cm->mi_rows ? cm->mi_rows - 1 : mi_row + num_mi_h - 1;
-}
-#endif
-
-#endif  // CONFIG_MOTION_VAR
-
 /* clang-format off */
-#if CONFIG_INTERINTRA
-#if CONFIG_EXT_PARTITION
-static const int ii_weights1d[MAX_SB_SIZE] = {
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
   60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
   31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
   16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
@@ -2895,103 +1495,82 @@ static const int ii_weights1d[MAX_SB_SIZE] = {
   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
 };
-static int ii_size_scales[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    32, 32, 32,
-#endif
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
     32, 16, 16, 16, 8, 8, 8, 4,
     4,  4,  2,  2,  2, 1, 1, 1,
-    16, 16, 8, 8, 4, 4, 2, 2
-};
-#else
-static const int ii_weights1d[MAX_SB_SIZE] = {
-  60, 56, 52, 48, 45, 42, 39, 37, 34, 32, 30, 28, 26, 24, 22, 21,
-  19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 10,  9,  8,  8,  7,  7,
-  6,  6,  6,  5,  5,  4,  4,  4,  4,  3,  3,  3,  3,  3,  2,  2,
-  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
-};
-static int ii_size_scales[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    16, 16, 16,
-#endif
-    16, 8, 8, 8, 4, 4, 4,
-    2,  2, 2, 1, 1, 1,
-    8, 8, 4, 4, 2, 2,
+    8,  8,  4,  4,  2, 2
 };
 /* clang-format on */
-#endif  // CONFIG_EXT_PARTITION
 
-static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
-                               int wedge_index, int wedge_sign,
-                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
-                               uint8_t *comppred, int compstride,
-                               const uint8_t *interpred, int interstride,
-                               const uint8_t *intrapred, int intrastride) {
+static void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                         BLOCK_SIZE plane_bsize,
+                                         INTERINTRA_MODE mode) {
+  int i, j;
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const int size_scale = ii_size_scales[plane_bsize];
-  int i, j;
-
-  if (use_wedge_interintra) {
-    if (is_interintra_wedge_used(bsize)) {
-      const uint8_t *mask =
-          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
-                         interpred, interstride, mask, block_size_wide[bsize],
-                         bh, bw, subh, subw);
-    }
-    return;
-  }
 
   switch (mode) {
     case II_V_PRED:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[i * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
+        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+        mask += stride;
       }
       break;
 
     case II_H_PRED:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[j * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
+        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+        mask += stride;
       }
       break;
 
     case II_SMOOTH_PRED:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
+        for (j = 0; j < bw; ++j)
+          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+        mask += stride;
       }
       break;
 
     case II_DC_PRED:
     default:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          comppred[i * compstride + j] = AOM_BLEND_AVG(
-              intrapred[i * intrastride + j], interpred[i * interstride + j]);
-        }
+        memset(mask, 32, bw * sizeof(mask[0]));
+        mask += stride;
       }
       break;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
+static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
+                               int wedge_index, int wedge_sign,
+                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+                               uint8_t *comppred, int compstride,
+                               const uint8_t *interpred, int interstride,
+                               const uint8_t *intrapred, int intrastride) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subw = 2 * mi_size_wide[bsize] == bw;
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+                         interpred, interstride, mask, block_size_wide[bsize],
+                         bw, bh, subw, subh);
+    }
+    return;
+  }
+
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
+                     interstride, mask, bw, bw, bh, 0, 0);
+}
+
 static void combine_interintra_highbd(
     INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
     int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
@@ -2999,72 +1578,26 @@ static void combine_interintra_highbd(
     int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
-  const int size_scale = ii_size_scales[plane_bsize];
-  int i, j;
-
-  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
-  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
-  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
       const uint8_t *mask =
           av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      const int subw = 2 * mi_size_wide[bsize] == bw;
       aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
                                 interpred8, interstride, mask,
-                                block_size_wide[bsize], bh, bw, subh, subw, bd);
+                                block_size_wide[bsize], bw, bh, subw, subh, bd);
     }
     return;
   }
 
-  switch (mode) {
-    case II_V_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[i * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
-      }
-      break;
-
-    case II_H_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[j * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
-      }
-      break;
-
-    case II_SMOOTH_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
-      }
-      break;
-
-    case II_DC_PRED:
-    default:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          comppred[i * compstride + j] = AOM_BLEND_AVG(
-              interpred[i * interstride + j], intrapred[i * intrastride + j]);
-        }
-      }
-      break;
-  }
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                            interpred8, interstride, mask, bw, bw, bh, 0, 0,
+                            bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
@@ -3072,42 +1605,46 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                BUFFER_SET *ctx, uint8_t *dst,
                                                int dst_stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  PREDICTION_MODE mode =
-      interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode];
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+  PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
+  xd->mi[0]->angle_delta[PLANE_TYPE_Y] = 0;
+  xd->mi[0]->angle_delta[PLANE_TYPE_UV] = 0;
+  xd->mi[0]->filter_intra_mode_info.use_filter_intra = 0;
+  xd->mi[0]->use_intrabc = 0;
 
-  av1_predict_intra_block(cm, xd, pd->width, pd->height, plane_bsize, mode,
-                          ctx->plane[plane], ctx->stride[plane], dst,
-                          dst_stride, 0, 0, plane);
+  av1_predict_intra_block(cm, xd, pd->width, pd->height,
+                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
+                          FILTER_INTRA_MODES, ctx->plane[plane],
+                          ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 }
 
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride) {
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-#if CONFIG_HIGHBITDEPTH
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     combine_interintra_highbd(
-        xd->mi[0]->mbmi.interintra_mode, xd->mi[0]->mbmi.use_wedge_interintra,
-        xd->mi[0]->mbmi.interintra_wedge_index,
-        xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
-        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred,
-        inter_stride, intra_pred, intra_stride, xd->bd);
+        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+        xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+        bsize, plane_bsize, xd->plane[plane].dst.buf,
+        xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
+        intra_stride, xd->bd);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
-                     xd->mi[0]->mbmi.use_wedge_interintra,
-                     xd->mi[0]->mbmi.interintra_wedge_index,
-                     xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
-                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
-                     inter_pred, inter_stride, intra_pred, intra_stride);
+  combine_interintra(
+      xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+      xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+      bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      inter_pred, inter_stride, intra_pred, intra_stride);
 }
 
 void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *ypred, int ystride,
                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
@@ -3116,7 +1653,6 @@ void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                            CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   {
     DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, ctx,
@@ -3130,7 +1666,6 @@ void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *upred, int ustride,
                                          BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize) {
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
@@ -3138,10 +1673,7 @@ void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
         MAX_SB_SIZE);
     av1_combine_interintra(xd, bsize, plane, upred, ustride,
                            CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
                                               uintrapredictor, MAX_SB_SIZE);
@@ -3167,966 +1699,119 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
   av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
                                        ctx, bsize);
 }
-#endif  // CONFIG_INTERINTRA
 
 // Builds the inter-predictor for the single ref case
 // for use in the encoder to search the wedges efficiently.
 static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
-                                              int block, int bw, int bh, int x,
-                                              int y, int w, int h, int mi_x,
-                                              int mi_y, int ref,
-                                              uint8_t *const ext_dst,
-                                              int ext_dst_stride) {
+                                              int bw, int bh, int x, int y,
+                                              int w, int h, int mi_x, int mi_y,
+                                              int ref, uint8_t *const ext_dst,
+                                              int ext_dst_stride,
+                                              int can_use_previous) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO *mi = xd->mi[0];
 
   const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
   struct buf_2d *const pre_buf = &pd->pre[ref];
-#if CONFIG_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   uint8_t *const dst =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? CONVERT_TO_BYTEPTR(ext_dst)
-                                                   : ext_dst) +
-      ext_dst_stride * y + x;
-#else
-  uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
-#endif
-  const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-                    ? average_split_mvs(pd, mi, ref, block)
-                    : mi->mbmi.mv[ref].as_mv;
+      (hbd ? CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
+  const MV mv = mi->mv[ref].as_mv;
 
-  uint8_t *pre;
-  int xs, ys, subpel_x, subpel_y;
-  const int is_scaled = av1_is_scaled(sf);
-  ConvolveParams conv_params = get_conv_params(ref, 0, plane);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
   WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
-#if CONFIG_COMPOUND_SINGLEREF
-  WarpedMotionParams *const wm =
-      mi->mbmi.ref_frame[ref] > 0 ? &xd->global_motion[mi->mbmi.ref_frame[ref]]
-                                  : &xd->global_motion[mi->mbmi.ref_frame[0]];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-  WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  warp_types.global_warp_allowed = is_global_mv_block(mi, block, wm->wmtype);
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-  warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-  if (is_scaled) {
-    int ssx = pd->subsampling_x;
-    int ssy = pd->subsampling_y;
-    int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
-    orig_pos_y += mv.row * (1 << (1 - ssy));
-    int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
-    orig_pos_x += mv.col * (1 << (1 - ssx));
-    int pos_y = sf->scale_value_y(orig_pos_y, sf);
-    int pos_x = sf->scale_value_x(orig_pos_x, sf);
-    pos_x += SCALE_EXTRA_OFF;
-    pos_y += SCALE_EXTRA_OFF;
-
-    const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
-    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                       << SCALE_SUBPEL_BITS;
-    const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
-    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-    pos_y = clamp(pos_y, top, bottom);
-    pos_x = clamp(pos_x, left, right);
-
-    pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-    subpel_x = pos_x & SCALE_SUBPEL_MASK;
-    subpel_y = pos_y & SCALE_SUBPEL_MASK;
-    xs = sf->x_step_q4;
-    ys = sf->y_step_q4;
-  } else {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(
-        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-    xs = ys = SCALE_SUBPEL_SHIFTS;
-    subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-          (x + (mv_q4.col >> SUBPEL_BITS));
-  }
+  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+  const int pre_x = (mi_x) >> pd->subsampling_x;
+  const int pre_y = (mi_y) >> pd->subsampling_y;
+  uint8_t *pre;
+  SubpelParams subpel_params;
+  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                     &subpel_params, bw, bh);
 
-  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, subpel_x,
-                           subpel_y, sf, w, h, &conv_params,
-                           mi->mbmi.interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           &warp_types, (mi_x >> pd->subsampling_x) + x,
-                           (mi_y >> pd->subsampling_y) + y, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           mi, 0,
-#endif
-                           xs, ys, xd);
+  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+                           &subpel_params, sf, w, h, &conv_params,
+                           mi->interp_filters, &warp_types, pre_x + x,
+                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
 }
 
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous) {
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, &xd->plane[plane]);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-      int x, y;
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-      assert(bsize == BLOCK_8X8);
-#if CONFIG_COMPOUND_SINGLEREF
-      assert(has_second_ref(&xd->mi[0]->mbmi) ||
-             !is_inter_singleref_comp_mode(xd->mi[0]->mbmi.mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          build_inter_predictors_single_buf(
-              xd, plane, y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4, mi_x, mi_y, ref,
-              ext_dst[plane], ext_dst_stride[plane]);
-    } else {
-      build_inter_predictors_single_buf(xd, plane, 0, bw, bh, 0, 0, bw, bh,
-                                        mi_x, mi_y, ref, ext_dst[plane],
-                                        ext_dst_stride[plane]);
-    }
+    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+                                      mi_y, ref, ext_dst[plane],
+                                      ext_dst_stride[plane], can_use_previous);
   }
 }
 
 static void build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, int plane, int x, int y, int w, int h,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    uint8_t *ext_dst0, int ext_dst_stride0, uint8_t *ext_dst1,
-    int ext_dst_stride1) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
   MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
   struct buf_2d *const dst_buf = &pd->dst;
   uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  const INTERINTER_COMPOUND_DATA comp_data = {
-#if CONFIG_WEDGE
-    mbmi->wedge_index,
-    mbmi->wedge_sign,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mbmi->mask_type,
-    xd->seg_mask,
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mbmi->interinter_compound_type
-  };
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if ((is_compound || is_inter_singleref_comp_mode(mbmi->mode)) &&
-      is_masked_compound_type(mbmi->interinter_compound_type))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_compound && is_masked_compound_type(mbmi->interinter_compound_type))
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-#if CONFIG_COMPOUND_SEGMENT
-    if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-#if CONFIG_HIGHBITDEPTH
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+  if (is_compound && is_masked_compound_type(comp_data->type)) {
+    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        build_compound_seg_mask_highbd(
-            comp_data.seg_mask, comp_data.mask_type,
+        av1_build_compound_diffwtd_mask_highbd(
+            comp_data->seg_mask, comp_data->mask_type,
             CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, mbmi->sb_type, h, w,
-            xd->bd);
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
       else
-#endif  // CONFIG_HIGHBITDEPTH
-        build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type,
-                                ext_dst0, ext_dst_stride0, ext_dst1,
-                                ext_dst_stride1, mbmi->sb_type, h, w);
+        av1_build_compound_diffwtd_mask(
+            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
     }
-#endif  // CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_SUPERTX
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_masked_compound_wedge_extend_highbd(
-          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, &comp_data,
-          mbmi->sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      build_masked_compound_wedge_extend(
-          dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1,
-          ext_dst_stride1, &comp_data, mbmi->sb_type, wedge_offset_x,
-          wedge_offset_y, h, w);
-#else  // !CONFIG_SUPERTX
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       build_masked_compound_highbd(
           dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, &comp_data,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
           mbmi->sb_type, h, w, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, &comp_data,
-                            mbmi->sb_type, h, w);
-#endif  // CONFIG_SUPERTX
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            h, w);
   } else {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
                                dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
                                xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
                         0, NULL, 0, w, h);
   }
 }
 
-void av1_build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
-    int ext_dst_stride1[3]) {
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]) {
   int plane;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, &xd->plane[plane]);
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-      int x, y;
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          build_wedge_inter_predictor_from_buf(
-              xd, plane, 4 * x, 4 * y, 4, 4,
-#if CONFIG_SUPERTX
-              wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-              ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
-              ext_dst_stride1[plane]);
-    } else {
-      const int bw = block_size_wide[plane_bsize];
-      const int bh = block_size_high[plane_bsize];
-      build_wedge_inter_predictor_from_buf(
-          xd, plane, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-          wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-          ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
-          ext_dst_stride1[plane]);
-    }
-  }
-}
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-
-void alloc_ncobmc_pred_buffer(MACROBLOCKD *const xd) {
-  int i;
-  // allocate interpolated prediction buffer
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->ncobmc_pred_buf[i] = (uint8_t *)malloc(sizeof(uint8_t) * MAX_SB_SQUARE);
-    av1_zero_array(xd->ncobmc_pred_buf[i], MAX_SB_SQUARE);
-    xd->ncobmc_pred_buf_stride[i] = MAX_SB_SIZE;
-  }
-}
-
-void free_ncobmc_pred_buffer(MACROBLOCKD *const xd) {
-  for (int i = 0; i < MAX_MB_PLANE; ++i) free(xd->ncobmc_pred_buf[i]);
-}
-
-void get_pred_from_intrpl_buf(MACROBLOCKD *xd, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, int plane) {
-  uint8_t *dst = xd->plane[plane].dst.buf;
-  int ds = xd->plane[plane].dst.stride;
-  int ss_x = xd->plane[plane].subsampling_x;
-  int ss_y = xd->plane[plane].subsampling_y;
-
-  const int ip_wide = mi_size_wide[bsize] * MI_SIZE >> ss_x;
-  const int ip_high = mi_size_high[bsize] * MI_SIZE >> ss_y;
-  // relative coordinates of this MI in the superblock
-  int row_rlt = (mi_row - xd->sb_mi_bd.mi_row_begin) * MI_SIZE >> ss_y;
-  int col_rlt = (mi_col - xd->sb_mi_bd.mi_col_begin) * MI_SIZE >> ss_x;
-  int s = xd->ncobmc_pred_buf_stride[plane];
-  int r, c;
-
-  for (r = 0; r < ip_high; ++r) {
-    for (c = 0; c < ip_wide; ++c) {
-      dst[r * ds + c] =
-          xd->ncobmc_pred_buf[plane][(r + row_rlt) * s + c + col_rlt];
-    }
-  }
-}
-// scaling factors for ncobmc kernels
-#define KERNEL_SCALE_LOG 14
-
-void build_ncobmc_intrpl_pred(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                              int plane, int pxl_row, int pxl_col,
-                              BLOCK_SIZE bsize, uint8_t *preds[][MAX_MB_PLANE],
-                              int stride[MAX_MB_PLANE],  // pred buffer strides
-                              int mode) {
-  const ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[bsize];
-  const NCOBMC_KERNELS *const knls = &cm->ncobmc_kernels[ao_block][mode];
-  const int wide = mi_size_wide[bsize] * MI_SIZE;
-  const int high = mi_size_high[bsize] * MI_SIZE;
-  const int s = stride[plane];
-  const int ss_x = xd->plane[plane].subsampling_x;
-  const int ss_y = xd->plane[plane].subsampling_y;
-  int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
-  int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
-  int dst_stride = xd->ncobmc_pred_buf_stride[plane];
-  int dst_offset = row_offset * dst_stride + col_offset;
-
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  int r, c, k_r, k_c;
-  int64_t tmp;
-
-  for (r = 0; r < (high >> ss_x); ++r) {
-    for (c = 0; c < (wide >> ss_y); ++c) {
-      int pos = r * s + c;
-      int q_tmp;
-      uint8_t val;
-
-      // TODO(weitinglin): find out the optimal sub-sampling patterns for
-      //                   chroma
-      k_r = (r << ss_y) + ss_y;
-      k_c = (c << ss_x) + ss_x;
-      if (ss_y && k_r >= high) k_r -= 1;
-      if (ss_x && k_c >= wide) k_c -= 1;
-
-      if (!is_hbd) {
-        uint8_t *tmp_p[4];
-        int i;
-        for (i = 0; i < 4; ++i) tmp_p[i] = preds[i][plane];
-
-        tmp = 0;
-        for (i = 0; i < 4; ++i)
-          tmp += knls->KERNEL[i][k_r][k_c] * tmp_p[i][pos];
-
-      } else {
-        uint16_t *tmp_p[4];
-        int i;
-        for (i = 0; i < 4; ++i) tmp_p[i] = CONVERT_TO_SHORTPTR(preds[i][plane]);
-
-        tmp = 0;
-        for (i = 0; i < 4; ++i)
-          tmp += knls->KERNEL[i][k_r][k_c] * tmp_p[i][pos];
-      }
-
-      q_tmp = (tmp <= 0) ? 0 : ROUND_POWER_OF_TWO(tmp, KERNEL_SCALE_LOG);
-      val = clip_pixel(q_tmp);
-
-      xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] = val;
-
-      assert(r * dst_stride + c + dst_offset < MAX_SB_SQUARE);
-    }
-  }
-}
-
-void get_pred_by_horz_neighbor(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                               int mi_row, int mi_col,
-                               uint8_t *dst_buf[MAX_MB_PLANE],
-                               int dst_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  const int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int mb_to_top_edge_base = xd->mb_to_top_edge;
-  const int mb_to_left_edge_base = xd->mb_to_left_edge;
-  const int mb_to_right_edge_base = xd->mb_to_right_edge;
-  int overlappable_offset = -1;
-  const int mi_nums = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
-
-  int i, j, mi_step, ref;
-
-  xd->mb_to_right_edge += mi_size_wide[bsize] * MI_SIZE * 4;
-
-  // build from left neighbors
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *left_mi;
-    MB_MODE_INFO *left_mbmi, backup_mbmi;
-    BLOCK_SIZE l_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start)) mi_col_offset = 0;
-
-    left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    left_mbmi = &left_mi->mbmi;
-    l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(left_mbmi)) {
-      // use left_mbmi->sb_type instead of l_bsize to handle
-      // sub8x8 cases
-      int search_mi_step = mi_size_high[left_mbmi->sb_type];
-      while (!is_neighbor_overlappable(left_mbmi)) {
-        mi_row_offset += search_mi_step;
-        if (mi_row_offset < mi_nums) {
-          left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          left_mbmi = &left_mi->mbmi;
-          search_mi_step = mi_size_high[left_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_row_offset = overlappable_offset;
-          } else {
-            mi_row_offset = 0;
-            mi_col_offset = 0;
-          }
-          left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          left_mbmi = &left_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      // update the available overlappable mi
-      overlappable_offset = mi_row_offset;
-    }
-
-    backup_mbmi = *left_mbmi;
-    modify_neighbor_predictor_for_obmc(left_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, l_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], i, 0, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(left_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
-                                           ? left_mbmi->ref_frame[ref]
-                                           : left_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
-                           &ref_buf->sf);
-    }
-    xd->mb_to_top_edge = -((mi_row + i) * MI_SIZE * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = mi_col << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = mi_size_wide[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, left_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-    *left_mbmi = backup_mbmi;
-  }
-
-  // build from right neighbors
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_left_edge -= mi_size_wide[bsize] * MI_SIZE * 4;
-
-  overlappable_offset = -1;
-
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = mi_size_wide[bsize];
-    int mi_x, mi_y, bw, bh;
-    int mi_col_shift = mi_size_wide[bsize] >> 1;
-    MODE_INFO *right_mi;
-    MB_MODE_INFO *right_mbmi, backup_mbmi;
-    BLOCK_SIZE r_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_col + mi_col_offset > xd->sb_mi_bd.mi_col_end) mi_col_offset = 0;
-
-    right_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    right_mbmi = &right_mi->mbmi;
-    r_bsize = AOMMAX(right_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(mi_nums, mi_size_high[r_bsize]);
-
-    if (!is_neighbor_overlappable(right_mbmi)) {
-      int search_mi_step = mi_size_high[right_mbmi->sb_type];
-      while (!is_neighbor_overlappable(right_mbmi)) {
-        mi_row_offset += search_mi_step;
-        if (mi_row_offset < mi_nums) {
-          right_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          right_mbmi = &right_mi->mbmi;
-          search_mi_step = mi_size_high[right_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_row_offset = overlappable_offset;
-          } else {
-            mi_row_offset = 0;
-            mi_col_offset = 0;
-          }
-          right_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          right_mbmi = &right_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      overlappable_offset = mi_row_offset;
-    }
-
-    backup_mbmi = *right_mbmi;
-    modify_neighbor_predictor_for_obmc(right_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, r_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], i, mi_col_shift, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(right_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(right_mbmi)
-                                           ? right_mbmi->ref_frame[ref]
-                                           : right_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(right_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = right_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i,
-                           mi_col + mi_col_shift, &ref_buf->sf);
-    }
-    xd->mb_to_top_edge = -((mi_row + i) * MI_SIZE * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + mi_col_shift) << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = mi_size_wide[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, right_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-
-    *right_mbmi = backup_mbmi;
-  }
-
-  // restore the boundaries
-  xd->mb_to_top_edge = mb_to_top_edge_base;
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_left_edge = mb_to_left_edge_base;
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-}
-
-void get_pred_by_vert_neighbor(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                               int mi_row, int mi_col,
-                               uint8_t *dst_buf[MAX_MB_PLANE],
-                               int dst_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  const int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int mb_to_top_edge_base = xd->mb_to_top_edge;
-  const int mb_to_left_edge_base = xd->mb_to_left_edge;
-  const int mb_to_right_edge_base = xd->mb_to_right_edge;
-  int overlappable_offset = -1;
-  const int mi_nums = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
-
-  int i, j, mi_step, ref;
-
-  xd->mb_to_bottom_edge += mi_nums * MI_SIZE * 4;
-
-  // build from above neighbors
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *above_mi;
-    MB_MODE_INFO *above_mbmi, backup_mbmi;
-    BLOCK_SIZE a_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_row <= tile->mi_row_start) mi_row_offset = 0;
-
-    above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    above_mbmi = &above_mi->mbmi;
-    a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(mi_nums, mi_size_high[a_bsize]);
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(above_mbmi)) {
-      int search_mi_step = mi_size_high[above_mbmi->sb_type];
-      // backward search
-      while (!is_neighbor_overlappable(above_mbmi)) {
-        mi_col_offset += search_mi_step;
-        if (mi_col_offset < mi_nums) {
-          above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          above_mbmi = &above_mi->mbmi;
-          search_mi_step = mi_size_high[above_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_col_offset = overlappable_offset;
-          } else {
-            mi_row_offset = 0;
-            mi_col_offset = 0;
-          }
-          above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          above_mbmi = &above_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      // update the available overlappable mi
-      overlappable_offset = mi_col_offset;
-    }
-
-    backup_mbmi = *above_mbmi;
-    modify_neighbor_predictor_for_obmc(above_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, a_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], 0, i, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(above_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
-                                           ? above_mbmi->ref_frame[ref]
-                                           : above_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
-                           &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = mi_row << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-
-      bh = mi_size_high[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, above_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-
-    *above_mbmi = backup_mbmi;
-  }
-
-  // build from bottom neighbors
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_top_edge -= mi_size_high[bsize] * MI_SIZE * 4;
-
-  overlappable_offset = -1;
-
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = mi_size_high[bsize];
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    int mi_row_shift = mi_size_high[bsize] >> 1;
-    MODE_INFO *bottom_mi;
-    MB_MODE_INFO *bottom_mbmi, backup_mbmi;
-    BLOCK_SIZE b_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_row + mi_row_offset > xd->sb_mi_bd.mi_row_end) mi_row_offset = 0;
-
-    bottom_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    bottom_mbmi = &bottom_mi->mbmi;
-    b_bsize = AOMMAX(bottom_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(mi_nums, mi_size_high[b_bsize]);
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(bottom_mbmi)) {
-      int search_mi_step = mi_size_high[bottom_mbmi->sb_type];
-      while (!is_neighbor_overlappable(bottom_mbmi)) {
-        mi_col_offset += search_mi_step;
-        if (mi_col_offset < mi_nums) {
-          bottom_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          bottom_mbmi = &bottom_mi->mbmi;
-          search_mi_step = mi_size_high[bottom_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_col_offset = overlappable_offset;
-          } else {
-            mi_col_offset = 0;
-            mi_row_offset = 0;
-          }
-          bottom_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          bottom_mbmi = &bottom_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      // update the available overlappable mi
-      overlappable_offset = mi_col_offset;
-    }
-
-    backup_mbmi = *bottom_mbmi;
-    modify_neighbor_predictor_for_obmc(bottom_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, b_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], mi_row_shift, i, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(bottom_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(bottom_mbmi)
-                                           ? bottom_mbmi->ref_frame[ref]
-                                           : bottom_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(bottom_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = bottom_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + mi_row_shift,
-                           mi_col + i, &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = (mi_row + mi_row_shift) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-
-      bh = mi_size_high[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, bottom_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-
-    *bottom_mbmi = backup_mbmi;
-  }
-  // restore the boundaries
-  xd->mb_to_top_edge = mb_to_top_edge_base;
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_left_edge = mb_to_left_edge_base;
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-}
-
-void get_pred_by_corner_neighbor(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                 int bsize, int mi_row, int mi_col,
-                                 uint8_t *dst_buf[MAX_MB_PLANE],
-                                 int dst_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  const int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int mb_to_top_edge_base = xd->mb_to_top_edge;
-  const int mb_to_left_edge_base = xd->mb_to_left_edge;
-  const int mb_to_right_edge_base = xd->mb_to_right_edge;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-
-  // location of four mi sources
-  const int mi_row_offsets[4] = { -1, -1, mi_high, mi_high };
-  const int mi_col_offsets[4] = { -1, mi_wide, -1, mi_wide };
-
-  MB_MODE_INFO backup_mbmi;
-  int mi_x, mi_y, bh, bw;
-  int i, j, ref;
-
-  assert(bsize >= BLOCK_8X8);
-
-  for (i = 0; i < 4; ++i) {
-    int mi_row_offset = mi_row_offsets[i];
-    int mi_col_offset = mi_col_offsets[i];
-    MODE_INFO *corner_mi;
-    MB_MODE_INFO *corner_mbmi;
-
-    if (mi_col + mi_col_offset < tile->mi_col_start ||
-        mi_col + mi_col_offset > xd->sb_mi_bd.mi_col_end)
-      mi_col_offset = 0;
-
-    if (mi_row + mi_row_offset < tile->mi_row_start ||
-        mi_row + mi_row_offset > xd->sb_mi_bd.mi_row_end)
-      mi_row_offset = 0;
-
-    corner_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    corner_mbmi = &corner_mi->mbmi;
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(corner_mbmi)) {
-      mi_row_offset = 0;
-      mi_col_offset = 0;
-      corner_mi = xd->mi[0];
-      corner_mbmi = &corner_mi->mbmi;
-    }
-
-    backup_mbmi = *corner_mbmi;
-    modify_neighbor_predictor_for_obmc(corner_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, BLOCK_8X8, dst_buf[j], MAX_SB_SIZE,
-                       MAX_SB_SIZE, dst_stride[j], (i / 2) * (mi_high >> 1),
-                       (i % 2) * (mi_wide >> 1), NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(corner_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(corner_mbmi)
-                                           ? corner_mbmi->ref_frame[ref]
-                                           : corner_mbmi->ref_frame[0];
-#else
-    for (ref = 0; ref < 1 + has_second_ref(corner_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = corner_mbmi->ref_frame[ref];
-#endif
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-      xd->block_refs[ref] = ref_buf;
-
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf,
-                           mi_row + (i / 2) * (mi_high >> 1),
-                           mi_col + (i % 2) * (mi_wide >> 1), &ref_buf->sf);
-    }
-    // adjust mi boundaries of this block
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (1 - (i / 2)) * mi_high * MI_SIZE * 4;
-    xd->mb_to_top_edge = mb_to_top_edge_base - (i / 2) * mi_high * MI_SIZE * 4;
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (1 - (i % 2)) * mi_wide * MI_SIZE * 4;
-    xd->mb_to_left_edge =
-        mb_to_left_edge_base - (i % 2) * mi_wide * MI_SIZE * 4;
-
-    mi_x = (mi_col + (i % 2) * mi_wide / 2) << MI_SIZE_LOG2;
-    mi_y = (mi_row + (i / 2) * mi_high / 2) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bh = mi_high << MI_SIZE_LOG2 >> (pd->subsampling_x + 1);
-      bw = mi_wide << MI_SIZE_LOG2 >> (pd->subsampling_y + 1);
-      build_inter_predictors(cm, xd, j, corner_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-    *corner_mbmi = backup_mbmi;
-  }
-  // restore the boundaries
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_top_edge = mb_to_top_edge_base;
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_left_edge = mb_to_left_edge_base;
-}
-
-// get the stitched extra prediction for this block
-void av1_get_ext_blk_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                           int mi_row, int mi_col,
-                           uint8_t *dst_buf[][MAX_MB_PLANE],
-                           int dst_stride[MAX_MB_PLANE]) {
-  get_pred_by_corner_neighbor(cm, xd, bsize, mi_row, mi_col, dst_buf[0],
-                              dst_stride);
-  get_pred_by_vert_neighbor(cm, xd, bsize, mi_row, mi_col, dst_buf[1],
-                            dst_stride);
-  get_pred_by_horz_neighbor(cm, xd, bsize, mi_row, mi_col, dst_buf[2],
-                            dst_stride);
-}
-
-void av1_get_ori_blk_pred(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                          int mi_row, int mi_col,
-                          uint8_t *dst_buf[MAX_MB_PLANE],
-                          int dst_stride[MAX_MB_PLANE]) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int mi_x = mi_col << MI_SIZE_LOG2;
-  int mi_y = mi_row << MI_SIZE_LOG2;
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
-  int i, ref;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    setup_pred_plane(&pd->dst, BLOCK_8X8, dst_buf[i], MAX_SB_SIZE, MAX_SB_SIZE,
-                     dst_stride[i], 0, 0, NULL, pd->subsampling_x,
-                     pd->subsampling_y);
-  }
-
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-    xd->block_refs[ref] = ref_buf;
-
-    if (!av1_is_valid_scale(&ref_buf->sf))
-      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Reference frame has invalid dimensions");
-
-    av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf);
-  }
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const struct macroblockd_plane *pd = &xd->plane[i];
-    build_inter_predictors(cm, xd, i, mi, 1, 0, bw >> pd->subsampling_x,
-                           bh >> pd->subsampling_y, 0, 0,
-                           bw >> pd->subsampling_x, bh >> pd->subsampling_y,
-#if CONFIG_SUPERTX
-                           0, 0,
-#endif  // CONFIG_SUPERTX
-                           mi_x, mi_y);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_wedge_inter_predictor_from_buf(
+        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+        ext_dst1[plane], ext_dst_stride1[plane]);
   }
 }
-
-#endif
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index 0c3333339..aa3aefc88 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -15,164 +15,26 @@
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/convolve.h"
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #include "aom/aom_integer.h"
 
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#define WARP_WM_NEIGHBORS_WITH_OBMC 0
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-
-#if CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
-#define WARP_GM_NEIGHBORS_WITH_OBMC 0
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+// Work out how many pixels off the edge of a reference frame we're allowed
+// to go when forming an inter prediction.
+// The outermost row/col of each referernce frame is extended by
+// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep
+// at least AOM_INTERP_EXTEND pixels within that to account for filtering.
+//
+// We have to break this up into two macros to keep both clang-format and
+// tools/lint-hunks.py happy.
+#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \
+  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
+#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \
+  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static INLINE int has_scale(int xs, int ys) {
-  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
-}
-
-static INLINE void inter_predictor(const uint8_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride, int subpel_x,
-                                   int subpel_y, const struct scale_factors *sf,
-                                   int w, int h, ConvolveParams *conv_params,
-                                   InterpFilters interp_filters, int xs,
-                                   int ys) {
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  assert(sf);
-  if (has_scale(xs, ys)) {
-    // TODO(afergs, debargha): Use a different scale convolve function
-    // that uses higher precision for subpel_x, subpel_y, xs, ys
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                             interp_filters, subpel_x, xs, subpel_y, ys, 1,
-                             conv_params);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      assert(conv_params->round == CONVOLVE_OPT_ROUND);
-      av1_convolve_scale(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                         subpel_x, xs, subpel_y, ys, conv_params);
-    }
-  } else {
-    subpel_x >>= SCALE_EXTRA_BITS;
-    subpel_y >>= SCALE_EXTRA_BITS;
-    xs >>= SCALE_EXTRA_BITS;
-    ys >>= SCALE_EXTRA_BITS;
-    assert(subpel_x < SUBPEL_SHIFTS);
-    assert(subpel_y < SUBPEL_SHIFTS);
-    assert(xs <= SUBPEL_SHIFTS);
-    assert(ys <= SUBPEL_SHIFTS);
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                             interp_filters, subpel_x, xs, subpel_y, ys, 0,
-                             conv_params);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      assert(conv_params->round == CONVOLVE_OPT_ROUND);
-
-      InterpFilterParams filter_params_x, filter_params_y;
-      av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                     &filter_params_y);
-
-      if (w <= 2 || h <= 2) {
-        av1_convolve_c(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                       subpel_x, xs, subpel_y, ys, conv_params);
-      } else if (filter_params_x.taps == SUBPEL_TAPS &&
-                 filter_params_y.taps == SUBPEL_TAPS) {
-        const int16_t *kernel_x =
-            av1_get_interp_filter_subpel_kernel(filter_params_x, subpel_x);
-        const int16_t *kernel_y =
-            av1_get_interp_filter_subpel_kernel(filter_params_y, subpel_y);
-        sf->predict[subpel_x != 0][subpel_y != 0][conv_params->do_average](
-            src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
-      } else {
-        av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                     subpel_x, xs, subpel_y, ys, conv_params);
-      }
-    }
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
-                                          uint8_t *dst, int dst_stride,
-                                          int subpel_x, int subpel_y,
-                                          const struct scale_factors *sf, int w,
-                                          int h, ConvolveParams *conv_params,
-                                          InterpFilters interp_filters, int xs,
-                                          int ys, int bd) {
-  const int avg = conv_params->do_average;
-  assert(avg == 0 || avg == 1);
-
-  if (has_scale(xs, ys)) {
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                    interp_filters, subpel_x, xs, subpel_y, ys,
-                                    1, conv_params, bd);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      av1_highbd_convolve_scale(src, src_stride, dst, dst_stride, w, h,
-                                interp_filters, subpel_x, xs, subpel_y, ys, avg,
-                                bd);
-    }
-  } else {
-    subpel_x >>= SCALE_EXTRA_BITS;
-    subpel_y >>= SCALE_EXTRA_BITS;
-    xs >>= SCALE_EXTRA_BITS;
-    ys >>= SCALE_EXTRA_BITS;
-    assert(subpel_x < SUBPEL_SHIFTS);
-    assert(subpel_y < SUBPEL_SHIFTS);
-    assert(xs <= SUBPEL_SHIFTS);
-    assert(ys <= SUBPEL_SHIFTS);
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                    interp_filters, subpel_x, xs, subpel_y, ys,
-                                    0, conv_params, bd);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      InterpFilterParams filter_params_x, filter_params_y;
-      av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                     &filter_params_y);
-
-      if (filter_params_x.taps == SUBPEL_TAPS &&
-          filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2) {
-        const int16_t *kernel_x =
-            av1_get_interp_filter_subpel_kernel(filter_params_x, subpel_x);
-        const int16_t *kernel_y =
-            av1_get_interp_filter_subpel_kernel(filter_params_y, subpel_y);
-        sf->highbd_predict[subpel_x != 0][subpel_y != 0][avg](
-            src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h,
-            bd);
-      } else {
-        av1_highbd_convolve(src, src_stride, dst, dst_stride, w, h,
-                            interp_filters, subpel_x, xs, subpel_y, ys, avg,
-                            bd);
-      }
-    }
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
 // Set to (1 << 5) if the 32-ary codebooks are used for any bock size
 #define MAX_WEDGE_TYPES (1 << 4)
 
@@ -208,38 +70,108 @@ typedef struct {
   int bits;
   const wedge_code_type *codebook;
   uint8_t *signflip;
-  int smoother;
   wedge_masks_type *masks;
 } wedge_params_type;
 
 extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
 
+typedef struct SubpelParams {
+  int xs;
+  int ys;
+  int subpel_x;
+  int subpel_y;
+} SubpelParams;
+
+struct build_prediction_ctxt {
+  const AV1_COMMON *cm;
+  int mi_row;
+  int mi_col;
+  uint8_t **tmp_buf;
+  int *tmp_width;
+  int *tmp_height;
+  int *tmp_stride;
+  int mb_to_far_edge;
+};
+
+static INLINE int has_scale(int xs, int ys) {
+  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
+}
+
+static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+  sp->subpel_x >>= SCALE_EXTRA_BITS;
+  sp->subpel_y >>= SCALE_EXTRA_BITS;
+  sp->xs >>= SCALE_EXTRA_BITS;
+  sp->ys >>= SCALE_EXTRA_BITS;
+  assert(sp->subpel_x < SUBPEL_SHIFTS);
+  assert(sp->subpel_y < SUBPEL_SHIFTS);
+  assert(sp->xs <= SUBPEL_SHIFTS);
+  assert(sp->ys <= SUBPEL_SHIFTS);
+}
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const SubpelParams *subpel_params,
+                                   const struct scale_factors *sf, int w, int h,
+                                   ConvolveParams *conv_params,
+                                   InterpFilters interp_filters) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  if (has_scale(subpel_params->xs, subpel_params->ys)) {
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, subpel_params->subpel_x,
+                           subpel_params->xs, subpel_params->subpel_y,
+                           subpel_params->ys, 1, conv_params, sf);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
+                           sp.ys, 0, conv_params, sf);
+  }
+}
+
+static INLINE void highbd_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  if (has_scale(subpel_params->xs, subpel_params->ys)) {
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, subpel_params->subpel_x,
+                                  subpel_params->xs, subpel_params->subpel_y,
+                                  subpel_params->ys, 1, conv_params, sf, bd);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, sp.subpel_x, sp.xs,
+                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+  }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir);
+
 static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
                                               BLOCK_SIZE sb_type) {
-  (void)sb_type;
+  const int comp_allowed = is_comp_ref_allowed(sb_type);
   switch (type) {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    case COMPOUND_AVERAGE: return sb_type >= BLOCK_4X4;
-#else   // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    case COMPOUND_AVERAGE: return 1;
-#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-#if CONFIG_WEDGE
-    case COMPOUND_WEDGE: return wedge_params_lookup[sb_type].bits > 0;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      return AOMMIN(block_size_wide[sb_type], block_size_high[sb_type]) >= 8;
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_AVERAGE:
+    case COMPOUND_DIFFWTD: return comp_allowed;
+    case COMPOUND_WEDGE:
+      return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
     default: assert(0); return 0;
   }
 }
 
 static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
   COMPOUND_TYPE comp_type;
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  if (sb_type < BLOCK_4X4) return 0;
-#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) {
+  int i;
+  if (!is_comp_ref_allowed(sb_type)) return 0;
+  for (i = 0; i < COMPOUND_TYPES; i++) {
+    comp_type = (COMPOUND_TYPE)i;
     if (is_masked_compound_type(comp_type) &&
         is_interinter_compound_used(comp_type, sb_type))
       return 1;
@@ -257,7 +189,6 @@ static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
 }
 
 static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
-  (void)sb_type;
   return wedge_params_lookup[sb_type].bits > 0;
 }
 
@@ -265,60 +196,22 @@ static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
   return wedge_params_lookup[sb_type].bits;
 }
 
-#if CONFIG_COMPOUND_SEGMENT
-void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                             const uint8_t *src0, int src0_stride,
-                             const uint8_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w);
-#if CONFIG_HIGHBITDEPTH
-void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                    const uint8_t *src0, int src0_stride,
-                                    const uint8_t *src1, int src1_stride,
-                                    BLOCK_SIZE sb_type, int h, int w, int bd);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_COMPOUND_SEGMENT
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, const SubpelParams *subpel_params,
+                              const struct scale_factors *sf, int w, int h,
+                              ConvolveParams *conv_params,
+                              InterpFilters interp_filters,
+                              const WarpTypesAllowed *warp_types, int p_col,
+                              int p_row, int plane, int ref,
+                              const MB_MODE_INFO *mi, int build_for_obmc,
+                              const MACROBLOCKD *xd, int can_use_previous);
 
 void av1_make_masked_inter_predictor(
     const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
-    int w, int h, ConvolveParams *conv_params, InterpFilters interp_filters,
-    int xs, int ys,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    int plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
     const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    MACROBLOCKD *xd);
-
-static INLINE int round_mv_comp_q4(int value) {
-  return (value < 0 ? value - 2 : value + 2) / 4;
-}
-
-static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
-  MV res = {
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row +
-        mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row),
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col +
-        mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col)
-  };
-  return res;
-}
-
-static INLINE int round_mv_comp_q2(int value) {
-  return (value < 0 ? value - 1 : value + 1) / 2;
-}
-
-static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
-  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
-                              mi->bmi[block1].as_mv[idx].as_mv.row),
-             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
-                              mi->bmi[block1].as_mv[idx].as_mv.col) };
-  return res;
-}
+    MACROBLOCKD *xd, int can_use_previous);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
@@ -331,8 +224,8 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   const int spel_right = spel_left - SUBPEL_SHIFTS;
   const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
   const int spel_bottom = spel_top - SUBPEL_SHIFTS;
-  MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
-                    src_mv->col * (1 << (1 - ss_x)) };
+  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
 
@@ -344,20 +237,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   return clamped_mv;
 }
 
-static INLINE MV average_split_mvs(const struct macroblockd_plane *pd,
-                                   const MODE_INFO *mi, int ref, int block) {
-  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
-  MV res = { 0, 0 };
-  switch (ss_idx) {
-    case 0: res = mi->bmi[block].as_mv[ref].as_mv; break;
-    case 1: res = mi_mv_pred_q2(mi, ref, block, block + 2); break;
-    case 2: res = mi_mv_pred_q2(mi, ref, block, block + 1); break;
-    case 3: res = mi_mv_pred_q4(mi, ref); break;
-    default: assert(ss_idx <= 3 && ss_idx >= 0);
-  }
-  return res;
-}
-
 void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int mi_row, int mi_col, BUFFER_SET *ctx,
                                     BLOCK_SIZE bsize);
@@ -370,48 +249,22 @@ void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col, BUFFER_SET *ctx,
                                    BLOCK_SIZE bsize);
 
-#if CONFIG_SUPERTX
-void av1_build_inter_predictor_sb_sub8x8_extend(const AV1_COMMON *cm,
-                                                MACROBLOCKD *xd, int mi_row_ori,
-                                                int mi_col_ori, int mi_row,
-                                                int mi_col, int plane,
-                                                BLOCK_SIZE bsize, int block);
-
-void av1_build_inter_predictor_sb_extend(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row_ori, int mi_col_ori,
-                                         int mi_row, int mi_col, int plane,
-                                         BLOCK_SIZE bsize);
-struct macroblockd_plane;
-void av1_build_masked_inter_predictor_complex(
-    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
-    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
-    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
-    int plane);
-#endif  // CONFIG_SUPERTX
-
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
                                ConvolveParams *conv_params,
                                InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                const WarpTypesAllowed *warp_types, int p_col,
                                int p_row, int plane, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd);
+                               const MACROBLOCKD *xd, int can_use_previous);
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_build_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
-    InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    const WarpTypesAllowed *warp_types, int p_col, int p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd);
-#endif
+    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
+    int p_row, int plane, enum mv_precision precision, int x, int y,
+    const MACROBLOCKD *xd, int can_use_previous);
 
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                        const struct scale_factors *sf) {
@@ -427,15 +280,11 @@ static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
                                     int stride, int mi_row, int mi_col,
                                     const struct scale_factors *scale,
                                     int subsampling_x, int subsampling_y) {
-#if CONFIG_CHROMA_SUB8X8
   // Offset the buffer pointer
   if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
     mi_row -= 1;
   if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
     mi_col -= 1;
-#else
-  (void)bsize;
-#endif
 
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
@@ -447,62 +296,33 @@ static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
 }
 
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
-                          const YV12_BUFFER_CONFIG *src, int mi_row,
-                          int mi_col);
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end);
 
 void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const struct scale_factors *sf);
+                          const struct scale_factors *sf, const int num_planes);
 
 // Detect if the block have sub-pixel level motion vectors
 // per component.
 #define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
                                           const MACROBLOCKD *const xd,
                                           int dir) {
 #if CHECK_SUBPEL
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int plane;
   int ref = (dir >> 1);
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
 
-  if (bsize >= BLOCK_8X8 || unify_bsize) {
-    if (dir & 0x01) {
-      if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
-    } else {
-      if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
-    }
+  if (dir & 0x01) {
+    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
   } else {
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int have_vsplit = bp != PARTITION_HORZ;
-      const int have_hsplit = bp != PARTITION_VERT;
-      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-
-      int x, y;
-      for (y = 0; y < num_4x4_h; ++y) {
-        for (x = 0; x < num_4x4_w; ++x) {
-          const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-          if (dir & 0x01) {
-            if (mv.col & SUBPEL_MASK) return 1;
-          } else {
-            if (mv.row & SUBPEL_MASK) return 1;
-          }
-        }
-      }
-    }
+    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
   }
 
   return 0;
 #else
-  (void)mi;
+  (void)mbmi;
   (void)xd;
   (void)dir;
   return 1;
@@ -516,20 +336,16 @@ static INLINE void set_default_interp_filters(
 }
 
 static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
-  (void)xd;
-#if CONFIG_WARPED_MOTION
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (mbmi->skip_mode) return 0;
   if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
-  if (is_nontrans_global_motion(xd)) return 0;
-#endif  // CONFIG_GLOBAL_MOTION
+  if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
   return 1;
 }
 
 static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
-  MODE_INFO *const mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
   int ref;
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     int row_col;
@@ -542,17 +358,15 @@ static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
   }
   return 0;
 }
-
-#if CONFIG_MOTION_VAR
-const uint8_t *av1_get_obmc_mask(int length);
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);
-void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
-                                     uint8_t *above[MAX_MB_PLANE],
-                                     int above_stride[MAX_MB_PLANE],
-                                     uint8_t *left[MAX_MB_PLANE],
-                                     int left_stride[MAX_MB_PLANE]);
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes);
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes);
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
@@ -565,13 +379,18 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col);
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col);
-#if CONFIG_NCOBMC
-void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col);
-#endif
-#endif  // CONFIG_MOTION_VAR
 
 #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
 #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
@@ -584,32 +403,24 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
   return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
 }
 
-const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
-                                 BLOCK_SIZE sb_type, int wedge_offset_x,
-                                 int wedge_offset_y);
-
-const uint8_t *av1_get_compound_type_mask_inverse(
-    const INTERINTER_COMPOUND_DATA *const comp_data,
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t *mask_buffer, int h, int w, int stride,
-#endif
-    BLOCK_SIZE sb_type);
-
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
-#if CONFIG_INTERINTRA
+
 void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      uint8_t *ypred, uint8_t *upred,
                                      uint8_t *vpred, int ystride, int ustride,
                                      int vstride, BUFFER_SET *ctx,
                                      BLOCK_SIZE bsize);
+
 void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *ypred, int ystride,
                                          BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
 void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *upred, int ustride,
                                          BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize);
+
 void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                           uint8_t *upred, uint8_t *vpred,
                                           int ustride, int vstride,
@@ -621,57 +432,27 @@ void av1_build_intra_predictors_for_interintra(
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
-#endif  // CONFIG_INTERINTRA
+
 // Encoder only
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]);
-void av1_build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
-    int ext_dst_stride1[3]);
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#define ASSIGN_ALIGNED_PTRS(p, a, s) \
-  p[0] = a;                          \
-  p[1] = a + s;                      \
-  p[2] = a + 2 * s;
-
-#define ASSIGN_ALIGNED_PTRS_HBD(p, a, s, l) \
-  p[0] = CONVERT_TO_BYTEPTR(a);             \
-  p[1] = CONVERT_TO_BYTEPTR(a + s * l);     \
-  p[2] = CONVERT_TO_BYTEPTR(a + 2 * s * l);
-
-void alloc_ncobmc_pred_buffer(MACROBLOCKD *const xd);
-void free_ncobmc_pred_buffer(MACROBLOCKD *const xd);
-void set_sb_mi_boundaries(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                          const int mi_row, const int mi_col);
-
-void reset_xd_boundary(MACROBLOCKD *xd, int mi_row, int bh, int mi_col, int bw,
-                       int mi_rows, int mi_cols);
-
-void get_pred_from_intrpl_buf(MACROBLOCKD *xd, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, int plane);
-
-void build_ncobmc_intrpl_pred(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                              int plane, int pxl_row, int pxl_col,
-                              BLOCK_SIZE bsize, uint8_t *preds[][MAX_MB_PLANE],
-                              int ps[MAX_MB_PLANE],  // pred buffer strides
-                              int mode);
-
-void av1_get_ext_blk_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                           int mi_row, int mi_col,
-                           uint8_t *dst_buf[][MAX_MB_PLANE],
-                           int dst_stride[MAX_MB_PLANE]);
-
-void av1_get_ori_blk_pred(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                          int mi_row, int mi_col,
-                          uint8_t *dst_buf[MAX_MB_PLANE],
-                          int dst_stride[MAX_MB_PLANE]);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous);
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]);
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                                int order_idx, int *fwd_offset, int *bck_offset,
+                                int *use_jnt_comp_avg, int is_compound);
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, int x_scale, int y_scale,
+                   WarpedMotionParams *final_warp_params);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index c6d57b742..21d1f60b2 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -11,22 +11,18 @@
 
 #include <math.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/system_state.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#if CONFIG_HIGHBITDEPTH
 #include "aom_dsp/aom_dsp_common.h"
-#endif  // CONFIG_HIGHBITDEPTH
 #include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
 #include "aom_ports/aom_once.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
 #include "av1/common/reconintra.h"
 #include "av1/common/onyxc_int.h"
-#if CONFIG_CFL
 #include "av1/common/cfl.h"
-#endif
 
 enum {
   NEED_LEFT = 1 << 1,
@@ -36,17 +32,9 @@ enum {
   NEED_BOTTOMLEFT = 1 << 5,
 };
 
-#if CONFIG_INTRA_EDGE
 #define INTRA_EDGE_FILT 3
 #define INTRA_EDGE_TAPS 5
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-#define MAX_UPSAMPLE_SZ 12
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-#endif  // CONFIG_INTRA_EDGE
-
-#define INTRA_USES_EXT_TRANSFORMS 1
-#define INTRA_USES_RECT_TRANSFORMS \
-  (CONFIG_RECT_TX && (CONFIG_VAR_TX || CONFIG_EXT_TX))
+#define MAX_UPSAMPLE_SZ 16
 
 static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_ABOVE | NEED_LEFT,                   // DC
@@ -54,515 +42,187 @@ static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_LEFT,                                // H
   NEED_ABOVE | NEED_ABOVERIGHT,             // D45
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
-  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D117
-  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D153
-  NEED_LEFT | NEED_BOTTOMLEFT,              // D207
-  NEED_ABOVE | NEED_ABOVERIGHT,             // D63
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D113
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D157
+  NEED_LEFT | NEED_BOTTOMLEFT,              // D203
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D67
   NEED_LEFT | NEED_ABOVE,                   // SMOOTH
-#if CONFIG_SMOOTH_HV
   NEED_LEFT | NEED_ABOVE,                   // SMOOTH_V
   NEED_LEFT | NEED_ABOVE,                   // SMOOTH_H
-#endif                                      // CONFIG_SMOOTH_HV
-  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // PAETH
 };
 
-static const uint16_t orders_128x128[1] = { 0 };
-static const uint16_t orders_128x64[2] = { 0, 1 };
-static const uint16_t orders_64x128[2] = { 0, 1 };
-static const uint16_t orders_64x64[4] = {
-  0, 1, 2, 3,
-};
-static const uint16_t orders_64x32[8] = {
-  0, 2, 1, 3, 4, 6, 5, 7,
+// Tables to store if the top-right reference pixels are available. The flags
+// are represented with bits, packed into 8-bit integers. E.g., for the 32x32
+// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
+// order), so its flag is stored at the 3rd bit of the 2nd entry in the table,
+// i.e. (table[10 / 8] >> (10 % 8)) & 1.
+//       . . . .
+//       . . . .
+//       . . o .
+//       . . . .
+static uint8_t has_tr_4x4[128] = {
+  255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
 };
-static const uint16_t orders_32x64[8] = {
-  0, 1, 2, 3, 4, 5, 6, 7,
+static uint8_t has_tr_4x8[64] = {
+  255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119,
+  119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127,
+  127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119,
+  119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127,
+  119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119,
 };
-static const uint16_t orders_32x32[16] = {
-  0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15,
+static uint8_t has_tr_8x4[64] = {
+  255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
 };
-static const uint16_t orders_32x16[32] = {
-  0,  2,  8,  10, 1,  3,  9,  11, 4,  6,  12, 14, 5,  7,  13, 15,
-  16, 18, 24, 26, 17, 19, 25, 27, 20, 22, 28, 30, 21, 23, 29, 31,
+static uint8_t has_tr_8x8[32] = {
+  255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+  255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
 };
-static const uint16_t orders_16x32[32] = {
-  0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,
-  16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31,
+static uint8_t has_tr_8x16[16] = {
+  255, 255, 119, 119, 127, 127, 119, 119,
+  255, 127, 119, 119, 127, 127, 119, 119,
 };
-static const uint16_t orders_16x16[64] = {
-  0,  1,  4,  5,  16, 17, 20, 21, 2,  3,  6,  7,  18, 19, 22, 23,
-  8,  9,  12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31,
-  32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55,
-  40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63,
+static uint8_t has_tr_16x8[16] = {
+  255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0,
 };
-
-static const uint16_t orders_64x16[16] = {
-  0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+static uint8_t has_tr_16x16[8] = {
+  255, 85, 119, 85, 127, 85, 119, 85,
 };
-static const uint16_t orders_16x64[16] = {
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 };
+static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 };
+static uint8_t has_tr_32x32[2] = { 95, 87 };
+static uint8_t has_tr_32x64[1] = { 127 };
+static uint8_t has_tr_64x32[1] = { 19 };
+static uint8_t has_tr_64x64[1] = { 7 };
+static uint8_t has_tr_64x128[1] = { 3 };
+static uint8_t has_tr_128x64[1] = { 1 };
+static uint8_t has_tr_128x128[1] = { 1 };
+static uint8_t has_tr_4x16[32] = {
+  255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255,
+  127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127,
+  127, 127, 255, 127, 255, 127, 127, 127, 127, 127,
 };
-static const uint16_t orders_32x8[64] = {
-  0,  4,  16, 20, 1,  5,  17, 21, 2,  6,  18, 22, 3,  7,  19, 23,
-  8,  12, 24, 28, 9,  13, 25, 29, 10, 14, 26, 30, 11, 15, 27, 31,
-  32, 36, 48, 52, 33, 37, 49, 53, 34, 38, 50, 54, 35, 39, 51, 55,
-  40, 44, 56, 60, 41, 45, 57, 61, 42, 46, 58, 62, 43, 47, 59, 63,
+static uint8_t has_tr_16x4[32] = {
+  255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+  127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
 };
-static const uint16_t orders_8x32[64] = {
-  0,  1,  2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
-  8,  9,  10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55,
-  40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63,
+static uint8_t has_tr_8x32[8] = {
+  255, 255, 127, 127, 255, 127, 127, 127,
 };
-
-#if CONFIG_EXT_PARTITION
-static const uint16_t orders_16x4[256] = {
-  0,   4,   16,  20,  64,  68,  80,  84,  1,   5,   17,  21,  65,  69,  81,
-  85,  2,   6,   18,  22,  66,  70,  82,  86,  3,   7,   19,  23,  67,  71,
-  83,  87,  8,   12,  24,  28,  72,  76,  88,  92,  9,   13,  25,  29,  73,
-  77,  89,  93,  10,  14,  26,  30,  74,  78,  90,  94,  11,  15,  27,  31,
-  75,  79,  91,  95,  32,  36,  48,  52,  96,  100, 112, 116, 33,  37,  49,
-  53,  97,  101, 113, 117, 34,  38,  50,  54,  98,  102, 114, 118, 35,  39,
-  51,  55,  99,  103, 115, 119, 40,  44,  56,  60,  104, 108, 120, 124, 41,
-  45,  57,  61,  105, 109, 121, 125, 42,  46,  58,  62,  106, 110, 122, 126,
-  43,  47,  59,  63,  107, 111, 123, 127, 128, 132, 144, 148, 192, 196, 208,
-  212, 129, 133, 145, 149, 193, 197, 209, 213, 130, 134, 146, 150, 194, 198,
-  210, 214, 131, 135, 147, 151, 195, 199, 211, 215, 136, 140, 152, 156, 200,
-  204, 216, 220, 137, 141, 153, 157, 201, 205, 217, 221, 138, 142, 154, 158,
-  202, 206, 218, 222, 139, 143, 155, 159, 203, 207, 219, 223, 160, 164, 176,
-  180, 224, 228, 240, 244, 161, 165, 177, 181, 225, 229, 241, 245, 162, 166,
-  178, 182, 226, 230, 242, 246, 163, 167, 179, 183, 227, 231, 243, 247, 168,
-  172, 184, 188, 232, 236, 248, 252, 169, 173, 185, 189, 233, 237, 249, 253,
-  170, 174, 186, 190, 234, 238, 250, 254, 171, 175, 187, 191, 235, 239, 251,
-  255,
+static uint8_t has_tr_32x8[8] = {
+  15, 0, 5, 0, 7, 0, 5, 0,
 };
-static const uint16_t orders_4x16[256] = {
-  0,   1,   2,   3,   4,   5,   6,   7,   16,  17,  18,  19,  20,  21,  22,
-  23,  64,  65,  66,  67,  68,  69,  70,  71,  80,  81,  82,  83,  84,  85,
-  86,  87,  8,   9,   10,  11,  12,  13,  14,  15,  24,  25,  26,  27,  28,
-  29,  30,  31,  72,  73,  74,  75,  76,  77,  78,  79,  88,  89,  90,  91,
-  92,  93,  94,  95,  32,  33,  34,  35,  36,  37,  38,  39,  48,  49,  50,
-  51,  52,  53,  54,  55,  96,  97,  98,  99,  100, 101, 102, 103, 112, 113,
-  114, 115, 116, 117, 118, 119, 40,  41,  42,  43,  44,  45,  46,  47,  56,
-  57,  58,  59,  60,  61,  62,  63,  104, 105, 106, 107, 108, 109, 110, 111,
-  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-  135, 144, 145, 146, 147, 148, 149, 150, 151, 192, 193, 194, 195, 196, 197,
-  198, 199, 208, 209, 210, 211, 212, 213, 214, 215, 136, 137, 138, 139, 140,
-  141, 142, 143, 152, 153, 154, 155, 156, 157, 158, 159, 200, 201, 202, 203,
-  204, 205, 206, 207, 216, 217, 218, 219, 220, 221, 222, 223, 160, 161, 162,
-  163, 164, 165, 166, 167, 176, 177, 178, 179, 180, 181, 182, 183, 224, 225,
-  226, 227, 228, 229, 230, 231, 240, 241, 242, 243, 244, 245, 246, 247, 168,
-  169, 170, 171, 172, 173, 174, 175, 184, 185, 186, 187, 188, 189, 190, 191,
-  232, 233, 234, 235, 236, 237, 238, 239, 248, 249, 250, 251, 252, 253, 254,
-  255,
+static uint8_t has_tr_16x64[2] = { 255, 127 };
+static uint8_t has_tr_64x16[2] = { 3, 1 };
+
+static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_tr_4x4,
+  // 4X8,       8X4,            8X8
+  has_tr_4x8, has_tr_8x4, has_tr_8x8,
+  // 8X16,      16X8,           16X16
+  has_tr_8x16, has_tr_16x8, has_tr_16x16,
+  // 16X32,     32X16,          32X32
+  has_tr_16x32, has_tr_32x16, has_tr_32x32,
+  // 32X64,     64X32,          64X64
+  has_tr_32x64, has_tr_64x32, has_tr_64x64,
+  // 64x128,    128x64,         128x128
+  has_tr_64x128, has_tr_128x64, has_tr_128x128,
+  // 4x16,      16x4,            8x32
+  has_tr_4x16, has_tr_16x4, has_tr_8x32,
+  // 32x8,      16x64,           64x16
+  has_tr_32x8, has_tr_16x64, has_tr_64x16
 };
-#endif
 
-static const uint16_t orders_32x128[4] = {
-  0, 1, 2, 3,
-};
-static const uint16_t orders_128x32[4] = {
-  0, 1, 2, 3,
-};
-
-#if CONFIG_CB4X4 || CONFIG_EXT_PARTITION
-static const uint16_t orders_16x8[128] = {
-  0,  2,  8,  10, 32,  34,  40,  42,  1,  3,  9,  11, 33,  35,  41,  43,
-  4,  6,  12, 14, 36,  38,  44,  46,  5,  7,  13, 15, 37,  39,  45,  47,
-  16, 18, 24, 26, 48,  50,  56,  58,  17, 19, 25, 27, 49,  51,  57,  59,
-  20, 22, 28, 30, 52,  54,  60,  62,  21, 23, 29, 31, 53,  55,  61,  63,
-  64, 66, 72, 74, 96,  98,  104, 106, 65, 67, 73, 75, 97,  99,  105, 107,
-  68, 70, 76, 78, 100, 102, 108, 110, 69, 71, 77, 79, 101, 103, 109, 111,
-  80, 82, 88, 90, 112, 114, 120, 122, 81, 83, 89, 91, 113, 115, 121, 123,
-  84, 86, 92, 94, 116, 118, 124, 126, 85, 87, 93, 95, 117, 119, 125, 127,
-};
-static const uint16_t orders_8x16[128] = {
-  0,  1,  2,  3,  8,  9,  10, 11, 32,  33,  34,  35,  40,  41,  42,  43,
-  4,  5,  6,  7,  12, 13, 14, 15, 36,  37,  38,  39,  44,  45,  46,  47,
-  16, 17, 18, 19, 24, 25, 26, 27, 48,  49,  50,  51,  56,  57,  58,  59,
-  20, 21, 22, 23, 28, 29, 30, 31, 52,  53,  54,  55,  60,  61,  62,  63,
-  64, 65, 66, 67, 72, 73, 74, 75, 96,  97,  98,  99,  104, 105, 106, 107,
-  68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
-  80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
-  84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127,
+static uint8_t has_tr_vert_8x8[32] = {
+  255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+  255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
 };
-static const uint16_t orders_8x8[256] = {
-  0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
-  85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
-  86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
-  89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
-  90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
-  101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
-  102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
-  105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
-  106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
-  149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
-  150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
-  153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
-  154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
-  165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
-  166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
-  169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
-  170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
-  255,
+static uint8_t has_tr_vert_16x16[8] = {
+  255, 0, 119, 0, 127, 0, 119, 0,
 };
-
-#if CONFIG_CB4X4 && CONFIG_EXT_PARTITION
-static const uint16_t orders_4x8[512] = {
-  0,   1,   2,   3,   8,   9,   10,  11,  32,  33,  34,  35,  40,  41,  42,
-  43,  128, 129, 130, 131, 136, 137, 138, 139, 160, 161, 162, 163, 168, 169,
-  170, 171, 4,   5,   6,   7,   12,  13,  14,  15,  36,  37,  38,  39,  44,
-  45,  46,  47,  132, 133, 134, 135, 140, 141, 142, 143, 164, 165, 166, 167,
-  172, 173, 174, 175, 16,  17,  18,  19,  24,  25,  26,  27,  48,  49,  50,
-  51,  56,  57,  58,  59,  144, 145, 146, 147, 152, 153, 154, 155, 176, 177,
-  178, 179, 184, 185, 186, 187, 20,  21,  22,  23,  28,  29,  30,  31,  52,
-  53,  54,  55,  60,  61,  62,  63,  148, 149, 150, 151, 156, 157, 158, 159,
-  180, 181, 182, 183, 188, 189, 190, 191, 64,  65,  66,  67,  72,  73,  74,
-  75,  96,  97,  98,  99,  104, 105, 106, 107, 192, 193, 194, 195, 200, 201,
-  202, 203, 224, 225, 226, 227, 232, 233, 234, 235, 68,  69,  70,  71,  76,
-  77,  78,  79,  100, 101, 102, 103, 108, 109, 110, 111, 196, 197, 198, 199,
-  204, 205, 206, 207, 228, 229, 230, 231, 236, 237, 238, 239, 80,  81,  82,
-  83,  88,  89,  90,  91,  112, 113, 114, 115, 120, 121, 122, 123, 208, 209,
-  210, 211, 216, 217, 218, 219, 240, 241, 242, 243, 248, 249, 250, 251, 84,
-  85,  86,  87,  92,  93,  94,  95,  116, 117, 118, 119, 124, 125, 126, 127,
-  212, 213, 214, 215, 220, 221, 222, 223, 244, 245, 246, 247, 252, 253, 254,
-  255, 256, 257, 258, 259, 264, 265, 266, 267, 288, 289, 290, 291, 296, 297,
-  298, 299, 384, 385, 386, 387, 392, 393, 394, 395, 416, 417, 418, 419, 424,
-  425, 426, 427, 260, 261, 262, 263, 268, 269, 270, 271, 292, 293, 294, 295,
-  300, 301, 302, 303, 388, 389, 390, 391, 396, 397, 398, 399, 420, 421, 422,
-  423, 428, 429, 430, 431, 272, 273, 274, 275, 280, 281, 282, 283, 304, 305,
-  306, 307, 312, 313, 314, 315, 400, 401, 402, 403, 408, 409, 410, 411, 432,
-  433, 434, 435, 440, 441, 442, 443, 276, 277, 278, 279, 284, 285, 286, 287,
-  308, 309, 310, 311, 316, 317, 318, 319, 404, 405, 406, 407, 412, 413, 414,
-  415, 436, 437, 438, 439, 444, 445, 446, 447, 320, 321, 322, 323, 328, 329,
-  330, 331, 352, 353, 354, 355, 360, 361, 362, 363, 448, 449, 450, 451, 456,
-  457, 458, 459, 480, 481, 482, 483, 488, 489, 490, 491, 324, 325, 326, 327,
-  332, 333, 334, 335, 356, 357, 358, 359, 364, 365, 366, 367, 452, 453, 454,
-  455, 460, 461, 462, 463, 484, 485, 486, 487, 492, 493, 494, 495, 336, 337,
-  338, 339, 344, 345, 346, 347, 368, 369, 370, 371, 376, 377, 378, 379, 464,
-  465, 466, 467, 472, 473, 474, 475, 496, 497, 498, 499, 504, 505, 506, 507,
-  340, 341, 342, 343, 348, 349, 350, 351, 372, 373, 374, 375, 380, 381, 382,
-  383, 468, 469, 470, 471, 476, 477, 478, 479, 500, 501, 502, 503, 508, 509,
-  510, 511,
-};
-
-static const uint16_t orders_8x4[512] = {
-  0,   2,   8,   10,  32,  34,  40,  42,  128, 130, 136, 138, 160, 162, 168,
-  170, 1,   3,   9,   11,  33,  35,  41,  43,  129, 131, 137, 139, 161, 163,
-  169, 171, 4,   6,   12,  14,  36,  38,  44,  46,  132, 134, 140, 142, 164,
-  166, 172, 174, 5,   7,   13,  15,  37,  39,  45,  47,  133, 135, 141, 143,
-  165, 167, 173, 175, 16,  18,  24,  26,  48,  50,  56,  58,  144, 146, 152,
-  154, 176, 178, 184, 186, 17,  19,  25,  27,  49,  51,  57,  59,  145, 147,
-  153, 155, 177, 179, 185, 187, 20,  22,  28,  30,  52,  54,  60,  62,  148,
-  150, 156, 158, 180, 182, 188, 190, 21,  23,  29,  31,  53,  55,  61,  63,
-  149, 151, 157, 159, 181, 183, 189, 191, 64,  66,  72,  74,  96,  98,  104,
-  106, 192, 194, 200, 202, 224, 226, 232, 234, 65,  67,  73,  75,  97,  99,
-  105, 107, 193, 195, 201, 203, 225, 227, 233, 235, 68,  70,  76,  78,  100,
-  102, 108, 110, 196, 198, 204, 206, 228, 230, 236, 238, 69,  71,  77,  79,
-  101, 103, 109, 111, 197, 199, 205, 207, 229, 231, 237, 239, 80,  82,  88,
-  90,  112, 114, 120, 122, 208, 210, 216, 218, 240, 242, 248, 250, 81,  83,
-  89,  91,  113, 115, 121, 123, 209, 211, 217, 219, 241, 243, 249, 251, 84,
-  86,  92,  94,  116, 118, 124, 126, 212, 214, 220, 222, 244, 246, 252, 254,
-  85,  87,  93,  95,  117, 119, 125, 127, 213, 215, 221, 223, 245, 247, 253,
-  255, 256, 258, 264, 266, 288, 290, 296, 298, 384, 386, 392, 394, 416, 418,
-  424, 426, 257, 259, 265, 267, 289, 291, 297, 299, 385, 387, 393, 395, 417,
-  419, 425, 427, 260, 262, 268, 270, 292, 294, 300, 302, 388, 390, 396, 398,
-  420, 422, 428, 430, 261, 263, 269, 271, 293, 295, 301, 303, 389, 391, 397,
-  399, 421, 423, 429, 431, 272, 274, 280, 282, 304, 306, 312, 314, 400, 402,
-  408, 410, 432, 434, 440, 442, 273, 275, 281, 283, 305, 307, 313, 315, 401,
-  403, 409, 411, 433, 435, 441, 443, 276, 278, 284, 286, 308, 310, 316, 318,
-  404, 406, 412, 414, 436, 438, 444, 446, 277, 279, 285, 287, 309, 311, 317,
-  319, 405, 407, 413, 415, 437, 439, 445, 447, 320, 322, 328, 330, 352, 354,
-  360, 362, 448, 450, 456, 458, 480, 482, 488, 490, 321, 323, 329, 331, 353,
-  355, 361, 363, 449, 451, 457, 459, 481, 483, 489, 491, 324, 326, 332, 334,
-  356, 358, 364, 366, 452, 454, 460, 462, 484, 486, 492, 494, 325, 327, 333,
-  335, 357, 359, 365, 367, 453, 455, 461, 463, 485, 487, 493, 495, 336, 338,
-  344, 346, 368, 370, 376, 378, 464, 466, 472, 474, 496, 498, 504, 506, 337,
-  339, 345, 347, 369, 371, 377, 379, 465, 467, 473, 475, 497, 499, 505, 507,
-  340, 342, 348, 350, 372, 374, 380, 382, 468, 470, 476, 478, 500, 502, 508,
-  510, 341, 343, 349, 351, 373, 375, 381, 383, 469, 471, 477, 479, 501, 503,
-  509, 511,
-};
-
-static const uint16_t orders_4x4[1024] = {
-  0,    1,    4,    5,    16,   17,   20,   21,   64,   65,   68,   69,   80,
-  81,   84,   85,   256,  257,  260,  261,  272,  273,  276,  277,  320,  321,
-  324,  325,  336,  337,  340,  341,  2,    3,    6,    7,    18,   19,   22,
-  23,   66,   67,   70,   71,   82,   83,   86,   87,   258,  259,  262,  263,
-  274,  275,  278,  279,  322,  323,  326,  327,  338,  339,  342,  343,  8,
-  9,    12,   13,   24,   25,   28,   29,   72,   73,   76,   77,   88,   89,
-  92,   93,   264,  265,  268,  269,  280,  281,  284,  285,  328,  329,  332,
-  333,  344,  345,  348,  349,  10,   11,   14,   15,   26,   27,   30,   31,
-  74,   75,   78,   79,   90,   91,   94,   95,   266,  267,  270,  271,  282,
-  283,  286,  287,  330,  331,  334,  335,  346,  347,  350,  351,  32,   33,
-  36,   37,   48,   49,   52,   53,   96,   97,   100,  101,  112,  113,  116,
-  117,  288,  289,  292,  293,  304,  305,  308,  309,  352,  353,  356,  357,
-  368,  369,  372,  373,  34,   35,   38,   39,   50,   51,   54,   55,   98,
-  99,   102,  103,  114,  115,  118,  119,  290,  291,  294,  295,  306,  307,
-  310,  311,  354,  355,  358,  359,  370,  371,  374,  375,  40,   41,   44,
-  45,   56,   57,   60,   61,   104,  105,  108,  109,  120,  121,  124,  125,
-  296,  297,  300,  301,  312,  313,  316,  317,  360,  361,  364,  365,  376,
-  377,  380,  381,  42,   43,   46,   47,   58,   59,   62,   63,   106,  107,
-  110,  111,  122,  123,  126,  127,  298,  299,  302,  303,  314,  315,  318,
-  319,  362,  363,  366,  367,  378,  379,  382,  383,  128,  129,  132,  133,
-  144,  145,  148,  149,  192,  193,  196,  197,  208,  209,  212,  213,  384,
-  385,  388,  389,  400,  401,  404,  405,  448,  449,  452,  453,  464,  465,
-  468,  469,  130,  131,  134,  135,  146,  147,  150,  151,  194,  195,  198,
-  199,  210,  211,  214,  215,  386,  387,  390,  391,  402,  403,  406,  407,
-  450,  451,  454,  455,  466,  467,  470,  471,  136,  137,  140,  141,  152,
-  153,  156,  157,  200,  201,  204,  205,  216,  217,  220,  221,  392,  393,
-  396,  397,  408,  409,  412,  413,  456,  457,  460,  461,  472,  473,  476,
-  477,  138,  139,  142,  143,  154,  155,  158,  159,  202,  203,  206,  207,
-  218,  219,  222,  223,  394,  395,  398,  399,  410,  411,  414,  415,  458,
-  459,  462,  463,  474,  475,  478,  479,  160,  161,  164,  165,  176,  177,
-  180,  181,  224,  225,  228,  229,  240,  241,  244,  245,  416,  417,  420,
-  421,  432,  433,  436,  437,  480,  481,  484,  485,  496,  497,  500,  501,
-  162,  163,  166,  167,  178,  179,  182,  183,  226,  227,  230,  231,  242,
-  243,  246,  247,  418,  419,  422,  423,  434,  435,  438,  439,  482,  483,
-  486,  487,  498,  499,  502,  503,  168,  169,  172,  173,  184,  185,  188,
-  189,  232,  233,  236,  237,  248,  249,  252,  253,  424,  425,  428,  429,
-  440,  441,  444,  445,  488,  489,  492,  493,  504,  505,  508,  509,  170,
-  171,  174,  175,  186,  187,  190,  191,  234,  235,  238,  239,  250,  251,
-  254,  255,  426,  427,  430,  431,  442,  443,  446,  447,  490,  491,  494,
-  495,  506,  507,  510,  511,  512,  513,  516,  517,  528,  529,  532,  533,
-  576,  577,  580,  581,  592,  593,  596,  597,  768,  769,  772,  773,  784,
-  785,  788,  789,  832,  833,  836,  837,  848,  849,  852,  853,  514,  515,
-  518,  519,  530,  531,  534,  535,  578,  579,  582,  583,  594,  595,  598,
-  599,  770,  771,  774,  775,  786,  787,  790,  791,  834,  835,  838,  839,
-  850,  851,  854,  855,  520,  521,  524,  525,  536,  537,  540,  541,  584,
-  585,  588,  589,  600,  601,  604,  605,  776,  777,  780,  781,  792,  793,
-  796,  797,  840,  841,  844,  845,  856,  857,  860,  861,  522,  523,  526,
-  527,  538,  539,  542,  543,  586,  587,  590,  591,  602,  603,  606,  607,
-  778,  779,  782,  783,  794,  795,  798,  799,  842,  843,  846,  847,  858,
-  859,  862,  863,  544,  545,  548,  549,  560,  561,  564,  565,  608,  609,
-  612,  613,  624,  625,  628,  629,  800,  801,  804,  805,  816,  817,  820,
-  821,  864,  865,  868,  869,  880,  881,  884,  885,  546,  547,  550,  551,
-  562,  563,  566,  567,  610,  611,  614,  615,  626,  627,  630,  631,  802,
-  803,  806,  807,  818,  819,  822,  823,  866,  867,  870,  871,  882,  883,
-  886,  887,  552,  553,  556,  557,  568,  569,  572,  573,  616,  617,  620,
-  621,  632,  633,  636,  637,  808,  809,  812,  813,  824,  825,  828,  829,
-  872,  873,  876,  877,  888,  889,  892,  893,  554,  555,  558,  559,  570,
-  571,  574,  575,  618,  619,  622,  623,  634,  635,  638,  639,  810,  811,
-  814,  815,  826,  827,  830,  831,  874,  875,  878,  879,  890,  891,  894,
-  895,  640,  641,  644,  645,  656,  657,  660,  661,  704,  705,  708,  709,
-  720,  721,  724,  725,  896,  897,  900,  901,  912,  913,  916,  917,  960,
-  961,  964,  965,  976,  977,  980,  981,  642,  643,  646,  647,  658,  659,
-  662,  663,  706,  707,  710,  711,  722,  723,  726,  727,  898,  899,  902,
-  903,  914,  915,  918,  919,  962,  963,  966,  967,  978,  979,  982,  983,
-  648,  649,  652,  653,  664,  665,  668,  669,  712,  713,  716,  717,  728,
-  729,  732,  733,  904,  905,  908,  909,  920,  921,  924,  925,  968,  969,
-  972,  973,  984,  985,  988,  989,  650,  651,  654,  655,  666,  667,  670,
-  671,  714,  715,  718,  719,  730,  731,  734,  735,  906,  907,  910,  911,
-  922,  923,  926,  927,  970,  971,  974,  975,  986,  987,  990,  991,  672,
-  673,  676,  677,  688,  689,  692,  693,  736,  737,  740,  741,  752,  753,
-  756,  757,  928,  929,  932,  933,  944,  945,  948,  949,  992,  993,  996,
-  997,  1008, 1009, 1012, 1013, 674,  675,  678,  679,  690,  691,  694,  695,
-  738,  739,  742,  743,  754,  755,  758,  759,  930,  931,  934,  935,  946,
-  947,  950,  951,  994,  995,  998,  999,  1010, 1011, 1014, 1015, 680,  681,
-  684,  685,  696,  697,  700,  701,  744,  745,  748,  749,  760,  761,  764,
-  765,  936,  937,  940,  941,  952,  953,  956,  957,  1000, 1001, 1004, 1005,
-  1016, 1017, 1020, 1021, 682,  683,  686,  687,  698,  699,  702,  703,  746,
-  747,  750,  751,  762,  763,  766,  767,  938,  939,  942,  943,  954,  955,
-  958,  959,  1002, 1003, 1006, 1007, 1018, 1019, 1022, 1023,
-};
-#endif
-#endif  // CONFIG_CB4X4 || CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION
-/* clang-format off */
-static const uint16_t *const orders[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,         2X4,            4X2
-  orders_4x4,     orders_4x4,     orders_4x4,
-#endif
-  //                              4X4
-                                  orders_4x4,
-  // 4X8,         8X4,            8X8
-  orders_4x8,     orders_8x4,     orders_8x8,
-#else  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //                              4X4
-                                  orders_8x8,
-  // 4X8,         8X4,            8X8
-  orders_8x8,     orders_8x8,     orders_8x8,
-#endif
-  // 8X16,        16X8,           16X16
-  orders_8x16,    orders_16x8,    orders_16x16,
-  // 16X32,       32X16,          32X32
-  orders_16x32,   orders_32x16,   orders_32x32,
-  // 32X64,       64X32,          64X64
-  orders_32x64,   orders_64x32,   orders_64x64,
-  // 64x128,      128x64,         128x128
-  orders_64x128,  orders_128x64,  orders_128x128,
-  // 4x16,        16x4,           8x32
-  orders_4x16,    orders_16x4,    orders_8x32,
-  // 32x8,        16x64,          64x16
-  orders_32x8,    orders_16x64,   orders_64x16,
-  // 32x128,      128x32
-  orders_32x128,  orders_128x32
-};
-/* clang-format on */
-#else
-/* clang-format off */
-static const uint16_t *const orders[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,         2X4,            4X2
-  orders_8x8,     orders_8x8,     orders_8x8,
-#endif
-  //                              4X4
-                                  orders_8x8,
-  // 4X8,         8X4,            8X8
-  orders_8x16,    orders_16x8,    orders_16x16,
-#else  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //                              4X4
-                                  orders_16x16,
-  // 4X8,         8X4,            8X8
-  orders_16x16,   orders_16x16,   orders_16x16,
-#endif
-  // 8X16,        16X8,           16X16
-  orders_16x32,   orders_32x16,   orders_32x32,
-  // 16X32,       32X16,          32X32
-  orders_32x64,   orders_64x32,   orders_64x64,
-  // 32X64,       64X32,          64X64
-  orders_64x128,  orders_128x64,  orders_128x128,
-  // 4x16,        16x4,           8x32
-  orders_8x32,    orders_32x8,    orders_16x64,
-  // 32x8,        16x64,          64x16
-  orders_64x16,   orders_32x128,  orders_128x32
+static uint8_t has_tr_vert_32x32[2] = { 15, 7 };
+static uint8_t has_tr_vert_64x64[1] = { 3 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,      8X4,         8X8
+  has_tr_4x8, NULL, has_tr_vert_8x8,
+  // 8X16,     16X8,        16X16
+  has_tr_8x16, NULL, has_tr_vert_16x16,
+  // 16X32,    32X16,       32X32
+  has_tr_16x32, NULL, has_tr_vert_32x32,
+  // 32X64,    64X32,       64X64
+  has_tr_32x64, NULL, has_tr_vert_64x64,
+  // 64x128,   128x64,      128x128
+  has_tr_64x128, NULL, has_tr_128x128
 };
-/* clang-format on */
-#endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-static const uint16_t orders_verta_64x64[4] = {
-  0, 2, 1, 2,
-};
-static const uint16_t orders_verta_32x32[16] = {
-  0, 2, 4, 6, 1, 2, 5, 6, 8, 10, 12, 14, 9, 10, 13, 14,
-};
-static const uint16_t orders_verta_16x16[64] = {
-  0,  2,  4,  6,  16, 18, 20, 22, 1,  2,  5,  6,  17, 18, 21, 22,
-  8,  10, 12, 14, 24, 26, 28, 30, 9,  10, 13, 14, 25, 26, 29, 30,
-  32, 34, 36, 38, 48, 50, 52, 54, 33, 34, 37, 38, 49, 50, 53, 54,
-  40, 42, 44, 46, 56, 58, 60, 62, 41, 42, 45, 46, 57, 58, 61, 62,
-};
-#if CONFIG_EXT_PARTITION || CONFIG_CB4X4
-static const uint16_t orders_verta_8x8[256] = {
-  0,   2,   4,   6,   16,  18,  20,  22,  64,  66,  68,  70,  80,  82,  84,
-  86,  1,   2,   5,   6,   17,  18,  21,  22,  65,  66,  69,  70,  81,  82,
-  85,  86,  8,   10,  12,  14,  24,  26,  28,  30,  72,  74,  76,  78,  88,
-  90,  92,  94,  9,   10,  13,  14,  25,  26,  29,  30,  73,  74,  77,  78,
-  89,  90,  93,  94,  32,  34,  36,  38,  48,  50,  52,  54,  96,  98,  100,
-  102, 112, 114, 116, 118, 33,  34,  37,  38,  49,  50,  53,  54,  97,  98,
-  101, 102, 113, 114, 117, 118, 40,  42,  44,  46,  56,  58,  60,  62,  104,
-  106, 108, 110, 120, 122, 124, 126, 41,  42,  45,  46,  57,  58,  61,  62,
-  105, 106, 109, 110, 121, 122, 125, 126, 128, 130, 132, 134, 144, 146, 148,
-  150, 192, 194, 196, 198, 208, 210, 212, 214, 129, 130, 133, 134, 145, 146,
-  149, 150, 193, 194, 197, 198, 209, 210, 213, 214, 136, 138, 140, 142, 152,
-  154, 156, 158, 200, 202, 204, 206, 216, 218, 220, 222, 137, 138, 141, 142,
-  153, 154, 157, 158, 201, 202, 205, 206, 217, 218, 221, 222, 160, 162, 164,
-  166, 176, 178, 180, 182, 224, 226, 228, 230, 240, 242, 244, 246, 161, 162,
-  165, 166, 177, 178, 181, 182, 225, 226, 229, 230, 241, 242, 245, 246, 168,
-  170, 172, 174, 184, 186, 188, 190, 232, 234, 236, 238, 248, 250, 252, 254,
-  169, 170, 173, 174, 185, 186, 189, 190, 233, 234, 237, 238, 249, 250, 253,
-  254,
-};
-#endif  // CONFIG_EXT_PARTITION || CONFIG_CB4X4
-
-#if CONFIG_EXT_PARTITION
-/* clang-format off */
-static const uint16_t *const orders_verta[BLOCK_SIZES] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,           2X4,              4X2
-  orders_4x4,       orders_4x4,       orders_4x4,
-#endif
-  //                                  4X4
-                                      orders_verta_8x8,
-  // 4X8,           8X4,              8X8
-  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
-  // 8X16,          16X8,             16X16
-  orders_8x16,      orders_16x8,      orders_verta_16x16,
-  // 16X32,         32X16,            32X32
-  orders_16x32,     orders_32x16,     orders_verta_32x32,
-  // 32X64,         64X32,            64X64
-  orders_32x64,     orders_64x32,     orders_verta_64x64,
-  // 64x128,        128x64,           128x128
-  orders_64x128,    orders_128x64,    orders_128x128,
-  // Note: We can't get 4:1 shaped blocks from a VERT_A type partition
-};
-/* clang-format on */
-#else
-/* clang-format off */
-static const uint16_t *const orders_verta[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,             2X4,                4X2
-  orders_verta_8x8,   orders_verta_8x8,   orders_verta_8x8,
-#endif
-  //                                      4X4
-                                          orders_verta_8x8,
-  // 4X8,             8X4,                8X8
-  orders_verta_8x8,   orders_verta_8x8,   orders_verta_16x16,
-#else  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //                                      4X4
-                                          orders_verta_16x16,
-  // 4X8,             8X4,                8X8
-  orders_verta_16x16, orders_verta_16x16, orders_verta_16x16,
-#endif
-  // 8X16,            16X8,               16X16
-  orders_16x32,       orders_32x16,       orders_verta_32x32,
-  // 16X32,           32X16,              32X32
-  orders_32x64,       orders_64x32,       orders_verta_64x64,
-  // 32X64,           64X32,              64X64
-  orders_64x128,      orders_128x64,      orders_128x128,
-  // Note: We can't get 4:1 shaped blocks from a VERT_A type partition
-};
-/* clang-format on */
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_tr_vert_tables[bsize];
+  } else {
+    ret = has_tr_tables[bsize];
+  }
+  assert(ret);
+  return ret;
+}
 
 static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int top_available, int right_available,
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                         PARTITION_TYPE partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                         TX_SIZE txsz, int row_off, int col_off, int ss_x) {
+                         PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                         int col_off, int ss_x, int ss_y) {
   if (!top_available || !right_available) return 0;
 
-#if !CONFIG_CB4X4
-  // TODO(bshacklett, huisu): Currently the RD loop traverses 4X8 blocks in
-  // inverted N order while in the bitstream the subblocks are stored in Z
-  // order. This discrepancy makes this function incorrect when considering 4X8
-  // blocks in the RD loop, so we disable the extended right edge for these
-  // blocks. The correct solution is to change the bitstream to store these
-  // blocks in inverted N order, and then update this function appropriately.
-  if (bsize == BLOCK_4X8 && row_off == 1) return 0;
-#endif
-
   const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
   const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
   const int top_right_count_unit = tx_size_wide_unit[txsz];
 
-#if !CONFIG_CB4X4
-  // Special handling for block sizes 4x8 and 4x4.
-  if (ss_x == 0 && bw_unit < 2 && col_off == 0) return 1;
-#endif
-
   if (row_off > 0) {  // Just need to check if enough pixels on the right.
-#if CONFIG_EXT_PARTITION
-    if (col_off + top_right_count_unit >=
-        (block_size_wide[BLOCK_64X64] >> (tx_size_wide_log2[0] + ss_x)))
-      return 0;
-#endif
+    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+      // Special case: For 128x128 blocks, the transform unit whose
+      // top-right corner is at the center of the block does in fact have
+      // pixels available at its top-right corner.
+      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+        return 1;
+      }
+      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+      const int col_off_64 = col_off % plane_bw_unit_64;
+      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+    }
     return col_off + top_right_count_unit < plane_bw_unit;
   } else {
     // All top-right pixels are in the block above, which is already available.
     if (col_off + top_right_count_unit < plane_bw_unit) return 1;
 
-    const int bw_in_mi_log2 = mi_width_log2_lookup[bsize];
-    const int bh_in_mi_log2 = mi_height_log2_lookup[bsize];
-    const int sb_mi_size = mi_size_high[cm->sb_size];
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -572,32 +232,175 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     // Rightmost column of superblock (and not the top row): so top-right pixels
     // fall in the right superblock, which is not available yet.
-    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) return 0;
+    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
+      return 0;
+    }
 
     // General case (neither top row nor rightmost column): check if the
     // top-right block is coded before the current block.
-    const uint16_t *const order =
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-        (partition == PARTITION_VERT_A) ? orders_verta[bsize] :
-#endif  // CONFIG_EXT_PARTITION_TYPES
-                                        orders[bsize];
     const int this_blk_index =
         ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
         blk_col_in_sb + 0;
-    const uint16_t this_blk_order = order[this_blk_index];
-    const int tr_blk_index =
-        ((blk_row_in_sb - 1) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
-        blk_col_in_sb + 1;
-    const uint16_t tr_blk_order = order[tr_blk_index];
-    return tr_blk_order < this_blk_order;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
+    return (has_tr_table[idx1] >> idx2) & 1;
+  }
+}
+
+// Similar to the has_tr_* tables, but store if the bottom-left reference
+// pixels are available.
+static uint8_t has_bl_4x4[128] = {
+  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85,
+  85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,  0,  84, 85, 85, 85, 16, 17,
+  17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84,
+  85, 85, 85, 0,  0,  0,  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
+  0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,
+  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85,
+  85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  0,  0,
+};
+static uint8_t has_bl_4x8[64] = {
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+};
+static uint8_t has_bl_8x4[64] = {
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+};
+static uint8_t has_bl_8x8[32] = {
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+};
+static uint8_t has_bl_8x16[16] = {
+  16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
+};
+static uint8_t has_bl_16x8[16] = {
+  254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
+};
+static uint8_t has_bl_16x16[8] = {
+  84, 16, 84, 0, 84, 16, 84, 0,
+};
+static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
+static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
+static uint8_t has_bl_32x32[2] = { 4, 4 };
+static uint8_t has_bl_32x64[1] = { 0 };
+static uint8_t has_bl_64x32[1] = { 34 };
+static uint8_t has_bl_64x64[1] = { 0 };
+static uint8_t has_bl_64x128[1] = { 0 };
+static uint8_t has_bl_128x64[1] = { 0 };
+static uint8_t has_bl_128x128[1] = { 0 };
+static uint8_t has_bl_4x16[32] = {
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+};
+static uint8_t has_bl_16x4[32] = {
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+};
+static uint8_t has_bl_8x32[8] = {
+  0, 1, 0, 0, 0, 1, 0, 0,
+};
+static uint8_t has_bl_32x8[8] = {
+  238, 78, 238, 14, 238, 78, 238, 14,
+};
+static uint8_t has_bl_16x64[2] = { 0, 0 };
+static uint8_t has_bl_64x16[2] = { 42, 42 };
+
+static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_bl_4x4,
+  // 4X8,         8X4,         8X8
+  has_bl_4x8, has_bl_8x4, has_bl_8x8,
+  // 8X16,        16X8,        16X16
+  has_bl_8x16, has_bl_16x8, has_bl_16x16,
+  // 16X32,       32X16,       32X32
+  has_bl_16x32, has_bl_32x16, has_bl_32x32,
+  // 32X64,       64X32,       64X64
+  has_bl_32x64, has_bl_64x32, has_bl_64x64,
+  // 64x128,      128x64,      128x128
+  has_bl_64x128, has_bl_128x64, has_bl_128x128,
+  // 4x16,        16x4,        8x32
+  has_bl_4x16, has_bl_16x4, has_bl_8x32,
+  // 32x8,        16x64,       64x16
+  has_bl_32x8, has_bl_16x64, has_bl_64x16
+};
+
+static uint8_t has_bl_vert_8x8[32] = {
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+};
+static uint8_t has_bl_vert_16x16[8] = {
+  254, 16, 254, 0, 254, 16, 254, 0,
+};
+static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
+static uint8_t has_bl_vert_64x64[1] = { 2 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,     8X4,         8X8
+  has_bl_4x8, NULL, has_bl_vert_8x8,
+  // 8X16,    16X8,        16X16
+  has_bl_8x16, NULL, has_bl_vert_16x16,
+  // 16X32,   32X16,       32X32
+  has_bl_16x32, NULL, has_bl_vert_32x32,
+  // 32X64,   64X32,       64X64
+  has_bl_32x64, NULL, has_bl_vert_64x64,
+  // 64x128,  128x64,      128x128
+  has_bl_64x128, NULL, has_bl_128x128
+};
+
+static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_bl_vert_tables[bsize];
+  } else {
+    ret = has_bl_tables[bsize];
   }
+  assert(ret);
+  return ret;
 }
 
 static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                            int mi_col, int bottom_available, int left_available,
-                           TX_SIZE txsz, int row_off, int col_off, int ss_y) {
+                           PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                           int col_off, int ss_x, int ss_y) {
   if (!bottom_available || !left_available) return 0;
 
+  // Special case for 128x* blocks, when col_off is half the block width.
+  // This is needed because 128x* superblocks are divided into 64x* blocks in
+  // raster order
+  if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
+    const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+    const int col_off_64 = col_off % plane_bw_unit_64;
+    if (col_off_64 == 0) {
+      // We are at the left edge of top-right or bottom-right 64x* block.
+      const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
+      const int row_off_64 = row_off % plane_bh_unit_64;
+      const int plane_bh_unit =
+          AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
+      // Check if all bottom-left pixels are in the left 64x* block (which is
+      // already coded).
+      return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
+    }
+  }
+
   if (col_off > 0) {
     // Bottom-left pixels are in the bottom-left block, which is not available.
     return 0;
@@ -606,17 +409,12 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
     const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
     const int bottom_left_count_unit = tx_size_high_unit[txsz];
 
-#if !CONFIG_CB4X4
-    // Special handling for block sizes 8x4 and 4x4.
-    if (ss_y == 0 && bh_unit < 2 && row_off == 0) return 1;
-#endif
-
     // All bottom-left pixels are in the left block, which is already available.
     if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
 
-    const int bw_in_mi_log2 = mi_width_log2_lookup[bsize];
-    const int bh_in_mi_log2 = mi_height_log2_lookup[bsize];
-    const int sb_mi_size = mi_size_high[cm->sb_size];
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -629,8 +427,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                                             tx_size_wide_log2[0]) >>
                                     ss_y;
       const int row_off_in_sb = blk_start_row_off + row_off;
-      const int sb_height_unit =
-          sb_mi_size << (MI_SIZE_LOG2 - tx_size_wide_log2[0]) >> ss_y;
+      const int sb_height_unit = sb_mi_size >> ss_y;
       return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
     }
 
@@ -640,16 +437,13 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     // General case (neither leftmost column nor bottom row): check if the
     // bottom-left block is coded before the current block.
-    const uint16_t *const order = orders[bsize];
     const int this_blk_index =
         ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
         blk_col_in_sb + 0;
-    const uint16_t this_blk_order = order[this_blk_index];
-    const int bl_blk_index =
-        ((blk_row_in_sb + 1) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
-        blk_col_in_sb - 1;
-    const uint16_t bl_blk_order = order[bl_blk_index];
-    return bl_blk_order < this_blk_order;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
+    return (has_bl_table[idx1] >> idx2) & 1;
   }
 }
 
@@ -659,20 +453,15 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
 static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
 static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
 
-#if CONFIG_HIGHBITDEPTH
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *above, const uint16_t *left,
                                    int bd);
 static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
 static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
-#endif  // CONFIG_HIGHBITDEPTH
 
-static void av1_init_intra_predictors_internal(void) {
-#if CONFIG_EXT_INTRA
+static void init_intra_predictors_internal(void) {
   assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_TX64X64
 #define INIT_RECTANGULAR(p, type)             \
   p[TX_4X8] = aom_##type##_predictor_4x8;     \
   p[TX_8X4] = aom_##type##_predictor_8x4;     \
@@ -681,132 +470,53 @@ static void av1_init_intra_predictors_internal(void) {
   p[TX_16X32] = aom_##type##_predictor_16x32; \
   p[TX_32X16] = aom_##type##_predictor_32x16; \
   p[TX_32X64] = aom_##type##_predictor_32x64; \
-  p[TX_64X32] = aom_##type##_predictor_64x32;
-#else
-#define INIT_RECTANGULAR(p, type)             \
-  p[TX_4X8] = aom_##type##_predictor_4x8;     \
-  p[TX_8X4] = aom_##type##_predictor_8x4;     \
-  p[TX_8X16] = aom_##type##_predictor_8x16;   \
-  p[TX_16X8] = aom_##type##_predictor_16x8;   \
-  p[TX_16X32] = aom_##type##_predictor_16x32; \
-  p[TX_32X16] = aom_##type##_predictor_32x16;
-#endif  // CONFIG_TX64X64
+  p[TX_64X32] = aom_##type##_predictor_64x32; \
+  p[TX_4X16] = aom_##type##_predictor_4x16;   \
+  p[TX_16X4] = aom_##type##_predictor_16x4;   \
+  p[TX_8X32] = aom_##type##_predictor_8x32;   \
+  p[TX_32X8] = aom_##type##_predictor_32x8;   \
+  p[TX_16X64] = aom_##type##_predictor_16x64; \
+  p[TX_64X16] = aom_##type##_predictor_64x16;
 
-#if CONFIG_TX64X64
 #define INIT_NO_4X4(p, type)                  \
   p[TX_8X8] = aom_##type##_predictor_8x8;     \
   p[TX_16X16] = aom_##type##_predictor_16x16; \
   p[TX_32X32] = aom_##type##_predictor_32x32; \
   p[TX_64X64] = aom_##type##_predictor_64x64; \
   INIT_RECTANGULAR(p, type)
-#else
-#define INIT_NO_4X4(p, type)                  \
-  p[TX_8X8] = aom_##type##_predictor_8x8;     \
-  p[TX_16X16] = aom_##type##_predictor_16x16; \
-  p[TX_32X32] = aom_##type##_predictor_32x32; \
-  INIT_RECTANGULAR(p, type)
-#endif  // CONFIG_TX64X64
 
-#if CONFIG_CHROMA_2X2
 #define INIT_ALL_SIZES(p, type)           \
-  p[TX_2X2] = aom_##type##_predictor_2x2; \
   p[TX_4X4] = aom_##type##_predictor_4x4; \
   INIT_NO_4X4(p, type)
-#else
-#define INIT_ALL_SIZES(p, type)           \
-  p[TX_4X4] = aom_##type##_predictor_4x4; \
-  INIT_NO_4X4(p, type)
-#endif
 
   INIT_ALL_SIZES(pred[V_PRED], v);
   INIT_ALL_SIZES(pred[H_PRED], h);
-  INIT_ALL_SIZES(pred[D207_PRED], d207e);
-  INIT_ALL_SIZES(pred[D45_PRED], d45e);
-  INIT_ALL_SIZES(pred[D63_PRED], d63e);
-  INIT_ALL_SIZES(pred[D117_PRED], d117);
-  INIT_ALL_SIZES(pred[D135_PRED], d135);
-  INIT_ALL_SIZES(pred[D153_PRED], d153);
-
-  INIT_ALL_SIZES(pred[TM_PRED], paeth);
+  INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
   INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
-#if CONFIG_SMOOTH_HV
   INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
   INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
-#endif  // CONFIG_SMOOTH_HV
-
   INIT_ALL_SIZES(dc_pred[0][0], dc_128);
   INIT_ALL_SIZES(dc_pred[0][1], dc_top);
   INIT_ALL_SIZES(dc_pred[1][0], dc_left);
   INIT_ALL_SIZES(dc_pred[1][1], dc);
 
-#if CONFIG_HIGHBITDEPTH
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
-  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
-  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
-  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63e);
-  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
-  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
-  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
-
-  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_paeth);
+  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
   INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
-#if CONFIG_SMOOTH_HV
   INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v);
   INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h);
-#endif  // CONFIG_SMOOTH_HV
-
   INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
   INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
   INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
   INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
-#endif  // CONFIG_HIGHBITDEPTH
-
 #undef intra_pred_allsizes
 }
 
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-static int intra_subpel_interp(int base, int shift, const uint8_t *ref,
-                               int ref_start_idx, int ref_end_idx,
-                               INTRA_FILTER filter_type) {
-  int val, k, idx, filter_idx = 0;
-  const int16_t *filter = NULL;
-
-  if (filter_type == INTRA_FILTER_LINEAR) {
-    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
-    val = ROUND_POWER_OF_TWO(val, 8);
-  } else {
-    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
-    filter = av1_intra_filter_kernels[filter_type][filter_idx];
-
-    if (filter_idx < (1 << SUBPEL_BITS)) {
-      val = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) {
-        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
-        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
-        val += ref[idx] * filter[k];
-      }
-      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
-    } else {
-      val = ref[base + 1];
-    }
-  }
-
-  return val;
-}
-#endif  // CONFIG_INTRA_INTERP
-
 // Directional prediction, zone 1: 0 < angle < 90
-static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                             const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                             INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                             int upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                             int dx, int dy) {
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int dx, int dy) {
   int r, c, x, base, shift, val;
 
   (void)left;
@@ -814,16 +524,13 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   assert(dy == 1);
   assert(dx > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_x = ((bw + bh) - 1) << upsample_above;
-  const int frac_bits = 8 - upsample_above;
+  const int frac_bits = 6 - upsample_above;
   const int base_inc = 1 << upsample_above;
   x = dx;
   for (r = 0; r < bh; ++r, dst += stride, x += dx) {
     base = x >> frac_bits;
-    shift = (x << upsample_above) & 0xFF;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
 
     if (base >= max_base_x) {
       for (int i = r; i < bh; ++i) {
@@ -835,14 +542,8 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 
     for (c = 0; c < bw; ++c, base += base_inc) {
       if (base < max_base_x) {
-#if CONFIG_INTRA_INTERP
-        val = intra_subpel_interp(base, shift, above, 0, bw + bh - 1,
-                                  filter_type);
-#else   // CONFIG_INTRA_INTERP
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[c] = clip_pixel(val);
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
       } else {
         dst[c] = above[max_base_x];
       }
@@ -851,68 +552,44 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 }
 
 // Directional prediction, zone 2: 90 < angle < 180
-static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                             const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                             INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                             int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                             int dx, int dy) {
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int upsample_left, int dx,
+                            int dy) {
   int r, c, x, y, shift1, shift2, val, base1, base2;
 
   assert(dx > 0);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int min_base_x = -(1 << upsample_above);
-  const int frac_bits_x = 8 - upsample_above;
-  const int frac_bits_y = 8 - upsample_left;
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
   const int base_inc_x = 1 << upsample_above;
   x = -dx;
   for (r = 0; r < bh; ++r, x -= dx, dst += stride) {
     base1 = x >> frac_bits_x;
-    y = (r << 8) - dy;
+    y = (r << 6) - dy;
     for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) {
       if (base1 >= min_base_x) {
-        shift1 = (x * (1 << upsample_above)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val =
-            intra_subpel_interp(base1, shift1, above, -1, bw - 1, filter_type);
-#else
-        val = above[base1] * (256 - shift1) + above[base1 + 1] * shift1;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1;
+        val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         base2 = y >> frac_bits_y;
         assert(base2 >= -(1 << upsample_left));
-        shift2 = (y * (1 << upsample_left)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val = intra_subpel_interp(base2, shift2, left, -1, bh - 1, filter_type);
-#else
-        val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2;
+        val = ROUND_POWER_OF_TWO(val, 5);
       }
-      dst[c] = clip_pixel(val);
+      dst[c] = val;
     }
   }
 }
 
 // Directional prediction, zone 3: 180 < angle < 270
-static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                             const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                             INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                             int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                             int dx, int dy) {
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_left, int dx, int dy) {
   int r, c, y, base, shift, val;
 
   (void)above;
@@ -921,27 +598,18 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   assert(dx == 1);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_y = (bw + bh - 1) << upsample_left;
-  const int frac_bits = 8 - upsample_left;
+  const int frac_bits = 6 - upsample_left;
   const int base_inc = 1 << upsample_left;
   y = dy;
   for (c = 0; c < bw; ++c, y += dy) {
     base = y >> frac_bits;
-    shift = (y << upsample_left) & 0xFF;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
 
     for (r = 0; r < bh; ++r, base += base_inc) {
       if (base < max_base_y) {
-#if CONFIG_INTRA_INTERP
-        val =
-            intra_subpel_interp(base, shift, left, 0, bw + bh - 1, filter_type);
-#else   // CONFIG_INTRA_INTERP
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[r * stride + c] = clip_pixel(val);
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
         break;
@@ -950,78 +618,24 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
-// If angle > 0 && angle < 90, dx = -((int)(256 / t));
-// If angle > 90 && angle < 180, dx = (int)(256 / t);
-// If angle > 180 && angle < 270, dx = 1;
-static INLINE int get_dx(int angle) {
-  if (angle > 0 && angle < 90) {
-    return dr_intra_derivative[angle];
-  } else if (angle > 90 && angle < 180) {
-    return dr_intra_derivative[180 - angle];
-  } else {
-    // In this case, we are not really going to use dx. We may return any value.
-    return 1;
-  }
-}
-
-// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
-// If angle > 0 && angle < 90, dy = 1;
-// If angle > 90 && angle < 180, dy = (int)(256 * t);
-// If angle > 180 && angle < 270, dy = -((int)(256 * t));
-static INLINE int get_dy(int angle) {
-  if (angle > 90 && angle < 180) {
-    return dr_intra_derivative[angle - 90];
-  } else if (angle > 180 && angle < 270) {
-    return dr_intra_derivative[270 - angle];
-  } else {
-    // In this case, we are not really going to use dy. We may return any value.
-    return 1;
-  }
-}
-
 static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                          const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                         INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                         int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                         int angle) {
-  const int dx = get_dx(angle);
-  const int dy = get_dy(angle);
+                         int upsample_above, int upsample_left, int angle) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
   assert(angle > 0 && angle < 270);
 
   if (angle > 0 && angle < 90) {
-    dr_prediction_z1(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                     filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                     upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                     dx, dy);
+    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
+                         dy);
   } else if (angle > 90 && angle < 180) {
-    dr_prediction_z2(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                     filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                     upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                     dx, dy);
+    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
+                         upsample_left, dx, dy);
   } else if (angle > 180 && angle < 270) {
-    dr_prediction_z3(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                     filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                     upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                     dx, dy);
+    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
+                         dy);
   } else if (angle == 90) {
     pred[V_PRED][tx_size](dst, stride, above, left);
   } else if (angle == 180) {
@@ -1029,66 +643,26 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_INTRA_INTERP
-static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
-                                      int ref_start_idx, int ref_end_idx,
-                                      INTRA_FILTER filter_type) {
-  int val, k, idx, filter_idx = 0;
-  const int16_t *filter = NULL;
-
-  if (filter_type == INTRA_FILTER_LINEAR) {
-    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
-    val = ROUND_POWER_OF_TWO(val, 8);
-  } else {
-    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
-    filter = av1_intra_filter_kernels[filter_type][filter_idx];
-
-    if (filter_idx < (1 << SUBPEL_BITS)) {
-      val = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) {
-        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
-        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
-        val += ref[idx] * filter[k];
-      }
-      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
-    } else {
-      val = ref[base + 1];
-    }
-  }
-
-  return val;
-}
-#endif  // CONFIG_INTRA_INTERP
-
 // Directional prediction, zone 1: 0 < angle < 90
-static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint16_t *above,
-                                    const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                    INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int dx, int dy, int bd) {
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int dx, int dy, int bd) {
   int r, c, x, base, shift, val;
 
   (void)left;
   (void)dy;
+  (void)bd;
   assert(dy == 1);
   assert(dx > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_x = ((bw + bh) - 1) << upsample_above;
-  const int frac_bits = 8 - upsample_above;
+  const int frac_bits = 6 - upsample_above;
   const int base_inc = 1 << upsample_above;
   x = dx;
   for (r = 0; r < bh; ++r, dst += stride, x += dx) {
     base = x >> frac_bits;
-    shift = (x << upsample_above) & 0xFF;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
 
     if (base >= max_base_x) {
       for (int i = r; i < bh; ++i) {
@@ -1100,14 +674,8 @@ static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bw,
 
     for (c = 0; c < bw; ++c, base += base_inc) {
       if (base < max_base_x) {
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, above, 0, bw + bh - 1,
-                                         filter_type);
-#else
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[c] = clip_pixel_highbd(val, bd);
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
       } else {
         dst[c] = above[max_base_x];
       }
@@ -1116,100 +684,67 @@ static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bw,
 }
 
 // Directional prediction, zone 2: 90 < angle < 180
-static void highbd_dr_prediction_z2(uint16_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint16_t *above,
-                                    const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                    INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int dx, int dy, int bd) {
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int upsample_left, int dx, int dy, int bd) {
   int r, c, x, y, shift, val, base;
 
+  (void)bd;
   assert(dx > 0);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int min_base_x = -(1 << upsample_above);
-  const int frac_bits_x = 8 - upsample_above;
-  const int frac_bits_y = 8 - upsample_left;
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
   for (r = 0; r < bh; ++r) {
     for (c = 0; c < bw; ++c) {
       y = r + 1;
-      x = (c << 8) - y * dx;
+      x = (c << 6) - y * dx;
       base = x >> frac_bits_x;
       if (base >= min_base_x) {
-        shift = (x * (1 << upsample_above)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, above, -1, bw - 1,
-                                         filter_type);
-#else
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         x = c + 1;
-        y = (r << 8) - x * dy;
+        y = (r << 6) - x * dy;
         base = y >> frac_bits_y;
-        shift = (y * (1 << upsample_left)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, left, -1, bh - 1,
-                                         filter_type);
-#else
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
       }
-      dst[c] = clip_pixel_highbd(val, bd);
+      dst[c] = val;
     }
     dst += stride;
   }
 }
 
 // Directional prediction, zone 3: 180 < angle < 270
-static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint16_t *above,
-                                    const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                    INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int dx, int dy, int bd) {
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_left,
+                                   int dx, int dy, int bd) {
   int r, c, y, base, shift, val;
 
   (void)above;
   (void)dx;
+  (void)bd;
   assert(dx == 1);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_y = (bw + bh - 1) << upsample_left;
-  const int frac_bits = 8 - upsample_left;
+  const int frac_bits = 6 - upsample_left;
   const int base_inc = 1 << upsample_left;
   y = dy;
   for (c = 0; c < bw; ++c, y += dy) {
     base = y >> frac_bits;
-    shift = (y << upsample_left) & 0xFF;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
 
     for (r = 0; r < bh; ++r, base += base_inc) {
       if (base < max_base_y) {
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, left, 0, bw + bh - 1,
-                                         filter_type);
-#else
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[r * stride + c] = clip_pixel_highbd(val, bd);
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
       } else {
         for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
         break;
@@ -1220,1002 +755,253 @@ static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bw,
 
 static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
                                 TX_SIZE tx_size, const uint16_t *above,
-                                const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                INTRA_FILTER filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                int angle, int bd) {
-  const int dx = get_dx(angle);
-  const int dy = get_dy(angle);
+                                const uint16_t *left, int upsample_above,
+                                int upsample_left, int angle, int bd) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
   assert(angle > 0 && angle < 270);
 
   if (angle > 0 && angle < 90) {
-    highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                            filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                            upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                            dx, dy, bd);
+    av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
+                                upsample_above, dx, dy, bd);
   } else if (angle > 90 && angle < 180) {
-    highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                            filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                            upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                            dx, dy, bd);
+    av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
+                                upsample_above, upsample_left, dx, dy, bd);
   } else if (angle > 180 && angle < 270) {
-    highbd_dr_prediction_z3(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                            filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                            upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                            dx, dy, bd);
+    av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
+                                dx, dy, bd);
   } else if (angle == 90) {
     pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
   } else if (angle == 180) {
     pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_FILTER_INTRA
-#if USE_3TAP_INTRA_FILTER
-static int filter_intra_taps_3[TX_SIZES_ALL][FILTER_INTRA_MODES][3] = {
-#if CONFIG_CHROMA_2X2
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-#endif
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
   {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
+      { -6, 10, 0, 0, 0, 12, 0, 0 },
+      { -5, 2, 10, 0, 0, 9, 0, 0 },
+      { -3, 1, 1, 10, 0, 7, 0, 0 },
+      { -3, 1, 1, 2, 10, 5, 0, 0 },
+      { -4, 6, 0, 0, 0, 2, 12, 0 },
+      { -3, 2, 6, 0, 0, 2, 9, 0 },
+      { -3, 2, 2, 6, 0, 2, 7, 0 },
+      { -3, 1, 2, 2, 6, 3, 5, 0 },
   },
   {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
+      { -10, 16, 0, 0, 0, 10, 0, 0 },
+      { -6, 0, 16, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 16, 0, 4, 0, 0 },
+      { -2, 0, 0, 0, 16, 2, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 10, 0 },
+      { -6, 0, 16, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 16, 0, 0, 4, 0 },
+      { -2, 0, 0, 0, 16, 0, 2, 0 },
   },
   {
-      { 539, 927, -442 },
-      { 1003, 714, -693 },
-      { 349, 1271, -596 },
-      { 820, 764, -560 },
-      { 524, 816, -316 },
-      { 780, 681, -437 },
-      { 586, 795, -357 },
-      { 551, 1135, -663 },
-      { 593, 1061, -630 },
-      { 974, 970, -920 },
+      { -8, 8, 0, 0, 0, 16, 0, 0 },
+      { -8, 0, 8, 0, 0, 16, 0, 0 },
+      { -8, 0, 0, 8, 0, 16, 0, 0 },
+      { -8, 0, 0, 0, 8, 16, 0, 0 },
+      { -4, 4, 0, 0, 0, 0, 16, 0 },
+      { -4, 0, 4, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 4, 0, 0, 16, 0 },
+      { -4, 0, 0, 0, 4, 0, 16, 0 },
   },
   {
-      { 595, 919, -490 },
-      { 945, 668, -579 },
-      { 495, 962, -433 },
-      { 385, 1551, -912 },
-      { 455, 554, 15 },
-      { 852, 478, -306 },
-      { 177, 760, -87 },
-      { -65, 1611, -522 },
-      { 815, 894, -685 },
-      { 846, 1010, -832 },
+      { -2, 8, 0, 0, 0, 10, 0, 0 },
+      { -1, 3, 8, 0, 0, 6, 0, 0 },
+      { -1, 2, 3, 8, 0, 4, 0, 0 },
+      { 0, 1, 2, 3, 8, 2, 0, 0 },
+      { -1, 4, 0, 0, 0, 3, 10, 0 },
+      { -1, 3, 4, 0, 0, 4, 6, 0 },
+      { -1, 2, 3, 4, 0, 4, 4, 0 },
+      { -1, 2, 2, 3, 4, 3, 3, 0 },
   },
-#if CONFIG_TX64X64
   {
-      { 595, 919, -490 },
-      { 945, 668, -579 },
-      { 495, 962, -433 },
-      { 385, 1551, -912 },
-      { 455, 554, 15 },
-      { 852, 478, -306 },
-      { 177, 760, -87 },
-      { -65, 1611, -522 },
-      { 815, 894, -685 },
-      { 846, 1010, -832 },
+      { -12, 14, 0, 0, 0, 14, 0, 0 },
+      { -10, 0, 14, 0, 0, 12, 0, 0 },
+      { -9, 0, 0, 14, 0, 11, 0, 0 },
+      { -8, 0, 0, 0, 14, 10, 0, 0 },
+      { -10, 12, 0, 0, 0, 0, 14, 0 },
+      { -9, 1, 12, 0, 0, 0, 12, 0 },
+      { -8, 0, 0, 12, 0, 1, 11, 0 },
+      { -7, 0, 0, 1, 12, 1, 9, 0 },
   },
-#endif  // CONFIG_TX64X64
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  },
-  {
-      { 539, 927, -442 },
-      { 1003, 714, -693 },
-      { 349, 1271, -596 },
-      { 820, 764, -560 },
-      { 524, 816, -316 },
-      { 780, 681, -437 },
-      { 586, 795, -357 },
-      { 551, 1135, -663 },
-      { 593, 1061, -630 },
-      { 974, 970, -920 },
-  },
-  {
-      { 539, 927, -442 },
-      { 1003, 714, -693 },
-      { 349, 1271, -596 },
-      { 820, 764, -560 },
-      { 524, 816, -316 },
-      { 780, 681, -437 },
-      { 586, 795, -357 },
-      { 551, 1135, -663 },
-      { 593, 1061, -630 },
-      { 974, 970, -920 },
-  },
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  }
 };
-#else
-static int filter_intra_taps_4[TX_SIZES_ALL][FILTER_INTRA_MODES][4] = {
-#if CONFIG_CHROMA_2X2
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-#endif
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 477, 737, -393, 150 },
-      { 881, 630, -546, 67 },
-      { 506, 984, -443, -20 },
-      { 114, 459, -270, 528 },
-      { 433, 528, 14, 3 },
-      { 837, 470, -301, -30 },
-      { 181, 777, 89, -107 },
-      { -29, 716, -232, 259 },
-      { 589, 646, -495, 255 },
-      { 740, 884, -728, 77 },
-  },
-#if CONFIG_TX64X64
-  {
-      { 477, 737, -393, 150 },
-      { 881, 630, -546, 67 },
-      { 506, 984, -443, -20 },
-      { 114, 459, -270, 528 },
-      { 433, 528, 14, 3 },
-      { 837, 470, -301, -30 },
-      { 181, 777, 89, -107 },
-      { -29, 716, -232, 259 },
-      { 589, 646, -495, 255 },
-      { 740, 884, -728, 77 },
-  },
-#endif  // CONFIG_TX64X64
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  }
-};
-#endif
-
-#if USE_3TAP_INTRA_FILTER
-static void filter_intra_predictors_3tap(uint8_t *dst, ptrdiff_t stride,
-                                         TX_SIZE tx_size, const uint8_t *above,
-                                         const uint8_t *left, int mode) {
-  int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int buffer[65][65];
-#else
-  int buffer[33][33];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_3[tx_size][mode][0];
-  const int c1 = filter_intra_taps_3[tx_size][mode][1];
-  const int c2 = filter_intra_taps_3[tx_size][mode][2];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
 
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < bw + 1; ++c) {
-      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
-              c2 * buffer[r - 1][c - 1];
-      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      buffer[r][c] = clip_pixel(buffer[r][c] + mean) - mean;
-    }
-
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel(buffer[r + 1][c + 1] + mean);
-    }
-    dst += stride;
-  }
-}
-#else
-static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride,
-                                         TX_SIZE tx_size, const uint8_t *above,
-                                         const uint8_t *left, int mode) {
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+                                  TX_SIZE tx_size, const uint8_t *above,
+                                  const uint8_t *left, int mode) {
   int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int buffer[65][129];
-#else
-  int buffer[33][65];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_4[tx_size][mode][0];
-  const int c1 = filter_intra_taps_4[tx_size][mode][1];
-  const int c2 = filter_intra_taps_4[tx_size][mode][2];
-  const int c3 = filter_intra_taps_4[tx_size][mode][3];
+  uint8_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < 2 * bw + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < 2 * bw + 1 - r; ++c) {
-      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
-              c2 * buffer[r - 1][c - 1] + c3 * buffer[r - 1][c + 1];
-      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      buffer[r][c] = clip_pixel(buffer[r][c] + mean) - mean;
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint8_t p0 = buffer[r - 1][c - 1];
+      const uint8_t p1 = buffer[r - 1][c];
+      const uint8_t p2 = buffer[r - 1][c + 1];
+      const uint8_t p3 = buffer[r - 1][c + 2];
+      const uint8_t p4 = buffer[r - 1][c + 3];
+      const uint8_t p5 = buffer[r][c - 1];
+      const uint8_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+                av1_filter_intra_taps[mode][k][0] * p0 +
+                    av1_filter_intra_taps[mode][k][1] * p1 +
+                    av1_filter_intra_taps[mode][k][2] * p2 +
+                    av1_filter_intra_taps[mode][k][3] * p3 +
+                    av1_filter_intra_taps[mode][k][4] * p4 +
+                    av1_filter_intra_taps[mode][k][5] * p5 +
+                    av1_filter_intra_taps[mode][k][6] * p6,
+                FILTER_INTRA_SCALE_BITS));
+      }
     }
 
   for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel(buffer[r + 1][c + 1] + mean);
-    }
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
     dst += stride;
   }
 }
-#endif
-
-void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                               const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_DC_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_DC_PRED);
-#endif
-}
 
-void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_V_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_V_PRED);
-#endif
-}
-
-void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_H_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_H_PRED);
-#endif
-}
-
-void av1_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                                const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D45_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D45_PRED);
-#endif
-}
-
-void av1_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D135_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D135_PRED);
-#endif
-}
-
-void av1_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D117_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D117_PRED);
-#endif
-}
-
-void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D153_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D153_PRED);
-#endif
-}
-
-void av1_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D207_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D207_PRED);
-#endif
-}
-
-void av1_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                                const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D63_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D63_PRED);
-#endif
-}
-
-void av1_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                               const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_TM_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_TM_PRED);
-#endif
-}
-
-static void filter_intra_predictors(FILTER_INTRA_MODE mode, uint8_t *dst,
-                                    ptrdiff_t stride, TX_SIZE tx_size,
-                                    const uint8_t *above, const uint8_t *left) {
-  switch (mode) {
-    case FILTER_DC_PRED:
-      av1_dc_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_V_PRED:
-      av1_v_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_H_PRED:
-      av1_h_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D45_PRED:
-      av1_d45_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D135_PRED:
-      av1_d135_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D117_PRED:
-      av1_d117_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D153_PRED:
-      av1_d153_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D207_PRED:
-      av1_d207_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D63_PRED:
-      av1_d63_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_TM_PRED:
-      av1_tm_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    default: assert(0);
-  }
-}
-#if CONFIG_HIGHBITDEPTH
-#if USE_3TAP_INTRA_FILTER
-static void highbd_filter_intra_predictors_3tap(uint16_t *dst, ptrdiff_t stride,
-                                                TX_SIZE tx_size,
-                                                const uint16_t *above,
-                                                const uint16_t *left, int mode,
-                                                int bd) {
+static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          TX_SIZE tx_size,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int mode,
+                                          int bd) {
   int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int preds[65][65];
-#else
-  int preds[33][33];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_3[tx_size][mode][0];
-  const int c1 = filter_intra_taps_3[tx_size][mode][1];
-  const int c2 = filter_intra_taps_3[tx_size][mode][2];
+  uint16_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < bw + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < bw + 1; ++c) {
-      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
-              c2 * preds[r - 1][c - 1];
-      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      preds[r][c] = clip_pixel_highbd(preds[r][c] + mean, bd) - mean;
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint16_t p0 = buffer[r - 1][c - 1];
+      const uint16_t p1 = buffer[r - 1][c];
+      const uint16_t p2 = buffer[r - 1][c + 1];
+      const uint16_t p3 = buffer[r - 1][c + 2];
+      const uint16_t p4 = buffer[r - 1][c + 3];
+      const uint16_t p5 = buffer[r][c - 1];
+      const uint16_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+                                  av1_filter_intra_taps[mode][k][0] * p0 +
+                                      av1_filter_intra_taps[mode][k][1] * p1 +
+                                      av1_filter_intra_taps[mode][k][2] * p2 +
+                                      av1_filter_intra_taps[mode][k][3] * p3 +
+                                      av1_filter_intra_taps[mode][k][4] * p4 +
+                                      av1_filter_intra_taps[mode][k][5] * p5 +
+                                      av1_filter_intra_taps[mode][k][6] * p6,
+                                  FILTER_INTRA_SCALE_BITS),
+                              bd);
+      }
     }
 
   for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel_highbd(preds[r + 1][c + 1] + mean, bd);
-    }
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
     dst += stride;
   }
 }
-#else
-static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
-                                                TX_SIZE tx_size,
-                                                const uint16_t *above,
-                                                const uint16_t *left, int mode,
-                                                int bd) {
-  int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int preds[65][129];
-#else
-  int preds[33][65];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_4[tx_size][mode][0];
-  const int c1 = filter_intra_taps_4[tx_size][mode][1];
-  const int c2 = filter_intra_taps_4[tx_size][mode][2];
-  const int c3 = filter_intra_taps_4[tx_size][mode][3];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
 
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < 2 * bw + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < 2 * bw + 1 - r; ++c) {
-      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
-              c2 * preds[r - 1][c - 1] + c3 * preds[r - 1][c + 1];
-      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      preds[r][c] = clip_pixel_highbd(preds[r][c] + mean, bd) - mean;
-    }
+static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
+  if (plane == 0) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+            mode == SMOOTH_H_PRED);
+  } else {
+    // uv_mode is not set for inter blocks, so need to explicitly
+    // detect that case.
+    if (is_inter_block(mbmi)) return 0;
 
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel_highbd(preds[r + 1][c + 1] + mean, bd);
-    }
-    dst += stride;
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
+            uv_mode == UV_SMOOTH_H_PRED);
   }
 }
-#endif
 
-void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                      TX_SIZE tx_size, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_DC_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_DC_PRED, bd);
-#endif
-}
-
-void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                     TX_SIZE tx_size, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_V_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_V_PRED, bd);
-#endif
-}
-
-void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                     TX_SIZE tx_size, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_H_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_H_PRED, bd);
-#endif
-}
-
-void av1_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                       TX_SIZE tx_size, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D45_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D45_PRED, bd);
-#endif
-}
-
-void av1_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D135_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D135_PRED, bd);
-#endif
-}
-
-void av1_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D117_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D117_PRED, bd);
-#endif
-}
-
-void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D153_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D153_PRED, bd);
-#endif
-}
-
-void av1_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D207_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D207_PRED, bd);
-#endif
-}
-
-void av1_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                       TX_SIZE tx_size, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D63_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D63_PRED, bd);
-#endif
-}
+static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+  int ab_sm, le_sm;
 
-void av1_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                      TX_SIZE tx_size, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_TM_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_TM_PRED, bd);
-#endif
-}
-
-static void highbd_filter_intra_predictors(FILTER_INTRA_MODE mode,
-                                           uint16_t *dst, ptrdiff_t stride,
-                                           TX_SIZE tx_size,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  switch (mode) {
-    case FILTER_DC_PRED:
-      av1_highbd_dc_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_V_PRED:
-      av1_highbd_v_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_H_PRED:
-      av1_highbd_h_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D45_PRED:
-      av1_highbd_d45_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D135_PRED:
-      av1_highbd_d135_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D117_PRED:
-      av1_highbd_d117_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D153_PRED:
-      av1_highbd_d153_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D207_PRED:
-      av1_highbd_d207_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D63_PRED:
-      av1_highbd_d63_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_TM_PRED:
-      av1_highbd_tm_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    default: assert(0);
+  if (plane == 0) {
+    const MB_MODE_INFO *ab = xd->above_mbmi;
+    const MB_MODE_INFO *le = xd->left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
+  } else {
+    const MB_MODE_INFO *ab = xd->chroma_above_mbmi;
+    const MB_MODE_INFO *le = xd->chroma_left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
   }
+
+  return (ab_sm || le_sm) ? 1 : 0;
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_INTRA_EDGE
-static int intra_edge_filter_strength(int bsz, int delta) {
+static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
   const int d = abs(delta);
   int strength = 0;
 
-  switch (bsz) {
-    case 4:
-      if (d < 56) {
-        strength = 0;
-      } else if (d < 90) {
-        strength = 1;
-      }
-      break;
-    case 8:
-      if (d < 8) {
-        strength = 0;
-      } else if (d < 32) {
-        strength = 1;
-      } else if (d < 90) {
-        strength = 3;
-      }
-      break;
-    case 16:
-      if (d < 4) {
-        strength = 0;
-      } else if (d < 16) {
-        strength = 1;
-      } else if (d < 90) {
-        strength = 3;
-      }
-      break;
-    case 32:
-      if (d < 16) {
-        strength = 2;
-      } else if (d < 90) {
-        strength = 3;
-      }
-      break;
-    default: strength = 0; break;
+  const int blk_wh = bs0 + bs1;
+  if (type == 0) {
+    if (blk_wh <= 8) {
+      if (d >= 56) strength = 1;
+    } else if (blk_wh <= 12) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 16) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 24) {
+      if (d >= 8) strength = 1;
+      if (d >= 16) strength = 2;
+      if (d >= 32) strength = 3;
+    } else if (blk_wh <= 32) {
+      if (d >= 1) strength = 1;
+      if (d >= 4) strength = 2;
+      if (d >= 32) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
+  } else {
+    if (blk_wh <= 8) {
+      if (d >= 40) strength = 1;
+      if (d >= 64) strength = 2;
+    } else if (blk_wh <= 16) {
+      if (d >= 20) strength = 1;
+      if (d >= 48) strength = 2;
+    } else if (blk_wh <= 24) {
+      if (d >= 4) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
   }
-
   return strength;
 }
 
@@ -2229,7 +1015,7 @@ void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
   uint8_t edge[129];
 
   memcpy(edge, p, sz * sizeof(*p));
-  for (int i = 1; i < sz - 1; i++) {
+  for (int i = 1; i < sz; i++) {
     int s = 0;
     for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
       int k = i - 2 + j;
@@ -2242,7 +1028,16 @@ void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
+static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
 void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
@@ -2253,7 +1048,7 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   uint16_t edge[129];
 
   memcpy(edge, p, sz * sizeof(*p));
-  for (int i = 1; i < sz - 1; i++) {
+  for (int i = 1; i < sz; i++) {
     int s = 0;
     for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
       int k = i - 2 + j;
@@ -2265,12 +1060,22 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
     p[i] = s;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-static int use_intra_edge_upsample(int bsz, int delta) {
+static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
+static int use_intra_edge_upsample(int bs0, int bs1, int delta, int type) {
   const int d = abs(delta);
-  return (bsz == 4 && d > 0 && d < 56);
+  const int blk_wh = bs0 + bs1;
+  if (d <= 0 || d >= 40) return 0;
+  return type ? (blk_wh <= 8) : (blk_wh <= 16);
 }
 
 void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
@@ -2296,7 +1101,6 @@ void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
   // interpolate half-sample positions
   assert(sz <= MAX_UPSAMPLE_SZ);
@@ -2320,16 +1124,13 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
     p[2 * i] = in[i + 2];
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-
-#endif  // CONFIG_INTRA_EDGE
 
-#if CONFIG_HIGHBITDEPTH
 static void build_intra_predictors_high(
     const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
-    int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px,
-    int n_topright_px, int n_left_px, int n_bottomleft_px, int plane) {
+    int dst_stride, PREDICTION_MODE mode, int angle_delta,
+    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
+    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
+    int n_bottomleft_px, int plane) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -2339,36 +1140,25 @@ static void build_intra_predictors_high(
   uint16_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
-#if !INTRA_USES_RECT_TRANSFORMS
-  assert(txwpx == txhpx);
-#endif  // !INTRA_USES_RECT_TRANSFORMS
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
   int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
   const uint16_t *above_ref = ref - ref_stride;
-#if CONFIG_EXT_INTRA
+  const uint16_t *left_ref = ref - 1;
   int p_angle = 0;
-  const int is_dr_mode = av1_is_directional_mode(mode, xd->mi[0]->mbmi.sb_type);
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
-      &xd->mi[0]->mbmi.filter_intra_mode_info;
-  const FILTER_INTRA_MODE filter_intra_mode =
-      filter_intra_mode_info->filter_intra_mode[plane != 0];
-#endif  // CONFIG_FILTER_INTRA
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
   int base = 128 << (xd->bd - 8);
 
+  // The default values if ref pixels are not available:
   // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
   // base+1   A      B  ..     Y      Z
   // base+1   C      D  ..     W      X
   // base+1   E      F  ..     U      V
   // base+1   G      H  ..     S      T      T      T      T      T
-  aom_memset16(left_data, base + 1, sizeof(left_data) / sizeof(*left_data));
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] +
-              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    p_angle = mode_to_angle_map[mode] + angle_delta;
     if (p_angle <= 90)
       need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
@@ -2376,29 +1166,20 @@ static void build_intra_predictors_high(
     else
       need_above = 0, need_left = 1, need_above_left = 1;
   }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-    need_left = need_above = need_above_left = 1;
-#endif  // CONFIG_FILTER_INTRA
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
 
-  (void)plane;
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
 
   if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
-#if CONFIG_INTRA_EDGE
     int val;
     if (need_left) {
       val = (n_top_px > 0) ? above_ref[0] : base + 1;
     } else {
-      val = (n_left_px > 0) ? ref[-1] : base - 1;
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
     }
-#else
-    const int val = need_left ? base + 1 : base - 1;
-#endif  // CONFIG_INTRA_EDGE
     for (i = 0; i < txhpx; ++i) {
       aom_memset16(dst, val, txwpx);
       dst += dst_stride;
@@ -2408,56 +1189,34 @@ static void build_intra_predictors_high(
 
   // NEED_LEFT
   if (need_left) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_bottom = 0;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
-      for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
       if (need_bottom && n_bottomleft_px > 0) {
         assert(i == txhpx);
         for (; i < txhpx + n_bottomleft_px; i++)
-          left_col[i] = ref[i * ref_stride - 1];
+          left_col[i] = left_ref[i * ref_stride];
       }
       if (i < num_left_pixels_needed)
         aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_top_px > 0) {
         aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         aom_memset16(left_col, base + 1, num_left_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   // NEED_ABOVE
   if (need_above) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_right = 1;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
@@ -2472,92 +1231,75 @@ static void build_intra_predictors_high(
         aom_memset16(&above_row[i], above_row[i - 1],
                      num_top_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_left_px > 0) {
-        aom_memset16(above_row, ref[-1], num_top_pixels_needed);
+        aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         aom_memset16(above_row, base - 1, num_top_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   if (need_above_left) {
-#if CONFIG_INTRA_EDGE
     if (n_top_px > 0 && n_left_px > 0) {
       above_row[-1] = above_ref[-1];
     } else if (n_top_px > 0) {
       above_row[-1] = above_ref[0];
     } else if (n_left_px > 0) {
-      above_row[-1] = ref[-1];
+      above_row[-1] = left_ref[0];
     } else {
       above_row[-1] = base;
     }
-#else
-    above_row[-1] =
-        n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
-#endif  // CONFIG_INTRA_EDGE
     left_col[-1] = above_row[-1];
   }
 
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
-    highbd_filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
-                                   above_row, left_col, xd->bd);
+  if (use_filter_intra) {
+    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                                  filter_intra_mode, xd->bd);
     return;
   }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-#if CONFIG_INTRA_INTERP
-    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
-    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
-      filter = xd->mi[0]->mbmi.intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE
-    const int need_right = p_angle < 90;
-    const int need_bottom = p_angle > 180;
-    if (p_angle != 90 && p_angle != 180) {
-      const int ab_le = need_above_left ? 1 : 0;
-      if (need_above && n_top_px > 0) {
-        const int strength = intra_edge_filter_strength(txwpx, p_angle - 90);
-        const int n_px = n_top_px + ab_le + (need_right ? n_topright_px : 0);
-        av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner_high(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+        }
       }
-      if (need_left && n_left_px > 0) {
-        const int strength = intra_edge_filter_strength(txhpx, p_angle - 180);
-        const int n_px =
-            n_left_px + ab_le + (need_bottom ? n_bottomleft_px : 0);
-        av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+      upsample_above =
+          use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+      }
+      upsample_left =
+          use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
       }
     }
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-    const int upsample_above = use_intra_edge_upsample(txwpx, p_angle - 90);
-    if (need_above && upsample_above) {
-      const int n_px = txwpx + (need_right ? txhpx : 0);
-      av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
-    }
-    const int upsample_left = use_intra_edge_upsample(txhpx, p_angle - 180);
-    if (need_left && upsample_left) {
-      const int n_px = txhpx + (need_bottom ? txwpx : 0);
-      av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
-    }
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-#endif  // CONFIG_INTRA_EDGE
     highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-#if CONFIG_INTRA_INTERP
-                        filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                        upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                        p_angle, xd->bd);
+                        upsample_above, upsample_left, p_angle, xd->bd);
     return;
   }
-#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
@@ -2567,52 +1309,41 @@ static void build_intra_predictors_high(
     pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
-                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   PREDICTION_MODE mode, int angle_delta,
+                                   FILTER_INTRA_MODE filter_intra_mode,
+                                   TX_SIZE tx_size, int disable_edge_filter,
                                    int n_top_px, int n_topright_px,
                                    int n_left_px, int n_bottomleft_px,
                                    int plane) {
   int i;
   const uint8_t *above_ref = ref - ref_stride;
+  const uint8_t *left_ref = ref - 1;
   DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
   DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
   uint8_t *const above_row = above_data + 16;
   uint8_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
-#if !INTRA_USES_RECT_TRANSFORMS
-  assert(txwpx == txhpx);
-#endif  // !INTRA_USES_RECT_TRANSFORMS
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
   int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
-#if CONFIG_EXT_INTRA
   int p_angle = 0;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_dr_mode = av1_is_directional_mode(mode, mbmi->sb_type);
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
-      &xd->mi[0]->mbmi.filter_intra_mode_info;
-  const FILTER_INTRA_MODE filter_intra_mode =
-      filter_intra_mode_info->filter_intra_mode[plane != 0];
-#endif  // CONFIG_FILTER_INTRA
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 
+  // The default values if ref pixels are not available:
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
-  memset(left_data, 129, sizeof(left_data));
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] +
-              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    p_angle = mode_to_angle_map[mode] + angle_delta;
     if (p_angle <= 90)
       need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
@@ -2620,30 +1351,20 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
     else
       need_above = 0, need_left = 1, need_above_left = 1;
   }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-    need_left = need_above = need_above_left = 1;
-#endif  // CONFIG_FILTER_INTRA
-
-  (void)xd;
-  (void)plane;
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
 
   if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
-#if CONFIG_INTRA_EDGE
     int val;
     if (need_left) {
       val = (n_top_px > 0) ? above_ref[0] : 129;
     } else {
-      val = (n_left_px > 0) ? ref[-1] : 127;
+      val = (n_left_px > 0) ? left_ref[0] : 127;
     }
-#else
-    const int val = need_left ? 129 : 127;
-#endif  // CONFIG_INTRA_EDGE
     for (i = 0; i < txhpx; ++i) {
       memset(dst, val, txwpx);
       dst += dst_stride;
@@ -2653,56 +1374,34 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
 
   // NEED_LEFT
   if (need_left) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_bottom = 0;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
-      for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
       if (need_bottom && n_bottomleft_px > 0) {
         assert(i == txhpx);
         for (; i < txhpx + n_bottomleft_px; i++)
-          left_col[i] = ref[i * ref_stride - 1];
+          left_col[i] = left_ref[i * ref_stride];
       }
       if (i < num_left_pixels_needed)
         memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_top_px > 0) {
         memset(left_col, above_ref[0], num_left_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         memset(left_col, 129, num_left_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   // NEED_ABOVE
   if (need_above) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_right = 1;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FITLER_INTRA
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px);
@@ -2715,91 +1414,75 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
       if (i < num_top_pixels_needed)
         memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_left_px > 0) {
-        memset(above_row, ref[-1], num_top_pixels_needed);
+        memset(above_row, left_ref[0], num_top_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         memset(above_row, 127, num_top_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   if (need_above_left) {
-#if CONFIG_INTRA_EDGE
     if (n_top_px > 0 && n_left_px > 0) {
       above_row[-1] = above_ref[-1];
     } else if (n_top_px > 0) {
       above_row[-1] = above_ref[0];
     } else if (n_left_px > 0) {
-      above_row[-1] = ref[-1];
+      above_row[-1] = left_ref[0];
     } else {
       above_row[-1] = 128;
     }
-#else
-    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
-#endif  // CONFIG_INTRA_EDGE
     left_col[-1] = above_row[-1];
   }
 
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
-    filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
-                            above_row, left_col);
+  if (use_filter_intra) {
+    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                               filter_intra_mode);
     return;
   }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-#if CONFIG_INTRA_INTERP
-    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
-    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
-      filter = xd->mi[0]->mbmi.intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE
-    const int need_right = p_angle < 90;
-    const int need_bottom = p_angle > 180;
-    if (p_angle != 90 && p_angle != 180) {
-      const int ab_le = need_above_left ? 1 : 0;
-      if (need_above && n_top_px > 0) {
-        const int strength = intra_edge_filter_strength(txwpx, p_angle - 90);
-        const int n_px = n_top_px + ab_le + (need_right ? n_topright_px : 0);
-        av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+        }
       }
-      if (need_left && n_left_px > 0) {
-        const int strength = intra_edge_filter_strength(txhpx, p_angle - 180);
-        const int n_px =
-            n_left_px + ab_le + (need_bottom ? n_bottomleft_px : 0);
-        av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+      upsample_above =
+          use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge(above_row, n_px);
+      }
+      upsample_left =
+          use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge(left_col, n_px);
       }
     }
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-    const int upsample_above = use_intra_edge_upsample(txwpx, p_angle - 90);
-    if (need_above && upsample_above) {
-      const int n_px = txwpx + (need_right ? txhpx : 0);
-      av1_upsample_intra_edge(above_row, n_px);
-    }
-    const int upsample_left = use_intra_edge_upsample(txhpx, p_angle - 180);
-    if (need_left && upsample_left) {
-      const int n_px = txhpx + (need_bottom ? txwpx : 0);
-      av1_upsample_intra_edge(left_col, n_px);
-    }
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-#endif  // CONFIG_INTRA_EDGE
-    dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-#if CONFIG_INTRA_INTERP
-                 filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                 upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                 p_angle);
+    dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+                 upsample_left, p_angle);
     return;
   }
-#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
@@ -2810,41 +1493,54 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   }
 }
 
-static void predict_intra_block_helper(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd, int wpx, int hpx,
-                                       TX_SIZE tx_size, PREDICTION_MODE mode,
-                                       const uint8_t *ref, int ref_stride,
-                                       uint8_t *dst, int dst_stride,
-                                       int col_off, int row_off, int plane) {
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  const int x = col_off << tx_size_wide_log2[0];
+  const int y = row_off << tx_size_high_log2[0];
+
+  if (use_palette) {
+    int r, c;
+    const uint8_t *const map = xd->plane[plane != 0].color_index_map +
+                               xd->color_index_map_offset[plane != 0];
+    const uint16_t *const palette =
+        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    } else {
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst[r * dst_stride + c] =
+              (uint8_t)palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    }
+    return;
+  }
+
+  BLOCK_SIZE bsize = mbmi->sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int txw = tx_size_wide_unit[tx_size];
-#if CONFIG_CB4X4 && CONFIG_CHROMA_SUB8X8
+  const int txh = tx_size_high_unit[tx_size];
   const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
                                                      : xd->up_available);
   const int have_left =
       col_off ||
       (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
-#else
-  const int have_top = row_off || xd->up_available;
-  const int have_left = col_off || xd->left_available;
-#endif
-  const int x = col_off << tx_size_wide_log2[0];
-  const int y = row_off << tx_size_high_log2[0];
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  const int txwpx = tx_size_wide[tx_size];
-  const int txhpx = tx_size_high[tx_size];
-#if !INTRA_USES_RECT_TRANSFORMS
-  assert(txwpx == txhpx);
-#endif  // !INTRA_USES_RECT_TRANSFORMS
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 && !CONFIG_CHROMA_SUB8X8
-  const int xr_chr_offset = (pd->subsampling_x && bsize < BLOCK_8X8) ? 2 : 0;
-  const int yd_chr_offset = (pd->subsampling_y && bsize < BLOCK_8X8) ? 2 : 0;
-#else
   const int xr_chr_offset = 0;
   const int yd_chr_offset = 0;
-#endif
 
   // Distance between the right edge of this prediction block to
   // the frame right edge
@@ -2854,69 +1550,39 @@ static void predict_intra_block_helper(const AV1_COMMON *cm,
   // the frame bottom edge
   const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
                  (hpx - y - txhpx) - yd_chr_offset;
-  const int right_available = mi_col + ((col_off + txw) << pd->subsampling_x >>
-                                        (MI_SIZE_LOG2 - tx_size_wide_log2[0])) <
-                              xd->tile.mi_col_end;
-  const int bottom_available = (yd > 0);
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-  const PARTITION_TYPE partition = xd->mi[0]->mbmi.partition;
-#endif
+  const int right_available =
+      mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+  const int bottom_available =
+      (yd > 0) &&
+      (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+
+  const PARTITION_TYPE partition = mbmi->partition;
 
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   // force 4x4 chroma component block size.
   bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
-#endif
 
-  const int have_top_right =
-      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                    partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                    tx_size, row_off, col_off, pd->subsampling_x);
-  const int have_bottom_left =
-      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
-                      tx_size, row_off, col_off, pd->subsampling_y);
-  if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
-    const int stride = wpx;
-    int r, c;
-    const uint8_t *const map = xd->plane[plane != 0].color_index_map;
-    uint16_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
-                        plane * PALETTE_MAX_SIZE;
+  const int have_top_right = has_top_right(
+      cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
+      row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+  const int have_bottom_left = has_bottom_left(
+      cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
+      tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
 
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-      for (r = 0; r < txhpx; ++r) {
-        for (c = 0; c < txwpx; ++c) {
-          dst16[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
-        }
-      }
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (r = 0; r < txhpx; ++r) {
-        for (c = 0; c < txwpx; ++c) {
-          dst[r * dst_stride + c] =
-              (uint8_t)palette[map[(r + y) * stride + c + x]];
-        }
-      }
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-    return;
-  }
-
-#if CONFIG_HIGHBITDEPTH
+  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     build_intra_predictors_high(
-        xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
+        filter_intra_mode, tx_size, disable_edge_filter,
         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
         have_top_right ? AOMMIN(txwpx, xr) : 0,
         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
     return;
   }
-#endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                         angle_delta, filter_intra_mode, tx_size,
+                         disable_edge_filter,
                          have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
                          have_top_right ? AOMMIN(txwpx, xr) : 0,
                          have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
@@ -2924,278 +1590,56 @@ static void predict_intra_block_helper(const AV1_COMMON *cm,
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int plane, int block_idx, int blk_col,
-                                    int blk_row, TX_SIZE tx_size) {
-  const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  const int block_raster_idx =
-      av1_block_index_to_raster_order(tx_size, block_idx);
-  const PREDICTION_MODE mode = (plane == AOM_PLANE_Y)
-                                   ? get_y_mode(mi, block_raster_idx)
-                                   : get_uv_mode(mbmi->uv_mode);
-#if CONFIG_CFL
+  const PREDICTION_MODE mode =
+      (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
+          ? mbmi->filter_intra_mode_info.filter_intra_mode
+          : FILTER_INTRA_MODES;
+  const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
-    if (plane == AOM_PLANE_U && blk_col == 0 && blk_row == 0) {
-      // Avoid computing the CfL parameters twice, if they have already been
-      // computed in cfl_rd_pick_alpha.
-      if (!xd->cfl->are_parameters_computed)
-        cfl_compute_parameters(xd, tx_size);
+#if CONFIG_DEBUG
+    assert(is_cfl_allowed(xd));
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+    (void)plane_bsize;
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    if (!xd->lossless[mbmi->segment_id]) {
+      assert(blk_col == 0);
+      assert(blk_row == 0);
+      assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+      assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
     }
-    cfl_predict_block(xd, dst, dst_stride, blk_row, blk_col, tx_size, plane);
-    return;
-  }
 #endif
-
-  av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                          txsize_to_bsize[tx_size], mode, dst, dst_stride, dst,
-                          dst_stride, blk_col, blk_row, plane);
-}
-
-#if INTRA_USES_EXT_TRANSFORMS
-// Copy the given row of dst into the equivalent row of ref, saving
-// the overwritten data to tmp. Returns zero if no copy happened (so
-// no restore is needed)
-//
-// Note that ref_row and dst_row follow the usual hibd convention
-// where you convert to a uint16_t* with CONVERT_TO_SHORTPTR(). tmp
-// does not follow that convention: it's a genuine pointer which is
-// correctly aligned and sized for either 8 or 16 bit data.
-//
-// matching_strides is a boolean flag which should be nonzero if ref
-// and dst have the same stride.
-static int overwrite_ref_row(int matching_strides, int buf_flags,
-                             int block_width, const uint8_t *dst_row,
-                             uint8_t *ref_row, uint8_t *tmp_row) {
-  if (ref_row == dst_row && matching_strides) return 0;
-
-  int row_bytes = block_width;
-
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    row_bytes *= 2;
-    ref_row = (uint8_t *)CONVERT_TO_SHORTPTR(ref_row);
-    dst_row = (const uint8_t *)CONVERT_TO_SHORTPTR(dst_row);
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  memcpy(tmp_row, ref_row, row_bytes);
-  memcpy(ref_row, dst_row, row_bytes);
-  return 1;
-}
-
-static void restore_ref_row(int buf_flags, int block_width,
-                            const uint8_t *tmp_row, uint8_t *ref_row) {
-  int row_bytes = block_width;
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    row_bytes *= 2;
-    ref_row = (uint8_t *)CONVERT_TO_SHORTPTR(ref_row);
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  memcpy(ref_row, tmp_row, row_bytes);
-}
-
-// The column equivalent of overwrite_ref_row. ref_row and dst_row
-// point at the relevant column of the first row of the block.
-static int overwrite_ref_col(int buf_flags, int block_height,
-                             const uint8_t *dst_row, int dst_stride,
-                             uint8_t *ref_row, int ref_stride,
-                             uint8_t *tmp_row) {
-  if (ref_row == dst_row && ref_stride == dst_stride) return 0;
-
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint16_t *tmp_16 = (uint16_t *)tmp_row;
-    uint16_t *ref_16 = CONVERT_TO_SHORTPTR(ref_row);
-    const uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst_row);
-
-    for (int i = 0; i < block_height; ++i) {
-      tmp_16[i] = ref_16[i * ref_stride];
-      ref_16[i * ref_stride] = dst_16[i * dst_stride];
-    }
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    for (int i = 0; i < block_height; ++i) {
-      tmp_row[i] = ref_row[i * ref_stride];
-      ref_row[i * ref_stride] = dst_row[i * dst_stride];
-    }
-#if CONFIG_HIGHBITDEPTH
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-  return 1;
-}
-
-static void restore_ref_col(int buf_flags, int block_height,
-                            const uint8_t *tmp_row, uint8_t *ref_row,
-                            int ref_stride) {
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *tmp_16 = (const uint16_t *)tmp_row;
-    uint16_t *ref_16 = CONVERT_TO_SHORTPTR(ref_row);
-
-    for (int i = 0; i < block_height; ++i) {
-      ref_16[i * ref_stride] = tmp_16[i];
-    }
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    for (int i = 0; i < block_height; ++i) {
-      ref_row[i * ref_stride] = tmp_row[i];
+    CFL_CTX *const cfl = &xd->cfl;
+    CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+    if (cfl->dc_pred_is_cached[pred_plane] == 0) {
+      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                              angle_delta, use_palette, filter_intra_mode, dst,
+                              dst_stride, dst, dst_stride, blk_col, blk_row,
+                              plane);
+      if (cfl->use_dc_pred_cache) {
+        cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
+        cfl->dc_pred_is_cached[pred_plane] = 1;
+      }
+    } else {
+      cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
     }
-#if CONFIG_HIGHBITDEPTH
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-}
-#endif  // #if INTRA_USES_EXT_TRANSFORMS
-
-void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             int wpx, int hpx, BLOCK_SIZE bsize,
-                             PREDICTION_MODE mode, const uint8_t *ref,
-                             int ref_stride, uint8_t *dst, int dst_stride,
-                             int col_off, int row_off, int plane) {
-  const int block_width = block_size_wide[bsize];
-  const int block_height = block_size_high[bsize];
-#if INTRA_USES_RECT_TRANSFORMS
-  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  assert(tx_size < TX_SIZES_ALL);
-#else
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  assert(tx_size < TX_SIZES);
-#endif  // INTRA_USES_RECT_TRANSFORMS
-
-  // Start by running the helper to predict either the entire block
-  // (if the block is square or the same size as tx_size) or the top
-  // or left of the block if it's tall and thin or short and wide.
-  predict_intra_block_helper(cm, xd, wpx, hpx, tx_size, mode, ref, ref_stride,
-                             dst, dst_stride, col_off, row_off, plane);
-
-// If we're not using extended transforms, this function should
-// always be called with a square block.
-#if !INTRA_USES_EXT_TRANSFORMS
-  assert(block_width == block_height);
-#endif  // !INTRA_USES_EXT_TRANSFORMS
-
-  // If the block is square, we're done.
-  if (block_width == block_height) return;
-
-#if INTRA_USES_EXT_TRANSFORMS
-// If we're using rectangular transforms, we might be done even
-// though the block isn't square.
-#if INTRA_USES_RECT_TRANSFORMS
-  if (block_width == tx_size_wide[tx_size] &&
-      block_height == tx_size_high[tx_size])
+    cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
     return;
-
-  // A block should only fail to have a matching transform if it's
-  // large and rectangular (such large transform sizes aren't
-  // available).
-  assert(block_width >= 32 && block_height >= 32);
-#endif  // INTRA_USES_RECT_TRANSFORMS
-
-  assert((block_width == wpx && block_height == hpx) ||
-         (block_width == (wpx >> 1) && block_height == hpx) ||
-         (block_width == wpx && block_height == (hpx >> 1)));
-
-// The tmp buffer needs to be big enough to hold MAX_SB_SIZE samples
-// from the image. If CONFIG_HIGHBITDEPTH is enabled, it also needs
-// to be big enough and correctly aligned to hold 16-bit entries.
-#if CONFIG_HIGHBITDEPTH
-  uint16_t tmp_buf[MAX_SB_SIZE];
-#else
-  uint8_t tmp_buf[MAX_SB_SIZE];
-#endif  // CONFIG_HIGHBITDEPTH
-  uint8_t *tmp = (uint8_t *)tmp_buf;
-
-  if (block_width < block_height) {
-    // The block is tall and thin. We've already done the top part,
-    // and need to repeat the prediction down the rest of the block.
-
-    const int tx_height = tx_size_high[tx_size];
-    const int tx_height_off = tx_height >> tx_size_wide_log2[0];
-    assert(tx_height_off << tx_size_wide_log2[0] == tx_height);
-
-    int next_row_off = row_off + tx_height_off;
-    int next_row_idx = tx_height;
-
-    while (next_row_idx < block_height) {
-      const int last_row_idx = next_row_idx - 1;
-
-      // Cast away the const to make a mutable pointer to the last
-      // row of ref. This will be snapshotted and restored later.
-      uint8_t *last_ref_row = (uint8_t *)ref + last_row_idx * ref_stride;
-      uint8_t *last_dst_row = dst + last_row_idx * dst_stride;
-
-      const int needs_restore =
-          overwrite_ref_row(ref_stride == dst_stride, xd->cur_buf->flags,
-                            block_width, last_dst_row, last_ref_row, tmp);
-
-      const uint8_t *next_ref_row = ref + next_row_idx * ref_stride;
-      uint8_t *next_dst_row = dst + next_row_idx * dst_stride;
-
-      predict_intra_block_helper(cm, xd, wpx, hpx, tx_size, mode, next_ref_row,
-                                 ref_stride, next_dst_row, dst_stride, col_off,
-                                 next_row_off, plane);
-
-      if (needs_restore)
-        restore_ref_row(xd->cur_buf->flags, block_width, tmp, last_ref_row);
-
-      next_row_idx += tx_height;
-      next_row_off += tx_height_off;
-    }
-  } else {
-    // The block is short and wide. We've already done the left part,
-    // and need to repeat the prediction to the right.
-
-    const int tx_width = tx_size_wide[tx_size];
-    const int tx_width_off = tx_width >> tx_size_wide_log2[0];
-    assert(tx_width_off << tx_size_wide_log2[0] == tx_width);
-
-    int next_col_off = col_off + tx_width_off;
-    int next_col_idx = tx_width;
-
-    while (next_col_idx < block_width) {
-      const int last_col_idx = next_col_idx - 1;
-
-      // Cast away the const to make a mutable pointer to ref,
-      // starting at the last column written. This will be
-      // snapshotted and restored later.
-      uint8_t *last_ref_col = (uint8_t *)ref + last_col_idx;
-      uint8_t *last_dst_col = dst + last_col_idx;
-
-      const int needs_restore =
-          overwrite_ref_col(xd->cur_buf->flags, block_height, last_dst_col,
-                            dst_stride, last_ref_col, ref_stride, tmp);
-
-      const uint8_t *next_ref_col = ref + next_col_idx;
-      uint8_t *next_dst_col = dst + next_col_idx;
-
-      predict_intra_block_helper(cm, xd, wpx, hpx, tx_size, mode, next_ref_col,
-                                 ref_stride, next_dst_col, dst_stride,
-                                 next_col_off, row_off, plane);
-
-      if (needs_restore)
-        restore_ref_col(xd->cur_buf->flags, block_height, tmp, last_ref_col,
-                        ref_stride);
-
-      next_col_idx += tx_width;
-      next_col_off += tx_width_off;
-    }
   }
-#endif  // INTRA_USES_EXT_TRANSFORMS
+  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                          angle_delta, use_palette, filter_intra_mode, dst,
+                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 }
 
-void av1_init_intra_predictors(void) {
-  once(av1_init_intra_predictors_internal);
-}
+void av1_init_intra_predictors(void) { once(init_intra_predictors_internal); }
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index 42797e310..a7d9e8b79 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -22,15 +22,16 @@ extern "C" {
 
 void av1_init_intra_predictors(void);
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int plane, int block_idx, int blk_col,
-                                    int blk_row, TX_SIZE tx_size);
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size);
 void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             int bw, int bh, BLOCK_SIZE bsize,
-                             PREDICTION_MODE mode, const uint8_t *ref,
-                             int ref_stride, uint8_t *dst, int dst_stride,
-                             int aoff, int loff, int plane);
+                             int bw, int bh, TX_SIZE tx_size,
+                             PREDICTION_MODE mode, int angle_delta,
+                             int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int aoff, int loff, int plane);
 
-#if CONFIG_INTERINTRA
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
   DC_PRED, V_PRED, H_PRED, SMOOTH_PRED
@@ -41,44 +42,67 @@ static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
   II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED,      II_SMOOTH_PRED, II_V_PRED,
   II_H_PRED,  II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED
 };
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_FILTER_INTRA
-#define FILTER_INTRA_PREC_BITS 10
-#endif  // CONFIG_FILTER_INTRA
-
-#define CONFIG_INTRA_EDGE_UPSAMPLE CONFIG_INTRA_EDGE
-#define CONFIG_USE_ANGLE_DELTA_SUB8X8 0
-
-#if CONFIG_EXT_INTRA
-static INLINE int av1_is_directional_mode(PREDICTION_MODE mode,
-                                          BLOCK_SIZE bsize) {
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-  (void)bsize;
-  return mode >= V_PRED && mode <= D63_PRED;
-#else
-  return mode >= V_PRED && mode <= D63_PRED && bsize >= BLOCK_8X8;
-#endif
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+  return mode >= V_PRED && mode <= D67_PRED;
 }
 
 static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
-  (void)bsize;
-#if CONFIG_USE_ANGLE_DELTA_SUB8X8
-  return 1;
-#else
   return bsize >= BLOCK_8X8;
-#endif
 }
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_INTRABC
-static INLINE int av1_allow_intrabc(BLOCK_SIZE bsize,
-                                    const AV1_COMMON *const cm) {
-  return (bsize >= BLOCK_8X8 || bsize == BLOCK_4X4) &&
-         cm->allow_screen_content_tools;
+static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+  return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+         cm->allow_intrabc;
+}
+
+static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+                                                 BLOCK_SIZE bs) {
+  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+
+  return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
 }
-#endif  // CONFIG_INTRABC
 
+static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+                                           const MB_MODE_INFO *mbmi) {
+  return mbmi->mode == DC_PRED &&
+         mbmi->palette_mode_info.palette_size[0] == 0 &&
+         av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+}
+
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int av1_get_dx(int angle) {
+  if (angle > 0 && angle < 90) {
+    return dr_intra_derivative[angle];
+  } else if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[180 - angle];
+  } else {
+    // In this case, we are not really going to use dx. We may return any value.
+    return 1;
+  }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int av1_get_dy(int angle) {
+  if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[angle - 90];
+  } else if (angle > 180 && angle < 270) {
+    return dr_intra_derivative[270 - angle];
+  } else {
+    // In this case, we are not really going to use dy. We may return any value.
+    return 1;
+  }
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index b0f303e35..17e6823b1 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -16,30 +16,18 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./aom_config.h"
-#if CONFIG_HIGHBITDEPTH
+#include "config/aom_config.h"
+
 #include "aom_dsp/aom_dsp_common.h"
-#endif  // CONFIG_HIGHBITDEPTH
 #include "aom_ports/mem.h"
 #include "aom_scale/aom_scale.h"
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 
-#include "./aom_scale_rtcd.h"
-
-#define FILTER_BITS 7
-
-#define INTERP_TAPS 8
-#define SUBPEL_BITS_RS 6
-#define SUBPEL_MASK_RS ((1 << SUBPEL_BITS_RS) - 1)
-#define INTERP_PRECISION_BITS 16
-#define SUBPEL_INTERP_EXTRA_BITS (INTERP_PRECISION_BITS - SUBPEL_BITS_RS)
-#define SUBPEL_INTERP_EXTRA_OFF (1 << (SUBPEL_INTERP_EXTRA_BITS - 1))
-
-typedef int16_t interp_kernel[INTERP_TAPS];
+#include "config/aom_scale_rtcd.h"
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = {
   { -3, 0, 35, 64, 35, 0, -3, 0 },    { -3, 0, 34, 64, 36, 0, -3, 0 },
   { -3, -1, 34, 64, 36, 1, -3, 0 },   { -3, -1, 33, 64, 37, 1, -3, 0 },
   { -3, -1, 32, 64, 38, 1, -3, 0 },   { -3, -1, 31, 64, 39, 1, -3, 0 },
@@ -75,7 +63,7 @@ static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (0.625-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = {
   { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 },
   { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 },
   { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 },
@@ -111,7 +99,7 @@ static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (0.75-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = {
   { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 },
   { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 },
   { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 },
@@ -147,7 +135,7 @@ static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (0.875-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
   { 3, -8, 13, 112, 13, -8, 3, 0 },   { 2, -7, 12, 112, 15, -8, 3, -1 },
   { 3, -7, 10, 112, 17, -9, 3, -1 },  { 2, -6, 8, 112, 19, -9, 3, -1 },
   { 2, -6, 7, 112, 21, -10, 3, -1 },  { 2, -5, 6, 111, 22, -10, 3, -1 },
@@ -183,7 +171,7 @@ static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (full-band) - no filtering for integer pixels
-static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
   { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
   { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
@@ -218,153 +206,116 @@ static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS_RS)] = {
   { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
 };
 
-#if CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
-#define INTERP_SIMPLE_TAPS 4
-static const int16_t filter_simple[(1
-                                    << SUBPEL_BITS_RS)][INTERP_SIMPLE_TAPS] = {
-#if INTERP_SIMPLE_TAPS == 2
-  { 128, 0 },  { 126, 2 },  { 124, 4 },  { 122, 6 },  { 120, 8 },  { 118, 10 },
-  { 116, 12 }, { 114, 14 }, { 112, 16 }, { 110, 18 }, { 108, 20 }, { 106, 22 },
-  { 104, 24 }, { 102, 26 }, { 100, 28 }, { 98, 30 },  { 96, 32 },  { 94, 34 },
-  { 92, 36 },  { 90, 38 },  { 88, 40 },  { 86, 42 },  { 84, 44 },  { 82, 46 },
-  { 80, 48 },  { 78, 50 },  { 76, 52 },  { 74, 54 },  { 72, 56 },  { 70, 58 },
-  { 68, 60 },  { 66, 62 },  { 64, 64 },  { 62, 66 },  { 60, 68 },  { 58, 70 },
-  { 56, 72 },  { 54, 74 },  { 52, 76 },  { 50, 78 },  { 48, 80 },  { 46, 82 },
-  { 44, 84 },  { 42, 86 },  { 40, 88 },  { 38, 90 },  { 36, 92 },  { 34, 94 },
-  { 32, 96 },  { 30, 98 },  { 28, 100 }, { 26, 102 }, { 24, 104 }, { 22, 106 },
-  { 20, 108 }, { 18, 110 }, { 16, 112 }, { 14, 114 }, { 12, 116 }, { 10, 118 },
-  { 8, 120 },  { 6, 122 },  { 4, 124 },  { 2, 126 },
-#elif INTERP_SIMPLE_TAPS == 4
-  { 0, 128, 0, 0 },      { -1, 128, 2, -1 },    { -2, 127, 4, -1 },
-  { -3, 126, 7, -2 },    { -4, 125, 9, -2 },    { -5, 125, 11, -3 },
-  { -6, 124, 13, -3 },   { -7, 123, 16, -4 },   { -7, 122, 18, -5 },
-  { -8, 121, 20, -5 },   { -9, 120, 23, -6 },   { -9, 118, 25, -6 },
-  { -10, 117, 28, -7 },  { -11, 116, 30, -7 },  { -11, 114, 33, -8 },
-  { -12, 113, 35, -8 },  { -12, 111, 38, -9 },  { -13, 109, 41, -9 },
-  { -13, 108, 43, -10 }, { -13, 106, 45, -10 }, { -13, 104, 48, -11 },
-  { -14, 102, 51, -11 }, { -14, 100, 53, -11 }, { -14, 98, 56, -12 },
-  { -14, 96, 58, -12 },  { -14, 94, 61, -13 },  { -15, 92, 64, -13 },
-  { -15, 90, 66, -13 },  { -15, 87, 69, -13 },  { -14, 85, 71, -14 },
-  { -14, 83, 73, -14 },  { -14, 80, 76, -14 },  { -14, 78, 78, -14 },
-  { -14, 76, 80, -14 },  { -14, 73, 83, -14 },  { -14, 71, 85, -14 },
-  { -13, 69, 87, -15 },  { -13, 66, 90, -15 },  { -13, 64, 92, -15 },
-  { -13, 61, 94, -14 },  { -12, 58, 96, -14 },  { -12, 56, 98, -14 },
-  { -11, 53, 100, -14 }, { -11, 51, 102, -14 }, { -11, 48, 104, -13 },
-  { -10, 45, 106, -13 }, { -10, 43, 108, -13 }, { -9, 41, 109, -13 },
-  { -9, 38, 111, -12 },  { -8, 35, 113, -12 },  { -8, 33, 114, -11 },
-  { -7, 30, 116, -11 },  { -7, 28, 117, -10 },  { -6, 25, 118, -9 },
-  { -6, 23, 120, -9 },   { -5, 20, 121, -8 },   { -5, 18, 122, -7 },
-  { -4, 16, 123, -7 },   { -3, 13, 124, -6 },   { -3, 11, 125, -5 },
-  { -2, 9, 125, -4 },    { -2, 7, 126, -3 },    { -1, 4, 127, -2 },
-  { -1, 2, 128, -1 },
-#elif INTERP_SIMPLE_TAPS == 6
-  { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
-  { 1, -3, 127, 4, -2, 1 },    { 1, -4, 127, 6, -3, 1 },
-  { 2, -6, 126, 8, -3, 1 },    { 2, -7, 125, 11, -4, 1 },
-  { 2, -9, 125, 13, -5, 2 },   { 3, -10, 124, 15, -6, 2 },
-  { 3, -11, 123, 18, -7, 2 },  { 3, -12, 122, 20, -8, 3 },
-  { 4, -13, 121, 22, -9, 3 },  { 4, -14, 119, 25, -9, 3 },
-  { 4, -15, 118, 27, -10, 4 }, { 4, -16, 117, 30, -11, 4 },
-  { 5, -17, 116, 32, -12, 4 }, { 5, -17, 114, 35, -13, 4 },
-  { 5, -18, 112, 37, -13, 5 }, { 5, -19, 111, 40, -14, 5 },
-  { 6, -19, 109, 42, -15, 5 }, { 6, -20, 107, 45, -15, 5 },
-  { 6, -20, 105, 48, -16, 5 }, { 6, -21, 103, 51, -17, 6 },
-  { 6, -21, 101, 53, -17, 6 }, { 6, -21, 99, 56, -18, 6 },
-  { 7, -22, 97, 58, -18, 6 },  { 7, -22, 95, 61, -19, 6 },
-  { 7, -22, 93, 63, -19, 6 },  { 7, -22, 91, 66, -20, 6 },
-  { 7, -22, 88, 69, -20, 6 },  { 7, -22, 86, 71, -21, 7 },
-  { 7, -22, 83, 74, -21, 7 },  { 7, -22, 81, 76, -21, 7 },
-  { 7, -22, 79, 79, -22, 7 },  { 7, -21, 76, 81, -22, 7 },
-  { 7, -21, 74, 83, -22, 7 },  { 7, -21, 71, 86, -22, 7 },
-  { 6, -20, 69, 88, -22, 7 },  { 6, -20, 66, 91, -22, 7 },
-  { 6, -19, 63, 93, -22, 7 },  { 6, -19, 61, 95, -22, 7 },
-  { 6, -18, 58, 97, -22, 7 },  { 6, -18, 56, 99, -21, 6 },
-  { 6, -17, 53, 101, -21, 6 }, { 6, -17, 51, 103, -21, 6 },
-  { 5, -16, 48, 105, -20, 6 }, { 5, -15, 45, 107, -20, 6 },
-  { 5, -15, 42, 109, -19, 6 }, { 5, -14, 40, 111, -19, 5 },
-  { 5, -13, 37, 112, -18, 5 }, { 4, -13, 35, 114, -17, 5 },
-  { 4, -12, 32, 116, -17, 5 }, { 4, -11, 30, 117, -16, 4 },
-  { 4, -10, 27, 118, -15, 4 }, { 3, -9, 25, 119, -14, 4 },
-  { 3, -9, 22, 121, -13, 4 },  { 3, -8, 20, 122, -12, 3 },
-  { 2, -7, 18, 123, -11, 3 },  { 2, -6, 15, 124, -10, 3 },
-  { 2, -5, 13, 125, -9, 2 },   { 1, -4, 11, 125, -7, 2 },
-  { 1, -3, 8, 126, -6, 2 },    { 1, -3, 6, 127, -4, 1 },
-  { 1, -2, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
+const int16_t av1_resize_filter_normative[(
+    1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
+#if UPSCALE_NORMATIVE_TAPS == 8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
+  { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
+  { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
+  { -1, 2, -8, 125, 13, -5, 2, 0 },    { -1, 3, -9, 124, 15, -6, 2, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 3, -11, 122, 20, -7, 3, -1 },
+  { -1, 4, -12, 121, 22, -8, 3, -1 },  { -1, 4, -13, 120, 25, -9, 3, -1 },
+  { -1, 4, -14, 118, 28, -9, 3, -1 },  { -1, 4, -15, 117, 30, -10, 4, -1 },
+  { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
+  { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
+  { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
+  { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
+  { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
+  { -1, 6, -20, 97, 58, -17, 6, -1 },  { -1, 6, -20, 95, 61, -18, 6, -1 },
+  { -2, 7, -20, 93, 64, -18, 6, -2 },  { -2, 7, -20, 91, 66, -19, 6, -1 },
+  { -2, 7, -20, 88, 69, -19, 6, -1 },  { -2, 7, -20, 86, 71, -19, 6, -1 },
+  { -2, 7, -20, 84, 74, -20, 7, -2 },  { -2, 7, -20, 81, 76, -20, 7, -1 },
+  { -2, 7, -20, 79, 79, -20, 7, -2 },  { -1, 7, -20, 76, 81, -20, 7, -2 },
+  { -2, 7, -20, 74, 84, -20, 7, -2 },  { -1, 6, -19, 71, 86, -20, 7, -2 },
+  { -1, 6, -19, 69, 88, -20, 7, -2 },  { -1, 6, -19, 66, 91, -20, 7, -2 },
+  { -2, 6, -18, 64, 93, -20, 7, -2 },  { -1, 6, -18, 61, 95, -20, 6, -1 },
+  { -1, 6, -17, 58, 97, -20, 6, -1 },  { -1, 6, -17, 56, 99, -20, 6, -1 },
+  { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
+  { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
+  { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
+  { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
+  { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
+  { -1, 3, -9, 28, 118, -14, 4, -1 },  { -1, 3, -9, 25, 120, -13, 4, -1 },
+  { -1, 3, -8, 22, 121, -12, 4, -1 },  { -1, 3, -7, 20, 122, -11, 3, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 2, -6, 15, 124, -9, 3, -1 },
+  { 0, 2, -5, 13, 125, -8, 2, -1 },    { 0, 1, -4, 11, 125, -7, 2, 0 },
+  { 0, 1, -3, 8, 126, -6, 2, 0 },      { 0, 1, -3, 6, 127, -4, 1, 0 },
+  { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
 #else
-#error "Invalid value of INTERP_SIMPLE_TAPS"
-#endif  // INTERP_SIMPLE_TAPS == 2
+#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
+#endif  // UPSCALE_NORMATIVE_TAPS == 8
 };
-#endif  // CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
 
 // Filters for factor of 2 downsampling.
 static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
 static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
 
-static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
-  int outlength16 = outlength * 16;
-  if (outlength16 >= inlength * 16)
+static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
+  int out_length16 = out_length * 16;
+  if (out_length16 >= in_length * 16)
     return filteredinterp_filters1000;
-  else if (outlength16 >= inlength * 13)
+  else if (out_length16 >= in_length * 13)
     return filteredinterp_filters875;
-  else if (outlength16 >= inlength * 11)
+  else if (out_length16 >= in_length * 11)
     return filteredinterp_filters750;
-  else if (outlength16 >= inlength * 9)
+  else if (out_length16 >= in_length * 9)
     return filteredinterp_filters625;
   else
     return filteredinterp_filters500;
 }
 
-static void interpolate_core(const uint8_t *const input, int inlength,
-                             uint8_t *output, int outlength,
+static void interpolate_core(const uint8_t *const input, int in_length,
+                             uint8_t *output, int out_length,
                              const int16_t *interp_filters, int interp_taps) {
   const int32_t delta =
-      (((uint32_t)inlength << INTERP_PRECISION_BITS) + outlength / 2) /
-      outlength;
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
   const int32_t offset =
-      inlength > outlength
-          ? (((int32_t)(inlength - outlength) << (INTERP_PRECISION_BITS - 1)) +
-             outlength / 2) /
-                outlength
-          : -(((int32_t)(outlength - inlength) << (INTERP_PRECISION_BITS - 1)) +
-              outlength / 2) /
-                outlength;
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
   uint8_t *optr = output;
   int x, x1, x2, sum, k, int_pel, sub_pel;
   int32_t y;
 
   x = 0;
-  y = offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) < (interp_taps / 2 - 1)) {
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
     x++;
     y += delta;
   }
   x1 = x;
-  x = outlength - 1;
-  y = delta * x + offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) + (int32_t)(interp_taps / 2) >=
-         inlength) {
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
     x--;
     y -= delta;
   }
   x2 = x;
   if (x1 > x2) {
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < outlength;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
          ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k) {
         const int pk = int_pel - interp_taps / 2 + 1 + k;
-        sum += filter[k] * input[AOMMAX(AOMMIN(pk, inlength - 1), 0)];
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
       }
       *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
   } else {
     // Initial part.
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < x1; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -373,8 +324,8 @@ static void interpolate_core(const uint8_t *const input, int inlength,
     }
     // Middle part.
     for (; x <= x2; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -382,35 +333,42 @@ static void interpolate_core(const uint8_t *const input, int inlength,
       *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
     // End part.
-    for (; x < outlength; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
         sum += filter[k] *
-               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, inlength - 1)];
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
       *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
   }
 }
 
-static void interpolate(const uint8_t *const input, int inlength,
-                        uint8_t *output, int outlength) {
-  const interp_kernel *interp_filters =
-      choose_interp_filter(inlength, outlength);
+static void interpolate(const uint8_t *const input, int in_length,
+                        uint8_t *output, int out_length) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  interpolate_core(input, in_length, output, out_length, &interp_filters[0][0],
+                   SUBPEL_TAPS);
+}
 
-  interpolate_core(input, inlength, output, outlength, &interp_filters[0][0],
-                   INTERP_TAPS);
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
+  return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
 }
 
-#if CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
-static void interpolate_simple(const uint8_t *const input, int inlength,
-                               uint8_t *output, int outlength) {
-  interpolate_core(input, inlength, output, outlength, &filter_simple[0][0],
-                   INTERP_SIMPLE_TAPS);
+static int32_t get_upscale_convolve_x0(int in_length, int out_length,
+                                       int32_t x_step_qn) {
+  const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
+  const int32_t x0 =
+      (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+       out_length / 2) /
+          out_length +
+      RS_SCALE_EXTRA_OFF - err / 2;
+  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }
-#endif  // CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
 
 #ifndef __clang_analyzer__
 static void down2_symeven(const uint8_t *const input, int length,
@@ -525,8 +483,7 @@ static void down2_symodd(const uint8_t *const input, int length,
 }
 
 static int get_down2_length(int length, int steps) {
-  int s;
-  for (s = 0; s < steps; ++s) length = (length + 1) >> 1;
+  for (int s = 0; s < steps; ++s) length = (length + 1) >> 1;
   return length;
 }
 
@@ -536,6 +493,12 @@ static int get_down2_steps(int in_length, int out_length) {
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
     ++steps;
     in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
   }
   return steps;
 }
@@ -624,97 +587,118 @@ Error:
   aom_free(arrbuf2);
 }
 
-#if CONFIG_FRAME_SUPERRES
-static void upscale_normative(const uint8_t *const input, int length,
-                              uint8_t *output, int olength) {
-#if CONFIG_LOOP_RESTORATION
-  interpolate_simple(input, length, output, olength);
-#else
-  interpolate(input, length, output, olength);
-#endif  // CONFIG_LOOP_RESTORATION
-}
-
-static void upscale_normative_plane(const uint8_t *const input, int height,
-                                    int width, int in_stride, uint8_t *output,
-                                    int height2, int width2, int out_stride) {
-  int i;
-  uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
-  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
-  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
-  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+static void upscale_normative_rect(const uint8_t *const input, int height,
+                                   int width, int in_stride, uint8_t *output,
+                                   int height2, int width2, int out_stride,
+                                   int x_step_qn, int x0_qn, int pad_left,
+                                   int pad_right) {
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
   assert(height2 > 0);
-  for (i = 0; i < height; ++i)
-    upscale_normative(input + in_stride * i, width, intbuf + width2 * i,
-                      width2);
-  for (i = 0; i < width2; ++i) {
-    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    upscale_normative(arrbuf, height, arrbuf2, height2);
-    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  uint8_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint8_t *tmp_right = NULL;
+  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
+  uint8_t *const in_tr = (uint8_t *)(input + width);
+  if (pad_left) {
+    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
+      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
+    }
+  }
+  if (pad_right) {
+    tmp_right =
+        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
+      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
+             border_cols);
+    }
   }
 
-Error:
-  aom_free(intbuf);
-  aom_free(arrbuf);
-  aom_free(arrbuf2);
+  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
+                        height2, &av1_resize_filter_normative[0][0], x0_qn,
+                        x_step_qn);
+
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
+    }
+    aom_free(tmp_right);
+  }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_interpolate_core(const uint16_t *const input, int inlength,
-                                    uint16_t *output, int outlength, int bd,
+static void highbd_interpolate_core(const uint16_t *const input, int in_length,
+                                    uint16_t *output, int out_length, int bd,
                                     const int16_t *interp_filters,
                                     int interp_taps) {
   const int32_t delta =
-      (((uint32_t)inlength << INTERP_PRECISION_BITS) + outlength / 2) /
-      outlength;
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
   const int32_t offset =
-      inlength > outlength
-          ? (((int32_t)(inlength - outlength) << (INTERP_PRECISION_BITS - 1)) +
-             outlength / 2) /
-                outlength
-          : -(((int32_t)(outlength - inlength) << (INTERP_PRECISION_BITS - 1)) +
-              outlength / 2) /
-                outlength;
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
   uint16_t *optr = output;
   int x, x1, x2, sum, k, int_pel, sub_pel;
   int32_t y;
 
   x = 0;
-  y = offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) < (interp_taps / 2 - 1)) {
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
     x++;
     y += delta;
   }
   x1 = x;
-  x = outlength - 1;
-  y = delta * x + offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) + (int32_t)(interp_taps / 2) >=
-         inlength) {
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
     x--;
     y -= delta;
   }
   x2 = x;
   if (x1 > x2) {
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < outlength;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
          ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k) {
         const int pk = int_pel - interp_taps / 2 + 1 + k;
-        sum += filter[k] * input[AOMMAX(AOMMIN(pk, inlength - 1), 0)];
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
       }
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
   } else {
     // Initial part.
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < x1; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -723,8 +707,8 @@ static void highbd_interpolate_core(const uint16_t *const input, int inlength,
     }
     // Middle part.
     for (; x <= x2; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -732,35 +716,27 @@ static void highbd_interpolate_core(const uint16_t *const input, int inlength,
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
     // End part.
-    for (; x < outlength; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
         sum += filter[k] *
-               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, inlength - 1)];
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
   }
 }
 
-static void highbd_interpolate(const uint16_t *const input, int inlength,
-                               uint16_t *output, int outlength, int bd) {
-  const interp_kernel *interp_filters =
-      choose_interp_filter(inlength, outlength);
-
-  highbd_interpolate_core(input, inlength, output, outlength, bd,
-                          &interp_filters[0][0], INTERP_TAPS);
-}
+static void highbd_interpolate(const uint16_t *const input, int in_length,
+                               uint16_t *output, int out_length, int bd) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
 
-#if CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
-static void highbd_interpolate_simple(const uint16_t *const input, int inlength,
-                                      uint16_t *output, int outlength, int bd) {
-  highbd_interpolate_core(input, inlength, output, outlength, bd,
-                          &filter_simple[0][0], INTERP_SIMPLE_TAPS);
+  highbd_interpolate_core(input, in_length, output, out_length, bd,
+                          &interp_filters[0][0], SUBPEL_TAPS);
 }
-#endif  // CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
 
 #ifndef __clang_analyzer__
 static void highbd_down2_symeven(const uint16_t *const input, int length,
@@ -958,44 +934,68 @@ Error:
   aom_free(arrbuf2);
 }
 
-#if CONFIG_FRAME_SUPERRES
-static void highbd_upscale_normative(const uint16_t *const input, int length,
-                                     uint16_t *output, int olength, int bd) {
-#if CONFIG_LOOP_RESTORATION
-  highbd_interpolate_simple(input, length, output, olength, bd);
-#else
-  highbd_interpolate(input, length, output, olength, bd);
-#endif  // CONFIG_LOOP_RESTORATION
-}
-
-static void highbd_upscale_normative_plane(const uint8_t *const input,
-                                           int height, int width, int in_stride,
-                                           uint8_t *output, int height2,
-                                           int width2, int out_stride, int bd) {
-  int i;
-  uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
-  uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height);
-  uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2);
-  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
-  for (i = 0; i < height; ++i) {
-    highbd_upscale_normative(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
-                             intbuf + width2 * i, width2, bd);
+static void highbd_upscale_normative_rect(const uint8_t *const input,
+                                          int height, int width, int in_stride,
+                                          uint8_t *output, int height2,
+                                          int width2, int out_stride,
+                                          int x_step_qn, int x0_qn,
+                                          int pad_left, int pad_right, int bd) {
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  const int border_size = border_cols * sizeof(uint16_t);
+  uint16_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint16_t *tmp_right = NULL;
+  uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
+  uint16_t *const in_tl = input16 - border_cols;
+  uint16_t *const in_tr = input16 + width;
+  if (pad_left) {
+    tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
+      aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
+    }
   }
-  for (i = 0; i < width2; ++i) {
-    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    highbd_upscale_normative(arrbuf, height, arrbuf2, height2, bd);
-    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
-                           arrbuf2);
+  if (pad_right) {
+    tmp_right =
+        (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
+      aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
+                   border_cols);
+    }
   }
 
-Error:
-  aom_free(intbuf);
-  aom_free(arrbuf);
-  aom_free(arrbuf2);
-}
-#endif  // CONFIG_FRAME_SUPERRES
+  av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
+                               CONVERT_TO_SHORTPTR(output), out_stride, width2,
+                               height2, &av1_resize_filter_normative[0][0],
+                               x0_qn, x_step_qn, bd);
 
-#endif  // CONFIG_HIGHBITDEPTH
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
+    }
+    aom_free(tmp_right);
+  }
+}
 
 void av1_resize_frame420(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
@@ -1031,7 +1031,6 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride,
   resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
                                 const uint8_t *const u, const uint8_t *const v,
                                 int uv_stride, int height, int width,
@@ -1073,125 +1072,137 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
   highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
                       ouv_stride, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_HIGHBITDEPTH
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd) {
-#else
 void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst) {
-#endif  // CONFIG_HIGHBITDEPTH
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes) {
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
-  int i;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
-                              src->uv_crop_width };
-  const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
-                               src->uv_crop_height };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
-                              dst->uv_crop_width };
-  const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
-                               dst->uv_crop_height };
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_HIGHBITDEPTH
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
     if (src->flags & YV12_FLAG_HIGHBITDEPTH)
-      highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
-                          src_strides[i], dsts[i], dst_heights[i],
-                          dst_widths[i], dst_strides[i], bd);
+      highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                          src->crop_widths[is_uv], src->strides[is_uv],
+                          dst->buffers[i], dst->crop_heights[is_uv],
+                          dst->crop_widths[is_uv], dst->strides[is_uv], bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
-      resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
-                   dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+      resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                   src->crop_widths[is_uv], src->strides[is_uv],
+                   dst->buffers[i], dst->crop_heights[is_uv],
+                   dst->crop_widths[is_uv], dst->strides[is_uv]);
   }
-  aom_extend_frame_borders(dst);
+  aom_extend_frame_borders(dst, num_planes);
 }
 
-#if CONFIG_FRAME_SUPERRES
-#if CONFIG_HIGHBITDEPTH
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                            YV12_BUFFER_CONFIG *dst, int bd) {
-#else
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                            YV12_BUFFER_CONFIG *dst) {
-#endif  // CONFIG_HIGHBITDEPTH
-  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
-  int i;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
-                              src->uv_crop_width };
-  const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
-                               src->uv_crop_height };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
-                              dst->uv_crop_width };
-  const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
-                               dst->uv_crop_height };
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_HIGHBITDEPTH
-    if (src->flags & YV12_FLAG_HIGHBITDEPTH)
-      highbd_upscale_normative_plane(srcs[i], src_heights[i], src_widths[i],
-                                     src_strides[i], dsts[i], dst_heights[i],
-                                     dst_widths[i], dst_strides[i], bd);
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows) {
+  const int is_uv = (plane > 0);
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
+  const int upscaled_plane_width =
+      ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  const int superres_denom = cm->superres_scale_denominator;
+
+  TileInfo tile_col;
+  const int32_t x_step_qn = av1_get_upscale_convolve_step(
+      downscaled_plane_width, upscaled_plane_width);
+  int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
+                                          upscaled_plane_width, x_step_qn);
+
+  for (int j = 0; j < cm->tile_cols; j++) {
+    av1_tile_set_col(&tile_col, cm, j);
+    // Determine the limits of this tile column in both the source
+    // and destination images.
+    // Note: The actual location which we start sampling from is
+    // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
+    // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
+    const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
+    const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
+    const int src_width = downscaled_x1 - downscaled_x0;
+
+    const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
+    int upscaled_x1;
+    if (j == cm->tile_cols - 1) {
+      // Note that we can't just use AOMMIN here - due to rounding,
+      // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
+      // upscaled_plane_width.
+      upscaled_x1 = upscaled_plane_width;
+    } else {
+      upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
+    }
+
+    const uint8_t *const src_ptr = src + downscaled_x0;
+    uint8_t *const dst_ptr = dst + upscaled_x0;
+    const int dst_width = upscaled_x1 - upscaled_x0;
+
+    const int pad_left = (j == 0);
+    const int pad_right = (j == cm->tile_cols - 1);
+
+    if (cm->use_highbitdepth)
+      highbd_upscale_normative_rect(
+          src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
+          dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->bit_depth);
     else
-#endif  // CONFIG_HIGHBITDEPTH
-      upscale_normative_plane(srcs[i], src_heights[i], src_widths[i],
-                              src_strides[i], dsts[i], dst_heights[i],
-                              dst_widths[i], dst_strides[i]);
+      upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
+                             rows, dst_width, dst_stride, x_step_qn, x0_qn,
+                             pad_left, pad_right);
+
+    // Update the fractional pixel offset to prepare for the next tile column.
+    x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
   }
-  aom_extend_frame_borders(dst);
 }
-#endif  // CONFIG_FRAME_SUPERRES
+
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
+                                            YV12_BUFFER_CONFIG *dst) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    const int is_uv = (i > 0);
+    av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
+                               dst->buffers[i], dst->strides[is_uv], i,
+                               src->crop_heights[is_uv]);
+  }
+
+  aom_extend_frame_borders(dst, num_planes);
+}
 
 YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled) {
+  const int num_planes = av1_num_planes(cm);
   if (cm->width != unscaled->y_crop_width ||
       cm->height != unscaled->y_crop_height) {
-#if CONFIG_HIGHBITDEPTH
-    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
-#else
-    av1_resize_and_extend_frame(unscaled, scaled);
-#endif  // CONFIG_HIGHBITDEPTH
+    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+                                num_planes);
     return scaled;
   } else {
     return unscaled;
   }
 }
 
-// Calculates scaled dimensions given original dimensions and the scale
-// denominator. If 'scale_height' is 1, both width and height are scaled;
-// otherwise, only the width is scaled.
-static void calculate_scaled_size_helper(int *width, int *height, int denom,
-                                         int scale_height) {
+// Calculates the scaled dimension given the original dimension and the scale
+// denominator.
+static void calculate_scaled_size_helper(int *dim, int denom) {
   if (denom != SCALE_NUMERATOR) {
-    *width = *width * SCALE_NUMERATOR / denom;
-    *width += *width & 1;  // Make it even.
-    if (scale_height) {
-      *height = *height * SCALE_NUMERATOR / denom;
-      *height += *height & 1;  // Make it even.
-    }
+    // Use this version if we need *dim to be even
+    // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
+    // *width <<= 1;
+    *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
   }
 }
 
 void av1_calculate_scaled_size(int *width, int *height, int resize_denom) {
-  calculate_scaled_size_helper(width, height, resize_denom, 1);
+  calculate_scaled_size_helper(width, resize_denom);
+  calculate_scaled_size_helper(height, resize_denom);
 }
 
-#if CONFIG_FRAME_SUPERRES
 void av1_calculate_scaled_superres_size(int *width, int *height,
                                         int superres_denom) {
-  calculate_scaled_size_helper(width, height, superres_denom,
-                               !CONFIG_HORZONLY_FRAME_SUPERRES);
+  (void)height;
+  calculate_scaled_size_helper(width, superres_denom);
 }
 
 void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
@@ -1199,38 +1210,47 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
     // Note: av1_calculate_scaled_superres_size() rounds *up* after division
     // when the resulting dimensions are odd. So here, we round *down*.
     *width = *width * denom / SCALE_NUMERATOR;
-#if CONFIG_HORZONLY_FRAME_SUPERRES
     (void)height;
-#else
-    *height = *height * denom / SCALE_NUMERATOR;
-#endif  // CONFIG_HORZONLY_FRAME_SUPERRES
   }
 }
 
+// Copy only the config data from 'src' to 'dst'.
+static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
+                               YV12_BUFFER_CONFIG *const dst) {
+  dst->bit_depth = src->bit_depth;
+  dst->color_primaries = src->color_primaries;
+  dst->transfer_characteristics = src->transfer_characteristics;
+  dst->matrix_coefficients = src->matrix_coefficients;
+  dst->monochrome = src->monochrome;
+  dst->chroma_sample_position = src->chroma_sample_position;
+  dst->color_range = src->color_range;
+}
+
 // TODO(afergs): Look for in-place upscaling
 // TODO(afergs): aom_ vs av1_ functions? Which can I use?
 // Upscale decoded image.
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
-  if (av1_superres_unscaled(cm)) return;
+  const int num_planes = av1_num_planes(cm);
+  if (!av1_superres_scaled(cm)) return;
 
   YV12_BUFFER_CONFIG copy_buffer;
   memset(&copy_buffer, 0, sizeof(copy_buffer));
 
   YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
 
-  if (aom_alloc_frame_buffer(&copy_buffer, cm->width, cm->height,
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
+  if (aom_alloc_frame_buffer(&copy_buffer, aligned_width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                             cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-                             AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+                             cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                             cm->byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
-  // Copy function assumes the frames are the same size, doesn't copy bit_depth.
-  aom_yv12_copy_frame(frame_to_show, &copy_buffer);
-  copy_buffer.bit_depth = frame_to_show->bit_depth;
-  assert(copy_buffer.y_crop_width == cm->width);
+  // Copy function assumes the frames are the same size.
+  // Note that it does not copy YV12_BUFFER_CONFIG config data.
+  aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
+
+  assert(copy_buffer.y_crop_width == aligned_width);
   assert(copy_buffer.y_crop_height == cm->height);
 
   // Realloc the current frame buffer at a higher resolution in place.
@@ -1248,48 +1268,43 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
 
-    if (aom_realloc_frame_buffer(
-            frame_to_show, cm->superres_upscaled_width,
-            cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-            cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+    // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
+    if (aom_realloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
+                                 cm->superres_upscaled_height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                                 cm->byte_alignment, fb, cb, cb_priv))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
   } else {
+    // Make a copy of the config data for frame_to_show in copy_buffer
+    copy_buffer_config(frame_to_show, &copy_buffer);
+
     // Don't use callbacks on the encoder.
+    // aom_alloc_frame_buffer() clears the config data for frame_to_show
     if (aom_alloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
                                cm->superres_upscaled_height, cm->subsampling_x,
-                               cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
+                               cm->subsampling_y, cm->use_highbitdepth,
                                AOM_BORDER_IN_PIXELS, cm->byte_alignment))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
+
+    // Restore config data back to frame_to_show
+    copy_buffer_config(&copy_buffer, frame_to_show);
   }
   // TODO(afergs): verify frame_to_show is correct after realloc
   //               encoder:
   //               decoder:
-  frame_to_show->bit_depth = copy_buffer.bit_depth;
+
   assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
   assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
 
   // Scale up and back into frame_to_show.
   assert(frame_to_show->y_crop_width != cm->width);
-  assert(IMPLIES(!CONFIG_HORZONLY_FRAME_SUPERRES,
-                 frame_to_show->y_crop_height != cm->height));
-#if CONFIG_HIGHBITDEPTH
-  av1_upscale_normative_and_extend_frame(&copy_buffer, frame_to_show,
-                                         (int)cm->bit_depth);
-#else
-  av1_upscale_normative_and_extend_frame(&copy_buffer, frame_to_show);
-#endif  // CONFIG_HIGHBITDEPTH
+  av1_upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
 
   // Free the copy buffer
   aom_free_frame_buffer(&copy_buffer);
 }
-#endif  // CONFIG_FRAME_SUPERRES
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index 66b32c72d..feec3a90e 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -39,7 +39,6 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth);
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
                              int in_stride, uint8_t *output, int height2,
                              int width2, int out_stride, int bd);
@@ -61,25 +60,16 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd);
-#else
 void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_FRAME_SUPERRES
-#if CONFIG_HIGHBITDEPTH
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                            YV12_BUFFER_CONFIG *dst, int bd);
-#else
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes);
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows);
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
                                             YV12_BUFFER_CONFIG *dst);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_FRAME_SUPERRES
 
 YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
@@ -89,7 +79,6 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
 // resize scale denominator.
 void av1_calculate_scaled_size(int *width, int *height, int resize_denom);
 
-#if CONFIG_FRAME_SUPERRES
 // Similar to above, but calculates scaled dimensions after superres from the
 // given original dimensions and superres scale denominator.
 void av1_calculate_scaled_superres_size(int *width, int *height,
@@ -102,11 +91,19 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
 
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool);
 
-// Returns 1 if a superres upscaled frame is unscaled and 0 otherwise.
-static INLINE int av1_superres_unscaled(const AV1_COMMON *cm) {
-  return (cm->superres_scale_denominator == SCALE_NUMERATOR);
+// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
+static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
+  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
+  // So, the following check is more accurate.
+  return !(cm->width == cm->superres_upscaled_width);
 }
-#endif  // CONFIG_FRAME_SUPERRES
+
+#define UPSCALE_NORMATIVE_TAPS 8
+extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS]
+                                                [UPSCALE_NORMATIVE_TAPS];
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 00441f072..58a5275ca 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -12,100 +12,130 @@
 
 #include <math.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "av1/common/restoration.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 
 #include "aom_ports/mem.h"
 
+// The 's' values are calculated based on original 'r' and 'e' values in the
+// spec using GenSgrprojVtable().
+// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
-#if USE_HIGHPASS_IN_SGRPROJ
-  // corner, edge, r2, eps2
-  { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
-  { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
-  { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
-  { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
-#else
-// r1, eps1, r2, eps2
-#if MAX_RADIUS == 2
-  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
-  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
-  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
-  { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
-#else
-  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
-  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
-  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
-  { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
-#endif  // MAX_RADIUS == 2
-#endif
+  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
+  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
+  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
+  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
+  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
+  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
+  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
+  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
 };
 
-typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
-                                  int stride, RestorationInternal *rst,
-                                  uint8_t *dst8, int dst_stride);
-#if CONFIG_HIGHBITDEPTH
-typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
-                                         int stride, RestorationInternal *rst,
-                                         int bit_depth, uint8_t *dst8,
-                                         int dst_stride);
-#endif  // CONFIG_HIGHBITDEPTH
-
-int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
-                                 int width, int height) {
-  const int ntiles = av1_get_rest_ntiles(
-      width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
-  aom_free(rst_info->restoration_type);
-  CHECK_MEM_ERROR(cm, rst_info->restoration_type,
-                  (RestorationType *)aom_malloc(
-                      sizeof(*rst_info->restoration_type) * ntiles));
-  aom_free(rst_info->wiener_info);
-  CHECK_MEM_ERROR(
-      cm, rst_info->wiener_info,
-      (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
-  memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
-  aom_free(rst_info->sgrproj_info);
-  CHECK_MEM_ERROR(
-      cm, rst_info->sgrproj_info,
-      (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
-  return ntiles;
+AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
+  AV1PixelRect rect;
+
+  int ss_x = is_uv && cm->subsampling_x;
+  int ss_y = is_uv && cm->subsampling_y;
+
+  rect.top = 0;
+  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
+  rect.left = 0;
+  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  return rect;
+}
+
+// Count horizontal or vertical units per tile (use a width or height for
+// tile_size, respectively). We basically want to divide the tile size by the
+// size of a restoration unit. Rather than rounding up unconditionally as you
+// might expect, we round to nearest, which models the way a right or bottom
+// restoration unit can extend to up to 150% its normal width or height. The
+// max with 1 is to deal with tiles that are smaller than half of a restoration
+// unit.
+int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
+  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+}
+
+void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
+                                  int is_uv) {
+  // We need to allocate enough space for restoration units to cover the
+  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
+  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
+  // to do the computation ourselves, iterating over the tiles and keeping
+  // track of the largest width and height, then upscaling.
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int max_tile_w = tile_rect.right - tile_rect.left;
+  const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+  // To calculate hpertile and vpertile (horizontal and vertical units per
+  // tile), we basically want to divide the largest tile width or height by the
+  // size of a restoration unit. Rather than rounding up unconditionally as you
+  // might expect, we round to nearest, which models the way a right or bottom
+  // restoration unit can extend to up to 150% its normal width or height. The
+  // max with 1 is to deal with tiles that are smaller than half of a
+  // restoration unit.
+  const int unit_size = rsi->restoration_unit_size;
+  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
+  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+
+  rsi->units_per_tile = hpertile * vpertile;
+  rsi->horz_units_per_tile = hpertile;
+  rsi->vert_units_per_tile = vpertile;
+
+  const int ntiles = 1;
+  const int nunits = ntiles * rsi->units_per_tile;
+
+  aom_free(rsi->unit_info);
+  CHECK_MEM_ERROR(cm, rsi->unit_info,
+                  (RestorationUnitInfo *)aom_memalign(
+                      16, sizeof(*rsi->unit_info) * nunits));
 }
 
 void av1_free_restoration_struct(RestorationInfo *rst_info) {
-  aom_free(rst_info->restoration_type);
-  rst_info->restoration_type = NULL;
-  aom_free(rst_info->wiener_info);
-  rst_info->wiener_info = NULL;
-  aom_free(rst_info->sgrproj_info);
-  rst_info->sgrproj_info = NULL;
+  aom_free(rst_info->unit_info);
+  rst_info->unit_info = NULL;
 }
 
-// TODO(debargha): This table can be substantially reduced since only a few
-// values are actually used.
-int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+#if 0
+// Pair of values for each sgrproj parameter:
+// Index 0 corresponds to r[0], e[0]
+// Index 1 corresponds to r[1], e[1]
+int sgrproj_mtable[SGRPROJ_PARAMS][2];
 
 static void GenSgrprojVtable() {
-  int e, n;
-  for (e = 1; e <= MAX_EPS; ++e)
-    for (n = 1; n <= MAX_NELEM; ++n) {
-      const int n2e = n * n * e;
-      sgrproj_mtable[e - 1][n - 1] =
-          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
+    const sgr_params_type *const params = &sgr_params[i];
+    for (int j = 0; j < 2; ++j) {
+      const int e = params->e[j];
+      const int r = params->r[j];
+      if (r == 0) {                 // filter is disabled
+        sgrproj_mtable[i][j] = -1;  // mark invalid
+      } else {                      // filter is enabled
+        const int n = (2 * r + 1) * (2 * r + 1);
+        const int n2e = n * n * e;
+        assert(n2e != 0);
+        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+      }
     }
+  }
 }
+#endif
 
-void av1_loop_restoration_precal() { GenSgrprojVtable(); }
-
-static void loop_restoration_init(RestorationInternal *rst, int kf) {
-  rst->keyframe = kf;
+void av1_loop_restoration_precal() {
+#if 0
+  GenSgrprojVtable();
+#endif
 }
 
-void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert) {
+static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
+                               int border_horz, int border_vert) {
   uint8_t *data_p;
   int i;
   for (i = 0; i < height; ++i) {
@@ -123,261 +153,297 @@ void extend_frame(uint8_t *data, int width, int height, int stride,
   }
 }
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-
-// This function setup a processing stripe by replacing the vertical
-// stripe boundary (2 lines above and 2 lines below) by data coming
-// from the above/below buffers. Before doing so the original
-// frame data is saved into a temporary buffer, such that it
-// can be restored by the restore_processing_stripe_boundary
-// function after the filtering of the processing stripe.
-// Returns the height of the processing stripe
-static int setup_processing_stripe_boundary(int y0, int v_end, int h_start,
-                                            int h_end, uint8_t *data,
-                                            int stride,
-                                            RestorationInternal *rst,
-                                            int use_highbd) {
-  int y, y_stripe_topmost, stripe_index, i;
-  int tile_offset = RESTORATION_TILE_OFFSET >> rst->subsampling_y;
-  int stripe_height = rst->rsi->procunit_height;
-  int comp = rst->component;
-  uint8_t *boundary_above_buf = rst->stripe_boundary_above[comp];
-  uint8_t *boundary_below_buf = rst->stripe_boundary_below[comp];
-  int boundary_stride = rst->stripe_boundary_stride[comp];
-  int x0 = h_start - RESTORATION_EXTRA_HORZ;
-  int x1 = h_end + RESTORATION_EXTRA_HORZ;
-
-  stripe_index = (y0 + tile_offset) / stripe_height;
-  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
-  boundary_above_buf +=
-      ((stripe_index - 1) * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
-      << use_highbd;
-  boundary_below_buf +=
-      (stripe_index * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
-      << use_highbd;
-
-  // setup the 2 lines above the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost - 2 + i;
-    if (y >= 0 && y < y0 && y >= y0 - 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      uint8_t *new_data =
-          boundary_above_buf + ((i * boundary_stride + x0) << use_highbd);
-      // printf("above %3d %3d: %08x %08x : %08x %08x\n", y, x0,
-      // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
-      // ((uint32_t*)new_data)[1]);
-      // Save old pixels
-      memcpy(rst->tmp_save_above[i], p, (x1 - x0) << use_highbd);
-      // Replace width pixels from boundary_above_buf
-      memcpy(p, new_data, (x1 - x0) << use_highbd);
-    }
+static void extend_frame_highbd(uint16_t *data, int width, int height,
+                                int stride, int border_horz, int border_vert) {
+  uint16_t *data_p;
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
   }
-  // setup the 2 lines below the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost + stripe_height + i;
-    if (y < v_end + 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      uint8_t *new_data =
-          boundary_below_buf + ((i * boundary_stride + x0) << use_highbd);
-      // printf("below %3d %3d: %08x %08x : %08x %08x\n", y, x0,
-      // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
-      // ((uint32_t*)new_data)[1]);
-      // Save old pixels
-      memcpy(rst->tmp_save_below[i], p, (x1 - x0) << use_highbd);
-      // Replace width pixels from boundary_below_buf
-      memcpy(p, new_data, (x1 - x0) << use_highbd);
-    }
+  data_p = data - border_horz;
+  for (i = -border_vert; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p,
+           (width + 2 * border_horz) * sizeof(uint16_t));
+  }
+  for (i = height; i < height + border_vert; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           (width + 2 * border_horz) * sizeof(uint16_t));
   }
-  // Return actual stripe height
-  return AOMMIN(v_end, y_stripe_topmost + stripe_height) - y0;
 }
 
-// This function restores the boundary lines modified by
-// setup_processing_stripe_boundary.
-static void restore_processing_stripe_boundary(int y0, int v_end, int h_start,
-                                               int h_end, uint8_t *data,
-                                               int stride,
-                                               RestorationInternal *rst,
-                                               int use_highbd) {
-  int y, y_stripe_topmost, i, stripe_index;
-  int tile_offset = 8 >> rst->subsampling_y;
-  int stripe_height = rst->rsi->procunit_height;
-  int x0 = h_start - RESTORATION_EXTRA_HORZ;
-  int x1 = h_end + RESTORATION_EXTRA_HORZ;
-
-  stripe_index = (y0 + tile_offset) / stripe_height;
-  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
-
-  // restore the 2 lines above the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost - 2 + i;
-    if (y >= 0 && y < y0 && y >= y0 - 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      memcpy(p, rst->tmp_save_above[i], (x1 - x0) << use_highbd);
-    }
-  }
-  // restore the 2 lines below the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost + stripe_height + i;
-    if (y < v_end + 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      memcpy(p, rst->tmp_save_below[i], (x1 - x0) << use_highbd);
-    }
-  }
+void extend_frame(uint8_t *data, int width, int height, int stride,
+                  int border_horz, int border_vert, int highbd) {
+  if (highbd)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+                        border_horz, border_vert);
+  else
+    extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
 }
 
-#endif
+static void copy_tile_lowbd(int width, int height, const uint8_t *src,
+                            int src_stride, uint8_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width);
+}
 
-static void loop_copy_tile(uint8_t *data, int tile_idx, int width, int height,
-                           int stride, RestorationInternal *rst, uint8_t *dst,
-                           int dst_stride) {
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; ++i)
-    memcpy(dst + i * dst_stride + limits.h_start,
-           data + i * stride + limits.h_start, limits.h_end - limits.h_start);
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+                             int src_stride, uint16_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
 }
 
-static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
-                                   int boundary_dist, int istop) {
-  memcpy(vert, orig, sizeof(InterpKernel));
-  switch (boundary_dist) {
-    case 0:
-      vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
-      vert[2] = vert[1] = vert[0] = 0;
-      break;
-    case 1:
-      vert[2] += vert[1] + vert[0];
-      vert[1] = vert[0] = 0;
-      break;
-    case 2:
-      vert[1] += vert[0];
-      vert[0] = 0;
-      break;
-    default: break;
-  }
-  if (!istop) {
-    int tmp;
-    tmp = vert[0];
-    vert[0] = vert[WIENER_WIN - 1];
-    vert[WIENER_WIN - 1] = tmp;
-    tmp = vert[1];
-    vert[1] = vert[WIENER_WIN - 2];
-    vert[WIENER_WIN - 2] = tmp;
-    tmp = vert[2];
-    vert[2] = vert[WIENER_WIN - 3];
-    vert[WIENER_WIN - 3] = tmp;
-  }
+static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride, int highbd) {
+  if (highbd)
+    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+                     CONVERT_TO_SHORTPTR(dst), dst_stride);
+  else
+    copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
 }
 
-static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
-                                    int height, int stride,
-                                    RestorationInternal *rst, uint8_t *dst,
-                                    int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
-    return;
+#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+
+// With striped loop restoration, the filtering for each 64-pixel stripe gets
+// most of its input from the output of CDEF (stored in data8), but we need to
+// fill out a border of 3 pixels above/below the stripe according to the
+// following
+// rules:
+//
+// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
+//   This extension is done by a call to extend_frame() at the start of the loop
+//   restoration process, so the value of copy_above/copy_below doesn't strictly
+//   matter.
+//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
+//   across tiles is disabled, we can allow
+//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
+//   data has always been copied, simplifying the behaviour at the left and
+//   right edges of tiles.
+//
+// * If we're at a tile boundary and loop filtering across tiles is enabled,
+//   then there is a logical stripe which is 64 pixels high, but which is split
+//   into an 8px high and a 56px high stripe so that the processing (and
+//   coefficient set usage) can be aligned to tiles.
+//   In this case, we use the 3 rows of CDEF output across the boundary for
+//   context; this corresponds to leaving the frame buffer as-is.
+//
+// * If we're at a tile boundary and loop filtering across tiles is disabled,
+//   then we take the outermost row of CDEF pixels *within the current tile*
+//   and copy it three times. Thus we behave exactly as if the tile were a full
+//   frame.
+//
+// * Otherwise, we're at a stripe boundary within a tile. In that case, we
+//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
+//
+// The distinction between the latter two cases is handled by the
+// av1_loop_restoration_save_boundary_lines() function, so here we just need
+// to decide if we're overwriting the above/below boundary pixels or not.
+static void get_stripe_boundary_info(const RestorationTileLimits *limits,
+                                     const AV1PixelRect *tile_rect, int ss_y,
+                                     int *copy_above, int *copy_below) {
+  *copy_above = 1;
+  *copy_below = 1;
+
+  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+  const int this_stripe_height =
+      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
+  const int last_stripe_in_tile =
+      (limits->v_start + this_stripe_height >= tile_rect->bottom);
+
+  if (first_stripe_in_tile) *copy_above = 0;
+  if (last_stripe_in_tile) *copy_below = 0;
+}
+
+// Overwrite the border pixels around a processing stripe so that the conditions
+// listed above get_stripe_boundary_info() are preserved.
+// We save the pixels which get overwritten into a temporary buffer, so that
+// they can be restored by restore_processing_stripe_boundary() after we've
+// processed the stripe.
+//
+// limits gives the rectangular limits of the remaining stripes for the current
+// restoration unit. rsb is the stored stripe boundaries (taken from either
+// deblock or CDEF output as necessary).
+//
+// tile_rect is the limits of the current tile and tile_stripe0 is the index of
+// the first stripe in this tile (needed to convert the tile-relative stripe
+// index we get from limits into something we can look up in rsb).
+static void setup_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
+    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
+  // Offsets within the line buffers. The buffer logically starts at column
+  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
+  // has column x0 in the buffer.
+  const int buf_stride = rsb->stripe_boundary_stride;
+  const int buf_x0_off = limits->h_start;
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  // Replace RESTORATION_BORDER pixels above the top of the stripe
+  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
+  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
+  // duplicating the topmost of the 2 lines (see the AOMMAX call when
+  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
+  //
+  // Special case: If we're at the top of a tile, which isn't on the topmost
+  // tile row, and we're allowed to loop filter across tiles, then we have a
+  // logical 64-pixel-high stripe which has been split into an 8-pixel high
+  // stripe and a 56-pixel high stripe (the current one). So, in this case,
+  // we want to leave the boundary alone!
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *buf =
+            rsb->stripe_boundary_above + (buf_off << use_highbd);
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_above
+        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+               REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+      }
+    }
+
+    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+    // for i = 0, 1, 2.
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *src =
+            rsb->stripe_boundary_below + (buf_off << use_highbd);
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_below
+        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only save and overwrite i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_above
+      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd,
+                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+             line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      // Only save and overwrite i=2 line.
+      uint8_t *dst8 = data8_bl + 2 * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_below
+      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+    }
   }
-  InterpKernel vertical_topbot;
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
+}
 
-  // Convolve the whole tile (done in blocks here to match the requirements
-  // of the vectorized convolve functions, but the result is equivalent)
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(
-        i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
-    h = ALIGN_POWER_OF_TWO(h, 1);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
-      const uint8_t *data_p = data + i * stride + j;
-      uint8_t *dst_p = dst + i * dst_stride + j;
-      // Note h is at least 16
-      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                  vertical_topbot, 16, w, 1);
-#else
-        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                              vertical_topbot, 16, w, 1);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
+// This function restores the boundary lines modified by
+// setup_processing_stripe_boundary.
+//
+// Note: We need to be careful when handling the corners of the processing
+// unit, because (eg.) the top-left corner is considered to be part of
+// both the left and top borders. This means that, depending on the
+// loop_filter_across_tiles_enabled flag, the corner pixels might get
+// overwritten twice, once as part of the "top" border and once as part
+// of the "left" border (or similar for other corners).
+//
+// Everything works out fine as long as we make sure to reverse the order
+// when restoring, ie. we need to restore the left/right borders followed
+// by the top/bottom borders.
+static void restore_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
+    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
+    int copy_below, int opt) {
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8),
+               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
       }
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                                h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-#else
-      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                            h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                  vertical_topbot, 16, w, 1);
-#else
-        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                              vertical_topbot, 16, w, 1);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only restore i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      // Only restore i=2 line.
+      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+        uint8_t *dst8 = data8_bl + 2 * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
       }
     }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, data, stride, rst, 0);
-#endif
   }
 }
 
-static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
-                               RestorationInternal *rst, uint8_t *dst,
-                               int dst_stride) {
-  int tile_idx;
-  extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
-               WIENER_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                            dst_stride);
+static void wiener_filter_stripe(const RestorationUnitInfo *rui,
+                                 int stripe_width, int stripe_height,
+                                 int procunit_width, const uint8_t *src,
+                                 int src_stride, uint8_t *dst, int dst_stride,
+                                 int32_t *tmpbuf, int bit_depth) {
+  (void)tmpbuf;
+  (void)bit_depth;
+  assert(bit_depth == 8);
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src_p = src + j;
+    uint8_t *dst_p = dst + j;
+    av1_wiener_convolve_add_src(
+        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
+        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
   }
 }
 
@@ -391,6 +457,8 @@ static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
 static void boxsum1(int32_t *src, int width, int height, int src_stride,
                     int sqr, int32_t *dst, int dst_stride) {
   int i, j, a, b, c;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
 
   // Vertical sum over 3-pixel regions, from src into dst.
   if (!sqr) {
@@ -456,6 +524,8 @@ static void boxsum1(int32_t *src, int width, int height, int src_stride,
 static void boxsum2(int32_t *src, int width, int height, int src_stride,
                     int sqr, int32_t *dst, int dst_stride) {
   int i, j, a, b, c, d, e;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
 
   // Vertical sum over 5-pixel regions, from src into dst.
   if (!sqr) {
@@ -540,202 +610,33 @@ static void boxsum2(int32_t *src, int width, int height, int src_stride,
   }
 }
 
-static void boxsum3(int32_t *src, int width, int height, int src_stride,
-                    int sqr, int32_t *dst, int dst_stride) {
-  int i, j, a, b, c, d, e, f, g;
-
-  // Vertical sum over 7-pixel regions, from src into dst.
-  if (!sqr) {
-    for (j = 0; j < width; ++j) {
-      a = src[j];
-      b = src[1 * src_stride + j];
-      c = src[2 * src_stride + j];
-      d = src[3 * src_stride + j];
-      e = src[4 * src_stride + j];
-      f = src[5 * src_stride + j];
-      g = src[6 * src_stride + j];
-
-      dst[j] = a + b + c + d;
-      dst[dst_stride + j] = a + b + c + d + e;
-      dst[2 * dst_stride + j] = a + b + c + d + e + f;
-      for (i = 3; i < height - 4; ++i) {
-        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-        a = b;
-        b = c;
-        c = d;
-        d = e;
-        e = f;
-        f = g;
-        g = src[(i + 4) * src_stride + j];
-      }
-      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
-      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
-      dst[(i + 3) * dst_stride + j] = d + e + f + g;
-    }
-  } else {
-    for (j = 0; j < width; ++j) {
-      a = src[j] * src[j];
-      b = src[1 * src_stride + j] * src[1 * src_stride + j];
-      c = src[2 * src_stride + j] * src[2 * src_stride + j];
-      d = src[3 * src_stride + j] * src[3 * src_stride + j];
-      e = src[4 * src_stride + j] * src[4 * src_stride + j];
-      f = src[5 * src_stride + j] * src[5 * src_stride + j];
-      g = src[6 * src_stride + j] * src[6 * src_stride + j];
-
-      dst[j] = a + b + c + d;
-      dst[dst_stride + j] = a + b + c + d + e;
-      dst[2 * dst_stride + j] = a + b + c + d + e + f;
-      for (i = 3; i < height - 4; ++i) {
-        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-        a = b;
-        b = c;
-        c = d;
-        d = e;
-        e = f;
-        f = g;
-        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
-      }
-      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
-      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
-      dst[(i + 3) * dst_stride + j] = d + e + f + g;
-    }
-  }
-
-  // Horizontal sum over 7-pixel regions of dst
-  for (i = 0; i < height; ++i) {
-    a = dst[i * dst_stride];
-    b = dst[i * dst_stride + 1];
-    c = dst[i * dst_stride + 2];
-    d = dst[i * dst_stride + 3];
-    e = dst[i * dst_stride + 4];
-    f = dst[i * dst_stride + 5];
-    g = dst[i * dst_stride + 6];
-
-    dst[i * dst_stride] = a + b + c + d;
-    dst[i * dst_stride + 1] = a + b + c + d + e;
-    dst[i * dst_stride + 2] = a + b + c + d + e + f;
-    for (j = 3; j < width - 4; ++j) {
-      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-      a = b;
-      b = c;
-      c = d;
-      d = e;
-      e = f;
-      f = g;
-      g = dst[i * dst_stride + (j + 4)];
-    }
-    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
-    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
-    dst[i * dst_stride + (j + 3)] = d + e + f + g;
-  }
-}
-
-// Generic version for any r. To be removed after experiments are done.
-static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
-                    int sqr, int32_t *dst, int dst_stride) {
-  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
-  int tmp_stride = width;
-  int i, j;
-  if (sqr) {
-    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
-    for (j = 0; j < width; ++j)
-      for (i = 1; i < height; ++i)
-        tmp[i * tmp_stride + j] =
-            tmp[(i - 1) * tmp_stride + j] +
-            src[i * src_stride + j] * src[i * src_stride + j];
-  } else {
-    memcpy(tmp, src, sizeof(*tmp) * width);
-    for (j = 0; j < width; ++j)
-      for (i = 1; i < height; ++i)
-        tmp[i * tmp_stride + j] =
-            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
-  }
-  for (i = 0; i <= r; ++i)
-    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
-           sizeof(*tmp) * width);
-  for (i = r + 1; i < height - r; ++i)
-    for (j = 0; j < width; ++j)
-      dst[i * dst_stride + j] =
-          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
-  for (i = height - r; i < height; ++i)
-    for (j = 0; j < width; ++j)
-      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
-                                tmp[(i - r - 1) * tmp_stride + j];
-
-  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
-  for (i = 0; i < height; ++i)
-    for (j = 1; j < width; ++j)
-      tmp[i * tmp_stride + j] =
-          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
-
-  for (j = 0; j <= r; ++j)
-    for (i = 0; i < height; ++i)
-      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
-  for (j = r + 1; j < width - r; ++j)
-    for (i = 0; i < height; ++i)
-      dst[i * dst_stride + j] =
-          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
-  for (j = width - r; j < width; ++j)
-    for (i = 0; i < height; ++i)
-      dst[i * dst_stride + j] =
-          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
-  aom_free(tmp);
-}
-
 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                    int sqr, int32_t *dst, int dst_stride) {
   if (r == 1)
     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
   else if (r == 2)
     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
-  else if (r == 3)
-    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
   else
-    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
+    assert(0 && "Invalid value of r in self-guided filter");
 }
 
-static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
-  int i, j;
-  for (i = 0; i <= r; ++i) {
-    for (j = 0; j <= r; ++j) {
-      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
-      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
-      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
-      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
-          num[i * num_stride + j];
-    }
-  }
-  for (j = 0; j <= r; ++j) {
-    const int val = (2 * r + 1) * (r + 1 + j);
-    for (i = r + 1; i < height - r; ++i) {
-      num[i * num_stride + j] = val;
-      num[i * num_stride + (width - 1 - j)] = val;
-    }
-  }
-  for (i = 0; i <= r; ++i) {
-    const int val = (2 * r + 1) * (r + 1 + i);
-    for (j = r + 1; j < width - r; ++j) {
-      num[i * num_stride + j] = val;
-      num[(height - 1 - i) * num_stride + j] = val;
-    }
-  }
-  for (i = r + 1; i < height - r; ++i) {
-    for (j = r + 1; j < width - r; ++j) {
-      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
-    }
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xq[0] = 0;
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
+  } else if (params->r[1] == 0) {
+    xq[0] = xqd[0];
+    xq[1] = 0;
+  } else {
+    xq[0] = xqd[0];
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
   }
 }
 
-void decode_xq(int *xqd, int *xq) {
-  xq[0] = xqd[0];
-  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
-}
-
 const int32_t x_by_xplus1[256] = {
-  0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
+  // instead of 0. See comments in selfguided_restoration_internal() for why
+  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
@@ -758,19 +659,15 @@ const int32_t x_by_xplus1[256] = {
 const int32_t one_by_x[MAX_NELEM] = {
   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
-#if MAX_RADIUS > 2
-  158,  152,  146,  141,  137, 132, 128, 124, 120, 117, 114, 111, 108,
-  105,  102,  100,  98,   95,  93,  91,  89,  87,  85,  84
-#endif  // MAX_RADIUS > 2
 };
 
-static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
-                                                int height, int dgd_stride,
-                                                int32_t *dst, int dst_stride,
-                                                int bit_depth, int r, int eps) {
+static void selfguided_restoration_fast_internal(
+    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-  const int num_stride = width_ext;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes, for consistency
@@ -780,25 +677,24 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
   int32_t B_[RESTORATION_PROC_UNIT_PELS];
   int32_t *A = A_;
   int32_t *B = B_;
-  int8_t num_[RESTORATION_PROC_UNIT_PELS];
-  int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
   int i, j;
 
-  // Don't filter tiles with dimensions < 5 on any axis
-  if ((width < 5) || (height < 5)) return;
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
 
   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
-  boxnum(width_ext, height_ext, r, num_, num_stride);
-  assert(r <= 3);
   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
+  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+  for (i = -1; i < height + 1; i += 2) {
+    for (j = -1; j < width + 1; ++j) {
       const int k = i * buf_stride + j;
-      const int n = num[i * num_stride + j];
+      const int n = (2 * r + 1) * (2 * r + 1);
 
       // a < 2^16 * n < 2^22 regardless of bit depth
       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
@@ -807,139 +703,192 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
 
       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
       // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // This bound on p is due to:
+      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+      //
       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
       // This is an artefact of rounding, and can only happen if all pixels
       // are (almost) identical, so in this case we saturate to p=0.
       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
-      uint32_t s = sgrproj_mtable[eps - 1][n - 1];
+
+      const uint32_t s = params->s[radius_idx];
 
       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
       // (this holds even after accounting for the rounding in s)
       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
 
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8
-
-      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
+      // Note: We have to be quite careful about the value of A[k].
+      // This is used as a blend factor between individual pixel values and the
+      // local mean. So it logically has a range of [0, 256], including both
+      // endpoints.
+      //
+      // This is a pain for hardware, as we'd like something which can be stored
+      // in exactly 8 bits.
+      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+      // slightly above 2^(8 + bit depth), due to rounding in the value of
+      // one_by_x[25-1].
+      //
+      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+      // overflow), without significantly affecting the final result: z == 0
+      // implies that the image is essentially "flat", so the local mean and
+      // individual pixel values are very similar.
+      //
+      // Note that saturating on the other side, ie. requring A[k] <= 255,
+      // would be a bad idea, as that corresponds to the case where the image
+      // is very variable, when we want to preserve the local pixel value as
+      // much as possible.
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+
+      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
       // one_by_x[n - 1] = round(2^12 / n)
       // => the product here is < 2^(20 + bit_depth) <= 2^32,
       // and B[k] is set to a value < 2^(8 + bit depth)
+      // This holds even with the rounding in one_by_x and in the overall
+      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                              (uint32_t)B[k] *
                                              (uint32_t)one_by_x[n - 1],
                                          SGRPROJ_RECIP_BITS);
     }
   }
-  i = 0;
-  j = 0;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = 0;
-  j = width - 1;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = height - 1;
-  j = 0;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = height - 1;
-  j = width - 1;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = 0;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
-                      A[k + buf_stride - 1] + A[k + buf_stride + 1];
-    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
-                      B[k + buf_stride - 1] + B[k + buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = height - 1;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
-                      A[k - buf_stride - 1] + A[k - buf_stride + 1];
-    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
-                      B[k - buf_stride - 1] + B[k - buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  j = 0;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                      A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
-    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                      B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  // Use the A[] and B[] arrays to calculate the filtered image
+  assert(r == 2);
+  for (i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 5;
+        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+                              5;
+        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+                              5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    } else {  // odd row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 4;
+        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    }
   }
-  j = width - 1;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                      A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
-    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                      B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+}
+
+static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
+                                            int dgd_stride, int32_t *dst,
+                                            int dst_stride, int bit_depth,
+                                            int sgr_params_idx,
+                                            int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+  for (i = -1; i < height + 1; ++i) {
+    for (j = -1; j < width + 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int n = (2 * r + 1) * (2 * r + 1);
+
+      // a < 2^16 * n < 2^22 regardless of bit depth
+      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+      // b < 2^8 * n < 2^14 regardless of bit depth
+      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+      // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // This bound on p is due to:
+      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+      //
+      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+      // This is an artefact of rounding, and can only happen if all pixels
+      // are (almost) identical, so in this case we saturate to p=0.
+      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+
+      const uint32_t s = params->s[radius_idx];
+
+      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+      // (this holds even after accounting for the rounding in s)
+      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+      // Note: We have to be quite careful about the value of A[k].
+      // This is used as a blend factor between individual pixel values and the
+      // local mean. So it logically has a range of [0, 256], including both
+      // endpoints.
+      //
+      // This is a pain for hardware, as we'd like something which can be stored
+      // in exactly 8 bits.
+      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+      // slightly above 2^(8 + bit depth), due to rounding in the value of
+      // one_by_x[25-1].
+      //
+      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+      // overflow), without significantly affecting the final result: z == 0
+      // implies that the image is essentially "flat", so the local mean and
+      // individual pixel values are very similar.
+      //
+      // Note that saturating on the other side, ie. requring A[k] <= 255,
+      // would be a bad idea, as that corresponds to the case where the image
+      // is very variable, when we want to preserve the local pixel value as
+      // much as possible.
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+
+      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
+      // one_by_x[n - 1] = round(2^12 / n)
+      // => the product here is < 2^(20 + bit_depth) <= 2^32,
+      // and B[k] is set to a value < 2^(8 + bit depth)
+      // This holds even with the rounding in one_by_x and in the overall
+      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
+      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+                                             (uint32_t)B[k] *
+                                             (uint32_t)one_by_x[n - 1],
+                                         SGRPROJ_RECIP_BITS);
+    }
   }
-  for (i = 1; i < height - 1; ++i) {
-    for (j = 1; j < width - 1; ++j) {
+  // Use the A[] and B[] arrays to calculate the filtered image
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
       const int k = i * buf_stride + j;
       const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
@@ -962,968 +911,697 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
   }
 }
 
-void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
-                                  int stride, int32_t *dst, int dst_stride,
-                                  int r, int eps) {
+void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
+                                  int flt_stride, int sgr_params_idx,
+                                  int bit_depth, int highbd) {
   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
   int32_t *dgd32 =
       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
-  int i, j;
-  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
-    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
-      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
-    }
-  }
-  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
-                                      dst_stride, 8, r, eps);
-}
 
-void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
-                           int32_t *dst, int dst_stride, int corner, int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  i = 0;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = 0;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = 0;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  i = height - 1;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  j = 0;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  j = width - 1;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  for (i = 1; i < height - 1; ++i) {
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+  if (highbd) {
+    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
+      }
+    }
+  } else {
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
+      }
     }
   }
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  if (params->r[0] > 0)
+    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
+                                         flt0, flt_stride, bit_depth,
+                                         sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
+                                    flt_stride, bit_depth, sgr_params_idx, 1);
 }
 
-void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
-                                    int stride, int eps, int *xqd, uint8_t *dst,
-                                    int dst_stride, int32_t *tmpbuf) {
+void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
+                                    int stride, int eps, const int *xqd,
+                                    uint8_t *dst8, int dst_stride,
+                                    int32_t *tmpbuf, int bit_depth,
+                                    int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+
+  av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
+                               eps, bit_depth, highbd);
+  const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_c(dat, width, height, stride, flt1, width,
-                        sgr_params[eps].corner, sgr_params[eps].edge);
-#else
-  av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
-                               sgr_params[eps].r1, sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
-                               sgr_params[eps].r2, sgr_params[eps].e2);
-  decode_xq(xqd, xq);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
+  decode_xq(xqd, xq, params);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
       const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      uint8_t *dst8ij = dst8 + i * dst_stride + j;
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+
+      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
+      int32_t v = u << SGRPROJ_PRJ_BITS;
+      // If params->r == 0 then we skipped the filtering in
+      // av1_selfguided_restoration_c, i.e. flt[k] == u
+      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
+      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
       const int16_t w =
           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = clip_pixel(w);
-    }
-  }
-}
 
-static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
-                                     int height, int stride,
-                                     RestorationInternal *rst, uint8_t *dst,
-                                     int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
-    return;
-  }
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(
-        i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, limits.v_end - i);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, limits.h_end - j);
-      uint8_t *data_p = data + i * stride + j;
-      uint8_t *dst_p = dst + i * dst_stride + j;
-      apply_selfguided_restoration(
-          data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
-          rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
+      const uint16_t out = clip_pixel_highbd(w, bit_depth);
+      if (highbd)
+        *CONVERT_TO_SHORTPTR(dst8ij) = out;
+      else
+        *dst8ij = (uint8_t)out;
     }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, data, stride, rst, 0);
-#endif
   }
 }
 
-static void loop_sgrproj_filter(uint8_t *data, int width, int height,
-                                int stride, RestorationInternal *rst,
-                                uint8_t *dst, int dst_stride) {
-  int tile_idx;
-  extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
-               SGRPROJ_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                             dst_stride);
+static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth) {
+  (void)bit_depth;
+  assert(bit_depth == 8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
+                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+                                 dst + j, dst_stride, tmpbuf, bit_depth, 0);
   }
 }
 
-static void loop_switchable_filter(uint8_t *data, int width, int height,
-                                   int stride, RestorationInternal *rst,
-                                   uint8_t *dst, int dst_stride) {
-  int tile_idx;
-  extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
-               RESTORATION_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-      loop_copy_tile(data, tile_idx, width, height, stride, rst, dst,
-                     dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
-      loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                              dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
-      loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                               dst_stride);
-    }
+static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                        int stripe_width, int stripe_height,
+                                        int procunit_width, const uint8_t *src8,
+                                        int src_stride, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth) {
+  (void)tmpbuf;
+  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src8_p = src8 + j;
+    uint8_t *dst8_p = dst8 + j;
+    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
+                                       rui->wiener_info.hfilter, 16,
+                                       rui->wiener_info.vfilter, 16, w,
+                                       stripe_height, &conv_params, bit_depth);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
-                         int border_horz, int border_vert) {
-  uint16_t *data_p;
-  int i, j;
-  for (i = 0; i < height; ++i) {
-    data_p = data + i * stride;
-    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
-    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
-  }
-  data_p = data - border_horz;
-  for (i = -border_vert; i < 0; ++i) {
-    memcpy(data_p + i * stride, data_p,
-           (width + 2 * border_horz) * sizeof(uint16_t));
-  }
-  for (i = height; i < height + border_vert; ++i) {
-    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
-           (width + 2 * border_horz) * sizeof(uint16_t));
+static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                         int stripe_width, int stripe_height,
+                                         int procunit_width,
+                                         const uint8_t *src8, int src_stride,
+                                         uint8_t *dst8, int dst_stride,
+                                         int32_t *tmpbuf, int bit_depth) {
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
+                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+                                 dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
   }
 }
 
-static void loop_copy_tile_highbd(uint16_t *data, int tile_idx, int width,
-                                  int height, int stride,
-                                  RestorationInternal *rst, uint16_t *dst,
-                                  int dst_stride) {
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; ++i)
-    memcpy(dst + i * dst_stride + limits.h_start,
-           data + i * stride + limits.h_start,
-           (limits.h_end - limits.h_start) * sizeof(*dst));
-}
+typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth);
 
-static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
-                                           int width, int height, int stride,
-                                           RestorationInternal *rst,
-                                           int bit_depth, uint16_t *dst,
-                                           int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
+#define NUM_STRIPE_FILTERS 4
+
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
+  sgrproj_filter_stripe_highbd
+};
 
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
-                          dst_stride);
+// Filter one restoration unit
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+  RestorationType unit_rtype = rui->restoration_type;
+
+  int unit_h = limits->v_end - limits->v_start;
+  int unit_w = limits->h_end - limits->h_start;
+  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
+  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
+
+  if (unit_rtype == RESTORE_NONE) {
+    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
     return;
   }
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  InterpKernel vertical_topbot;
-
-  // Convolve the whole tile (done in blocks here to match the requirements
-  // of the vectorized convolve functions, but the result is equivalent)
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                             limits.h_end, (uint8_t *)data,
-                                             stride, rst, 1);
-    h = ALIGN_POWER_OF_TWO(h, 1);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
-      const uint16_t *data_p = data + i * stride + j;
-      uint16_t *dst_p = dst + i * dst_stride + j;
-      // Note h is at least 16
-      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_highbd_convolve8_add_src_hip(
-            CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-            dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-            vertical_topbot, 16, w, 1, bit_depth);
-#else
-        aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
-                                     CONVERT_TO_BYTEPTR(dst_p), dst_stride,
-                                     rst->rsi->wiener_info[tile_idx].hfilter,
-                                     16, vertical_topbot, 16, w, 1, bit_depth);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
-      }
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_highbd_convolve8_add_src_hip(
-          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-          h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
-#else
-      aom_highbd_convolve8_add_src(
-          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-          h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_highbd_convolve8_add_src_hip(
-            CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-            dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-            vertical_topbot, 16, w, 1, bit_depth);
-#else
-        aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
-                                     CONVERT_TO_BYTEPTR(dst_p), dst_stride,
-                                     rst->rsi->wiener_info[tile_idx].hfilter,
-                                     16, vertical_topbot, 16, w, 1, bit_depth);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
-      }
-    }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, (uint8_t *)data, stride,
-                                       rst, 1);
-#endif
+
+  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+  assert(filter_idx < NUM_STRIPE_FILTERS);
+  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
+
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+
+  // Convolve the whole tile one stripe at a time
+  RestorationTileLimits remaining_stripes = *limits;
+  int i = 0;
+  while (i < unit_h) {
+    int copy_above, copy_below;
+    remaining_stripes.v_start = limits->v_start + i;
+
+    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
+                             &copy_below);
+
+    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+    // Work out where this stripe's boundaries are within
+    // rsb->stripe_boundary_{above,below}
+    const int tile_stripe =
+        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
+        full_stripe_height;
+    const int frame_stripe = tile_stripe0 + tile_stripe;
+    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
+
+    // Calculate this stripe's height, based on two rules:
+    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+    // * We can't extend past the end of the current restoration unit
+    const int nominal_stripe_height =
+        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+    const int h = AOMMIN(nominal_stripe_height,
+                         remaining_stripes.v_end - remaining_stripes.v_start);
+
+    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
+                                     h, data8, stride, rlbs, copy_above,
+                                     copy_below, optimized_lr);
+
+    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
+                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+
+    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
+                                       data8, stride, copy_above, copy_below,
+                                       optimized_lr);
+
+    i += h;
   }
 }
 
-static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
-                                      int stride, RestorationInternal *rst,
-                                      int bit_depth, uint8_t *dst8,
-                                      int dst_stride) {
-  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int tile_idx;
-  extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ,
-                      WIENER_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
-                                   bit_depth, dst, dst_stride);
-  }
+static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
+                                 AV1_COMMON *cm) {
+  (void)tile_col;
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
 }
 
-void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
-                                         int stride, int32_t *dst,
-                                         int dst_stride, int bit_depth, int r,
-                                         int eps) {
-  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
-  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
-  int32_t *dgd32 =
-      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
-  int i, j;
-  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
-    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
-      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
-    }
-  }
-  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
-                                      dst_stride, bit_depth, r, eps);
+static void filter_frame_on_unit(const RestorationTileLimits *limits,
+                                 const AV1PixelRect *tile_rect,
+                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 RestorationLineBuffers *rlbs) {
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  const RestorationInfo *rsi = ctxt->rsi;
+
+  av1_loop_restoration_filter_unit(
+      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
+      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
+      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
+      rsi->optimized_lr);
 }
 
-void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
-                                  int stride, int32_t *dst, int dst_stride,
-                                  int corner, int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  i = 0;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = 0;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = 0;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  i = height - 1;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  j = 0;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  j = width - 1;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  for (i = 1; i < height - 1; ++i) {
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            AV1_COMMON *cm, int optimized_lr,
+                                            int num_planes) {
+  const int bit_depth = cm->bit_depth;
+  const int highbd = cm->use_highbitdepth;
+  lr_ctxt->dst = &cm->rst_frame;
+
+  const int frame_width = frame->crop_widths[0];
+  const int frame_height = frame->crop_heights[0];
+  if (aom_realloc_frame_buffer(lr_ctxt->dst, frame_width, frame_height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL) < 0)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate restoration dst buffer");
+
+  lr_ctxt->on_rest_unit = filter_frame_on_unit;
+  lr_ctxt->frame = frame;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    RestorationInfo *rsi = &cm->rst_info[plane];
+    RestorationType rtype = rsi->frame_restoration_type;
+    rsi->optimized_lr = optimized_lr;
+
+    if (rtype == RESTORE_NONE) {
+      continue;
     }
+
+    const int is_uv = plane > 0;
+    const int plane_width = frame->crop_widths[is_uv];
+    const int plane_height = frame->crop_heights[is_uv];
+    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+
+    extend_frame(frame->buffers[plane], plane_width, plane_height,
+                 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
+                 highbd);
+
+    lr_plane_ctxt->rsi = rsi;
+    lr_plane_ctxt->ss_x = is_uv && cm->subsampling_x;
+    lr_plane_ctxt->ss_y = is_uv && cm->subsampling_y;
+    lr_plane_ctxt->highbd = highbd;
+    lr_plane_ctxt->bit_depth = bit_depth;
+    lr_plane_ctxt->data8 = frame->buffers[plane];
+    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
+    lr_plane_ctxt->data_stride = frame->strides[is_uv];
+    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
+    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
+    filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
   }
 }
 
-void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
-                                           int stride, int bit_depth, int eps,
-                                           int *xqd, uint16_t *dst,
-                                           int dst_stride, int32_t *tmpbuf) {
-  int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_highbd_c(dat, width, height, stride, flt1, width,
-                               sgr_params[eps].corner, sgr_params[eps].edge);
-#else
-  av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
-                                      bit_depth, sgr_params[eps].r1,
-                                      sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
-                                      bit_depth, sgr_params[eps].r2,
-                                      sgr_params[eps].e2);
-  decode_xq(xqd, xq);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int16_t w =
-          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
-    }
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      AV1_COMMON *cm, int num_planes) {
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
+                     tile_rect.right, tile_rect.top, tile_rect.bottom);
   }
 }
 
-static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
-                                            int width, int height, int stride,
-                                            RestorationInternal *rst,
-                                            int bit_depth, uint16_t *dst,
-                                            int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
+static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
+                                        int num_planes) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
 
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
-                          dst_stride);
-    return;
-  }
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                             limits.h_end, (uint8_t *)data,
-                                             stride, rst, 1);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, limits.v_end - i);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, limits.h_end - j);
-      uint16_t *data_p = data + i * stride + j;
-      uint16_t *dst_p = dst + i * dst_stride + j;
-      apply_selfguided_restoration_highbd(
-          data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
-          rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
+      continue;
     }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, (uint8_t *)data, stride,
-                                       rst, 1);
-#endif
-  }
-}
 
-static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
-                                       int stride, RestorationInternal *rst,
-                                       int bit_depth, uint8_t *dst8,
-                                       int dst_stride) {
-  int tile_idx;
-  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ,
-                      SGRPROJ_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
-                                    bit_depth, dst, dst_stride);
+    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
+                                   &ctxt[plane], &ctxt[plane].tile_rect,
+                                   cm->rst_tmpbuf, cm->rlbs);
   }
 }
 
-static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
-                                          int stride, RestorationInternal *rst,
-                                          int bit_depth, uint8_t *dst8,
-                                          int dst_stride) {
-  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int tile_idx;
-  extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ,
-                      RESTORATION_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-      loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
-                            dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
-      loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
-                                     bit_depth, dst, dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
-      loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride,
-                                      rst, bit_depth, dst, dst_stride);
-    }
-  }
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       AV1_COMMON *cm, int optimized_lr,
+                                       void *lr_ctxt) {
+  assert(!cm->all_lossless);
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
+
+  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
+
+  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
 }
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void loop_restoration_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                                  int start_mi_row, int end_mi_row,
-                                  int components_pattern, RestorationInfo *rsi,
-                                  YV12_BUFFER_CONFIG *dst) {
-  const int ywidth = frame->y_crop_width;
-  const int yheight = frame->y_crop_height;
-  const int uvwidth = frame->uv_crop_width;
-  const int uvheight = frame->uv_crop_height;
-  const int ystride = frame->y_stride;
-  const int uvstride = frame->uv_stride;
-  const int ystart = start_mi_row << MI_SIZE_LOG2;
-  const int uvstart = ystart >> cm->subsampling_y;
-  int yend = end_mi_row << MI_SIZE_LOG2;
-  int uvend = yend >> cm->subsampling_y;
-  restore_func_type restore_funcs[RESTORE_TYPES] = {
-    NULL, loop_wiener_filter, loop_sgrproj_filter, loop_switchable_filter
-  };
-#if CONFIG_HIGHBITDEPTH
-  restore_func_highbd_type restore_funcs_highbd[RESTORE_TYPES] = {
-    NULL, loop_wiener_filter_highbd, loop_sgrproj_filter_highbd,
-    loop_switchable_filter_highbd
-  };
-#endif  // CONFIG_HIGHBITDEPTH
-  restore_func_type restore_func;
-#if CONFIG_HIGHBITDEPTH
-  restore_func_highbd_type restore_func_highbd;
-#endif  // CONFIG_HIGHBITDEPTH
-  YV12_BUFFER_CONFIG dst_;
-
-  yend = AOMMIN(yend, yheight);
-  uvend = AOMMIN(uvend, uvheight);
-  if (components_pattern == (1 << AOM_PLANE_Y)) {
-    // Only y
-    if (rsi[0].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_y(frame, dst);
-      return;
-    }
-  } else if (components_pattern == (1 << AOM_PLANE_U)) {
-    // Only U
-    if (rsi[1].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_u(frame, dst);
-      return;
-    }
-  } else if (components_pattern == (1 << AOM_PLANE_V)) {
-    // Only V
-    if (rsi[2].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_v(frame, dst);
-      return;
-    }
-  } else if (components_pattern ==
-             ((1 << AOM_PLANE_Y) | (1 << AOM_PLANE_U) | (1 << AOM_PLANE_V))) {
-    // All components
-    if (rsi[0].frame_restoration_type == RESTORE_NONE &&
-        rsi[1].frame_restoration_type == RESTORE_NONE &&
-        rsi[2].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_frame(frame, dst);
-      return;
-    }
-  }
 
-  if (!dst) {
-    dst = &dst_;
-    memset(dst, 0, sizeof(YV12_BUFFER_CONFIG));
-    if (aom_realloc_frame_buffer(
-            dst, ywidth, yheight, cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-            cm->use_highbitdepth,
-#endif
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL) < 0)
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate restoration dst buffer");
-  }
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync) {
+  const int tile_w = tile_rect->right - tile_rect->left;
+  const int ext_size = unit_size * 3 / 2;
+  int x0 = 0, j = 0;
+  while (x0 < tile_w) {
+    int remaining_w = tile_w - x0;
+    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
 
-  if ((components_pattern >> AOM_PLANE_Y) & 1) {
-    if (rsi[0].frame_restoration_type != RESTORE_NONE) {
-      cm->rst_internal.ntiles = av1_get_rest_ntiles(
-          ywidth, yheight, cm->rst_info[AOM_PLANE_Y].restoration_tilesize,
-          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
-          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
-      cm->rst_internal.rsi = &rsi[0];
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      cm->rst_internal.component = AOM_PLANE_Y;
-      cm->rst_internal.subsampling_y = 0;
-#endif
-      restore_func =
-          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
-#if CONFIG_HIGHBITDEPTH
-      restore_func_highbd =
-          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
-      if (cm->use_highbitdepth)
-        restore_func_highbd(
-            frame->y_buffer + ystart * ystride, ywidth, yend - ystart, ystride,
-            &cm->rst_internal, cm->bit_depth,
-            dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        restore_func(frame->y_buffer + ystart * ystride, ywidth, yend - ystart,
-                     ystride, &cm->rst_internal,
-                     dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
-    } else {
-      aom_yv12_copy_y(frame, dst);
-    }
-  }
+    limits->h_start = tile_rect->left + x0;
+    limits->h_end = tile_rect->left + x0 + w;
+    assert(limits->h_end <= tile_rect->right);
 
-  if ((components_pattern >> AOM_PLANE_U) & 1) {
-    if (rsi[AOM_PLANE_U].frame_restoration_type != RESTORE_NONE) {
-      cm->rst_internal.ntiles = av1_get_rest_ntiles(
-          uvwidth, uvheight, cm->rst_info[AOM_PLANE_U].restoration_tilesize,
-          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
-          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
-      cm->rst_internal.rsi = &rsi[AOM_PLANE_U];
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      cm->rst_internal.component = AOM_PLANE_U;
-      cm->rst_internal.subsampling_y = cm->subsampling_y;
-#endif
-      restore_func =
-          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
-#if CONFIG_HIGHBITDEPTH
-      restore_func_highbd =
-          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
-      if (cm->use_highbitdepth)
-        restore_func_highbd(
-            frame->u_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
-            uvstride, &cm->rst_internal, cm->bit_depth,
-            dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        restore_func(frame->u_buffer + uvstart * uvstride, uvwidth,
-                     uvend - uvstart, uvstride, &cm->rst_internal,
-                     dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-    } else {
-      aom_yv12_copy_u(frame, dst);
-    }
-  }
+    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
 
-  if ((components_pattern >> AOM_PLANE_V) & 1) {
-    if (rsi[AOM_PLANE_V].frame_restoration_type != RESTORE_NONE) {
-      cm->rst_internal.ntiles = av1_get_rest_ntiles(
-          uvwidth, uvheight, cm->rst_info[AOM_PLANE_V].restoration_tilesize,
-          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
-          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
-      cm->rst_internal.rsi = &rsi[AOM_PLANE_V];
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      cm->rst_internal.component = AOM_PLANE_V;
-      cm->rst_internal.subsampling_y = cm->subsampling_y;
-#endif
-      restore_func =
-          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
-#if CONFIG_HIGHBITDEPTH
-      restore_func_highbd =
-          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
-      if (cm->use_highbitdepth)
-        restore_func_highbd(
-            frame->v_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
-            uvstride, &cm->rst_internal, cm->bit_depth,
-            dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        restore_func(frame->v_buffer + uvstart * uvstride, uvwidth,
-                     uvend - uvstart, uvstride, &cm->rst_internal,
-                     dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-    } else {
-      aom_yv12_copy_v(frame, dst);
-    }
-  }
+    // No sync for even numbered rows
+    // For odd numbered rows, Loop Restoration of current block requires the LR
+    // of top-right and bottom-right blocks to be completed
+
+    // top-right sync
+    on_sync_read(lr_sync, row_number, j, plane);
+    if ((row_number + 1) < vunits_per_tile)
+      // bottom-right sync
+      on_sync_read(lr_sync, row_number + 2, j, plane);
+
+    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+
+    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
 
-  if (dst == &dst_) {
-    if ((components_pattern >> AOM_PLANE_Y) & 1) aom_yv12_copy_y(dst, frame);
-    if ((components_pattern >> AOM_PLANE_U) & 1) aom_yv12_copy_u(dst, frame);
-    if ((components_pattern >> AOM_PLANE_V) & 1) aom_yv12_copy_v(dst, frame);
-    aom_free_frame_buffer(dst);
+    x0 += w;
+    ++j;
   }
 }
 
-void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                                RestorationInfo *rsi, int components_pattern,
-                                int partial_frame, YV12_BUFFER_CONFIG *dst) {
-  int start_mi_row, end_mi_row, mi_rows_to_filter;
-  start_mi_row = 0;
-#if CONFIG_FRAME_SUPERRES
-  mi_rows_to_filter =
-      ALIGN_POWER_OF_TWO(cm->superres_upscaled_height, 3) >> MI_SIZE_LOG2;
-#else
-  mi_rows_to_filter = cm->mi_rows;
-#endif  // CONFIG_FRAME_SUPERRES
-  if (partial_frame && mi_rows_to_filter > 8) {
-    start_mi_row = mi_rows_to_filter >> 1;
-    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(mi_rows_to_filter / 8, 8);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+}
+
+static void foreach_rest_unit_in_tile(
+    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
+  const int tile_h = tile_rect->bottom - tile_rect->top;
+  const int ext_size = unit_size * 3 / 2;
+
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  const int unit_idx0 = tile_idx * units_per_tile;
+
+  int y0 = 0, i = 0;
+  while (y0 < tile_h) {
+    int remaining_h = tile_h - y0;
+    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+    RestorationTileLimits limits;
+    limits.v_start = tile_rect->top + y0;
+    limits.v_end = tile_rect->top + y0 + h;
+    assert(limits.v_end <= tile_rect->bottom);
+    // Offset the tile upwards to align with the restoration processing stripe
+    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
+    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+
+    av1_foreach_rest_unit_in_row(
+        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+
+    y0 += h;
+    ++i;
   }
-  end_mi_row = start_mi_row + mi_rows_to_filter;
-  loop_restoration_init(&cm->rst_internal, cm->frame_type == KEY_FRAME);
-  loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, components_pattern,
-                        rsi, dst);
+}
+
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->subsampling_y;
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+
+  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
+                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+                            rsi->units_per_tile, rsi->restoration_unit_size,
+                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
 }
 
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
-                                       int *rrow1, int *nhtiles) {
-  assert(rcol0 && rcol1 && rrow0 && rrow1 && nhtiles);
-
-  if (bsize != cm->sb_size) return 0;
-
-#if CONFIG_FRAME_SUPERRES
-  const int frame_w = cm->superres_upscaled_width;
-  const int frame_h = cm->superres_upscaled_height;
-  const int mi_to_px = MI_SIZE * SCALE_NUMERATOR;
-  const int denom = cm->superres_scale_denominator;
-#else
-  const int frame_w = cm->width;
-  const int frame_h = cm->height;
-  const int mi_to_px = MI_SIZE;
-  const int denom = 1;
-#endif  // CONFIG_FRAME_SUPERRES
-
-  const int ss_x = plane > 0 && cm->subsampling_x != 0;
-  const int ss_y = plane > 0 && cm->subsampling_y != 0;
-
-  const int ss_frame_w = (frame_w + ss_x) >> ss_x;
-  const int ss_frame_h = (frame_h + ss_y) >> ss_y;
-
-  int rtile_w, rtile_h, nvtiles;
-  av1_get_rest_ntiles(ss_frame_w, ss_frame_h,
-                      cm->rst_info[plane].restoration_tilesize, &rtile_w,
-                      &rtile_h, nhtiles, &nvtiles);
-
-  const int rnd_w = rtile_w * denom - 1;
-  const int rnd_h = rtile_h * denom - 1;
-
-  // rcol0/rrow0 should be the first column/row of rtiles that doesn't start
-  // left/below of mi_col/mi_row. For this calculation, we need to round up the
-  // division (if the sb starts at rtile column 10.1, the first matching rtile
-  // has column index 11)
-  *rcol0 = (mi_col * mi_to_px + rnd_w) / (rtile_w * denom);
-  *rrow0 = (mi_row * mi_to_px + rnd_h) / (rtile_h * denom);
-
-  // rcol1/rrow1 is the equivalent calculation, but for the superblock
-  // below-right. There are some slightly strange boundary effects. First, we
-  // need to clamp to nhtiles/nvtiles for the case where it appears there are,
-  // say, 2.4 restoration tiles horizontally. There we need a maximum mi_row1
-  // of 2 because tile 1 gets extended.
-  //
-  // Second, if mi_col1 >= cm->mi_cols then we must manually set *rcol1 to
-  // nhtiles. This is needed whenever the frame's width rounded up to the next
-  // toplevel superblock is smaller than nhtiles * rtile_w. The same logic is
-  // needed for rows.
-  const int mi_row1 = mi_row + mi_size_high[bsize];
-  const int mi_col1 = mi_col + mi_size_wide[bsize];
-
-  if (mi_col1 >= cm->mi_cols)
-    *rcol1 = *nhtiles;
-  else
-    *rcol1 = AOMMIN(*nhtiles, (mi_col1 * mi_to_px + rnd_w) / (rtile_w * denom));
+                                       int *rrow1, int *tile_tl_idx) {
+  assert(rcol0 && rcol1 && rrow0 && rrow1);
 
-  if (mi_row1 >= cm->mi_rows)
-    *rrow1 = nvtiles;
-  else
-    *rrow1 = AOMMIN(nvtiles, (mi_row1 * mi_to_px + rnd_h) / (rtile_h * denom));
+  if (bsize != cm->seq_params.sb_size) return 0;
+  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
+
+  assert(!cm->all_lossless);
+
+  const int is_uv = plane > 0;
+
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int tile_w = tile_rect.right - tile_rect.left;
+  const int tile_h = tile_rect.bottom - tile_rect.top;
+
+  const int mi_top = 0;
+  const int mi_left = 0;
+
+  // Compute the mi-unit corners of the superblock relative to the top-left of
+  // the tile
+  const int mi_rel_row0 = mi_row - mi_top;
+  const int mi_rel_col0 = mi_col - mi_left;
+  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
+  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  const int size = rsi->restoration_unit_size;
+
+  // Calculate the number of restoration units in this tile (which might be
+  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
+  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
+  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+
+  // The size of an MI-unit on this plane of the image
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_y = is_uv && cm->subsampling_y;
+  const int mi_size_x = MI_SIZE >> ss_x;
+  const int mi_size_y = MI_SIZE >> ss_y;
+
+  // Write m for the relative mi column or row, D for the superres denominator
+  // and N for the superres numerator. If u is the upscaled pixel offset then
+  // we can write the downscaled pixel offset in two ways as:
+  //
+  //   MI_SIZE * m = N / D u
+  //
+  // from which we get u = D * MI_SIZE * m / N
+  const int mi_to_num_x = av1_superres_scaled(cm)
+                              ? mi_size_x * cm->superres_scale_denominator
+                              : mi_size_x;
+  const int mi_to_num_y = mi_size_y;
+  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
+  const int denom_y = size;
+
+  const int rnd_x = denom_x - 1;
+  const int rnd_y = denom_y - 1;
+
+  // rcol0/rrow0 should be the first column/row of restoration units (relative
+  // to the top-left of the tile) that doesn't start left/below of
+  // mi_col/mi_row. For this calculation, we need to round up the division (if
+  // the sb starts at runit column 10.1, the first matching runit has column
+  // index 11)
+  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
+  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+
+  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
+  // below-right. If we're at the bottom or right of the tile, this restoration
+  // unit might not exist, in which case we'll clamp accordingly.
+  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+
+  const int tile_idx = 0;
+  *tile_tl_idx = tile_idx * rsi->units_per_tile;
 
   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
 }
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-
 // Extend to left and right
-static void extend_line(uint8_t *buf, int width, int extend,
-                        int use_highbitdepth) {
-  int i;
-  if (use_highbitdepth) {
-    uint16_t val, *buf16 = (uint16_t *)buf;
-    val = buf16[0];
-    for (i = 0; i < extend; i++) buf16[-1 - i] = val;
-    val = buf16[width - 1];
-    for (i = 0; i < extend; i++) buf16[width + i] = val;
+static void extend_lines(uint8_t *buf, int width, int height, int stride,
+                         int extend, int use_highbitdepth) {
+  for (int i = 0; i < height; ++i) {
+    if (use_highbitdepth) {
+      uint16_t *buf16 = (uint16_t *)buf;
+      aom_memset16(buf16 - extend, buf16[0], extend);
+      aom_memset16(buf16 + width, buf16[width - 1], extend);
+    } else {
+      memset(buf - extend, buf[0], extend);
+      memset(buf + width, buf[width - 1], extend);
+    }
+    buf += stride;
+  }
+}
+
+static void save_deblock_boundary_lines(
+    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
+    int stripe, int use_highbd, int is_above,
+    RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+
+  // There is a rare case in which a processing stripe can end 1px above the
+  // crop border. In this case, we do want to use deblocked pixels from below
+  // the stripe (hence why we ended up in this function), but instead of
+  // fetching 2 "below" rows we need to fetch one and duplicate it.
+  // This is equivalent to clamping the sample locations against the crop border
+  const int lines_to_save =
+      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
+  assert(lines_to_save == 1 || lines_to_save == 2);
+
+  int upscaled_width;
+  int line_bytes;
+  if (av1_superres_scaled(cm)) {
+    const int ss_x = is_uv && cm->subsampling_x;
+    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
+    line_bytes = upscaled_width << use_highbd;
+    if (use_highbd)
+      av1_upscale_normative_rows(
+          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+          plane, lines_to_save);
+    else
+      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
+                                 boundaries->stripe_boundary_stride, plane,
+                                 lines_to_save);
   } else {
-    uint8_t val;
-    val = buf[0];
-    for (i = 0; i < extend; i++) buf[-1 - i] = val;
-    val = buf[width - 1];
-    for (i = 0; i < extend; i++) buf[width + i] = val;
+    upscaled_width = frame->crop_widths[is_uv];
+    line_bytes = upscaled_width << use_highbd;
+    for (int i = 0; i < lines_to_save; i++) {
+      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
+             line_bytes);
+    }
   }
+  // If we only saved one line, then copy it into the second line buffer
+  if (lines_to_save == 1)
+    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
+
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                     const AV1_COMMON *cm, int plane, int row,
+                                     int stripe, int use_highbd, int is_above,
+                                     RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+  const int src_width = frame->crop_widths[is_uv];
+
+  // At the point where this function is called, we've already applied
+  // superres. So we don't need to extend the lines here, we can just
+  // pull directly from the topmost row of the upscaled frame.
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int upscaled_width = av1_superres_scaled(cm)
+                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
+                                 : src_width;
+  const int line_bytes = upscaled_width << use_highbd;
+  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
+    // Copy the line at 'row' into both context lines. This is because
+    // we want to (effectively) extend the outermost row of CDEF data
+    // from this tile to produce a border, rather than using deblocked
+    // pixels from the tile above/below.
+    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
+  }
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
 }
 
-// For each 64 pixel high stripe, save 4 scan lines to be used as boundary in
-// the loop restoration process. The lines are saved in
-// rst_internal.stripe_boundary_lines
-void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
-                                              AV1_COMMON *cm) {
-  int p, boundary_stride;
-  int src_width, src_height, src_stride, stripe_height, stripe_offset, stripe_y,
-      yy;
-  uint8_t *src_buf, *boundary_below_buf, *boundary_above_buf;
-  int use_highbitdepth = 0;
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    if (p == 0) {
-      src_buf = frame->y_buffer;
-      src_width = frame->y_crop_width;
-      src_height = frame->y_crop_height;
-      src_stride = frame->y_stride;
-      stripe_height = 64;
-      stripe_offset = 56 - 2;  // offset of first line to copy
+static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                         int use_highbd, int plane,
+                                         AV1_COMMON *cm, int after_cdef) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->subsampling_y;
+  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  // Get the tile rectangle, with height rounded up to the next multiple of 8
+  // luma pixels (only relevant for the bottom tile of the frame)
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int stripe0 = 0;
+
+  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
+
+  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
+
+  int tile_stripe;
+  for (tile_stripe = 0;; ++tile_stripe) {
+    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
+    const int y0 = tile_rect.top + rel_y0;
+    if (y0 >= tile_rect.bottom) break;
+
+    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
+    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+
+    const int frame_stripe = stripe0 + tile_stripe;
+
+    // In this case, we should only use CDEF pixels at the top
+    // and bottom of the frame as a whole; internal tile boundaries
+    // can use deblocked pixels from adjacent tiles for context.
+    const int use_deblock_above = (frame_stripe > 0);
+    const int use_deblock_below = (y1 < plane_height);
+
+    if (!after_cdef) {
+      // Save deblocked context where needed.
+      if (use_deblock_above) {
+        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
+                                    frame_stripe, use_highbd, 1, boundaries);
+      }
+      if (use_deblock_below) {
+        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+                                    use_highbd, 0, boundaries);
+      }
     } else {
-      src_buf = p == 1 ? frame->u_buffer : frame->v_buffer;
-      src_width = frame->uv_crop_width;
-      src_height = frame->uv_crop_height;
-      src_stride = frame->uv_stride;
-      stripe_height = 64 >> cm->subsampling_y;
-      stripe_offset = (56 >> cm->subsampling_y) - 2;
-    }
-    boundary_above_buf = cm->rst_internal.stripe_boundary_above[p];
-    boundary_below_buf = cm->rst_internal.stripe_boundary_below[p];
-    boundary_stride = cm->rst_internal.stripe_boundary_stride[p];
-#if CONFIG_HIGHBITDEPTH
-    use_highbitdepth = cm->use_highbitdepth;
-    if (use_highbitdepth) {
-      src_buf = (uint8_t *)CONVERT_TO_SHORTPTR(src_buf);
-    }
-#endif
-    src_buf += (stripe_offset * src_stride) << use_highbitdepth;
-    boundary_above_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
-    boundary_below_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
-    // Loop over stripes
-    for (stripe_y = stripe_offset; stripe_y < src_height;
-         stripe_y += stripe_height) {
-      // Save 2 lines above the LR stripe (offset -9, -10)
-      for (yy = 0; yy < 2; yy++) {
-        if (stripe_y + yy < src_height) {
-          memcpy(boundary_above_buf, src_buf, src_width << use_highbitdepth);
-          extend_line(boundary_above_buf, src_width, RESTORATION_EXTRA_HORZ,
-                      use_highbitdepth);
-          src_buf += src_stride << use_highbitdepth;
-          boundary_above_buf += boundary_stride << use_highbitdepth;
-        }
+      // Save CDEF context where needed. Note that we need to save the CDEF
+      // context for a particular boundary iff we *didn't* save deblocked
+      // context for that boundary.
+      //
+      // In addition, we need to save copies of the outermost line within
+      // the tile, rather than using data from outside the tile.
+      if (!use_deblock_above) {
+        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+                                 1, boundaries);
       }
-      // Save 2 lines below the LR stripe (offset 56,57)
-      for (yy = 2; yy < 4; yy++) {
-        if (stripe_y + yy < src_height) {
-          memcpy(boundary_below_buf, src_buf, src_width << use_highbitdepth);
-          extend_line(boundary_below_buf, src_width, RESTORATION_EXTRA_HORZ,
-                      use_highbitdepth);
-          src_buf += src_stride << use_highbitdepth;
-          boundary_below_buf += boundary_stride << use_highbitdepth;
-        }
+      if (!use_deblock_below) {
+        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+                                 use_highbd, 0, boundaries);
       }
-      // jump to next stripe
-      src_buf += ((stripe_height - 4) * src_stride) << use_highbitdepth;
     }
   }
 }
 
-#endif  // CONFIG_STRIPED_LOOP_RESTORATION
+// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
+// lines to be used as boundary in the loop restoration process. The
+// lines are saved in rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              AV1_COMMON *cm, int after_cdef) {
+  const int num_planes = av1_num_planes(cm);
+  const int use_highbd = cm->use_highbitdepth;
+  for (int p = 0; p < num_planes; ++p) {
+    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+  }
+}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index 23a53879e..0c4017534 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -13,9 +13,10 @@
 #define AV1_COMMON_RESTORATION_H_
 
 #include "aom_ports/mem.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,23 +27,13 @@ extern "C" {
 
 #define RESTORATION_PROC_UNIT_SIZE 64
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
 // Filter tile grid offset upwards compared to the superblock grid
-#define RESTORATION_TILE_OFFSET 8
-#endif
+#define RESTORATION_UNIT_OFFSET 8
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-#define SGRPROJ_BORDER_VERT 2  // Vertical border used for Sgr
-#else
-#define SGRPROJ_BORDER_VERT 1  // Vertical border used for Sgr
-#endif
-#define SGRPROJ_BORDER_HORZ 2  // Horizontal border used for Sgr
+#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
 #define WIENER_BORDER_VERT 2  // Vertical border used for Wiener
-#else
-#define WIENER_BORDER_VERT 1  // Vertical border used for Wiener
-#endif
 #define WIENER_HALFWIN 3
 #define WIENER_BORDER_HORZ (WIENER_HALFWIN)  // Horizontal border for Wiener
 
@@ -61,11 +52,16 @@ extern "C" {
 #define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
 #endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
+// How many border pixels do we need for each processing unit?
+#define RESTORATION_BORDER 3
+
+// How many rows of deblocked pixels do we save above/below each processing
+// stripe?
+#define RESTORATION_CTX_VERT 2
+
 // Additional pixels to the left and right in above/below buffers
 // It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
 #define RESTORATION_EXTRA_HORZ 4
-#endif
 
 // Pad up to 20 more (may be much less is needed)
 #define RESTORATION_PADDING 20
@@ -75,30 +71,23 @@ extern "C" {
    (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
     RESTORATION_PADDING))
 
-#define RESTORATION_TILESIZE_MAX 256
-#if CONFIG_STRIPED_LOOP_RESTORATION
-#define RESTORATION_TILEPELS_HORZ_MAX \
-  (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
-#define RESTORATION_TILEPELS_VERT_MAX                                \
-  ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
-    RESTORATION_TILE_OFFSET))
-#define RESTORATION_TILEPELS_MAX \
-  (RESTORATION_TILEPELS_HORZ_MAX * RESTORATION_TILEPELS_VERT_MAX)
-#else
-#define RESTORATION_TILEPELS_MAX                                           \
-  ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \
-   (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT))
-#endif
+#define RESTORATION_UNITSIZE_MAX 256
+#define RESTORATION_UNITPELS_HORZ_MAX \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_UNITPELS_VERT_MAX                                \
+  ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+    RESTORATION_UNIT_OFFSET))
+#define RESTORATION_UNITPELS_MAX \
+  (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
 
 // Two 32-bit buffers needed for the restored versions from two filters
 // TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
 // on the decoder side.
-#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t))
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t))
 
 #define SGRPROJ_EXTBUF_SIZE (0)
 #define SGRPROJ_PARAMS_BITS 4
 #define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
-#define USE_HIGHPASS_IN_SGRPROJ 0
 
 // Precision bits for projection
 #define SGRPROJ_PRJ_BITS 7
@@ -108,24 +97,16 @@ extern "C" {
 #define SGRPROJ_SGR_BITS 8
 #define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
 
-#if USE_HIGHPASS_IN_SGRPROJ
-#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) / 8)
-#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
-#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 2)
-#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
-#else
 #define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
 #define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
 #define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
 #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
-#endif  // USE_HIGHPASS_IN_SGRPROJ
 
 #define SGRPROJ_PRJ_SUBEXP_K 4
 
 #define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
 
 #define MAX_RADIUS 2  // Only 1, 2, 3 allowed
-#define MAX_EPS 80    // Max value of eps
 #define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
 #define SGRPROJ_MTABLE_BITS 20
 #define SGRPROJ_RECIP_BITS 12
@@ -143,17 +124,13 @@ extern "C" {
 #define WIENER_FILT_PREC_BITS 7
 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
 
-// Whether to use high intermediate precision filtering
-#define USE_WIENER_HIGH_INTERMEDIATE_PRECISION 1
-
 // Central values for the taps
 #define WIENER_FILT_TAP0_MIDV (3)
 #define WIENER_FILT_TAP1_MIDV (-7)
 #define WIENER_FILT_TAP2_MIDV (15)
-#define WIENER_FILT_TAP3_MIDV                           \
-  (WIENER_FILT_STEP -                                   \
-   2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
-        WIENER_FILT_TAP2_MIDV))
+#define WIENER_FILT_TAP3_MIDV                                              \
+  (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
+                           WIENER_FILT_TAP2_MIDV))
 
 #define WIENER_FILT_TAP0_BITS 4
 #define WIENER_FILT_TAP1_BITS 5
@@ -194,51 +171,64 @@ extern "C" {
 #error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
 #endif
 
+#define LR_TILE_ROW 0
+#define LR_TILE_COL 0
+#define LR_TILE_COLS 1
+
 typedef struct {
-#if USE_HIGHPASS_IN_SGRPROJ
-  int corner;
-  int edge;
-#else
-  int r1;
-  int e1;
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  int r2;
-  int e2;
+  int r[2];  // radii
+  int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
 } sgr_params_type;
 
 typedef struct {
-  int restoration_tilesize;
-  int procunit_width, procunit_height;
-  RestorationType frame_restoration_type;
-  RestorationType *restoration_type;
-  // Wiener filter
-  WienerInfo *wiener_info;
-  // Selfguided proj filter
-  SgrprojInfo *sgrproj_info;
-} RestorationInfo;
+  RestorationType restoration_type;
+  WienerInfo wiener_info;
+  SgrprojInfo sgrproj_info;
+} RestorationUnitInfo;
+
+// A restoration line buffer needs space for two lines plus a horizontal filter
+// margin of RESTORATION_EXTRA_HORZ on each side.
+#define RESTORATION_LINEBUFFER_WIDTH \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
+
+// Similarly, the column buffers (used when we're at a vertical tile edge
+// that we can't filter across) need space for one processing unit's worth
+// of pixels, plus the top/bottom border width
+#define RESTORATION_COLBUFFER_HEIGHT \
+  (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
 
 typedef struct {
-  RestorationInfo *rsi;
-  int keyframe;
-  int ntiles;
-  int tile_width, tile_height;
-  int nhtiles, nvtiles;
-  int32_t *tmpbuf;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int component;
-  int subsampling_y;
-  uint8_t *stripe_boundary_above[MAX_MB_PLANE];
-  uint8_t *stripe_boundary_below[MAX_MB_PLANE];
-  int stripe_boundary_stride[MAX_MB_PLANE];
-  // Temporary buffers to save/restore 2 lines above/below the restoration
-  // stripe
-  // Allow for filter margin to left and right
-  uint16_t
-      tmp_save_above[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
-  uint16_t
-      tmp_save_below[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
-#endif
-} RestorationInternal;
+  // Temporary buffers to save/restore 3 lines above/below the restoration
+  // stripe.
+  uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+  uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+} RestorationLineBuffers;
+
+typedef struct {
+  uint8_t *stripe_boundary_above;
+  uint8_t *stripe_boundary_below;
+  int stripe_boundary_stride;
+  int stripe_boundary_size;
+} RestorationStripeBoundaries;
+
+typedef struct {
+  RestorationType frame_restoration_type;
+  int restoration_unit_size;
+
+  // Fields below here are allocated and initialised by
+  // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
+  // restoration units in (one row of) the largest tile in the frame. The data
+  // in unit_info is laid out with units_per_tile entries for each tile, which
+  // have stride horz_units_per_tile.
+  //
+  // Even if there are tiles of different sizes, the data in unit_info is laid
+  // out as if all tiles are of full size.
+  int units_per_tile;
+  int vert_units_per_tile, horz_units_per_tile;
+  RestorationUnitInfo *unit_info;
+  RestorationStripeBoundaries boundaries;
+  int optimized_lr;
+} RestorationInfo;
 
 static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
   sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
@@ -257,91 +247,128 @@ static INLINE void set_default_wiener(WienerInfo *wiener_info) {
   wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
 }
 
-static INLINE int av1_get_rest_ntiles(int width, int height, int tilesize,
-                                      int *tile_width, int *tile_height,
-                                      int *nhtiles, int *nvtiles) {
-  int nhtiles_, nvtiles_;
-  int tile_width_, tile_height_;
-  tile_width_ = (tilesize < 0) ? width : AOMMIN(tilesize, width);
-  tile_height_ = (tilesize < 0) ? height : AOMMIN(tilesize, height);
-  assert(tile_width_ > 0 && tile_height_ > 0);
-
-  nhtiles_ = (width + (tile_width_ >> 1)) / tile_width_;
-  nvtiles_ = (height + (tile_height_ >> 1)) / tile_height_;
-  if (tile_width) *tile_width = tile_width_;
-  if (tile_height) *tile_height = tile_height_;
-  if (nhtiles) *nhtiles = nhtiles_;
-  if (nvtiles) *nvtiles = nvtiles_;
-  return (nhtiles_ * nvtiles_);
-}
-
-typedef struct { int h_start, h_end, v_start, v_end; } RestorationTileLimits;
-
-static INLINE RestorationTileLimits
-av1_get_rest_tile_limits(int tile_idx, int nhtiles, int nvtiles, int tile_width,
-                         int tile_height, int im_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                         int im_height, int subsampling_y) {
-#else
-                         int im_height) {
-#endif
-  const int htile_idx = tile_idx % nhtiles;
-  const int vtile_idx = tile_idx / nhtiles;
-  RestorationTileLimits limits;
-  limits.h_start = htile_idx * tile_width;
-  limits.v_start = vtile_idx * tile_height;
-  limits.h_end =
-      (htile_idx < nhtiles - 1) ? limits.h_start + tile_width : im_width;
-  limits.v_end =
-      (vtile_idx < nvtiles - 1) ? limits.v_start + tile_height : im_height;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  // Offset the tile upwards to align with the restoration processing stripe
-  limits.v_start -= RESTORATION_TILE_OFFSET >> subsampling_y;
-  if (limits.v_start < 0) limits.v_start = 0;
-  if (limits.v_end < im_height)
-    limits.v_end -= RESTORATION_TILE_OFFSET >> subsampling_y;
-#endif
-  return limits;
-}
+typedef struct {
+  int h_start, h_end, v_start, v_end;
+} RestorationTileLimits;
+
+typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    int rest_unit_idx, void *priv,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
+typedef struct FilterFrameCtxt {
+  const RestorationInfo *rsi;
+  int tile_stripe0;
+  int ss_x, ss_y;
+  int highbd, bit_depth;
+  uint8_t *data8, *dst8;
+  int data_stride, dst_stride;
+  AV1PixelRect tile_rect;
+} FilterFrameCtxt;
+
+typedef struct AV1LrStruct {
+  rest_unit_visitor_t on_rest_unit;
+  FilterFrameCtxt ctxt[MAX_MB_PLANE];
+  YV12_BUFFER_CONFIG *frame;
+  YV12_BUFFER_CONFIG *dst;
+} AV1LrStruct;
 
 extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
-extern int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
 extern const int32_t x_by_xplus1[256];
 extern const int32_t one_by_x[MAX_NELEM];
 
-int av1_alloc_restoration_struct(struct AV1Common *cm,
-                                 RestorationInfo *rst_info, int width,
-                                 int height);
+void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
+                                  int is_uv);
 void av1_free_restoration_struct(RestorationInfo *rst_info);
 
 void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert);
-#if CONFIG_HIGHBITDEPTH
-void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
-                         int border_horz, int border_vert);
-#endif  // CONFIG_HIGHBITDEPTH
-void decode_xq(int *xqd, int *xq);
-void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                                RestorationInfo *rsi, int components_pattern,
-                                int partial_frame, YV12_BUFFER_CONFIG *dst);
+                  int border_horz, int border_vert, int highbd);
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+
+// Filter a single loop restoration unit.
+//
+// limits is the limits of the unit. rui gives the mode to use for this unit
+// and its coefficients. If striped loop restoration is enabled, rsb contains
+// deblocked pixels to use for stripe boundaries; rlbs is just some space to
+// use as a scratch buffer. tile_rect gives the limits of the tile containing
+// this unit. tile_stripe0 is the index of the first stripe in this tile.
+//
+// ss_x and ss_y are flags which should be 1 if this is a plane with
+// horizontal/vertical subsampling, respectively. highbd is a flag which should
+// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
+//
+// data8 is the frame data (pointing at the top-left corner of the frame, not
+// the restoration unit) and stride is its stride. dst8 is the buffer where the
+// results will be written and has stride dst_stride. Like data8, dst8 should
+// point at the top-left corner of the frame.
+//
+// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
+// be at least SGRPROJ_TMPBUF_SIZE big.
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr);
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       struct AV1Common *cm, int optimized_lr,
+                                       void *lr_ctxt);
 void av1_loop_restoration_precal();
 
+typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
+                                          void *priv);
+struct AV1LrSyncData;
+
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+                                const int sb_cols, int plane);
+
+// Call on_rest_unit for each loop restoration unit in the plane.
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
 // Return 1 iff the block at mi_row, mi_col with size bsize is a
 // top-level superblock containing the top-left corner of at least one
-// loop restoration tile.
+// loop restoration unit.
 //
 // If the block is a top-level superblock, the function writes to
-// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of indices given by
-// [*rcol0, *rcol1) x [*rrow0, *rrow1) will point at the set of rtiles
-// whose top left corners lie in the superblock. Note that the set is
-// only nonempty if *rcol0 < *rcol1 and *rrow0 < *rrow1.
+// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
+// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
+// to the current tile, whose starting index is returned as
+// *tile_tl_idx.
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
-                                       int *rrow1, int *nhtiles);
-
-void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
-                                              struct AV1Common *cm);
+                                       int *rrow1, int *tile_tl_idx);
+
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              struct AV1Common *cm,
+                                              int after_cdef);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            struct AV1Common *cm,
+                                            int optimized_lr, int num_planes);
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      struct AV1Common *cm, int num_planes);
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync);
+AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c
index d5ccdfec0..c525fe229 100644
--- a/third_party/aom/av1/common/scale.c
+++ b/third_party/aom/av1/common/scale.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/filter.h"
 #include "av1/common/scale.h"
 #include "aom_dsp/aom_filter.h"
@@ -46,12 +48,9 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) {
   return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
 }
 
-static int get_coarse_point_scale_factor(int other_size, int this_size) {
-  // Calculate scaling factor once for each reference frame
-  // and use fixed point scaling factors in decoding and encoding routines.
-  // Hardware implementations can calculate scale factor in device driver
-  // and use multiplication and shifting on hardware instead of division.
-  return ((other_size << SCALE_SUBPEL_BITS) + this_size / 2) / this_size;
+// Given the fixed point scale, calculate coarse point scale.
+static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
+  return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
 }
 
 // Note: x and y are integer precision, mvq4 is q4 precision.
@@ -64,14 +63,8 @@ MV32 av1_scale_mv(const MV *mvq4, int x, int y,
   return res;
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
-                                       int other_h, int this_w, int this_h,
-                                       int use_highbd) {
-#else
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h) {
-#endif
   if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     sf->x_scale_fp = REF_INVALID_SCALE;
     sf->y_scale_fp = REF_INVALID_SCALE;
@@ -81,8 +74,8 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
   sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
   sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
 
-  sf->x_step_q4 = get_coarse_point_scale_factor(other_w, this_w);
-  sf->y_step_q4 = get_coarse_point_scale_factor(other_h, this_h);
+  sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
+  sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
 
   if (av1_is_scaled(sf)) {
     sf->scale_value_x = scaled_x;
@@ -92,95 +85,42 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
     sf->scale_value_y = unscaled_value;
   }
 
-  // TODO(agrange): Investigate the best choice of functions to use here
-  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
-  // to do at full-pel offsets. The current selection, where the filter is
-  // applied in one direction only, and not at all for 0,0, seems to give the
-  // best quality, but it may be worth trying an additional mode that does
-  // do the filtering on full-pel.
-  if (sf->x_step_q4 == SCALE_SUBPEL_SHIFTS) {
-    if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-      // No scaling in either direction.
-      sf->predict[0][0][0] = aom_convolve_copy;
-      sf->predict[0][0][1] = aom_convolve_avg;
-      sf->predict[0][1][0] = aom_convolve8_vert;
-      sf->predict[0][1][1] = aom_convolve8_avg_vert;
-      sf->predict[1][0][0] = aom_convolve8_horiz;
-      sf->predict[1][0][1] = aom_convolve8_avg_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      sf->predict[0][0][0] = aom_convolve8_vert;
-      sf->predict[0][0][1] = aom_convolve8_avg_vert;
-      sf->predict[0][1][0] = aom_convolve8_vert;
-      sf->predict[0][1][1] = aom_convolve8_avg_vert;
-      sf->predict[1][0][0] = aom_convolve8;
-      sf->predict[1][0][1] = aom_convolve8_avg;
-    }
-  } else {
-    if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      sf->predict[0][0][0] = aom_convolve8_horiz;
-      sf->predict[0][0][1] = aom_convolve8_avg_horiz;
-      sf->predict[0][1][0] = aom_convolve8;
-      sf->predict[0][1][1] = aom_convolve8_avg;
-      sf->predict[1][0][0] = aom_convolve8_horiz;
-      sf->predict[1][0][1] = aom_convolve8_avg_horiz;
-    } else {
-      // Must always scale in both directions.
-      sf->predict[0][0][0] = aom_convolve8;
-      sf->predict[0][0][1] = aom_convolve8_avg;
-      sf->predict[0][1][0] = aom_convolve8;
-      sf->predict[0][1][1] = aom_convolve8_avg;
-      sf->predict[1][0][0] = aom_convolve8;
-      sf->predict[1][0][1] = aom_convolve8_avg;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  sf->predict[1][1][0] = aom_convolve8;
-  sf->predict[1][1][1] = aom_convolve8_avg;
-
-#if CONFIG_HIGHBITDEPTH
-  if (use_highbd) {
-    if (sf->x_step_q4 == SCALE_SUBPEL_SHIFTS) {
-      if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-        // No scaling in either direction.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve_copy;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve_avg;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8_vert;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg_vert;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8_horiz;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg_horiz;
-      } else {
-        // No scaling in x direction. Must always scale in the y direction.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve8_vert;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg_vert;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8_vert;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg_vert;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg;
-      }
-    } else {
-      if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-        // No scaling in the y direction. Must always scale in the x direction.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve8_horiz;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg_horiz;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8_horiz;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg_horiz;
-      } else {
-        // Must always scale in both directions.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve8;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg;
-      }
-    }
-    // 2D subpel motion always gets filtered in both directions.
-    sf->highbd_predict[1][1][0] = aom_highbd_convolve8;
-    sf->highbd_predict[1][1][1] = aom_highbd_convolve8_avg;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
+  // AV1 convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_convolve_2d.
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
+  // subpel_x_q4 == 0
+  sf->convolve[0][1][0] = av1_convolve_y_sr;
+  // subpel_y_q4 == 0
+  sf->convolve[1][0][0] = av1_convolve_x_sr;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->convolve[1][1][0] = av1_convolve_2d_sr;
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
+  // subpel_x_q4 == 0
+  sf->convolve[0][1][1] = av1_jnt_convolve_y;
+  // subpel_y_q4 == 0
+  sf->convolve[1][0][1] = av1_jnt_convolve_x;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+  // AV1 High BD convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_highbd_convolve_2d.
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
+  // subpel_x_q4 == 0
+  sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
+  // subpel_y_q4 == 0
+  sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
+  // subpel_x_q4 == 0
+  sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
+  // subpel_y_q4 == 0
+  sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
 }
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
index 900e6bf47..5f02fdb81 100644
--- a/third_party/aom/av1/common/scale.h
+++ b/third_party/aom/av1/common/scale.h
@@ -12,6 +12,7 @@
 #ifndef AV1_COMMON_SCALE_H_
 #define AV1_COMMON_SCALE_H_
 
+#include "av1/common/convolve.h"
 #include "av1/common/mv.h"
 #include "aom_dsp/aom_convolve.h"
 
@@ -34,22 +35,15 @@ struct scale_factors {
   int (*scale_value_x)(int val, const struct scale_factors *sf);
   int (*scale_value_y)(int val, const struct scale_factors *sf);
 
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-#if CONFIG_HIGHBITDEPTH
-  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
-#endif                                           // CONFIG_HIGHBITDEPTH
+  // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
+  aom_convolve_fn_t convolve[2][2][2];
+  aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
 };
 
 MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 
-#if CONFIG_HIGHBITDEPTH
-void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
-                                       int other_h, int this_w, int this_h,
-                                       int use_high);
-#else
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&
diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c
index 3c8f3d7ac..31a787b53 100644
--- a/third_party/aom/av1/common/scan.c
+++ b/third_party/aom/av1/common/scan.c
@@ -14,17 +14,10 @@
 #include "av1/common/common_data.h"
 #include "av1/common/scan.h"
 
-#if CONFIG_CHROMA_2X2
-DECLARE_ALIGNED(16, static const int16_t, default_scan_2x2[4]) = {
-  0, 1, 2, 3,
-};
-#endif
-
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
-  0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15,
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 };
@@ -32,19 +25,10 @@ DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
 DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
-  0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
-  0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
-  0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
-  17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
@@ -58,8 +42,8 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
-  0,  1,  8,  9, 2,  16, 10, 17, 18, 3,  24, 11, 25, 19, 26, 4,
-  12, 27, 20, 5, 28, 13, 21, 29, 6,  14, 22, 30, 7,  15, 23, 31,
+  0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+  12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
@@ -73,20 +57,19 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
-  0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
-  17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+  33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+  49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
-  0,  1,  16, 17, 2,  32, 18, 33, 34, 3,  48, 19, 49, 35, 50, 4,
-  20, 51, 36, 5,  52, 21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
-  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
-  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+  0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+  20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+  24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+  28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
@@ -114,7 +97,6 @@ DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
   8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
   12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
   0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
@@ -138,27 +120,26 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
-  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
-  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
-  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 9,
-  40,  71,  102, 133, 164, 195, 226, 10,  41,  72,  103, 134, 165, 196, 227,
-  11,  42,  73,  104, 135, 166, 197, 228, 12,  43,  74,  105, 136, 167, 198,
-  229, 13,  44,  75,  106, 137, 168, 199, 230, 14,  45,  76,  107, 138, 169,
-  200, 231, 15,  46,  77,  108, 139, 170, 201, 232, 16,  47,  78,  109, 140,
-  171, 202, 233, 17,  48,  79,  110, 141, 172, 203, 234, 18,  49,  80,  111,
-  142, 173, 204, 235, 19,  50,  81,  112, 143, 174, 205, 236, 20,  51,  82,
-  113, 144, 175, 206, 237, 21,  52,  83,  114, 145, 176, 207, 238, 22,  53,
-  84,  115, 146, 177, 208, 239, 23,  54,  85,  116, 147, 178, 209, 240, 24,
-  55,  86,  117, 148, 179, 210, 241, 25,  56,  87,  118, 149, 180, 211, 242,
-  26,  57,  88,  119, 150, 181, 212, 243, 27,  58,  89,  120, 151, 182, 213,
-  244, 28,  59,  90,  121, 152, 183, 214, 245, 29,  60,  91,  122, 153, 184,
-  215, 246, 30,  61,  92,  123, 154, 185, 216, 247, 31,  62,  93,  124, 155,
-  186, 217, 248, 63,  94,  125, 156, 187, 218, 249, 95,  126, 157, 188, 219,
-  250, 127, 158, 189, 220, 251, 159, 190, 221, 252, 191, 222, 253, 223, 254,
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+  195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+  228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+  12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+  45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+  78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+  111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+  144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+  177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+  210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+  243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+  27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+  60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+  93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+  95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
   255,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
   15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
@@ -240,16 +221,14 @@ DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
   28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
   30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
-  0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
-  33, 19, 40, 12, 34, 27, 5,  41, 20, 48, 13, 35, 42, 28, 21, 6,
-  49, 56, 36, 43, 29, 7,  14, 50, 57, 44, 22, 37, 15, 51, 58, 30,
-  45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63,
+  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
   0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
   2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
@@ -263,21 +242,6 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
-  0,  8,  16, 1,  24, 9,  32, 17, 2,  40, 25, 10, 33, 18, 48, 3,
-  26, 41, 11, 56, 19, 34, 4,  49, 27, 42, 12, 35, 20, 57, 50, 28,
-  5,  43, 13, 36, 58, 51, 21, 44, 6,  29, 59, 37, 14, 52, 22, 7,
-  45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
-  0,  1,  2,  8,  9,  3,  16, 10, 4,  17, 11, 24, 5,  18, 25, 12,
-  19, 26, 32, 6,  13, 20, 33, 27, 7,  34, 40, 21, 28, 41, 14, 35,
-  48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51,
-  58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
   0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
@@ -292,14 +256,14 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
-  0,   1,  16,  2,   17,  32,  3,  18, 33,  48,  4,   19,  34,  49,  64,  5,
-  20,  35, 50,  65,  80,  6,   21, 36, 51,  66,  81,  96,  7,   22,  37,  52,
-  67,  82, 97,  112, 8,   23,  38, 53, 68,  83,  98,  113, 9,   24,  39,  54,
-  69,  84, 99,  114, 10,  25,  40, 55, 70,  85,  100, 115, 11,  26,  41,  56,
-  71,  86, 101, 116, 12,  27,  42, 57, 72,  87,  102, 117, 13,  28,  43,  58,
-  73,  88, 103, 118, 14,  29,  44, 59, 74,  89,  104, 119, 15,  30,  45,  60,
-  75,  90, 105, 120, 31,  46,  61, 76, 91,  106, 121, 47,  62,  77,  92,  107,
-  122, 63, 78,  93,  108, 123, 79, 94, 109, 124, 95,  110, 125, 111, 126, 127,
+  0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+  65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+  52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+  54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+  56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+  58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+  60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+  47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
@@ -387,41 +351,41 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
-  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
-  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
-  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 256,
-  9,   40,  71,  102, 133, 164, 195, 226, 257, 288, 10,  41,  72,  103, 134,
-  165, 196, 227, 258, 289, 320, 11,  42,  73,  104, 135, 166, 197, 228, 259,
-  290, 321, 352, 12,  43,  74,  105, 136, 167, 198, 229, 260, 291, 322, 353,
-  384, 13,  44,  75,  106, 137, 168, 199, 230, 261, 292, 323, 354, 385, 416,
-  14,  45,  76,  107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448,
-  15,  46,  77,  108, 139, 170, 201, 232, 263, 294, 325, 356, 387, 418, 449,
-  480, 16,  47,  78,  109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419,
-  450, 481, 17,  48,  79,  110, 141, 172, 203, 234, 265, 296, 327, 358, 389,
-  420, 451, 482, 18,  49,  80,  111, 142, 173, 204, 235, 266, 297, 328, 359,
-  390, 421, 452, 483, 19,  50,  81,  112, 143, 174, 205, 236, 267, 298, 329,
-  360, 391, 422, 453, 484, 20,  51,  82,  113, 144, 175, 206, 237, 268, 299,
-  330, 361, 392, 423, 454, 485, 21,  52,  83,  114, 145, 176, 207, 238, 269,
-  300, 331, 362, 393, 424, 455, 486, 22,  53,  84,  115, 146, 177, 208, 239,
-  270, 301, 332, 363, 394, 425, 456, 487, 23,  54,  85,  116, 147, 178, 209,
-  240, 271, 302, 333, 364, 395, 426, 457, 488, 24,  55,  86,  117, 148, 179,
-  210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 25,  56,  87,  118, 149,
-  180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 26,  57,  88,  119,
-  150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 27,  58,  89,
-  120, 151, 182, 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 28,  59,
-  90,  121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 29,
-  60,  91,  122, 153, 184, 215, 246, 277, 308, 339, 370, 401, 432, 463, 494,
-  30,  61,  92,  123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464,
-  495, 31,  62,  93,  124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434,
-  465, 496, 63,  94,  125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435,
-  466, 497, 95,  126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
-  498, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 159,
-  190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 191, 222, 253, 284,
-  315, 346, 377, 408, 439, 470, 501, 223, 254, 285, 316, 347, 378, 409, 440,
-  471, 502, 255, 286, 317, 348, 379, 410, 441, 472, 503, 287, 318, 349, 380,
-  411, 442, 473, 504, 319, 350, 381, 412, 443, 474, 505, 351, 382, 413, 444,
-  475, 506, 383, 414, 445, 476, 507, 415, 446, 477, 508, 447, 478, 509, 479,
-  510, 511,
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+  288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+  165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+  73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+  12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+  448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+  480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+  15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+  47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+  79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+  111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+  143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+  175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+  207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+  239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+  271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+  303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+  335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+  367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+  399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+  431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+  463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+  495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+  30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+  62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+  94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+  95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+  469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+  377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+  254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+  380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+  382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+  479, 511,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
@@ -574,27 +538,26 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
-  0,   16,  1,   32,  17,  2,   48,  33,  18,  3,   64,  34,  49,  19,  65,
-  80,  50,  4,   35,  66,  20,  81,  96,  51,  5,   36,  82,  97,  67,  112,
-  21,  52,  98,  37,  83,  113, 6,   68,  128, 53,  22,  99,  114, 84,  7,
-  129, 38,  69,  100, 115, 144, 130, 85,  54,  23,  8,   145, 39,  70,  116,
-  101, 131, 160, 146, 55,  86,  24,  71,  132, 117, 161, 40,  9,   102, 147,
-  176, 162, 87,  56,  25,  133, 118, 177, 148, 72,  103, 41,  163, 10,  192,
-  178, 88,  57,  134, 149, 119, 26,  164, 73,  104, 193, 42,  179, 208, 11,
-  135, 89,  165, 120, 150, 58,  194, 180, 27,  74,  209, 105, 151, 136, 43,
-  90,  224, 166, 195, 181, 121, 210, 59,  12,  152, 106, 167, 196, 75,  137,
-  225, 211, 240, 182, 122, 91,  28,  197, 13,  226, 168, 183, 153, 44,  212,
-  138, 107, 241, 60,  29,  123, 198, 184, 227, 169, 242, 76,  213, 154, 45,
-  92,  14,  199, 139, 61,  228, 214, 170, 185, 243, 108, 77,  155, 30,  15,
-  200, 229, 124, 215, 244, 93,  46,  186, 171, 201, 109, 140, 230, 62,  216,
-  245, 31,  125, 78,  156, 231, 47,  187, 202, 217, 94,  246, 141, 63,  232,
-  172, 110, 247, 157, 79,  218, 203, 126, 233, 188, 248, 95,  173, 142, 219,
-  111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
-  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
-  255,
+  0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+  5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+  37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+  85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+  12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+  31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+  243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+  109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+  170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+  218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+  250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+  255
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
   0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
   1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
@@ -634,51 +597,7 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
   240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
   255,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
-  0,   16,  32,  48,  1,   64,  17,  80,  33,  96,  49,  2,   65,  112, 18,
-  81,  34,  128, 50,  97,  3,   66,  144, 19,  113, 35,  82,  160, 98,  51,
-  129, 4,   67,  176, 20,  114, 145, 83,  36,  99,  130, 52,  192, 5,   161,
-  68,  115, 21,  146, 84,  208, 177, 37,  131, 100, 53,  162, 224, 69,  6,
-  116, 193, 147, 85,  22,  240, 132, 38,  178, 101, 163, 54,  209, 117, 70,
-  7,   148, 194, 86,  179, 225, 23,  133, 39,  164, 8,   102, 210, 241, 55,
-  195, 118, 149, 71,  180, 24,  87,  226, 134, 165, 211, 40,  103, 56,  72,
-  150, 196, 242, 119, 9,   181, 227, 88,  166, 25,  135, 41,  104, 212, 57,
-  151, 197, 120, 73,  243, 182, 136, 167, 213, 89,  10,  228, 105, 152, 198,
-  26,  42,  121, 183, 244, 168, 58,  137, 229, 74,  214, 90,  153, 199, 184,
-  11,  106, 245, 27,  122, 230, 169, 43,  215, 59,  200, 138, 185, 246, 75,
-  12,  91,  154, 216, 231, 107, 28,  44,  201, 123, 170, 60,  247, 232, 76,
-  139, 13,  92,  217, 186, 248, 155, 108, 29,  124, 45,  202, 233, 171, 61,
-  14,  77,  140, 15,  249, 93,  30,  187, 156, 218, 46,  109, 125, 62,  172,
-  78,  203, 31,  141, 234, 94,  47,  188, 63,  157, 110, 250, 219, 79,  126,
-  204, 173, 142, 95,  189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
-  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
-  255,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
-  0,   1,   2,   16,  3,   17,  4,   18,  32,  5,   33,  19,  6,   34,  48,
-  20,  49,  7,   35,  21,  50,  64,  8,   36,  65,  22,  51,  37,  80,  9,
-  66,  52,  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,
-  83,  97,  69,  25,  98,  84,  40,  112, 55,  12,  70,  99,  113, 85,  26,
-  41,  56,  114, 100, 13,  71,  128, 86,  27,  115, 101, 129, 42,  57,  72,
-  116, 14,  87,  130, 102, 144, 73,  131, 117, 28,  58,  15,  88,  43,  145,
-  103, 132, 146, 118, 74,  160, 89,  133, 104, 29,  59,  147, 119, 44,  161,
-  148, 90,  105, 134, 162, 120, 176, 75,  135, 149, 30,  60,  163, 177, 45,
-  121, 91,  106, 164, 178, 150, 192, 136, 165, 179, 31,  151, 193, 76,  122,
-  61,  137, 194, 107, 152, 180, 208, 46,  166, 167, 195, 92,  181, 138, 209,
-  123, 153, 224, 196, 77,  168, 210, 182, 240, 108, 197, 62,  154, 225, 183,
-  169, 211, 47,  139, 93,  184, 226, 212, 241, 198, 170, 124, 155, 199, 78,
-  213, 185, 109, 227, 200, 63,  228, 242, 140, 214, 171, 186, 156, 229, 243,
-  125, 94,  201, 244, 215, 216, 230, 141, 187, 202, 79,  172, 110, 157, 245,
-  217, 231, 95,  246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, 188,
-  248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
-  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
-  255,
-};
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
   0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
   448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
@@ -837,998 +756,97 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
   1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
   1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
-  0,    32,   1,    64,  33,   2,    96,   65,   34,   128,  3,    97,   66,
-  160,  129,  35,   98,  4,    67,   130,  161,  192,  36,   99,   224,  5,
-  162,  193,  68,   131, 37,   100,  225,  194,  256,  163,  69,   132,  6,
-  226,  257,  288,  195, 101,  164,  38,   258,  7,    227,  289,  133,  320,
-  70,   196,  165,  290, 259,  228,  39,   321,  102,  352,  8,    197,  71,
-  134,  322,  291,  260, 353,  384,  229,  166,  103,  40,   354,  323,  292,
-  135,  385,  198,  261, 72,   9,    416,  167,  386,  355,  230,  324,  104,
-  293,  41,   417,  199, 136,  262,  387,  448,  325,  356,  10,   73,   418,
-  231,  168,  449,  294, 388,  105,  419,  263,  42,   200,  357,  450,  137,
-  480,  74,   326,  232, 11,   389,  169,  295,  420,  106,  451,  481,  358,
-  264,  327,  201,  43,  138,  512,  482,  390,  296,  233,  170,  421,  75,
-  452,  359,  12,   513, 265,  483,  328,  107,  202,  514,  544,  422,  391,
-  453,  139,  44,   234, 484,  297,  360,  171,  76,   515,  545,  266,  329,
-  454,  13,   423,  203, 108,  546,  485,  576,  298,  235,  140,  361,  330,
-  172,  547,  45,   455, 267,  577,  486,  77,   204,  362,  608,  14,   299,
-  578,  109,  236,  487, 609,  331,  141,  579,  46,   15,   173,  610,  363,
-  78,   205,  16,   110, 237,  611,  142,  47,   174,  79,   206,  17,   111,
-  238,  48,   143,  80,  175,  112,  207,  49,   18,   239,  81,   113,  19,
-  50,   82,   114,  51,  83,   115,  640,  516,  392,  268,  144,  20,   672,
-  641,  548,  517,  424, 393,  300,  269,  176,  145,  52,   21,   704,  673,
-  642,  580,  549,  518, 456,  425,  394,  332,  301,  270,  208,  177,  146,
-  84,   53,   22,   736, 705,  674,  643,  612,  581,  550,  519,  488,  457,
-  426,  395,  364,  333, 302,  271,  240,  209,  178,  147,  116,  85,   54,
-  23,   737,  706,  675, 613,  582,  551,  489,  458,  427,  365,  334,  303,
-  241,  210,  179,  117, 86,   55,   738,  707,  614,  583,  490,  459,  366,
-  335,  242,  211,  118, 87,   739,  615,  491,  367,  243,  119,  768,  644,
-  520,  396,  272,  148, 24,   800,  769,  676,  645,  552,  521,  428,  397,
-  304,  273,  180,  149, 56,   25,   832,  801,  770,  708,  677,  646,  584,
-  553,  522,  460,  429, 398,  336,  305,  274,  212,  181,  150,  88,   57,
-  26,   864,  833,  802, 771,  740,  709,  678,  647,  616,  585,  554,  523,
-  492,  461,  430,  399, 368,  337,  306,  275,  244,  213,  182,  151,  120,
-  89,   58,   27,   865, 834,  803,  741,  710,  679,  617,  586,  555,  493,
-  462,  431,  369,  338, 307,  245,  214,  183,  121,  90,   59,   866,  835,
-  742,  711,  618,  587, 494,  463,  370,  339,  246,  215,  122,  91,   867,
-  743,  619,  495,  371, 247,  123,  896,  772,  648,  524,  400,  276,  152,
-  28,   928,  897,  804, 773,  680,  649,  556,  525,  432,  401,  308,  277,
-  184,  153,  60,   29,  960,  929,  898,  836,  805,  774,  712,  681,  650,
-  588,  557,  526,  464, 433,  402,  340,  309,  278,  216,  185,  154,  92,
-  61,   30,   992,  961, 930,  899,  868,  837,  806,  775,  744,  713,  682,
-  651,  620,  589,  558, 527,  496,  465,  434,  403,  372,  341,  310,  279,
-  248,  217,  186,  155, 124,  93,   62,   31,   993,  962,  931,  869,  838,
-  807,  745,  714,  683, 621,  590,  559,  497,  466,  435,  373,  342,  311,
-  249,  218,  187,  125, 94,   63,   994,  963,  870,  839,  746,  715,  622,
-  591,  498,  467,  374, 343,  250,  219,  126,  95,   995,  871,  747,  623,
-  499,  375,  251,  127, 900,  776,  652,  528,  404,  280,  156,  932,  901,
-  808,  777,  684,  653, 560,  529,  436,  405,  312,  281,  188,  157,  964,
-  933,  902,  840,  809, 778,  716,  685,  654,  592,  561,  530,  468,  437,
-  406,  344,  313,  282, 220,  189,  158,  996,  965,  934,  903,  872,  841,
-  810,  779,  748,  717, 686,  655,  624,  593,  562,  531,  500,  469,  438,
-  407,  376,  345,  314, 283,  252,  221,  190,  159,  997,  966,  935,  873,
-  842,  811,  749,  718, 687,  625,  594,  563,  501,  470,  439,  377,  346,
-  315,  253,  222,  191, 998,  967,  874,  843,  750,  719,  626,  595,  502,
-  471,  378,  347,  254, 223,  999,  875,  751,  627,  503,  379,  255,  904,
-  780,  656,  532,  408, 284,  936,  905,  812,  781,  688,  657,  564,  533,
-  440,  409,  316,  285, 968,  937,  906,  844,  813,  782,  720,  689,  658,
-  596,  565,  534,  472, 441,  410,  348,  317,  286,  1000, 969,  938,  907,
-  876,  845,  814,  783, 752,  721,  690,  659,  628,  597,  566,  535,  504,
-  473,  442,  411,  380, 349,  318,  287,  1001, 970,  939,  877,  846,  815,
-  753,  722,  691,  629, 598,  567,  505,  474,  443,  381,  350,  319,  1002,
-  971,  878,  847,  754, 723,  630,  599,  506,  475,  382,  351,  1003, 879,
-  755,  631,  507,  383, 908,  784,  660,  536,  412,  940,  909,  816,  785,
-  692,  661,  568,  537, 444,  413,  972,  941,  910,  848,  817,  786,  724,
-  693,  662,  600,  569, 538,  476,  445,  414,  1004, 973,  942,  911,  880,
-  849,  818,  787,  756, 725,  694,  663,  632,  601,  570,  539,  508,  477,
-  446,  415,  1005, 974, 943,  881,  850,  819,  757,  726,  695,  633,  602,
-  571,  509,  478,  447, 1006, 975,  882,  851,  758,  727,  634,  603,  510,
-  479,  1007, 883,  759, 635,  511,  912,  788,  664,  540,  944,  913,  820,
-  789,  696,  665,  572, 541,  976,  945,  914,  852,  821,  790,  728,  697,
-  666,  604,  573,  542, 1008, 977,  946,  915,  884,  853,  822,  791,  760,
-  729,  698,  667,  636, 605,  574,  543,  1009, 978,  947,  885,  854,  823,
-  761,  730,  699,  637, 606,  575,  1010, 979,  886,  855,  762,  731,  638,
-  607,  1011, 887,  763, 639,  916,  792,  668,  948,  917,  824,  793,  700,
-  669,  980,  949,  918, 856,  825,  794,  732,  701,  670,  1012, 981,  950,
-  919,  888,  857,  826, 795,  764,  733,  702,  671,  1013, 982,  951,  889,
-  858,  827,  765,  734, 703,  1014, 983,  890,  859,  766,  735,  1015, 891,
-  767,  920,  796,  952, 921,  828,  797,  984,  953,  922,  860,  829,  798,
-  1016, 985,  954,  923, 892,  861,  830,  799,  1017, 986,  955,  893,  862,
-  831,  1018, 987,  894, 863,  1019, 895,  924,  956,  925,  988,  957,  926,
-  1020, 989,  958,  927, 1021, 990,  959,  1022, 991,  1023,
-};
-
-// Scan over two rectangular vertical partitions one after the other
-DECLARE_ALIGNED(16, static const int16_t, v2_scan_32x32[1024]) = {
-  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
-  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
-  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
-  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
-  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
-  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
-  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
-  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
-  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
-  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
-  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
-  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
-  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
-  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
-  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
-  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
-  238,  455,  175,  301,  425,  485,  512,  513,  270,  456,  514,  207,  486,
-  364,  395,  515,  333,  426,  516,  239,  487,  302,  457,  517,  396,  271,
-  488,  544,  365,  427,  545,  518,  546,  334,  458,  547,  519,  548,  303,
-  489,  397,  428,  549,  366,  459,  520,  576,  335,  490,  550,  577,  578,
-  579,  521,  429,  551,  398,  460,  580,  367,  491,  581,  552,  522,  582,
-  608,  609,  430,  461,  610,  399,  492,  553,  611,  583,  523,  612,  613,
-  584,  554,  462,  431,  493,  614,  524,  640,  641,  642,  585,  643,  555,
-  615,  644,  463,  494,  586,  525,  616,  645,  556,  646,  672,  617,  673,
-  587,  674,  647,  495,  675,  526,  676,  557,  618,  648,  677,  588,  678,
-  527,  649,  619,  704,  558,  705,  706,  679,  589,  707,  650,  708,  620,
-  680,  709,  559,  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,
-  682,  652,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,
-  743,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,  685,  745,
-  774,  655,  775,  800,  801,  716,  746,  802,  803,  686,  776,  804,  747,
-  805,  717,  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  835,
-  808,  836,  779,  749,  837,  809,  719,  838,  780,  750,  810,  839,  864,
-  865,  866,  867,  840,  781,  868,  811,  751,  869,  841,  870,  812,  782,
-  842,  871,  896,  897,  898,  872,  899,  813,  843,  900,  783,  901,  873,
-  844,  902,  814,  874,  903,  928,  929,  845,  930,  904,  815,  875,  931,
-  932,  905,  933,  846,  876,  934,  906,  935,  877,  960,  847,  961,  962,
-  907,  936,  963,  964,  937,  878,  965,  908,  966,  938,  967,  909,  879,
-  992,  939,  993,  968,  994,  995,  996,  910,  969,  940,  997,  998,  970,
-  911,  941,  999,  971,  1000, 942,  1001, 972,  1002, 943,  973,  1003, 974,
-  1004, 975,  1005, 1006, 1007, 16,   48,   80,   112,  144,  176,  17,   49,
-  208,  81,   113,  145,  240,  177,  272,  18,   50,   209,  82,   114,  304,
-  241,  146,  178,  273,  336,  210,  19,   51,   83,   115,  305,  242,  147,
-  368,  179,  274,  337,  211,  20,   400,  52,   84,   306,  116,  243,  369,
-  148,  338,  180,  275,  432,  401,  212,  21,   53,   307,  85,   370,  244,
-  117,  464,  149,  433,  339,  276,  181,  402,  213,  308,  496,  371,  22,
-  54,   465,  86,   245,  118,  434,  150,  340,  277,  403,  182,  528,  497,
-  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,  341,  404,  151,
-  529,  560,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,   247,
-  561,  88,   530,  592,  342,  120,  405,  499,  152,  279,  468,  184,  374,
-  311,  437,  216,  562,  593,  531,  624,  25,   248,  500,  57,   406,  89,
-  343,  121,  469,  280,  153,  594,  185,  375,  563,  625,  438,  532,  656,
-  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,  595,  626,
-  281,  564,  657,  154,  376,  533,  688,  439,  186,  313,  502,  218,  408,
-  627,  596,  658,  250,  345,  471,  27,   59,   565,  689,  91,   123,  282,
-  534,  720,  155,  440,  377,  187,  503,  314,  628,  659,  219,  597,  690,
-  409,  472,  566,  721,  346,  251,  28,   60,   535,  752,  92,   124,  283,
-  441,  378,  156,  660,  504,  629,  691,  598,  722,  188,  315,  567,  753,
-  220,  410,  473,  347,  536,  784,  252,  29,   661,  692,  61,   93,   442,
-  630,  723,  284,  125,  379,  505,  599,  754,  157,  316,  568,  785,  189,
-  474,  411,  221,  537,  816,  693,  348,  662,  724,  253,  631,  755,  443,
-  30,   600,  786,  62,   506,  94,   285,  380,  126,  569,  817,  158,  317,
-  190,  475,  694,  725,  412,  663,  756,  538,  848,  222,  632,  787,  349,
-  254,  601,  818,  444,  507,  31,   63,   381,  286,  95,   570,  849,  726,
-  127,  695,  757,  664,  788,  159,  476,  318,  413,  539,  880,  191,  633,
-  819,  223,  350,  602,  850,  508,  255,  445,  727,  758,  696,  789,  571,
-  881,  382,  287,  665,  820,  477,  634,  851,  540,  912,  319,  414,  603,
-  882,  759,  728,  790,  351,  509,  697,  821,  446,  572,  913,  666,  852,
-  383,  635,  883,  478,  541,  944,  415,  760,  791,  604,  914,  729,  822,
-  698,  853,  510,  667,  884,  447,  573,  945,  636,  915,  792,  761,  823,
-  542,  976,  479,  730,  854,  605,  946,  699,  885,  668,  916,  511,  574,
-  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
-  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
-  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
-  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
-  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
-  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
-  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
-  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
-  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
-};
-
-// Scan over two rectangular horizontal partitions one after the other
-DECLARE_ALIGNED(16, static const int16_t, h2_scan_32x32[1024]) = {
-  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
-  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
-  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
-  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
-  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
-  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
-  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
-  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
-  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
-  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
-  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
-  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
-  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
-  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
-  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
-  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
-  238,  455,  175,  301,  425,  485,  16,   48,   80,   270,  456,  207,  486,
-  112,  364,  395,  333,  426,  144,  239,  487,  302,  457,  176,  396,  17,
-  271,  488,  49,   365,  427,  208,  81,   334,  458,  113,  145,  240,  303,
-  489,  397,  428,  177,  366,  459,  272,  18,   50,   209,  335,  490,  82,
-  114,  304,  241,  429,  146,  398,  460,  367,  491,  178,  273,  336,  210,
-  19,   51,   83,   430,  461,  399,  492,  115,  305,  242,  147,  368,  179,
-  274,  337,  462,  431,  493,  211,  20,   400,  52,   84,   306,  116,  243,
-  369,  148,  463,  494,  338,  180,  275,  432,  401,  212,  21,   53,   307,
-  85,   370,  244,  117,  495,  464,  149,  433,  339,  276,  181,  402,  213,
-  308,  496,  371,  22,   54,   465,  86,   245,  118,  434,  150,  340,  277,
-  403,  182,  497,  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,
-  341,  404,  151,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,
-  247,  88,   342,  120,  405,  499,  152,  279,  468,  184,  374,  311,  437,
-  216,  25,   248,  500,  57,   406,  89,   343,  121,  469,  280,  153,  185,
-  375,  438,  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,
-  281,  154,  376,  439,  186,  313,  502,  218,  408,  250,  345,  471,  27,
-  59,   91,   123,  282,  155,  440,  377,  187,  503,  314,  219,  409,  472,
-  346,  251,  28,   60,   92,   124,  283,  441,  378,  156,  504,  188,  315,
-  220,  410,  473,  347,  252,  29,   61,   93,   442,  284,  125,  379,  505,
-  157,  316,  189,  474,  411,  221,  348,  253,  443,  30,   62,   506,  94,
-  285,  380,  126,  158,  317,  190,  475,  412,  222,  349,  254,  444,  507,
-  31,   63,   381,  286,  95,   127,  159,  476,  318,  413,  191,  223,  350,
-  508,  255,  445,  382,  287,  477,  319,  414,  351,  509,  446,  383,  478,
-  415,  510,  447,  479,  511,  512,  513,  514,  515,  516,  517,  544,  545,
-  518,  546,  547,  519,  548,  549,  520,  576,  550,  577,  578,  579,  521,
-  551,  580,  581,  552,  522,  582,  608,  609,  610,  553,  611,  583,  523,
-  612,  613,  584,  554,  614,  524,  640,  641,  642,  585,  643,  555,  615,
-  644,  586,  525,  616,  645,  556,  646,  672,  617,  673,  587,  674,  647,
-  675,  526,  676,  557,  618,  648,  677,  588,  678,  527,  649,  619,  704,
-  558,  705,  706,  679,  589,  707,  650,  708,  620,  680,  709,  528,  559,
-  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,  682,  652,  529,
-  560,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,  561,
-  743,  530,  592,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,
-  685,  745,  774,  562,  593,  531,  624,  655,  775,  800,  801,  716,  746,
-  802,  803,  686,  776,  804,  594,  563,  625,  747,  805,  717,  532,  656,
-  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  595,  626,  835,
-  564,  657,  808,  836,  533,  688,  779,  749,  837,  809,  719,  838,  780,
-  627,  596,  658,  750,  810,  839,  864,  565,  689,  865,  866,  867,  534,
-  720,  840,  781,  868,  811,  751,  869,  841,  628,  659,  597,  690,  870,
-  812,  782,  566,  721,  842,  871,  896,  535,  752,  897,  898,  872,  899,
-  813,  843,  660,  900,  783,  629,  691,  598,  722,  901,  873,  567,  753,
-  844,  902,  814,  874,  536,  784,  903,  661,  692,  928,  929,  630,  723,
-  845,  930,  904,  815,  875,  931,  599,  754,  932,  568,  785,  905,  933,
-  846,  876,  934,  537,  816,  693,  662,  724,  906,  631,  755,  935,  877,
-  600,  786,  960,  847,  961,  962,  907,  936,  963,  569,  817,  964,  937,
-  694,  725,  878,  965,  908,  663,  756,  538,  848,  966,  632,  787,  938,
-  601,  818,  967,  909,  879,  992,  939,  993,  968,  570,  849,  994,  726,
-  695,  757,  995,  664,  788,  996,  910,  969,  539,  880,  940,  633,  819,
-  997,  998,  602,  850,  970,  911,  941,  999,  727,  758,  696,  789,  571,
-  881,  971,  665,  820,  1000, 634,  851,  942,  540,  912,  1001, 972,  603,
-  882,  759,  728,  790,  1002, 697,  821,  943,  973,  572,  913,  666,  852,
-  1003, 635,  883,  974,  541,  944,  760,  791,  1004, 604,  914,  729,  822,
-  698,  853,  975,  667,  884,  573,  945,  1005, 636,  915,  792,  761,  823,
-  542,  976,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  1007, 574,
-  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
-  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
-  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
-  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
-  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
-  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
-  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
-  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
-  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
-};
-
-// Scan where the top left quarter is scanned first
-DECLARE_ALIGNED(16, static const int16_t, qtr_scan_32x32[1024]) = {
-  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
-  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
-  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
-  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
-  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
-  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
-  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
-  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
-  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
-  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
-  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
-  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
-  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
-  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
-  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
-  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
-  238,  455,  175,  301,  425,  485,  270,  456,  207,  486,  364,  395,  333,
-  426,  239,  487,  302,  457,  396,  271,  488,  365,  427,  334,  458,  303,
-  489,  397,  428,  366,  459,  335,  490,  429,  398,  460,  367,  491,  430,
-  461,  399,  492,  462,  431,  493,  463,  494,  495,  16,   512,  48,   513,
-  80,   514,  112,  515,  144,  516,  176,  517,  17,   544,  49,   545,  208,
-  518,  81,   546,  113,  547,  145,  240,  519,  548,  177,  549,  272,  520,
-  18,   576,  50,   209,  550,  577,  82,   578,  114,  579,  304,  521,  241,
-  551,  146,  580,  178,  581,  273,  552,  336,  522,  210,  582,  19,   608,
-  51,   609,  83,   610,  115,  305,  553,  611,  242,  583,  147,  368,  523,
-  612,  179,  613,  274,  584,  337,  554,  211,  614,  20,   400,  524,  640,
-  52,   641,  84,   642,  306,  585,  116,  643,  243,  369,  555,  615,  148,
-  644,  338,  586,  180,  275,  432,  525,  616,  645,  401,  556,  212,  646,
-  21,   672,  53,   307,  617,  673,  85,   370,  587,  674,  244,  647,  117,
-  675,  464,  526,  149,  676,  433,  557,  339,  618,  276,  648,  181,  677,
-  402,  588,  213,  678,  308,  496,  527,  649,  371,  619,  22,   704,  54,
-  465,  558,  705,  86,   706,  245,  679,  118,  434,  589,  707,  150,  340,
-  650,  708,  277,  403,  620,  680,  182,  709,  528,  497,  559,  214,  466,
-  590,  710,  372,  651,  309,  681,  23,   736,  55,   435,  621,  737,  87,
-  246,  711,  738,  119,  739,  341,  682,  404,  652,  151,  529,  560,  740,
-  278,  712,  498,  591,  183,  741,  467,  622,  373,  683,  215,  310,  713,
-  742,  436,  653,  24,   768,  56,   769,  247,  561,  743,  88,   530,  592,
-  770,  342,  714,  120,  405,  684,  771,  499,  623,  152,  772,  279,  744,
-  468,  654,  184,  773,  374,  715,  311,  437,  685,  745,  216,  774,  562,
-  593,  531,  624,  25,   248,  500,  655,  775,  800,  57,   801,  406,  716,
-  89,   343,  746,  802,  121,  803,  469,  686,  280,  776,  153,  804,  594,
-  185,  375,  563,  625,  747,  805,  438,  717,  532,  656,  312,  777,  217,
-  806,  501,  687,  407,  748,  249,  807,  26,   344,  778,  832,  58,   833,
-  90,   470,  718,  834,  122,  595,  626,  835,  281,  564,  657,  808,  154,
-  836,  376,  533,  688,  779,  439,  749,  186,  837,  313,  809,  502,  719,
-  218,  838,  408,  780,  627,  596,  658,  250,  345,  471,  750,  810,  839,
-  27,   864,  59,   565,  689,  865,  91,   866,  123,  867,  282,  534,  720,
-  840,  155,  440,  781,  868,  377,  811,  187,  503,  751,  869,  314,  841,
-  628,  659,  219,  597,  690,  870,  409,  812,  472,  782,  566,  721,  346,
-  842,  251,  871,  28,   896,  60,   535,  752,  897,  92,   898,  124,  283,
-  872,  899,  441,  813,  378,  843,  156,  660,  900,  504,  783,  629,  691,
-  598,  722,  188,  901,  315,  873,  567,  753,  220,  410,  844,  902,  473,
-  814,  347,  874,  536,  784,  252,  903,  29,   661,  692,  928,  61,   929,
-  93,   442,  630,  723,  845,  930,  284,  904,  125,  379,  505,  815,  875,
-  931,  599,  754,  157,  932,  316,  568,  785,  905,  189,  933,  474,  846,
-  411,  876,  221,  934,  537,  816,  693,  348,  662,  724,  906,  253,  631,
-  755,  935,  443,  877,  30,   600,  786,  960,  62,   506,  847,  961,  94,
-  962,  285,  380,  907,  936,  126,  963,  569,  817,  158,  964,  317,  937,
-  190,  475,  694,  725,  878,  965,  412,  908,  663,  756,  538,  848,  222,
-  966,  632,  787,  349,  938,  254,  601,  818,  967,  444,  909,  507,  879,
-  31,   992,  63,   381,  939,  993,  286,  968,  95,   570,  849,  994,  726,
-  127,  695,  757,  995,  664,  788,  159,  996,  476,  910,  318,  969,  413,
-  539,  880,  940,  191,  633,  819,  997,  223,  998,  350,  602,  850,  970,
-  508,  911,  255,  445,  941,  999,  727,  758,  696,  789,  571,  881,  382,
-  971,  287,  665,  820,  1000, 477,  634,  851,  942,  540,  912,  319,  1001,
-  414,  972,  603,  882,  759,  728,  790,  351,  1002, 509,  697,  821,  943,
-  446,  973,  572,  913,  666,  852,  383,  1003, 635,  883,  478,  974,  541,
-  944,  415,  760,  791,  1004, 604,  914,  729,  822,  698,  853,  510,  975,
-  667,  884,  447,  573,  945,  1005, 636,  915,  792,  761,  823,  542,  976,
-  479,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  511,  1007, 574,
-  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
-  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
-  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
-  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
-  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
-  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
-  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
-  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
-  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
-};
-
-#if CONFIG_TX64X64
-DECLARE_ALIGNED(16, static const int16_t, default_scan_32x64[2048]) = {
-  0,    1,    32,   2,    33,   64,   3,    34,   65,   96,   4,    35,   66,
-  97,   128,  5,    36,   67,   98,   129,  160,  6,    37,   68,   99,   130,
-  161,  192,  7,    38,   69,   100,  131,  162,  193,  224,  8,    39,   70,
-  101,  132,  163,  194,  225,  256,  9,    40,   71,   102,  133,  164,  195,
-  226,  257,  288,  10,   41,   72,   103,  134,  165,  196,  227,  258,  289,
-  320,  11,   42,   73,   104,  135,  166,  197,  228,  259,  290,  321,  352,
-  12,   43,   74,   105,  136,  167,  198,  229,  260,  291,  322,  353,  384,
-  13,   44,   75,   106,  137,  168,  199,  230,  261,  292,  323,  354,  385,
-  416,  14,   45,   76,   107,  138,  169,  200,  231,  262,  293,  324,  355,
-  386,  417,  448,  15,   46,   77,   108,  139,  170,  201,  232,  263,  294,
-  325,  356,  387,  418,  449,  480,  16,   47,   78,   109,  140,  171,  202,
-  233,  264,  295,  326,  357,  388,  419,  450,  481,  512,  17,   48,   79,
-  110,  141,  172,  203,  234,  265,  296,  327,  358,  389,  420,  451,  482,
-  513,  544,  18,   49,   80,   111,  142,  173,  204,  235,  266,  297,  328,
-  359,  390,  421,  452,  483,  514,  545,  576,  19,   50,   81,   112,  143,
-  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,
-  577,  608,  20,   51,   82,   113,  144,  175,  206,  237,  268,  299,  330,
-  361,  392,  423,  454,  485,  516,  547,  578,  609,  640,  21,   52,   83,
-  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,
-  517,  548,  579,  610,  641,  672,  22,   53,   84,   115,  146,  177,  208,
-  239,  270,  301,  332,  363,  394,  425,  456,  487,  518,  549,  580,  611,
-  642,  673,  704,  23,   54,   85,   116,  147,  178,  209,  240,  271,  302,
-  333,  364,  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,
-  736,  24,   55,   86,   117,  148,  179,  210,  241,  272,  303,  334,  365,
-  396,  427,  458,  489,  520,  551,  582,  613,  644,  675,  706,  737,  768,
-  25,   56,   87,   118,  149,  180,  211,  242,  273,  304,  335,  366,  397,
-  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,
-  26,   57,   88,   119,  150,  181,  212,  243,  274,  305,  336,  367,  398,
-  429,  460,  491,  522,  553,  584,  615,  646,  677,  708,  739,  770,  801,
-  832,  27,   58,   89,   120,  151,  182,  213,  244,  275,  306,  337,  368,
-  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,
-  802,  833,  864,  28,   59,   90,   121,  152,  183,  214,  245,  276,  307,
-  338,  369,  400,  431,  462,  493,  524,  555,  586,  617,  648,  679,  710,
-  741,  772,  803,  834,  865,  896,  29,   60,   91,   122,  153,  184,  215,
-  246,  277,  308,  339,  370,  401,  432,  463,  494,  525,  556,  587,  618,
-  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  30,   61,   92,
-  123,  154,  185,  216,  247,  278,  309,  340,  371,  402,  433,  464,  495,
-  526,  557,  588,  619,  650,  681,  712,  743,  774,  805,  836,  867,  898,
-  929,  960,  31,   62,   93,   124,  155,  186,  217,  248,  279,  310,  341,
-  372,  403,  434,  465,  496,  527,  558,  589,  620,  651,  682,  713,  744,
-  775,  806,  837,  868,  899,  930,  961,  992,  63,   94,   125,  156,  187,
-  218,  249,  280,  311,  342,  373,  404,  435,  466,  497,  528,  559,  590,
-  621,  652,  683,  714,  745,  776,  807,  838,  869,  900,  931,  962,  993,
-  1024, 95,   126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,
-  467,  498,  529,  560,  591,  622,  653,  684,  715,  746,  777,  808,  839,
-  870,  901,  932,  963,  994,  1025, 1056, 127,  158,  189,  220,  251,  282,
-  313,  344,  375,  406,  437,  468,  499,  530,  561,  592,  623,  654,  685,
-  716,  747,  778,  809,  840,  871,  902,  933,  964,  995,  1026, 1057, 1088,
-  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,
-  562,  593,  624,  655,  686,  717,  748,  779,  810,  841,  872,  903,  934,
-  965,  996,  1027, 1058, 1089, 1120, 191,  222,  253,  284,  315,  346,  377,
-  408,  439,  470,  501,  532,  563,  594,  625,  656,  687,  718,  749,  780,
-  811,  842,  873,  904,  935,  966,  997,  1028, 1059, 1090, 1121, 1152, 223,
-  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,
-  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  1029,
-  1060, 1091, 1122, 1153, 1184, 255,  286,  317,  348,  379,  410,  441,  472,
-  503,  534,  565,  596,  627,  658,  689,  720,  751,  782,  813,  844,  875,
-  906,  937,  968,  999,  1030, 1061, 1092, 1123, 1154, 1185, 1216, 287,  318,
-  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,  659,  690,  721,
-  752,  783,  814,  845,  876,  907,  938,  969,  1000, 1031, 1062, 1093, 1124,
-  1155, 1186, 1217, 1248, 319,  350,  381,  412,  443,  474,  505,  536,  567,
-  598,  629,  660,  691,  722,  753,  784,  815,  846,  877,  908,  939,  970,
-  1001, 1032, 1063, 1094, 1125, 1156, 1187, 1218, 1249, 1280, 351,  382,  413,
-  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,
-  847,  878,  909,  940,  971,  1002, 1033, 1064, 1095, 1126, 1157, 1188, 1219,
-  1250, 1281, 1312, 383,  414,  445,  476,  507,  538,  569,  600,  631,  662,
-  693,  724,  755,  786,  817,  848,  879,  910,  941,  972,  1003, 1034, 1065,
-  1096, 1127, 1158, 1189, 1220, 1251, 1282, 1313, 1344, 415,  446,  477,  508,
-  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,
-  942,  973,  1004, 1035, 1066, 1097, 1128, 1159, 1190, 1221, 1252, 1283, 1314,
-  1345, 1376, 447,  478,  509,  540,  571,  602,  633,  664,  695,  726,  757,
-  788,  819,  850,  881,  912,  943,  974,  1005, 1036, 1067, 1098, 1129, 1160,
-  1191, 1222, 1253, 1284, 1315, 1346, 1377, 1408, 479,  510,  541,  572,  603,
-  634,  665,  696,  727,  758,  789,  820,  851,  882,  913,  944,  975,  1006,
-  1037, 1068, 1099, 1130, 1161, 1192, 1223, 1254, 1285, 1316, 1347, 1378, 1409,
-  1440, 511,  542,  573,  604,  635,  666,  697,  728,  759,  790,  821,  852,
-  883,  914,  945,  976,  1007, 1038, 1069, 1100, 1131, 1162, 1193, 1224, 1255,
-  1286, 1317, 1348, 1379, 1410, 1441, 1472, 543,  574,  605,  636,  667,  698,
-  729,  760,  791,  822,  853,  884,  915,  946,  977,  1008, 1039, 1070, 1101,
-  1132, 1163, 1194, 1225, 1256, 1287, 1318, 1349, 1380, 1411, 1442, 1473, 1504,
-  575,  606,  637,  668,  699,  730,  761,  792,  823,  854,  885,  916,  947,
-  978,  1009, 1040, 1071, 1102, 1133, 1164, 1195, 1226, 1257, 1288, 1319, 1350,
-  1381, 1412, 1443, 1474, 1505, 1536, 607,  638,  669,  700,  731,  762,  793,
-  824,  855,  886,  917,  948,  979,  1010, 1041, 1072, 1103, 1134, 1165, 1196,
-  1227, 1258, 1289, 1320, 1351, 1382, 1413, 1444, 1475, 1506, 1537, 1568, 639,
-  670,  701,  732,  763,  794,  825,  856,  887,  918,  949,  980,  1011, 1042,
-  1073, 1104, 1135, 1166, 1197, 1228, 1259, 1290, 1321, 1352, 1383, 1414, 1445,
-  1476, 1507, 1538, 1569, 1600, 671,  702,  733,  764,  795,  826,  857,  888,
-  919,  950,  981,  1012, 1043, 1074, 1105, 1136, 1167, 1198, 1229, 1260, 1291,
-  1322, 1353, 1384, 1415, 1446, 1477, 1508, 1539, 1570, 1601, 1632, 703,  734,
-  765,  796,  827,  858,  889,  920,  951,  982,  1013, 1044, 1075, 1106, 1137,
-  1168, 1199, 1230, 1261, 1292, 1323, 1354, 1385, 1416, 1447, 1478, 1509, 1540,
-  1571, 1602, 1633, 1664, 735,  766,  797,  828,  859,  890,  921,  952,  983,
-  1014, 1045, 1076, 1107, 1138, 1169, 1200, 1231, 1262, 1293, 1324, 1355, 1386,
-  1417, 1448, 1479, 1510, 1541, 1572, 1603, 1634, 1665, 1696, 767,  798,  829,
-  860,  891,  922,  953,  984,  1015, 1046, 1077, 1108, 1139, 1170, 1201, 1232,
-  1263, 1294, 1325, 1356, 1387, 1418, 1449, 1480, 1511, 1542, 1573, 1604, 1635,
-  1666, 1697, 1728, 799,  830,  861,  892,  923,  954,  985,  1016, 1047, 1078,
-  1109, 1140, 1171, 1202, 1233, 1264, 1295, 1326, 1357, 1388, 1419, 1450, 1481,
-  1512, 1543, 1574, 1605, 1636, 1667, 1698, 1729, 1760, 831,  862,  893,  924,
-  955,  986,  1017, 1048, 1079, 1110, 1141, 1172, 1203, 1234, 1265, 1296, 1327,
-  1358, 1389, 1420, 1451, 1482, 1513, 1544, 1575, 1606, 1637, 1668, 1699, 1730,
-  1761, 1792, 863,  894,  925,  956,  987,  1018, 1049, 1080, 1111, 1142, 1173,
-  1204, 1235, 1266, 1297, 1328, 1359, 1390, 1421, 1452, 1483, 1514, 1545, 1576,
-  1607, 1638, 1669, 1700, 1731, 1762, 1793, 1824, 895,  926,  957,  988,  1019,
-  1050, 1081, 1112, 1143, 1174, 1205, 1236, 1267, 1298, 1329, 1360, 1391, 1422,
-  1453, 1484, 1515, 1546, 1577, 1608, 1639, 1670, 1701, 1732, 1763, 1794, 1825,
-  1856, 927,  958,  989,  1020, 1051, 1082, 1113, 1144, 1175, 1206, 1237, 1268,
-  1299, 1330, 1361, 1392, 1423, 1454, 1485, 1516, 1547, 1578, 1609, 1640, 1671,
-  1702, 1733, 1764, 1795, 1826, 1857, 1888, 959,  990,  1021, 1052, 1083, 1114,
-  1145, 1176, 1207, 1238, 1269, 1300, 1331, 1362, 1393, 1424, 1455, 1486, 1517,
-  1548, 1579, 1610, 1641, 1672, 1703, 1734, 1765, 1796, 1827, 1858, 1889, 1920,
-  991,  1022, 1053, 1084, 1115, 1146, 1177, 1208, 1239, 1270, 1301, 1332, 1363,
-  1394, 1425, 1456, 1487, 1518, 1549, 1580, 1611, 1642, 1673, 1704, 1735, 1766,
-  1797, 1828, 1859, 1890, 1921, 1952, 1023, 1054, 1085, 1116, 1147, 1178, 1209,
-  1240, 1271, 1302, 1333, 1364, 1395, 1426, 1457, 1488, 1519, 1550, 1581, 1612,
-  1643, 1674, 1705, 1736, 1767, 1798, 1829, 1860, 1891, 1922, 1953, 1984, 1055,
-  1086, 1117, 1148, 1179, 1210, 1241, 1272, 1303, 1334, 1365, 1396, 1427, 1458,
-  1489, 1520, 1551, 1582, 1613, 1644, 1675, 1706, 1737, 1768, 1799, 1830, 1861,
-  1892, 1923, 1954, 1985, 2016, 1087, 1118, 1149, 1180, 1211, 1242, 1273, 1304,
-  1335, 1366, 1397, 1428, 1459, 1490, 1521, 1552, 1583, 1614, 1645, 1676, 1707,
-  1738, 1769, 1800, 1831, 1862, 1893, 1924, 1955, 1986, 2017, 1119, 1150, 1181,
-  1212, 1243, 1274, 1305, 1336, 1367, 1398, 1429, 1460, 1491, 1522, 1553, 1584,
-  1615, 1646, 1677, 1708, 1739, 1770, 1801, 1832, 1863, 1894, 1925, 1956, 1987,
-  2018, 1151, 1182, 1213, 1244, 1275, 1306, 1337, 1368, 1399, 1430, 1461, 1492,
-  1523, 1554, 1585, 1616, 1647, 1678, 1709, 1740, 1771, 1802, 1833, 1864, 1895,
-  1926, 1957, 1988, 2019, 1183, 1214, 1245, 1276, 1307, 1338, 1369, 1400, 1431,
-  1462, 1493, 1524, 1555, 1586, 1617, 1648, 1679, 1710, 1741, 1772, 1803, 1834,
-  1865, 1896, 1927, 1958, 1989, 2020, 1215, 1246, 1277, 1308, 1339, 1370, 1401,
-  1432, 1463, 1494, 1525, 1556, 1587, 1618, 1649, 1680, 1711, 1742, 1773, 1804,
-  1835, 1866, 1897, 1928, 1959, 1990, 2021, 1247, 1278, 1309, 1340, 1371, 1402,
-  1433, 1464, 1495, 1526, 1557, 1588, 1619, 1650, 1681, 1712, 1743, 1774, 1805,
-  1836, 1867, 1898, 1929, 1960, 1991, 2022, 1279, 1310, 1341, 1372, 1403, 1434,
-  1465, 1496, 1527, 1558, 1589, 1620, 1651, 1682, 1713, 1744, 1775, 1806, 1837,
-  1868, 1899, 1930, 1961, 1992, 2023, 1311, 1342, 1373, 1404, 1435, 1466, 1497,
-  1528, 1559, 1590, 1621, 1652, 1683, 1714, 1745, 1776, 1807, 1838, 1869, 1900,
-  1931, 1962, 1993, 2024, 1343, 1374, 1405, 1436, 1467, 1498, 1529, 1560, 1591,
-  1622, 1653, 1684, 1715, 1746, 1777, 1808, 1839, 1870, 1901, 1932, 1963, 1994,
-  2025, 1375, 1406, 1437, 1468, 1499, 1530, 1561, 1592, 1623, 1654, 1685, 1716,
-  1747, 1778, 1809, 1840, 1871, 1902, 1933, 1964, 1995, 2026, 1407, 1438, 1469,
-  1500, 1531, 1562, 1593, 1624, 1655, 1686, 1717, 1748, 1779, 1810, 1841, 1872,
-  1903, 1934, 1965, 1996, 2027, 1439, 1470, 1501, 1532, 1563, 1594, 1625, 1656,
-  1687, 1718, 1749, 1780, 1811, 1842, 1873, 1904, 1935, 1966, 1997, 2028, 1471,
-  1502, 1533, 1564, 1595, 1626, 1657, 1688, 1719, 1750, 1781, 1812, 1843, 1874,
-  1905, 1936, 1967, 1998, 2029, 1503, 1534, 1565, 1596, 1627, 1658, 1689, 1720,
-  1751, 1782, 1813, 1844, 1875, 1906, 1937, 1968, 1999, 2030, 1535, 1566, 1597,
-  1628, 1659, 1690, 1721, 1752, 1783, 1814, 1845, 1876, 1907, 1938, 1969, 2000,
-  2031, 1567, 1598, 1629, 1660, 1691, 1722, 1753, 1784, 1815, 1846, 1877, 1908,
-  1939, 1970, 2001, 2032, 1599, 1630, 1661, 1692, 1723, 1754, 1785, 1816, 1847,
-  1878, 1909, 1940, 1971, 2002, 2033, 1631, 1662, 1693, 1724, 1755, 1786, 1817,
-  1848, 1879, 1910, 1941, 1972, 2003, 2034, 1663, 1694, 1725, 1756, 1787, 1818,
-  1849, 1880, 1911, 1942, 1973, 2004, 2035, 1695, 1726, 1757, 1788, 1819, 1850,
-  1881, 1912, 1943, 1974, 2005, 2036, 1727, 1758, 1789, 1820, 1851, 1882, 1913,
-  1944, 1975, 2006, 2037, 1759, 1790, 1821, 1852, 1883, 1914, 1945, 1976, 2007,
-  2038, 1791, 1822, 1853, 1884, 1915, 1946, 1977, 2008, 2039, 1823, 1854, 1885,
-  1916, 1947, 1978, 2009, 2040, 1855, 1886, 1917, 1948, 1979, 2010, 2041, 1887,
-  1918, 1949, 1980, 2011, 2042, 1919, 1950, 1981, 2012, 2043, 1951, 1982, 2013,
-  2044, 1983, 2014, 2045, 2015, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, default_scan_64x32[2048]) = {
-  0,    1,    64,   2,    65,   128,  3,    66,   129,  192,  4,    67,   130,
-  193,  256,  5,    68,   131,  194,  257,  320,  6,    69,   132,  195,  258,
-  321,  384,  7,    70,   133,  196,  259,  322,  385,  448,  8,    71,   134,
-  197,  260,  323,  386,  449,  512,  9,    72,   135,  198,  261,  324,  387,
-  450,  513,  576,  10,   73,   136,  199,  262,  325,  388,  451,  514,  577,
-  640,  11,   74,   137,  200,  263,  326,  389,  452,  515,  578,  641,  704,
-  12,   75,   138,  201,  264,  327,  390,  453,  516,  579,  642,  705,  768,
-  13,   76,   139,  202,  265,  328,  391,  454,  517,  580,  643,  706,  769,
-  832,  14,   77,   140,  203,  266,  329,  392,  455,  518,  581,  644,  707,
-  770,  833,  896,  15,   78,   141,  204,  267,  330,  393,  456,  519,  582,
-  645,  708,  771,  834,  897,  960,  16,   79,   142,  205,  268,  331,  394,
-  457,  520,  583,  646,  709,  772,  835,  898,  961,  1024, 17,   80,   143,
-  206,  269,  332,  395,  458,  521,  584,  647,  710,  773,  836,  899,  962,
-  1025, 1088, 18,   81,   144,  207,  270,  333,  396,  459,  522,  585,  648,
-  711,  774,  837,  900,  963,  1026, 1089, 1152, 19,   82,   145,  208,  271,
-  334,  397,  460,  523,  586,  649,  712,  775,  838,  901,  964,  1027, 1090,
-  1153, 1216, 20,   83,   146,  209,  272,  335,  398,  461,  524,  587,  650,
-  713,  776,  839,  902,  965,  1028, 1091, 1154, 1217, 1280, 21,   84,   147,
-  210,  273,  336,  399,  462,  525,  588,  651,  714,  777,  840,  903,  966,
-  1029, 1092, 1155, 1218, 1281, 1344, 22,   85,   148,  211,  274,  337,  400,
-  463,  526,  589,  652,  715,  778,  841,  904,  967,  1030, 1093, 1156, 1219,
-  1282, 1345, 1408, 23,   86,   149,  212,  275,  338,  401,  464,  527,  590,
-  653,  716,  779,  842,  905,  968,  1031, 1094, 1157, 1220, 1283, 1346, 1409,
-  1472, 24,   87,   150,  213,  276,  339,  402,  465,  528,  591,  654,  717,
-  780,  843,  906,  969,  1032, 1095, 1158, 1221, 1284, 1347, 1410, 1473, 1536,
-  25,   88,   151,  214,  277,  340,  403,  466,  529,  592,  655,  718,  781,
-  844,  907,  970,  1033, 1096, 1159, 1222, 1285, 1348, 1411, 1474, 1537, 1600,
-  26,   89,   152,  215,  278,  341,  404,  467,  530,  593,  656,  719,  782,
-  845,  908,  971,  1034, 1097, 1160, 1223, 1286, 1349, 1412, 1475, 1538, 1601,
-  1664, 27,   90,   153,  216,  279,  342,  405,  468,  531,  594,  657,  720,
-  783,  846,  909,  972,  1035, 1098, 1161, 1224, 1287, 1350, 1413, 1476, 1539,
-  1602, 1665, 1728, 28,   91,   154,  217,  280,  343,  406,  469,  532,  595,
-  658,  721,  784,  847,  910,  973,  1036, 1099, 1162, 1225, 1288, 1351, 1414,
-  1477, 1540, 1603, 1666, 1729, 1792, 29,   92,   155,  218,  281,  344,  407,
-  470,  533,  596,  659,  722,  785,  848,  911,  974,  1037, 1100, 1163, 1226,
-  1289, 1352, 1415, 1478, 1541, 1604, 1667, 1730, 1793, 1856, 30,   93,   156,
-  219,  282,  345,  408,  471,  534,  597,  660,  723,  786,  849,  912,  975,
-  1038, 1101, 1164, 1227, 1290, 1353, 1416, 1479, 1542, 1605, 1668, 1731, 1794,
-  1857, 1920, 31,   94,   157,  220,  283,  346,  409,  472,  535,  598,  661,
-  724,  787,  850,  913,  976,  1039, 1102, 1165, 1228, 1291, 1354, 1417, 1480,
-  1543, 1606, 1669, 1732, 1795, 1858, 1921, 1984, 32,   95,   158,  221,  284,
-  347,  410,  473,  536,  599,  662,  725,  788,  851,  914,  977,  1040, 1103,
-  1166, 1229, 1292, 1355, 1418, 1481, 1544, 1607, 1670, 1733, 1796, 1859, 1922,
-  1985, 33,   96,   159,  222,  285,  348,  411,  474,  537,  600,  663,  726,
-  789,  852,  915,  978,  1041, 1104, 1167, 1230, 1293, 1356, 1419, 1482, 1545,
-  1608, 1671, 1734, 1797, 1860, 1923, 1986, 34,   97,   160,  223,  286,  349,
-  412,  475,  538,  601,  664,  727,  790,  853,  916,  979,  1042, 1105, 1168,
-  1231, 1294, 1357, 1420, 1483, 1546, 1609, 1672, 1735, 1798, 1861, 1924, 1987,
-  35,   98,   161,  224,  287,  350,  413,  476,  539,  602,  665,  728,  791,
-  854,  917,  980,  1043, 1106, 1169, 1232, 1295, 1358, 1421, 1484, 1547, 1610,
-  1673, 1736, 1799, 1862, 1925, 1988, 36,   99,   162,  225,  288,  351,  414,
-  477,  540,  603,  666,  729,  792,  855,  918,  981,  1044, 1107, 1170, 1233,
-  1296, 1359, 1422, 1485, 1548, 1611, 1674, 1737, 1800, 1863, 1926, 1989, 37,
-  100,  163,  226,  289,  352,  415,  478,  541,  604,  667,  730,  793,  856,
-  919,  982,  1045, 1108, 1171, 1234, 1297, 1360, 1423, 1486, 1549, 1612, 1675,
-  1738, 1801, 1864, 1927, 1990, 38,   101,  164,  227,  290,  353,  416,  479,
-  542,  605,  668,  731,  794,  857,  920,  983,  1046, 1109, 1172, 1235, 1298,
-  1361, 1424, 1487, 1550, 1613, 1676, 1739, 1802, 1865, 1928, 1991, 39,   102,
-  165,  228,  291,  354,  417,  480,  543,  606,  669,  732,  795,  858,  921,
-  984,  1047, 1110, 1173, 1236, 1299, 1362, 1425, 1488, 1551, 1614, 1677, 1740,
-  1803, 1866, 1929, 1992, 40,   103,  166,  229,  292,  355,  418,  481,  544,
-  607,  670,  733,  796,  859,  922,  985,  1048, 1111, 1174, 1237, 1300, 1363,
-  1426, 1489, 1552, 1615, 1678, 1741, 1804, 1867, 1930, 1993, 41,   104,  167,
-  230,  293,  356,  419,  482,  545,  608,  671,  734,  797,  860,  923,  986,
-  1049, 1112, 1175, 1238, 1301, 1364, 1427, 1490, 1553, 1616, 1679, 1742, 1805,
-  1868, 1931, 1994, 42,   105,  168,  231,  294,  357,  420,  483,  546,  609,
-  672,  735,  798,  861,  924,  987,  1050, 1113, 1176, 1239, 1302, 1365, 1428,
-  1491, 1554, 1617, 1680, 1743, 1806, 1869, 1932, 1995, 43,   106,  169,  232,
-  295,  358,  421,  484,  547,  610,  673,  736,  799,  862,  925,  988,  1051,
-  1114, 1177, 1240, 1303, 1366, 1429, 1492, 1555, 1618, 1681, 1744, 1807, 1870,
-  1933, 1996, 44,   107,  170,  233,  296,  359,  422,  485,  548,  611,  674,
-  737,  800,  863,  926,  989,  1052, 1115, 1178, 1241, 1304, 1367, 1430, 1493,
-  1556, 1619, 1682, 1745, 1808, 1871, 1934, 1997, 45,   108,  171,  234,  297,
-  360,  423,  486,  549,  612,  675,  738,  801,  864,  927,  990,  1053, 1116,
-  1179, 1242, 1305, 1368, 1431, 1494, 1557, 1620, 1683, 1746, 1809, 1872, 1935,
-  1998, 46,   109,  172,  235,  298,  361,  424,  487,  550,  613,  676,  739,
-  802,  865,  928,  991,  1054, 1117, 1180, 1243, 1306, 1369, 1432, 1495, 1558,
-  1621, 1684, 1747, 1810, 1873, 1936, 1999, 47,   110,  173,  236,  299,  362,
-  425,  488,  551,  614,  677,  740,  803,  866,  929,  992,  1055, 1118, 1181,
-  1244, 1307, 1370, 1433, 1496, 1559, 1622, 1685, 1748, 1811, 1874, 1937, 2000,
-  48,   111,  174,  237,  300,  363,  426,  489,  552,  615,  678,  741,  804,
-  867,  930,  993,  1056, 1119, 1182, 1245, 1308, 1371, 1434, 1497, 1560, 1623,
-  1686, 1749, 1812, 1875, 1938, 2001, 49,   112,  175,  238,  301,  364,  427,
-  490,  553,  616,  679,  742,  805,  868,  931,  994,  1057, 1120, 1183, 1246,
-  1309, 1372, 1435, 1498, 1561, 1624, 1687, 1750, 1813, 1876, 1939, 2002, 50,
-  113,  176,  239,  302,  365,  428,  491,  554,  617,  680,  743,  806,  869,
-  932,  995,  1058, 1121, 1184, 1247, 1310, 1373, 1436, 1499, 1562, 1625, 1688,
-  1751, 1814, 1877, 1940, 2003, 51,   114,  177,  240,  303,  366,  429,  492,
-  555,  618,  681,  744,  807,  870,  933,  996,  1059, 1122, 1185, 1248, 1311,
-  1374, 1437, 1500, 1563, 1626, 1689, 1752, 1815, 1878, 1941, 2004, 52,   115,
-  178,  241,  304,  367,  430,  493,  556,  619,  682,  745,  808,  871,  934,
-  997,  1060, 1123, 1186, 1249, 1312, 1375, 1438, 1501, 1564, 1627, 1690, 1753,
-  1816, 1879, 1942, 2005, 53,   116,  179,  242,  305,  368,  431,  494,  557,
-  620,  683,  746,  809,  872,  935,  998,  1061, 1124, 1187, 1250, 1313, 1376,
-  1439, 1502, 1565, 1628, 1691, 1754, 1817, 1880, 1943, 2006, 54,   117,  180,
-  243,  306,  369,  432,  495,  558,  621,  684,  747,  810,  873,  936,  999,
-  1062, 1125, 1188, 1251, 1314, 1377, 1440, 1503, 1566, 1629, 1692, 1755, 1818,
-  1881, 1944, 2007, 55,   118,  181,  244,  307,  370,  433,  496,  559,  622,
-  685,  748,  811,  874,  937,  1000, 1063, 1126, 1189, 1252, 1315, 1378, 1441,
-  1504, 1567, 1630, 1693, 1756, 1819, 1882, 1945, 2008, 56,   119,  182,  245,
-  308,  371,  434,  497,  560,  623,  686,  749,  812,  875,  938,  1001, 1064,
-  1127, 1190, 1253, 1316, 1379, 1442, 1505, 1568, 1631, 1694, 1757, 1820, 1883,
-  1946, 2009, 57,   120,  183,  246,  309,  372,  435,  498,  561,  624,  687,
-  750,  813,  876,  939,  1002, 1065, 1128, 1191, 1254, 1317, 1380, 1443, 1506,
-  1569, 1632, 1695, 1758, 1821, 1884, 1947, 2010, 58,   121,  184,  247,  310,
-  373,  436,  499,  562,  625,  688,  751,  814,  877,  940,  1003, 1066, 1129,
-  1192, 1255, 1318, 1381, 1444, 1507, 1570, 1633, 1696, 1759, 1822, 1885, 1948,
-  2011, 59,   122,  185,  248,  311,  374,  437,  500,  563,  626,  689,  752,
-  815,  878,  941,  1004, 1067, 1130, 1193, 1256, 1319, 1382, 1445, 1508, 1571,
-  1634, 1697, 1760, 1823, 1886, 1949, 2012, 60,   123,  186,  249,  312,  375,
-  438,  501,  564,  627,  690,  753,  816,  879,  942,  1005, 1068, 1131, 1194,
-  1257, 1320, 1383, 1446, 1509, 1572, 1635, 1698, 1761, 1824, 1887, 1950, 2013,
-  61,   124,  187,  250,  313,  376,  439,  502,  565,  628,  691,  754,  817,
-  880,  943,  1006, 1069, 1132, 1195, 1258, 1321, 1384, 1447, 1510, 1573, 1636,
-  1699, 1762, 1825, 1888, 1951, 2014, 62,   125,  188,  251,  314,  377,  440,
-  503,  566,  629,  692,  755,  818,  881,  944,  1007, 1070, 1133, 1196, 1259,
-  1322, 1385, 1448, 1511, 1574, 1637, 1700, 1763, 1826, 1889, 1952, 2015, 63,
-  126,  189,  252,  315,  378,  441,  504,  567,  630,  693,  756,  819,  882,
-  945,  1008, 1071, 1134, 1197, 1260, 1323, 1386, 1449, 1512, 1575, 1638, 1701,
-  1764, 1827, 1890, 1953, 2016, 127,  190,  253,  316,  379,  442,  505,  568,
-  631,  694,  757,  820,  883,  946,  1009, 1072, 1135, 1198, 1261, 1324, 1387,
-  1450, 1513, 1576, 1639, 1702, 1765, 1828, 1891, 1954, 2017, 191,  254,  317,
-  380,  443,  506,  569,  632,  695,  758,  821,  884,  947,  1010, 1073, 1136,
-  1199, 1262, 1325, 1388, 1451, 1514, 1577, 1640, 1703, 1766, 1829, 1892, 1955,
-  2018, 255,  318,  381,  444,  507,  570,  633,  696,  759,  822,  885,  948,
-  1011, 1074, 1137, 1200, 1263, 1326, 1389, 1452, 1515, 1578, 1641, 1704, 1767,
-  1830, 1893, 1956, 2019, 319,  382,  445,  508,  571,  634,  697,  760,  823,
-  886,  949,  1012, 1075, 1138, 1201, 1264, 1327, 1390, 1453, 1516, 1579, 1642,
-  1705, 1768, 1831, 1894, 1957, 2020, 383,  446,  509,  572,  635,  698,  761,
-  824,  887,  950,  1013, 1076, 1139, 1202, 1265, 1328, 1391, 1454, 1517, 1580,
-  1643, 1706, 1769, 1832, 1895, 1958, 2021, 447,  510,  573,  636,  699,  762,
-  825,  888,  951,  1014, 1077, 1140, 1203, 1266, 1329, 1392, 1455, 1518, 1581,
-  1644, 1707, 1770, 1833, 1896, 1959, 2022, 511,  574,  637,  700,  763,  826,
-  889,  952,  1015, 1078, 1141, 1204, 1267, 1330, 1393, 1456, 1519, 1582, 1645,
-  1708, 1771, 1834, 1897, 1960, 2023, 575,  638,  701,  764,  827,  890,  953,
-  1016, 1079, 1142, 1205, 1268, 1331, 1394, 1457, 1520, 1583, 1646, 1709, 1772,
-  1835, 1898, 1961, 2024, 639,  702,  765,  828,  891,  954,  1017, 1080, 1143,
-  1206, 1269, 1332, 1395, 1458, 1521, 1584, 1647, 1710, 1773, 1836, 1899, 1962,
-  2025, 703,  766,  829,  892,  955,  1018, 1081, 1144, 1207, 1270, 1333, 1396,
-  1459, 1522, 1585, 1648, 1711, 1774, 1837, 1900, 1963, 2026, 767,  830,  893,
-  956,  1019, 1082, 1145, 1208, 1271, 1334, 1397, 1460, 1523, 1586, 1649, 1712,
-  1775, 1838, 1901, 1964, 2027, 831,  894,  957,  1020, 1083, 1146, 1209, 1272,
-  1335, 1398, 1461, 1524, 1587, 1650, 1713, 1776, 1839, 1902, 1965, 2028, 895,
-  958,  1021, 1084, 1147, 1210, 1273, 1336, 1399, 1462, 1525, 1588, 1651, 1714,
-  1777, 1840, 1903, 1966, 2029, 959,  1022, 1085, 1148, 1211, 1274, 1337, 1400,
-  1463, 1526, 1589, 1652, 1715, 1778, 1841, 1904, 1967, 2030, 1023, 1086, 1149,
-  1212, 1275, 1338, 1401, 1464, 1527, 1590, 1653, 1716, 1779, 1842, 1905, 1968,
-  2031, 1087, 1150, 1213, 1276, 1339, 1402, 1465, 1528, 1591, 1654, 1717, 1780,
-  1843, 1906, 1969, 2032, 1151, 1214, 1277, 1340, 1403, 1466, 1529, 1592, 1655,
-  1718, 1781, 1844, 1907, 1970, 2033, 1215, 1278, 1341, 1404, 1467, 1530, 1593,
-  1656, 1719, 1782, 1845, 1908, 1971, 2034, 1279, 1342, 1405, 1468, 1531, 1594,
-  1657, 1720, 1783, 1846, 1909, 1972, 2035, 1343, 1406, 1469, 1532, 1595, 1658,
-  1721, 1784, 1847, 1910, 1973, 2036, 1407, 1470, 1533, 1596, 1659, 1722, 1785,
-  1848, 1911, 1974, 2037, 1471, 1534, 1597, 1660, 1723, 1786, 1849, 1912, 1975,
-  2038, 1535, 1598, 1661, 1724, 1787, 1850, 1913, 1976, 2039, 1599, 1662, 1725,
-  1788, 1851, 1914, 1977, 2040, 1663, 1726, 1789, 1852, 1915, 1978, 2041, 1727,
-  1790, 1853, 1916, 1979, 2042, 1791, 1854, 1917, 1980, 2043, 1855, 1918, 1981,
-  2044, 1919, 1982, 2045, 1983, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, default_scan_64x64[4096]) = {
-  0,    1,    64,   65,   2,    128,  66,   129,  130,  3,    192,  67,   193,
-  131,  194,  4,    256,  68,   257,  195,  132,  258,  5,    196,  259,  320,
-  69,   321,  133,  322,  260,  197,  323,  6,    384,  70,   385,  134,  386,
-  261,  324,  198,  387,  7,    448,  71,   325,  449,  262,  388,  135,  450,
-  199,  451,  326,  389,  8,    512,  72,   263,  452,  513,  136,  514,  390,
-  200,  515,  327,  453,  264,  516,  9,    576,  73,   577,  137,  391,  454,
-  578,  328,  517,  201,  579,  265,  580,  455,  10,   392,  518,  640,  74,
-  641,  138,  642,  329,  581,  202,  643,  456,  519,  266,  644,  393,  582,
-  11,   704,  75,   705,  139,  330,  645,  706,  520,  203,  457,  583,  707,
-  394,  646,  267,  708,  12,   768,  76,   521,  584,  769,  331,  709,  140,
-  770,  458,  647,  204,  771,  395,  710,  268,  772,  585,  522,  648,  13,
-  332,  773,  832,  77,   459,  711,  833,  141,  834,  205,  835,  396,  774,
-  586,  649,  269,  523,  712,  836,  460,  775,  333,  837,  14,   896,  78,
-  897,  142,  650,  898,  587,  713,  206,  397,  838,  899,  524,  776,  270,
-  900,  461,  839,  334,  651,  714,  901,  15,   588,  777,  960,  79,   961,
-  143,  962,  398,  902,  525,  840,  207,  963,  271,  964,  715,  652,  778,
-  462,  903,  335,  589,  841,  965,  16,   1024, 80,   1025, 144,  526,  904,
-  1026, 399,  966,  208,  716,  779,  1027, 653,  842,  272,  1028, 463,  967,
-  590,  905,  336,  1029, 780,  17,   527,  968,  1088, 81,   717,  843,  1089,
-  400,  1030, 145,  1090, 654,  906,  209,  1091, 273,  464,  1031, 1092, 591,
-  969,  781,  844,  337,  1093, 718,  907,  528,  1032, 18,   1152, 82,   401,
-  655,  970,  1094, 1153, 146,  1154, 210,  1155, 592,  1033, 465,  845,  1095,
-  274,  782,  908,  1156, 719,  971,  338,  1157, 529,  1096, 656,  1034, 402,
-  1158, 19,   1216, 83,   1217, 147,  846,  909,  1218, 783,  972,  211,  593,
-  1097, 1219, 466,  1159, 275,  720,  1035, 1220, 339,  1221, 530,  1160, 657,
-  1098, 910,  847,  973,  403,  1222, 20,   784,  1036, 1280, 84,   1281, 148,
-  1282, 594,  1161, 212,  1283, 467,  721,  1099, 1223, 276,  1284, 911,  974,
-  658,  1162, 340,  531,  848,  1037, 1224, 1285, 785,  1100, 404,  1286, 21,
-  1344, 85,   595,  1225, 1345, 149,  722,  1163, 1346, 468,  1287, 213,  975,
-  1347, 912,  1038, 277,  1348, 849,  1101, 659,  1226, 532,  1288, 341,  1349,
-  786,  1164, 405,  1350, 596,  976,  1039, 1289, 723,  1227, 22,   1408, 86,
-  913,  1102, 1409, 150,  1410, 469,  1351, 214,  850,  1165, 1411, 278,  660,
-  1290, 1412, 533,  787,  1228, 1352, 342,  1413, 1040, 977,  1103, 406,  914,
-  1166, 1414, 724,  1291, 597,  1353, 23,   1472, 87,   851,  1229, 1473, 151,
-  470,  1415, 1474, 215,  1475, 661,  1354, 788,  1292, 279,  1041, 1104, 1476,
-  534,  1416, 978,  1167, 343,  1477, 915,  1230, 725,  1355, 407,  598,  1417,
-  1478, 852,  1293, 24,   1536, 88,   1537, 471,  1105, 1479, 152,  1042, 1168,
-  1538, 662,  1418, 216,  789,  1356, 1539, 979,  1231, 280,  1540, 535,  1480,
-  916,  1294, 344,  1541, 726,  1419, 599,  853,  1357, 1481, 408,  1542, 1106,
-  1169, 1043, 1232, 25,   472,  980,  1295, 1543, 1600, 89,   1601, 790,  1420,
-  153,  663,  1482, 1602, 217,  1603, 917,  1358, 536,  1544, 281,  1604, 1170,
-  345,  727,  1107, 1233, 1483, 1605, 854,  1421, 1044, 1296, 600,  1545, 409,
-  1606, 981,  1359, 791,  1484, 473,  1607, 26,   664,  1546, 1664, 90,   1665,
-  154,  918,  1422, 1666, 218,  1171, 1234, 1667, 537,  1108, 1297, 1608, 282,
-  1668, 728,  1045, 1360, 1547, 855,  1485, 346,  1669, 601,  1609, 982,  1423,
-  410,  1670, 792,  1548, 1235, 1172, 1298, 474,  665,  919,  1486, 1610, 1671,
-  27,   1728, 91,   1109, 1361, 1729, 155,  1730, 219,  1731, 538,  1046, 1424,
-  1672, 283,  856,  1549, 1732, 729,  1611, 347,  983,  1487, 1733, 602,  1673,
-  1236, 1299, 411,  1173, 1362, 1734, 793,  1612, 920,  1550, 1110, 1425, 666,
-  1674, 475,  1735, 28,   1792, 92,   1047, 1488, 1793, 156,  1794, 220,  539,
-  1736, 1795, 857,  1613, 730,  1675, 284,  1300, 1796, 984,  1551, 1237, 1363,
-  1174, 1426, 348,  1797, 603,  1737, 1111, 1489, 412,  794,  1676, 1798, 921,
-  1614, 667,  1738, 1048, 1552, 476,  1799, 29,   1301, 1364, 1856, 93,   1857,
-  157,  858,  1238, 1427, 1677, 1858, 540,  1800, 221,  731,  985,  1615, 1739,
-  1859, 1175, 1490, 285,  1860, 604,  1112, 1553, 1801, 349,  1861, 922,  1678,
-  795,  1740, 413,  1862, 1049, 1616, 1365, 668,  1302, 1428, 1802, 477,  1239,
-  1491, 1863, 859,  1741, 30,   1176, 1554, 1920, 94,   986,  1679, 1921, 158,
-  1922, 541,  732,  1803, 1864, 222,  1923, 1113, 1617, 286,  1924, 605,  1865,
-  350,  923,  1366, 1429, 1742, 1925, 796,  1804, 1303, 1492, 1050, 1680, 414,
-  1926, 1240, 1555, 669,  1866, 478,  1177, 1618, 1927, 860,  1805, 987,  1743,
-  31,   1984, 95,   733,  1867, 1985, 542,  1928, 159,  1114, 1681, 1986, 1430,
-  223,  1367, 1493, 1987, 1304, 1556, 287,  1988, 924,  1806, 606,  1929, 797,
-  1051, 1744, 1868, 351,  1241, 1619, 1989, 415,  1990, 670,  1178, 1682, 1930,
-  988,  1807, 479,  861,  1869, 1991, 1431, 1494, 1368, 1557, 1115, 1745, 734,
-  1931, 32,   2048, 96,   543,  1305, 1620, 1992, 2049, 160,  2050, 224,  2051,
-  925,  1242, 1683, 1870, 288,  1052, 1808, 2052, 607,  1993, 798,  1932, 352,
-  2053, 1179, 1746, 1495, 416,  1432, 1558, 2054, 671,  1994, 989,  1369, 1621,
-  1871, 862,  1933, 480,  1116, 1809, 2055, 1306, 1684, 735,  1995, 544,  2056,
-  33,   2112, 97,   1243, 1747, 2113, 161,  2114, 926,  1934, 1053, 1872, 225,
-  2115, 289,  608,  799,  1496, 1559, 1996, 2057, 2116, 1180, 1810, 1433, 1622,
-  353,  2117, 1370, 1685, 672,  2058, 417,  990,  1935, 2118, 1307, 1748, 863,
-  1117, 1873, 1997, 481,  2119, 736,  1244, 1811, 2059, 1560, 545,  2120, 1497,
-  1623, 34,   1054, 1936, 2176, 98,   927,  1998, 2177, 162,  1434, 1686, 2178,
-  226,  1181, 1874, 2179, 800,  2060, 609,  1371, 1749, 2121, 290,  2180, 354,
-  2181, 1308, 1812, 991,  1999, 673,  1118, 1937, 2122, 418,  2182, 864,  2061,
-  1561, 1624, 1245, 1875, 482,  1498, 1687, 2183, 737,  2123, 1435, 1750, 1055,
-  2000, 546,  928,  2062, 2184, 1182, 1938, 35,   1372, 1813, 2240, 99,   2241,
-  163,  2242, 801,  2124, 227,  2243, 610,  2185, 291,  1309, 1876, 2244, 992,
-  2063, 355,  1119, 1625, 2001, 2245, 1562, 1688, 674,  2186, 865,  1499, 1751,
-  2125, 419,  1246, 1939, 2246, 1436, 1814, 483,  2247, 738,  2187, 1056, 2064,
-  1373, 1877, 929,  1183, 2002, 2126, 547,  2248, 36,   2304, 100,  2305, 164,
-  802,  1310, 1940, 2188, 2306, 1626, 1689, 228,  1563, 1752, 2307, 611,  2249,
-  292,  2308, 1120, 1500, 1815, 2065, 993,  2127, 356,  2309, 1247, 2003, 675,
-  866,  1437, 1878, 2189, 2250, 420,  2310, 1374, 1941, 484,  1057, 2128, 2311,
-  739,  2251, 1184, 2066, 930,  1690, 2190, 1627, 1753, 548,  1564, 1816, 2312,
-  1311, 2004, 37,   803,  2252, 2368, 101,  1501, 1879, 2369, 165,  2370, 612,
-  2313, 229,  1121, 2129, 2371, 994,  2191, 1438, 1942, 293,  1248, 2067, 2372,
-  357,  867,  2253, 2373, 676,  2314, 1375, 2005, 421,  1691, 1754, 2374, 1628,
-  1817, 1058, 2192, 1185, 2130, 740,  1565, 1880, 2315, 485,  2375, 931,  2254,
-  1312, 2068, 1502, 1943, 549,  2376, 804,  2316, 38,   2432, 102,  1122, 1439,
-  2006, 2193, 2433, 166,  2434, 613,  995,  1249, 2131, 2255, 2377, 230,  2435,
-  1755, 294,  1692, 1818, 2436, 868,  1376, 2069, 2317, 1629, 1881, 358,  677,
-  2378, 2437, 1566, 1944, 422,  1186, 2194, 2438, 1059, 2256, 1313, 2132, 741,
-  1503, 2007, 2379, 932,  2318, 486,  2439, 550,  1440, 2070, 2440, 805,  1756,
-  1819, 2380, 1123, 2257, 1250, 1693, 1882, 2195, 39,   996,  2319, 2496, 103,
-  2497, 167,  614,  1630, 1945, 2441, 2498, 231,  1377, 2133, 2499, 295,  1567,
-  2008, 2500, 869,  2381, 678,  2442, 359,  2501, 1187, 2258, 1060, 2320, 1504,
-  2071, 1314, 2196, 423,  2502, 742,  933,  2382, 2443, 1820, 487,  1757, 1883,
-  2503, 1441, 2134, 1694, 1946, 551,  1124, 2321, 2504, 1251, 1631, 2009, 2259,
-  806,  2444, 997,  2383, 1378, 2197, 40,   1568, 2072, 2560, 104,  2561, 615,
-  2505, 168,  2562, 232,  2563, 870,  2445, 296,  2564, 1505, 2135, 1188, 2322,
-  679,  2506, 360,  1061, 1315, 1821, 1884, 2260, 2384, 2565, 1758, 1947, 424,
-  2566, 1695, 2010, 934,  1442, 2198, 2446, 743,  2507, 488,  1632, 2073, 2567,
-  1252, 2323, 1125, 2385, 552,  2568, 807,  1569, 2136, 2508, 1379, 2261, 998,
-  2447, 41,   616,  2569, 2624, 105,  1885, 2625, 1822, 1948, 169,  1506, 2199,
-  2626, 233,  871,  1759, 2011, 2509, 2627, 1189, 2386, 1316, 2324, 297,  2628,
-  680,  1062, 1696, 2074, 2448, 2570, 361,  2629, 1443, 2262, 1633, 2137, 425,
-  935,  2510, 2630, 744,  2571, 489,  1253, 2387, 2631, 1570, 2200, 1126, 2449,
-  1380, 2325, 1886, 1949, 808,  2572, 553,  1823, 2012, 2632, 999,  2511, 1760,
-  2075, 1507, 2263, 617,  2633, 42,   2688, 106,  1697, 2138, 2689, 170,  1190,
-  2450, 2690, 872,  1317, 2388, 2573, 234,  2691, 1063, 2512, 298,  1444, 2326,
-  2692, 681,  1634, 2201, 2634, 362,  2693, 936,  2574, 426,  1950, 2694, 1571,
-  2264, 745,  1887, 2013, 2635, 1254, 2451, 1824, 2076, 1127, 1381, 2389, 2513,
-  490,  2695, 1761, 2139, 809,  1000, 1508, 2327, 2575, 2636, 554,  2696, 1698,
-  2202, 1318, 2452, 618,  1191, 2514, 2697, 43,   2752, 107,  873,  1635, 2265,
-  2637, 2753, 171,  1445, 2390, 2754, 1064, 2576, 235,  2755, 1951, 2014, 682,
-  2698, 299,  1888, 2077, 2756, 1572, 2328, 1825, 2140, 363,  2757, 937,  2638,
-  1255, 2515, 427,  746,  1382, 1762, 2203, 2453, 2699, 2758, 1128, 2577, 491,
-  1509, 2391, 2759, 1699, 2266, 1001, 2639, 810,  2700, 555,  2760, 1319, 1636,
-  2329, 2516, 2015, 1192, 1952, 2078, 2578, 1446, 2454, 619,  1889, 2141, 2761,
-  874,  2701, 44,   2816, 108,  1065, 2640, 2817, 172,  1826, 2204, 2818, 236,
-  1573, 2392, 2819, 683,  2762, 300,  2820, 1763, 2267, 938,  2702, 364,  1256,
-  2579, 2821, 1383, 2517, 747,  1129, 2641, 2763, 428,  1700, 2330, 2822, 1510,
-  2455, 492,  2016, 2079, 2823, 1002, 1953, 2142, 2703, 811,  2764, 1637, 2393,
-  1890, 2205, 556,  1320, 2580, 2824, 1193, 1447, 2518, 2642, 1827, 2268, 620,
-  2825, 875,  2765, 1066, 1574, 2456, 2704, 45,   1764, 2331, 2880, 109,  2881,
-  173,  2882, 237,  2883, 684,  2826, 301,  1384, 2581, 2884, 1257, 2643, 939,
-  1701, 2394, 2766, 2080, 365,  1511, 2017, 2143, 2519, 2885, 1130, 2705, 1954,
-  2206, 748,  2827, 429,  2886, 1891, 2269, 1638, 2457, 493,  1003, 2767, 2887,
-  812,  1828, 2332, 2828, 1321, 2644, 1448, 2582, 1194, 2706, 557,  2888, 1575,
-  2520, 1765, 2395, 876,  1067, 2768, 2829, 621,  2889, 2081, 2144, 46,   2944,
-  110,  2018, 2207, 2945, 174,  1702, 2458, 2946, 1385, 2645, 238,  685,  1258,
-  1955, 2270, 2707, 2890, 2947, 1512, 2583, 302,  940,  2830, 2948, 1892, 2333,
-  1131, 2769, 366,  2949, 749,  1639, 2521, 2891, 430,  2950, 1829, 2396, 1004,
-  2831, 1322, 2708, 494,  1449, 2646, 2951, 813,  2892, 1195, 1766, 2459, 2770,
-  1576, 2584, 2145, 558,  2082, 2208, 2952, 2019, 2271, 1068, 2832, 877,  2893,
-  1956, 2334, 622,  1703, 2522, 2953, 1386, 2709, 47,   3008, 111,  1259, 1513,
-  1893, 2397, 2647, 2771, 3009, 175,  3010, 686,  2954, 239,  3011, 941,  2894,
-  303,  1132, 1640, 2585, 2833, 3012, 1830, 2460, 367,  3013, 750,  2955, 431,
-  2146, 2209, 3014, 1450, 2710, 1323, 2083, 2272, 2772, 1005, 1767, 2523, 2895,
-  1577, 2020, 2335, 2648, 495,  3015, 814,  1196, 2834, 2956, 1957, 2398, 559,
-  3016, 1704, 2586, 1069, 2896, 878,  1894, 2461, 2957, 623,  1387, 2773, 3017,
-  1514, 2711, 1260, 2835, 48,   3072, 112,  1831, 2524, 3073, 1641, 2649, 176,
-  3074, 687,  3018, 942,  2210, 2958, 240,  3075, 1133, 2147, 2273, 2897, 304,
-  2084, 2336, 3076, 368,  1768, 2587, 3077, 751,  2021, 2399, 3019, 1451, 2774,
-  1324, 2836, 432,  1578, 2712, 3078, 1006, 2959, 1958, 2462, 1197, 2898, 496,
-  815,  3020, 3079, 1705, 2650, 1895, 2525, 560,  3080, 1070, 2960, 1388, 2837,
-  879,  1515, 2775, 3021, 2211, 2274, 1832, 2588, 624,  2148, 2337, 3081, 1261,
-  2899, 1642, 2713, 2085, 2400, 49,   3136, 113,  3137, 688,  3082, 177,  943,
-  1134, 2022, 2463, 2961, 3022, 3138, 241,  1769, 2651, 3139, 305,  3140, 1452,
-  2838, 1959, 2526, 752,  1325, 1579, 2776, 2900, 3083, 369,  3141, 1007, 3023,
-  433,  3142, 1198, 1706, 2714, 2962, 1896, 2589, 816,  3084, 497,  2275, 3143,
-  2212, 2338, 2149, 2401, 561,  1071, 1516, 1833, 2652, 2839, 3024, 3144, 1389,
-  2901, 2086, 2464, 880,  3085, 1643, 2777, 1262, 2963, 625,  2023, 2527, 3145,
-  1770, 2715, 1135, 3025, 50,   944,  1960, 2590, 3086, 3200, 114,  689,  3146,
-  3201, 178,  3202, 242,  1453, 2902, 3203, 1580, 2840, 306,  1326, 2964, 3204,
-  2276, 2339, 753,  1897, 2653, 3147, 370,  1707, 2213, 2402, 2778, 3205, 1008,
-  3087, 1199, 2150, 2465, 3026, 434,  3206, 817,  2087, 2528, 3148, 1834, 2716,
-  498,  3207, 1517, 2903, 1390, 2965, 1072, 3088, 1644, 2024, 2591, 2841, 562,
-  3208, 881,  1263, 3027, 3149, 1771, 2779, 626,  1961, 2654, 3209, 2340, 1136,
-  3089, 2277, 2403, 945,  3150, 690,  1454, 2214, 2466, 2966, 3210, 51,   1581,
-  2904, 3264, 115,  3265, 179,  1898, 2717, 3266, 1327, 3028, 243,  2151, 2529,
-  3267, 1708, 2842, 307,  3268, 754,  3211, 2088, 2592, 371,  1009, 3151, 3269,
-  1200, 3090, 1835, 2780, 435,  3270, 2025, 2655, 818,  3212, 1518, 2967, 499,
-  1391, 1645, 2905, 3029, 3271, 1073, 3152, 1962, 2718, 563,  1264, 1772, 2341,
-  2404, 2843, 3091, 3272, 882,  2278, 2467, 3213, 2215, 2530, 627,  3273, 2152,
-  2593, 1137, 1899, 2781, 3153, 1582, 2968, 1455, 3030, 946,  3214, 691,  1709,
-  2906, 3274, 52,   1328, 3092, 3328, 116,  2089, 2656, 3329, 180,  3330, 244,
-  3331, 308,  1836, 2844, 3332, 755,  3275, 1010, 1201, 2026, 2719, 3154, 3215,
-  372,  3333, 1519, 2405, 3031, 436,  2342, 2468, 3334, 1646, 2969, 819,  1392,
-  3093, 3276, 2279, 2531, 1963, 2782, 500,  3335, 1773, 2907, 1074, 2216, 2594,
-  3216, 1265, 3155, 564,  3336, 883,  2153, 2657, 3277, 1900, 2845, 628,  1583,
-  3032, 3337, 1456, 2090, 2720, 3094, 1138, 3217, 1710, 2970, 947,  3278, 1329,
-  3156, 692,  3338, 53,   1837, 2908, 3392, 117,  2027, 2783, 3393, 181,  2406,
-  2469, 3394, 2343, 2532, 245,  3395, 1202, 3218, 309,  756,  2280, 2595, 3339,
-  3396, 1011, 3279, 1520, 3095, 373,  1647, 3033, 3397, 1964, 2846, 2217, 2658,
-  1393, 3157, 437,  1774, 2971, 3398, 820,  3340, 2154, 2721, 1075, 3280, 501,
-  3399, 1266, 3219, 1901, 2909, 565,  884,  2091, 2784, 3341, 3400, 1584, 3096,
-  1457, 1711, 3034, 3158, 2470, 629,  1139, 2407, 2533, 3281, 3401, 2344, 2596,
-  2028, 2847, 948,  1330, 1838, 2972, 3220, 3342, 2281, 2659, 693,  3402, 54,
-  3456, 118,  3457, 182,  2218, 2722, 3458, 246,  1203, 1965, 2910, 3282, 3459,
-  1012, 1648, 3097, 3343, 757,  1521, 3159, 3403, 310,  3460, 1775, 2155, 2785,
-  3035, 374,  1394, 3221, 3461, 438,  3462, 821,  3404, 1902, 2973, 1076, 2092,
-  2848, 3344, 1267, 3283, 502,  2471, 2534, 3463, 2408, 2597, 1585, 2345, 2660,
-  3160, 885,  3405, 566,  1712, 3098, 3464, 1458, 3222, 2029, 2911, 2282, 2723,
-  1140, 1839, 3036, 3345, 630,  3465, 1331, 3284, 949,  2219, 2786, 3406, 694,
-  1966, 2974, 3466, 55,   2156, 2849, 3520, 119,  1649, 3161, 3521, 1204, 3346,
-  183,  1522, 3223, 3522, 1776, 3099, 247,  1013, 3407, 3523, 758,  3467, 311,
-  3524, 1395, 2535, 3285, 2472, 2598, 2093, 2912, 375,  1903, 2409, 2661, 3037,
-  3525, 822,  2346, 2724, 3468, 439,  3526, 1077, 1268, 3347, 3408, 503,  2283,
-  2787, 3527, 1586, 3224, 1713, 2030, 2975, 3162, 886,  1459, 3286, 3469, 1840,
-  3100, 567,  3528, 2220, 2850, 1141, 3409, 1332, 3348, 631,  3529, 1967, 3038,
-  950,  3470, 2157, 2913, 2536, 2599, 695,  1650, 2473, 2662, 3225, 3530, 1523,
-  1777, 3163, 3287, 1205, 2410, 2725, 3410, 56,   3584, 120,  3585, 184,  2094,
-  2976, 3586, 1014, 3471, 248,  1396, 1904, 2347, 2788, 3101, 3349, 3587, 759,
-  3531, 312,  3588, 376,  2284, 2851, 3589, 823,  3532, 1269, 2031, 3039, 3411,
-  440,  1078, 3472, 3590, 1714, 3226, 1587, 3288, 2221, 2914, 504,  1841, 3164,
-  3591, 1460, 3350, 887,  3533, 568,  2600, 3592, 2537, 2663, 1968, 3102, 1142,
-  2158, 2977, 3473, 2474, 2726, 1333, 3412, 632,  3593, 2411, 2789, 951,  3534,
-  1651, 3289, 1778, 3227, 2348, 2852, 1524, 2095, 3040, 3351, 696,  3594, 1206,
-  3474, 1905, 3165, 57,   3648, 121,  1015, 1397, 2285, 2915, 3413, 3535, 3649,
-  185,  3650, 760,  3595, 249,  3651, 313,  2032, 3103, 3652, 2222, 2978, 377,
-  3653, 1270, 1715, 3290, 3475, 824,  1588, 3352, 3596, 1079, 2601, 2664, 3536,
-  1842, 3228, 441,  2538, 2727, 3654, 1461, 2475, 2790, 3414, 505,  2159, 3041,
-  3655, 1969, 3166, 888,  2412, 2853, 3597, 569,  3656, 1143, 3537, 1334, 3476,
-  2349, 2916, 2096, 3104, 1652, 3353, 633,  1779, 3291, 3657, 952,  3598, 1525,
-  3415, 1906, 2286, 2979, 3229, 697,  1207, 3538, 3658, 1398, 3477, 1016, 3599,
-  2033, 2665, 3167, 58,   2602, 2728, 3712, 122,  2223, 3042, 3713, 186,  3714,
-  761,  2539, 2791, 3659, 250,  3715, 314,  1716, 2476, 2854, 3354, 3716, 1589,
-  1843, 3292, 3416, 1271, 3539, 378,  3717, 1080, 3600, 825,  2160, 3105, 3660,
-  2413, 2917, 442,  1462, 1970, 3230, 3478, 3718, 2350, 2980, 506,  3719, 889,
-  3661, 1144, 1335, 2097, 3168, 3540, 3601, 570,  3720, 1780, 3355, 1653, 2287,
-  3043, 3417, 1907, 3293, 634,  953,  1526, 2666, 2729, 3479, 3662, 3721, 2603,
-  2792, 2540, 2855, 1208, 2224, 3106, 3602, 2034, 3231, 698,  3722, 1399, 3541,
-  2477, 2918, 1017, 3663, 59,   3776, 123,  3777, 187,  762,  1717, 2414, 2981,
-  3418, 3723, 3778, 1844, 3356, 251,  2161, 3169, 3779, 1590, 3480, 315,  1272,
-  3603, 3780, 1971, 3294, 1081, 2351, 3044, 3664, 379,  3781, 826,  3724, 1463,
-  3542, 443,  3782, 2098, 3232, 2730, 2288, 3107, 507,  2667, 2793, 3783, 890,
-  3725, 1336, 2604, 2856, 3604, 1145, 1781, 3419, 3665, 1654, 3481, 571,  1908,
-  3357, 3784, 2541, 2919, 1527, 3543, 2225, 3170, 954,  2478, 2982, 3726, 635,
-  2035, 3295, 3785, 1209, 3666, 1400, 3605, 2415, 3045, 699,  3786, 1018, 2162,
-  3233, 3727, 1718, 3482, 1845, 3420, 60,   2352, 3108, 3840, 124,  1591, 3544,
-  3841, 763,  3787, 188,  1972, 3358, 3842, 252,  3843, 1273, 3667, 2731, 2794,
-  316,  3844, 2668, 2857, 1082, 1464, 3606, 3728, 380,  827,  2099, 2605, 2920,
-  3296, 3788, 3845, 2289, 3171, 444,  3846, 2542, 2983, 1782, 3483, 508,  1337,
-  3668, 3847, 891,  1655, 1909, 3421, 3545, 3789, 1146, 2479, 3046, 3729, 2226,
-  3234, 572,  3848, 1528, 2036, 3359, 3607, 2416, 3109, 955,  3790, 636,  3849,
-  1210, 3730, 1401, 2163, 3297, 3669, 2353, 3172, 2795, 700,  1846, 2732, 2858,
-  3484, 3850, 1719, 3546, 1019, 2669, 2921, 3791, 1973, 3422, 1592, 3608, 2606,
-  2984, 61,   764,  3851, 3904, 125,  3905, 189,  1274, 2290, 3235, 3731, 3906,
-  2100, 3360, 253,  2543, 3047, 3907, 1465, 3670, 317,  1083, 3792, 3908, 828,
-  3852, 381,  3909, 2480, 3110, 1783, 3547, 445,  1910, 2227, 3298, 3485, 3910,
-  1656, 3609, 1338, 3732, 892,  3853, 509,  1147, 2037, 2417, 3173, 3423, 3793,
-  3911, 1529, 3671, 573,  2796, 2859, 3912, 2733, 2922, 2164, 3361, 956,  2354,
-  3236, 3854, 2670, 2985, 637,  3913, 1211, 1402, 3733, 3794, 1847, 2607, 3048,
-  3548, 1720, 3610, 1974, 3486, 701,  3914, 1020, 1593, 2544, 3111, 3672, 3855,
-  2291, 3299, 2101, 3424, 765,  1275, 3795, 3915, 62,   3968, 126,  2481, 3174,
-  3969, 190,  1466, 3734, 3970, 254,  3971, 1084, 3856, 318,  2228, 3362, 3972,
-  829,  1784, 3611, 3916, 1911, 3549, 382,  2418, 3237, 3973, 2860, 1657, 2797,
-  2923, 3673, 2038, 3487, 446,  2734, 2986, 3974, 1339, 3796, 1148, 3857, 893,
-  2671, 3049, 3917, 510,  1530, 3735, 3975, 2355, 3300, 2165, 3425, 2608, 3112,
-  574,  3976, 957,  3918, 1848, 3612, 1403, 2545, 3175, 3797, 1212, 3858, 638,
-  1721, 1975, 3550, 3674, 3977, 2292, 3363, 1594, 2102, 3488, 3736, 702,  2482,
-  3238, 3978, 1021, 3919, 1276, 2861, 2924, 3859, 766,  1467, 2229, 2798, 2987,
-  3426, 3798, 3979, 63,   4032, 127,  2419, 3301, 4033, 191,  2735, 3050, 4034,
-  1085, 1912, 3613, 3920, 255,  1785, 3675, 4035, 319,  2672, 3113, 4036, 2039,
-  3551, 830,  3980, 1658, 3737, 383,  4037, 1340, 2356, 3364, 3860, 2609, 3176,
-  447,  2166, 3489, 4038, 1149, 1531, 3799, 3921, 894,  3981, 511,  4039, 2546,
-  3239, 575,  1849, 3676, 4040, 2293, 3427, 1976, 3614, 958,  1722, 3738, 3982,
-  1404, 3861, 1213, 2483, 3302, 3922, 2103, 3552, 639,  2925, 4041, 2862, 2988,
-  1595, 3800, 2799, 3051, 2736, 3114, 703,  1022, 3983, 4042, 2230, 3490, 2420,
-  3365, 1277, 2673, 3177, 3923, 1468, 3862, 767,  1913, 3677, 4043, 1786, 3739,
-  2040, 3615, 1086, 2610, 3240, 3984, 2357, 3428, 1659, 3801, 831,  4044, 2167,
-  3553, 1341, 3924, 2547, 3303, 1532, 3863, 1150, 3985, 895,  4045, 2294, 2926,
-  2989, 3491, 2863, 3052, 1850, 2484, 3366, 3740, 1977, 3678, 2800, 3115, 1723,
-  3802, 2104, 3616, 1405, 3925, 959,  2737, 3178, 4046, 1214, 3986, 1596, 3864,
-  2421, 3429, 2231, 2674, 3241, 3554, 1023, 4047, 2611, 3304, 1278, 1469, 1914,
-  3741, 3926, 3987, 1787, 2041, 3679, 3803, 2358, 3492, 1087, 1660, 2168, 2548,
-  3367, 3617, 3865, 4048, 2990, 2927, 3053, 2864, 3116, 1342, 3988, 1533, 2295,
-  2801, 3179, 3555, 3927, 2485, 3430, 1151, 4049, 1978, 2738, 3242, 3742, 1851,
-  3804, 2105, 3680, 1724, 3866, 2675, 3305, 1406, 2422, 3493, 3989, 2232, 3618,
-  1215, 4050, 1597, 3928, 2612, 3368, 2359, 3556, 1915, 3805, 2042, 2991, 3054,
-  3743, 1470, 3990, 1788, 2928, 3117, 3867, 1279, 2549, 3431, 4051, 2865, 3180,
-  2169, 3681, 1661, 3929, 2802, 3243, 2486, 3494, 2296, 3619, 2739, 3306, 1343,
-  4052, 1534, 3991, 1979, 3806, 1852, 3868, 2676, 3369, 2106, 3744, 2423, 3557,
-  1725, 3930, 2233, 3682, 2613, 3432, 1407, 4053, 3055, 1598, 2992, 3118, 3992,
-  2929, 3181, 2360, 3620, 2866, 3244, 2550, 3495, 1916, 3869, 2043, 3807, 1789,
-  2803, 3307, 3931, 1471, 2170, 3745, 4054, 2740, 3370, 1662, 2487, 3558, 3993,
-  2297, 3683, 2677, 3433, 1535, 4055, 1980, 3870, 1853, 2107, 2424, 3621, 3808,
-  3932, 3056, 3119, 2614, 3496, 2993, 3182, 1726, 2234, 3746, 3994, 2930, 3245,
-  2867, 3308, 1599, 2361, 3684, 4056, 2551, 3559, 2804, 3371, 2044, 3871, 1917,
-  3933, 2171, 3809, 1790, 2741, 3434, 3995, 2488, 3622, 2298, 3747, 1663, 4057,
-  2678, 3497, 3120, 3057, 3183, 2994, 3246, 2425, 3685, 1981, 3934, 2108, 3872,
-  2615, 3560, 2931, 3309, 1854, 3996, 2235, 3810, 2868, 3372, 1727, 4058, 2552,
-  3623, 2805, 3435, 2362, 3748, 2742, 3498, 2045, 3935, 1918, 3997, 2172, 3873,
-  2489, 3686, 1791, 4059, 3121, 3184, 2299, 2679, 3561, 3811, 3058, 3247, 2995,
-  3310, 2932, 3373, 2426, 3749, 2616, 3624, 1982, 3998, 2109, 2869, 3436, 3936,
-  1855, 4060, 2236, 3874, 2806, 3499, 2553, 3687, 2363, 3812, 2743, 3562, 3185,
-  3122, 3248, 2046, 3999, 2490, 3750, 1919, 2173, 3059, 3311, 3937, 4061, 2680,
-  3625, 2996, 3374, 2300, 3875, 2933, 3437, 2617, 3688, 2427, 3813, 2870, 3500,
-  2110, 4000, 1983, 4062, 2807, 3563, 2237, 3938, 2554, 3751, 2364, 3876, 2744,
-  3626, 3186, 3249, 3123, 3312, 3060, 3375, 2491, 2997, 3438, 3814, 2047, 2681,
-  3689, 4063, 2174, 4001, 2934, 3501, 2301, 3939, 2871, 3564, 2618, 3752, 2428,
-  3877, 2808, 3627, 2111, 4064, 2238, 3250, 4002, 2555, 3187, 3313, 3815, 3124,
-  3376, 2745, 3690, 2365, 3940, 3061, 3439, 2998, 3502, 2492, 3878, 2682, 3753,
-  2935, 3565, 2175, 4065, 2302, 4003, 2872, 3628, 2619, 3816, 2429, 3941, 2809,
-  3691, 3251, 3314, 3188, 3377, 3125, 3440, 2556, 3879, 2239, 3062, 3503, 4066,
-  2746, 3754, 2366, 4004, 2999, 3566, 2936, 3629, 2683, 3817, 2493, 3942, 2873,
-  3692, 2303, 4067, 2620, 3880, 3315, 3252, 3378, 3189, 3441, 2430, 2810, 3755,
-  4005, 3126, 3504, 3063, 3567, 2557, 3943, 2747, 3818, 3000, 3630, 2367, 4068,
-  2937, 3693, 2684, 3881, 2494, 4006, 2874, 3756, 3316, 3379, 3253, 3442, 3190,
-  3505, 2621, 3944, 3127, 3568, 2811, 3819, 2431, 4069, 3064, 3631, 2748, 3882,
-  2558, 3001, 3694, 4007, 2938, 3757, 2685, 3945, 3380, 3317, 3443, 2495, 4070,
-  3254, 3506, 2875, 3820, 3191, 3569, 3128, 3632, 2622, 4008, 2812, 3883, 3065,
-  3695, 3002, 3758, 2749, 3946, 2559, 4071, 2939, 3821, 3381, 3444, 3318, 3507,
-  2686, 3255, 3570, 4009, 2876, 3884, 3192, 3633, 3129, 3696, 2623, 4072, 2813,
-  3947, 3066, 3759, 3003, 3822, 2750, 4010, 3445, 3382, 3508, 2940, 3885, 3319,
-  3571, 3256, 3634, 2687, 3193, 3697, 4073, 2877, 3948, 3130, 3760, 3067, 3823,
-  2814, 4011, 3004, 3886, 3446, 3509, 3383, 3572, 2751, 4074, 3320, 3635, 2941,
-  3949, 3257, 3698, 3194, 3761, 2878, 4012, 3131, 3824, 3068, 3887, 2815, 4075,
-  3510, 3447, 3573, 3005, 3950, 3384, 3636, 3321, 3699, 3258, 3762, 2942, 4013,
-  3195, 3825, 3132, 3888, 2879, 4076, 3069, 3951, 3511, 3574, 3448, 3637, 3385,
-  3700, 3006, 4014, 3322, 3763, 3259, 3826, 2943, 4077, 3196, 3889, 3133, 3952,
-  3575, 3512, 3638, 3070, 4015, 3449, 3701, 3386, 3764, 3323, 3827, 3007, 4078,
-  3260, 3890, 3197, 3953, 3134, 4016, 3576, 3639, 3513, 3702, 3450, 3765, 3071,
-  4079, 3387, 3828, 3324, 3891, 3261, 3954, 3198, 4017, 3640, 3135, 4080, 3577,
-  3703, 3514, 3766, 3451, 3829, 3388, 3892, 3325, 3955, 3262, 4018, 3199, 4081,
-  3641, 3704, 3578, 3767, 3515, 3830, 3452, 3893, 3389, 3956, 3326, 4019, 3263,
-  4082, 3705, 3642, 3768, 3579, 3831, 3516, 3894, 3453, 3957, 3390, 4020, 3327,
-  4083, 3706, 3769, 3643, 3832, 3580, 3895, 3517, 3958, 3454, 4021, 3391, 4084,
-  3770, 3707, 3833, 3644, 3896, 3581, 3959, 3518, 4022, 3455, 4085, 3771, 3834,
-  3708, 3897, 3645, 3960, 3582, 4023, 3519, 4086, 3835, 3772, 3898, 3709, 3961,
-  3646, 4024, 3583, 4087, 3836, 3899, 3773, 3962, 3710, 4025, 3647, 4088, 3900,
-  3837, 3963, 3774, 4026, 3711, 4089, 3901, 3964, 3838, 4027, 3775, 4090, 3965,
-  3902, 4028, 3839, 4091, 3966, 4029, 3903, 4092, 4030, 3967, 4093, 4031, 4094,
-  4095,
-};
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_CHROMA_2X2
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_2x2_neighbors[5 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 1, 1, 2, 0, 0,
+  0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+  35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+  37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+  163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+  226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+  10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+  384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+  13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+  416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+  76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+  325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+  295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+  110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+  513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+  235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+  174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+  577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+  299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+  114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+  517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+  487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+  84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+  333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+  736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+  396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+  25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+  428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+  832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+  429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+  26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+  399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+  802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+  586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+  183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+  246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+  649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+  867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+  464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+  61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+  372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+  775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+  838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+  435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+  95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+  498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+  901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+  716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+  313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+  345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+  748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+  873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+  470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+  316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+  719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+  906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+  503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+  411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+  814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+  815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+  412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+  630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+  1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+  600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+  570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+  973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+  664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+  634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+  1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+  604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+  822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+  823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+  731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+  918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+  764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+  889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+  921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+  799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+  862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+  927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023
 };
-#endif
 
 // Neighborhood 2-tuples for various scans and blocksizes,
 // in {top, left} order for each position in corresponding scan order.
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4,  0, 1, 4, 4, 5,  5,  1, 8,  8,  5,  8, 2,
-  2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+  0, 0, 0, 0, 0,  0, 4, 4, 1, 4, 1,  1,  2,  2,  2,  5, 5,
+  8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
@@ -1840,19 +858,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
   4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0,  0, 0, 0, 4, 4, 4, 0, 8, 8,  1,  4, 5,  8,  5,  1, 9,
-  12, 2, 5, 6, 9, 6, 2, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 1, 1, 1, 1,  4, 2,  2,  2,  5,  4,  5, 5,
-  8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
-};
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
@@ -1924,7 +929,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,  4,  5,
@@ -1968,7 +972,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
   14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
@@ -2048,7 +1051,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   223, 254, 0,   0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
@@ -2202,20 +1204,7 @@ DECLARE_ALIGNED(16, static const int16_t,
   30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
   0,  0
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  8,  0,  16, 16, 1,  8,  24, 24, 9,  16, 9,  1,  32,
-  32, 17, 24, 2,  9,  25, 32, 10, 17, 40, 40, 10, 2,  18, 25, 33, 40, 3,  10,
-  48, 48, 11, 18, 26, 33, 11, 3,  41, 48, 19, 26, 34, 41, 4,  11, 27, 34, 12,
-  19, 49, 56, 42, 49, 20, 27, 12, 4,  35, 42, 5,  12, 28, 35, 50, 57, 43, 50,
-  13, 20, 36, 43, 13, 5,  21, 28, 51, 58, 29, 36, 6,  13, 44, 51, 14, 21, 14,
-  6,  37, 44, 52, 59, 22, 29, 7,  14, 30, 37, 45, 52, 15, 22, 38, 45, 23, 30,
-  53, 60, 31, 38, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
-};
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
@@ -2237,28 +1226,16 @@ DECLARE_ALIGNED(16, static const int16_t,
   46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
   49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  0,  1,  1,  8,  2,  2,  8,  9,  2,  9,  3,  3,  9,
-  16, 3,  10, 16, 17, 4,  4,  10, 17, 17, 24, 4,  11, 11, 18, 18, 25, 24, 25,
-  5,  5,  5,  12, 12, 19, 25, 32, 19, 26, 6,  6,  26, 33, 32, 33, 13, 20, 20,
-  27, 33, 40, 6,  13, 27, 34, 40, 41, 34, 41, 21, 28, 28, 35, 41, 48, 14, 21,
-  35, 42, 7,  14, 48, 49, 29, 36, 42, 49, 36, 43, 22, 29, 49, 56, 15, 22, 43,
-  50, 50, 57, 37, 44, 30, 37, 44, 51, 23, 30, 51, 58, 45, 52, 38, 45, 52, 59,
-  31, 38, 53, 60, 39, 46, 46, 53, 47, 54, 54, 61, 55, 62, 0,  0,
-};
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  0,  8,  8,  1,  8,  9,  1,  9,  16, 16, 17, 2,  9,  10,
-  2,  10, 17, 17, 24, 24, 25, 3,  10, 11, 3,  18, 25, 25, 32, 11, 18, 32, 33,
-  4,  11, 26, 33, 19, 26, 12, 4,  33, 40, 12, 19, 40, 41, 5,  12, 27, 34, 34,
-  41, 20, 27, 13, 20, 13, 5,  41, 48, 48, 49, 28, 35, 35, 42, 21, 28, 6,  6,
-  6,  13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7,  14, 43, 50, 50, 57, 22,
-  29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45,
-  31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
+  0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  2,  2,  2,  9,  9,  16, 16,
+  16, 24, 24, 17, 24, 10, 17, 3,  10, 3,  3,  4,  4,  4,  11, 11, 18, 18, 25,
+  25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5,  12, 5,  5,  6,
+  6,  6,  13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
+  35, 42, 28, 35, 21, 28, 14, 21, 7,  14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
+  50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
+  52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0,  0
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
@@ -2829,7 +1806,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   478, 509, 479, 510, 0,   0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
@@ -2907,126 +1883,46 @@ DECLARE_ALIGNED(16, static const int16_t,
   246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
   239, 254, 0,   0,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  16,  0,   48,  48,  1,   16,  64,
-  64,  17,  32,  80,  80,  33,  48,  17,  1,   49,  64,  96,  96,  2,   17,
-  65,  80,  18,  33,  112, 112, 34,  49,  81,  96,  18,  2,   50,  65,  128,
-  128, 3,   18,  97,  112, 19,  34,  66,  81,  144, 144, 82,  97,  35,  50,
-  113, 128, 19,  3,   51,  66,  160, 160, 4,   19,  98,  113, 129, 144, 67,
-  82,  20,  35,  83,  98,  114, 129, 36,  51,  176, 176, 20,  4,   145, 160,
-  52,  67,  99,  114, 5,   20,  130, 145, 68,  83,  192, 192, 161, 176, 21,
-  36,  115, 130, 84,  99,  37,  52,  146, 161, 208, 208, 53,  68,  21,  5,
-  100, 115, 177, 192, 131, 146, 69,  84,  6,   21,  224, 224, 116, 131, 22,
-  37,  162, 177, 85,  100, 147, 162, 38,  53,  193, 208, 101, 116, 54,  69,
-  22,  6,   132, 147, 178, 193, 70,  85,  163, 178, 209, 224, 7,   22,  117,
-  132, 23,  38,  148, 163, 23,  7,   86,  101, 194, 209, 225, 240, 39,  54,
-  179, 194, 102, 117, 133, 148, 55,  70,  164, 179, 8,   23,  71,  86,  210,
-  225, 118, 133, 149, 164, 195, 210, 24,  39,  87,  102, 40,  55,  56,  71,
-  134, 149, 180, 195, 226, 241, 103, 118, 24,  8,   165, 180, 211, 226, 72,
-  87,  150, 165, 9,   24,  119, 134, 25,  40,  88,  103, 196, 211, 41,  56,
-  135, 150, 181, 196, 104, 119, 57,  72,  227, 242, 166, 181, 120, 135, 151,
-  166, 197, 212, 73,  88,  25,  9,   212, 227, 89,  104, 136, 151, 182, 197,
-  10,  25,  26,  41,  105, 120, 167, 182, 228, 243, 152, 167, 42,  57,  121,
-  136, 213, 228, 58,  73,  198, 213, 74,  89,  137, 152, 183, 198, 168, 183,
-  26,  10,  90,  105, 229, 244, 11,  26,  106, 121, 214, 229, 153, 168, 27,
-  42,  199, 214, 43,  58,  184, 199, 122, 137, 169, 184, 230, 245, 59,  74,
-  27,  11,  75,  90,  138, 153, 200, 215, 215, 230, 91,  106, 12,  27,  28,
-  43,  185, 200, 107, 122, 154, 169, 44,  59,  231, 246, 216, 231, 60,  75,
-  123, 138, 28,  12,  76,  91,  201, 216, 170, 185, 232, 247, 139, 154, 92,
-  107, 13,  28,  108, 123, 29,  44,  186, 201, 217, 232, 155, 170, 45,  60,
-  29,  13,  61,  76,  124, 139, 14,  14,  233, 248, 77,  92,  14,  29,  171,
-  186, 140, 155, 202, 217, 30,  45,  93,  108, 109, 124, 46,  61,  156, 171,
-  62,  77,  187, 202, 15,  30,  125, 140, 218, 233, 78,  93,  31,  46,  172,
-  187, 47,  62,  141, 156, 94,  109, 234, 249, 203, 218, 63,  78,  110, 125,
-  188, 203, 157, 172, 126, 141, 79,  94,  173, 188, 95,  110, 219, 234, 142,
-  157, 204, 219, 235, 250, 111, 126, 158, 173, 127, 142, 189, 204, 220, 235,
-  143, 158, 174, 189, 205, 220, 236, 251, 159, 174, 190, 205, 221, 236, 175,
-  190, 237, 252, 206, 221, 222, 237, 191, 206, 238, 253, 207, 222, 223, 238,
-  239, 254, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   0,   1,   2,   2,   1,   16,  3,   3,   2,
-  17,  16,  17,  4,   4,   17,  32,  3,   18,  5,   5,   18,  33,  32,  33,
-  4,   19,  33,  48,  6,   6,   19,  34,  5,   20,  34,  49,  48,  49,  7,
-  7,   20,  35,  49,  64,  6,   21,  35,  50,  21,  36,  64,  65,  8,   8,
-  50,  65,  36,  51,  7,   22,  22,  37,  65,  80,  51,  66,  9,   9,   37,
-  52,  8,   23,  66,  81,  52,  67,  80,  81,  23,  38,  10,  10,  38,  53,
-  67,  82,  81,  96,  53,  68,  9,   24,  82,  97,  68,  83,  24,  39,  96,
-  97,  39,  54,  11,  11,  54,  69,  83,  98,  97,  112, 69,  84,  10,  25,
-  25,  40,  40,  55,  98,  113, 84,  99,  12,  12,  55,  70,  112, 113, 70,
-  85,  11,  26,  99,  114, 85,  100, 113, 128, 26,  41,  41,  56,  56,  71,
-  100, 115, 13,  13,  71,  86,  114, 129, 86,  101, 128, 129, 57,  72,  115,
-  130, 101, 116, 12,  27,  42,  57,  14,  14,  72,  87,  27,  42,  129, 144,
-  87,  102, 116, 131, 130, 145, 102, 117, 58,  73,  144, 145, 73,  88,  117,
-  132, 88,  103, 13,  28,  43,  58,  131, 146, 103, 118, 28,  43,  145, 160,
-  132, 147, 74,  89,  89,  104, 118, 133, 146, 161, 104, 119, 160, 161, 59,
-  74,  119, 134, 133, 148, 14,  29,  44,  59,  147, 162, 161, 176, 29,  44,
-  105, 120, 75,  90,  90,  105, 148, 163, 162, 177, 134, 149, 176, 177, 120,
-  135, 149, 164, 163, 178, 15,  30,  135, 150, 177, 192, 60,  75,  106, 121,
-  45,  60,  121, 136, 178, 193, 91,  106, 136, 151, 164, 179, 192, 193, 30,
-  45,  150, 165, 151, 166, 179, 194, 76,  91,  165, 180, 122, 137, 193, 208,
-  107, 122, 137, 152, 208, 209, 180, 195, 61,  76,  152, 167, 194, 209, 166,
-  181, 224, 224, 92,  107, 181, 196, 46,  61,  138, 153, 209, 224, 167, 182,
-  153, 168, 195, 210, 31,  46,  123, 138, 77,  92,  168, 183, 210, 225, 196,
-  211, 225, 240, 182, 197, 154, 169, 108, 123, 139, 154, 183, 198, 62,  77,
-  197, 212, 169, 184, 93,  108, 211, 226, 184, 199, 47,  62,  212, 227, 226,
-  241, 124, 139, 198, 213, 155, 170, 170, 185, 140, 155, 213, 228, 227, 242,
-  109, 124, 78,  93,  185, 200, 228, 243, 199, 214, 200, 215, 214, 229, 125,
-  140, 171, 186, 186, 201, 63,  78,  156, 171, 94,  109, 141, 156, 229, 244,
-  201, 216, 215, 230, 79,  94,  230, 245, 216, 231, 110, 125, 187, 202, 231,
-  246, 217, 232, 157, 172, 202, 217, 126, 141, 95,  110, 142, 157, 172, 187,
-  232, 247, 111, 126, 218, 233, 203, 218, 233, 248, 173, 188, 188, 203, 127,
-  142, 158, 173, 143, 158, 234, 249, 219, 234, 189, 204, 204, 219, 159, 174,
-  174, 189, 235, 250, 205, 220, 175, 190, 190, 205, 220, 235, 191, 206, 221,
-  236, 236, 251, 206, 221, 237, 252, 207, 222, 222, 237, 223, 238, 238, 253,
-  239, 254, 0,   0,
-};
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  0,   16,  16,  1,   16,  17,  1,   32,  32,  17,
-  32,  2,   17,  18,  2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
-  64,  65,  34,  49,  19,  3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
-  81,  35,  50,  20,  4,   20,  35,  66,  81,  81,  96,  51,  66,  96,  97,
-  5,   20,  36,  51,  82,  97,  21,  36,  67,  82,  97,  112, 21,  5,   52,
-  67,  112, 113, 37,  52,  6,   21,  83,  98,  98,  113, 68,  83,  22,  6,
-  113, 128, 22,  37,  53,  68,  84,  99,  99,  114, 128, 129, 114, 129, 69,
-  84,  38,  53,  7,   22,  23,  7,   129, 144, 23,  38,  54,  69,  100, 115,
-  85,  100, 115, 130, 144, 145, 130, 145, 39,  54,  70,  85,  8,   23,  55,
-  70,  116, 131, 101, 116, 145, 160, 24,  39,  24,  8,   86,  101, 131, 146,
-  160, 161, 146, 161, 71,  86,  40,  55,  9,   24,  117, 132, 102, 117, 161,
-  176, 132, 147, 56,  71,  87,  102, 25,  40,  147, 162, 25,  9,   176, 177,
-  162, 177, 72,  87,  41,  56,  118, 133, 133, 148, 103, 118, 10,  25,  148,
-  163, 57,  72,  88,  103, 177, 192, 26,  41,  163, 178, 192, 193, 26,  10,
-  119, 134, 73,  88,  149, 164, 104, 119, 134, 149, 42,  57,  178, 193, 164,
-  179, 11,  26,  58,  73,  193, 208, 89,  104, 135, 150, 120, 135, 27,  42,
-  74,  89,  208, 209, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43,
-  58,  27,  11,  136, 151, 90,  105, 151, 166, 180, 195, 59,  74,  121, 136,
-  209, 224, 195, 210, 224, 225, 166, 181, 106, 121, 75,  90,  12,  27,  181,
-  196, 28,  12,  210, 225, 152, 167, 167, 182, 137, 152, 28,  43,  196, 211,
-  122, 137, 91,  106, 225, 240, 44,  59,  13,  28,  107, 122, 182, 197, 168,
-  183, 211, 226, 153, 168, 226, 241, 60,  75,  197, 212, 138, 153, 29,  44,
-  76,  91,  29,  13,  183, 198, 123, 138, 45,  60,  212, 227, 198, 213, 154,
-  169, 169, 184, 227, 242, 92,  107, 61,  76,  139, 154, 14,  29,  30,  14,
-  184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77,  92,  30,  45,  170,
-  185, 155, 170, 185, 200, 93,  108, 124, 139, 214, 229, 46,  61,  200, 215,
-  229, 244, 15,  30,  109, 124, 62,  77,  140, 155, 215, 230, 31,  46,  171,
-  186, 186, 201, 201, 216, 78,  93,  230, 245, 125, 140, 47,  62,  216, 231,
-  156, 171, 94,  109, 231, 246, 141, 156, 63,  78,  202, 217, 187, 202, 110,
-  125, 217, 232, 172, 187, 232, 247, 79,  94,  157, 172, 126, 141, 203, 218,
-  95,  110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, 234,
-  249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250,
-  174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, 236,
-  251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, 238,
-  239, 254, 0,   0,
+  0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   2,   2,   2,
+  17,  17,  32,  32,  32,  48,  48,  33,  48,  18,  33,  3,   18,  3,   3,
+  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  80,  80,  65,
+  80,  50,  65,  35,  50,  20,  35,  5,   20,  5,   5,   6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  112, 112, 97,
+  112, 82,  97,  67,  82,  52,  67,  37,  52,  22,  37,  7,   22,  7,   7,
+  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
+  113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99,  114, 84,  99,
+  69,  84,  54,  69,  39,  54,  24,  39,  9,   24,  9,   9,   10,  10,  10,
+  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
+  130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
+  131, 101, 116, 86,  101, 71,  86,  56,  71,  41,  56,  26,  41,  11,  26,
+  11,  11,  12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+  208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
+  118, 88,  103, 73,  88,  58,  73,  43,  58,  28,  43,  13,  28,  13,  13,
+  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
+  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+  224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
+  150, 120, 135, 105, 120, 90,  105, 75,  90,  60,  75,  45,  60,  30,  45,
+  15,  30,  31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106, 121, 121,
+  136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
+  227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
+  137, 107, 122, 92,  107, 77,  92,  62,  77,  47,  62,  63,  78,  78,  93,
+  93,  108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
+  213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
+  154, 169, 139, 154, 124, 139, 109, 124, 94,  109, 79,  94,  95,  110, 110,
+  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+  230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
+  156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
+  202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
+  188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+  234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
+  221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
+  239, 254, 0,   0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
   0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
@@ -3328,1899 +2224,162 @@ DECLARE_ALIGNED(16, static const int16_t,
   983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
   990, 1021, 991, 1022, 0,   0,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    32,  0,    32,  32,   1,   32,  33,  1,    64,  64,
-  33,  64,   2,   33,   96,  96,   34,  2,    65,  96,  34,  65,   128, 128,
-  97,  128,  3,   34,   66,  97,   35,  3,    35,  66,  98,  129,  129, 160,
-  160, 161,  4,   35,   67,  98,   192, 192,  36,  4,   130, 161,  161, 192,
-  36,  67,   99,  130,  5,   36,   68,  99,   193, 224, 162, 193,  224, 225,
-  131, 162,  37,  68,   100, 131,  37,  5,    194, 225, 225, 256,  256, 257,
-  163, 194,  69,  100,  132, 163,  6,   37,   226, 257, 38,  6,    195, 226,
-  257, 288,  101, 132,  288, 289,  38,  69,   164, 195, 133, 164,  258, 289,
-  227, 258,  196, 227,  7,   38,   289, 320,  70,  101, 320, 321,  39,  7,
-  165, 196,  39,  70,   102, 133,  290, 321,  259, 290, 228, 259,  321, 352,
-  352, 353,  197, 228,  134, 165,  71,  102,  8,   39,  322, 353,  291, 322,
-  260, 291,  103, 134,  353, 384,  166, 197,  229, 260, 40,  71,   40,  8,
-  384, 385,  135, 166,  354, 385,  323, 354,  198, 229, 292, 323,  72,  103,
-  261, 292,  9,   40,   385, 416,  167, 198,  104, 135, 230, 261,  355, 386,
-  416, 417,  293, 324,  324, 355,  41,  9,    41,  72,  386, 417,  199, 230,
-  136, 167,  417, 448,  262, 293,  356, 387,  73,  104, 387, 418,  231, 262,
-  10,  41,   168, 199,  325, 356,  418, 449,  105, 136, 448, 449,  42,  73,
-  294, 325,  200, 231,  42,  10,   357, 388,  137, 168, 263, 294,  388, 419,
-  74,  105,  419, 450,  449, 480,  326, 357,  232, 263, 295, 326,  169, 200,
-  11,  42,   106, 137,  480, 481,  450, 481,  358, 389, 264, 295,  201, 232,
-  138, 169,  389, 420,  43,  74,   420, 451,  327, 358, 43,  11,   481, 512,
-  233, 264,  451, 482,  296, 327,  75,  106,  170, 201, 482, 513,  512, 513,
-  390, 421,  359, 390,  421, 452,  107, 138,  12,  43,  202, 233,  452, 483,
-  265, 296,  328, 359,  139, 170,  44,  75,   483, 514, 513, 544,  234, 265,
-  297, 328,  422, 453,  44,  12,   391, 422,  171, 202, 76,  107,  514, 545,
-  453, 484,  544, 545,  266, 297,  203, 234,  108, 139, 329, 360,  298, 329,
-  140, 171,  515, 546,  13,  44,   423, 454,  235, 266, 545, 576,  454, 485,
-  45,  76,   172, 203,  330, 361,  576, 577,  45,  13,  267, 298,  546, 577,
-  77,  108,  204, 235,  455, 486,  577, 608,  299, 330, 109, 140,  547, 578,
-  14,  45,   46,  14,   141, 172,  578, 609,  331, 362, 46,  77,   173, 204,
-  15,  15,   78,  109,  205, 236,  579, 610,  110, 141, 15,  46,   142, 173,
-  47,  78,   174, 205,  16,  16,   79,  110,  206, 237, 16,  47,   111, 142,
-  48,  79,   143, 174,  80,  111,  175, 206,  17,  48,  49,  17,   207, 238,
-  49,  80,   81,  112,  18,  18,   18,  49,   50,  81,  82,  113,  19,  50,
-  51,  82,   83,  114,  608, 609,  484, 515,  360, 391, 236, 267,  112, 143,
-  51,  19,   640, 640,  609, 640,  516, 547,  485, 516, 392, 423,  361, 392,
-  268, 299,  237, 268,  144, 175,  113, 144,  20,  51,  52,  20,   672, 672,
-  641, 672,  610, 641,  548, 579,  517, 548,  486, 517, 424, 455,  393, 424,
-  362, 393,  300, 331,  269, 300,  238, 269,  176, 207, 145, 176,  114, 145,
-  52,  83,   21,  52,   53,  21,   704, 704,  673, 704, 642, 673,  611, 642,
-  580, 611,  549, 580,  518, 549,  487, 518,  456, 487, 425, 456,  394, 425,
-  363, 394,  332, 363,  301, 332,  270, 301,  239, 270, 208, 239,  177, 208,
-  146, 177,  115, 146,  84,  115,  53,  84,   22,  53,  54,  22,   705, 736,
-  674, 705,  643, 674,  581, 612,  550, 581,  519, 550, 457, 488,  426, 457,
-  395, 426,  333, 364,  302, 333,  271, 302,  209, 240, 178, 209,  147, 178,
-  85,  116,  54,  85,   23,  54,   706, 737,  675, 706, 582, 613,  551, 582,
-  458, 489,  427, 458,  334, 365,  303, 334,  210, 241, 179, 210,  86,  117,
-  55,  86,   707, 738,  583, 614,  459, 490,  335, 366, 211, 242,  87,  118,
-  736, 737,  612, 643,  488, 519,  364, 395,  240, 271, 116, 147,  55,  23,
-  768, 768,  737, 768,  644, 675,  613, 644,  520, 551, 489, 520,  396, 427,
-  365, 396,  272, 303,  241, 272,  148, 179,  117, 148, 24,  55,   56,  24,
-  800, 800,  769, 800,  738, 769,  676, 707,  645, 676, 614, 645,  552, 583,
-  521, 552,  490, 521,  428, 459,  397, 428,  366, 397, 304, 335,  273, 304,
-  242, 273,  180, 211,  149, 180,  118, 149,  56,  87,  25,  56,   57,  25,
-  832, 832,  801, 832,  770, 801,  739, 770,  708, 739, 677, 708,  646, 677,
-  615, 646,  584, 615,  553, 584,  522, 553,  491, 522, 460, 491,  429, 460,
-  398, 429,  367, 398,  336, 367,  305, 336,  274, 305, 243, 274,  212, 243,
-  181, 212,  150, 181,  119, 150,  88,  119,  57,  88,  26,  57,   58,  26,
-  833, 864,  802, 833,  771, 802,  709, 740,  678, 709, 647, 678,  585, 616,
-  554, 585,  523, 554,  461, 492,  430, 461,  399, 430, 337, 368,  306, 337,
-  275, 306,  213, 244,  182, 213,  151, 182,  89,  120, 58,  89,   27,  58,
-  834, 865,  803, 834,  710, 741,  679, 710,  586, 617, 555, 586,  462, 493,
-  431, 462,  338, 369,  307, 338,  214, 245,  183, 214, 90,  121,  59,  90,
-  835, 866,  711, 742,  587, 618,  463, 494,  339, 370, 215, 246,  91,  122,
-  864, 865,  740, 771,  616, 647,  492, 523,  368, 399, 244, 275,  120, 151,
-  59,  27,   896, 896,  865, 896,  772, 803,  741, 772, 648, 679,  617, 648,
-  524, 555,  493, 524,  400, 431,  369, 400,  276, 307, 245, 276,  152, 183,
-  121, 152,  28,  59,   60,  28,   928, 928,  897, 928, 866, 897,  804, 835,
-  773, 804,  742, 773,  680, 711,  649, 680,  618, 649, 556, 587,  525, 556,
-  494, 525,  432, 463,  401, 432,  370, 401,  308, 339, 277, 308,  246, 277,
-  184, 215,  153, 184,  122, 153,  60,  91,   29,  60,  61,  29,   960, 960,
-  929, 960,  898, 929,  867, 898,  836, 867,  805, 836, 774, 805,  743, 774,
-  712, 743,  681, 712,  650, 681,  619, 650,  588, 619, 557, 588,  526, 557,
-  495, 526,  464, 495,  433, 464,  402, 433,  371, 402, 340, 371,  309, 340,
-  278, 309,  247, 278,  216, 247,  185, 216,  154, 185, 123, 154,  92,  123,
-  61,  92,   30,  61,   62,  30,   961, 992,  930, 961, 899, 930,  837, 868,
-  806, 837,  775, 806,  713, 744,  682, 713,  651, 682, 589, 620,  558, 589,
-  527, 558,  465, 496,  434, 465,  403, 434,  341, 372, 310, 341,  279, 310,
-  217, 248,  186, 217,  155, 186,  93,  124,  62,  93,  31,  62,   962, 993,
-  931, 962,  838, 869,  807, 838,  714, 745,  683, 714, 590, 621,  559, 590,
-  466, 497,  435, 466,  342, 373,  311, 342,  218, 249, 187, 218,  94,  125,
-  63,  94,   963, 994,  839, 870,  715, 746,  591, 622, 467, 498,  343, 374,
-  219, 250,  95,  126,  868, 899,  744, 775,  620, 651, 496, 527,  372, 403,
-  248, 279,  124, 155,  900, 931,  869, 900,  776, 807, 745, 776,  652, 683,
-  621, 652,  528, 559,  497, 528,  404, 435,  373, 404, 280, 311,  249, 280,
-  156, 187,  125, 156,  932, 963,  901, 932,  870, 901, 808, 839,  777, 808,
-  746, 777,  684, 715,  653, 684,  622, 653,  560, 591, 529, 560,  498, 529,
-  436, 467,  405, 436,  374, 405,  312, 343,  281, 312, 250, 281,  188, 219,
-  157, 188,  126, 157,  964, 995,  933, 964,  902, 933, 871, 902,  840, 871,
-  809, 840,  778, 809,  747, 778,  716, 747,  685, 716, 654, 685,  623, 654,
-  592, 623,  561, 592,  530, 561,  499, 530,  468, 499, 437, 468,  406, 437,
-  375, 406,  344, 375,  313, 344,  282, 313,  251, 282, 220, 251,  189, 220,
-  158, 189,  127, 158,  965, 996,  934, 965,  903, 934, 841, 872,  810, 841,
-  779, 810,  717, 748,  686, 717,  655, 686,  593, 624, 562, 593,  531, 562,
-  469, 500,  438, 469,  407, 438,  345, 376,  314, 345, 283, 314,  221, 252,
-  190, 221,  159, 190,  966, 997,  935, 966,  842, 873, 811, 842,  718, 749,
-  687, 718,  594, 625,  563, 594,  470, 501,  439, 470, 346, 377,  315, 346,
-  222, 253,  191, 222,  967, 998,  843, 874,  719, 750, 595, 626,  471, 502,
-  347, 378,  223, 254,  872, 903,  748, 779,  624, 655, 500, 531,  376, 407,
-  252, 283,  904, 935,  873, 904,  780, 811,  749, 780, 656, 687,  625, 656,
-  532, 563,  501, 532,  408, 439,  377, 408,  284, 315, 253, 284,  936, 967,
-  905, 936,  874, 905,  812, 843,  781, 812,  750, 781, 688, 719,  657, 688,
-  626, 657,  564, 595,  533, 564,  502, 533,  440, 471, 409, 440,  378, 409,
-  316, 347,  285, 316,  254, 285,  968, 999,  937, 968, 906, 937,  875, 906,
-  844, 875,  813, 844,  782, 813,  751, 782,  720, 751, 689, 720,  658, 689,
-  627, 658,  596, 627,  565, 596,  534, 565,  503, 534, 472, 503,  441, 472,
-  410, 441,  379, 410,  348, 379,  317, 348,  286, 317, 255, 286,  969, 1000,
-  938, 969,  907, 938,  845, 876,  814, 845,  783, 814, 721, 752,  690, 721,
-  659, 690,  597, 628,  566, 597,  535, 566,  473, 504, 442, 473,  411, 442,
-  349, 380,  318, 349,  287, 318,  970, 1001, 939, 970, 846, 877,  815, 846,
-  722, 753,  691, 722,  598, 629,  567, 598,  474, 505, 443, 474,  350, 381,
-  319, 350,  971, 1002, 847, 878,  723, 754,  599, 630, 475, 506,  351, 382,
-  876, 907,  752, 783,  628, 659,  504, 535,  380, 411, 908, 939,  877, 908,
-  784, 815,  753, 784,  660, 691,  629, 660,  536, 567, 505, 536,  412, 443,
-  381, 412,  940, 971,  909, 940,  878, 909,  816, 847, 785, 816,  754, 785,
-  692, 723,  661, 692,  630, 661,  568, 599,  537, 568, 506, 537,  444, 475,
-  413, 444,  382, 413,  972, 1003, 941, 972,  910, 941, 879, 910,  848, 879,
-  817, 848,  786, 817,  755, 786,  724, 755,  693, 724, 662, 693,  631, 662,
-  600, 631,  569, 600,  538, 569,  507, 538,  476, 507, 445, 476,  414, 445,
-  383, 414,  973, 1004, 942, 973,  911, 942,  849, 880, 818, 849,  787, 818,
-  725, 756,  694, 725,  663, 694,  601, 632,  570, 601, 539, 570,  477, 508,
-  446, 477,  415, 446,  974, 1005, 943, 974,  850, 881, 819, 850,  726, 757,
-  695, 726,  602, 633,  571, 602,  478, 509,  447, 478, 975, 1006, 851, 882,
-  727, 758,  603, 634,  479, 510,  880, 911,  756, 787, 632, 663,  508, 539,
-  912, 943,  881, 912,  788, 819,  757, 788,  664, 695, 633, 664,  540, 571,
-  509, 540,  944, 975,  913, 944,  882, 913,  820, 851, 789, 820,  758, 789,
-  696, 727,  665, 696,  634, 665,  572, 603,  541, 572, 510, 541,  976, 1007,
-  945, 976,  914, 945,  883, 914,  852, 883,  821, 852, 790, 821,  759, 790,
-  728, 759,  697, 728,  666, 697,  635, 666,  604, 635, 573, 604,  542, 573,
-  511, 542,  977, 1008, 946, 977,  915, 946,  853, 884, 822, 853,  791, 822,
-  729, 760,  698, 729,  667, 698,  605, 636,  574, 605, 543, 574,  978, 1009,
-  947, 978,  854, 885,  823, 854,  730, 761,  699, 730, 606, 637,  575, 606,
-  979, 1010, 855, 886,  731, 762,  607, 638,  884, 915, 760, 791,  636, 667,
-  916, 947,  885, 916,  792, 823,  761, 792,  668, 699, 637, 668,  948, 979,
-  917, 948,  886, 917,  824, 855,  793, 824,  762, 793, 700, 731,  669, 700,
-  638, 669,  980, 1011, 949, 980,  918, 949,  887, 918, 856, 887,  825, 856,
-  794, 825,  763, 794,  732, 763,  701, 732,  670, 701, 639, 670,  981, 1012,
-  950, 981,  919, 950,  857, 888,  826, 857,  795, 826, 733, 764,  702, 733,
-  671, 702,  982, 1013, 951, 982,  858, 889,  827, 858, 734, 765,  703, 734,
-  983, 1014, 859, 890,  735, 766,  888, 919,  764, 795, 920, 951,  889, 920,
-  796, 827,  765, 796,  952, 983,  921, 952,  890, 921, 828, 859,  797, 828,
-  766, 797,  984, 1015, 953, 984,  922, 953,  891, 922, 860, 891,  829, 860,
-  798, 829,  767, 798,  985, 1016, 954, 985,  923, 954, 861, 892,  830, 861,
-  799, 830,  986, 1017, 955, 986,  862, 893,  831, 862, 987, 1018, 863, 894,
-  892, 923,  924, 955,  893, 924,  956, 987,  925, 956, 894, 925,  988, 1019,
-  957, 988,  926, 957,  895, 926,  989, 1020, 958, 989, 927, 958,  990, 1021,
-  959, 990,  991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                v2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,   32,  32,   2,   33,
-  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,  65,  96,   35,  66,
-  66,  97,   3,   3,    96,  96,   4,   35,   97,  128, 67,  98,   36,  67,
-  98,  129,  4,   4,    68,  99,   99,  130,  128, 128, 5,   36,   129, 160,
-  37,  68,   130, 161,  100, 131,  69,  100,  131, 162, 5,   5,    160, 160,
-  6,   37,   161, 192,  38,  69,   162, 193,  101, 132, 132, 163,  70,  101,
-  163, 194,  6,   6,    192, 192,  7,   38,   133, 164, 193, 224,  102, 133,
-  164, 195,  39,  70,   194, 225,  71,  102,  195, 226, 134, 165,  165, 196,
-  7,   7,    224, 224,  8,   39,   103, 134,  196, 227, 225, 256,  40,  71,
-  226, 257,  166, 197,  72,  103,  227, 258,  135, 166, 197, 228,  104, 135,
-  228, 259,  8,   8,    256, 256,  9,   40,   257, 288, 41,  72,   167, 198,
-  198, 229,  258, 289,  136, 167,  229, 260,  73,  104, 259, 290,  105, 136,
-  260, 291,  199, 230,  9,   9,    168, 199,  230, 261, 288, 288,  10,  41,
-  289, 320,  42,  73,   290, 321,  137, 168,  261, 292, 74,  105,  291, 322,
-  200, 231,  231, 262,  106, 137,  292, 323,  169, 200, 262, 293,  10,  10,
-  320, 320,  11,  42,   321, 352,  43,  74,   138, 169, 293, 324,  322, 353,
-  232, 263,  75,  106,  201, 232,  263, 294,  323, 354, 170, 201,  294, 325,
-  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,  233, 264,  264, 295,
-  353, 384,  139, 170,  325, 356,  44,  75,   354, 385, 202, 233,  295, 326,
-  76,  107,  355, 386,  171, 202,  326, 357,  108, 139, 356, 387,  265, 296,
-  234, 265,  296, 327,  12,  12,   140, 171,  357, 388, 384, 384,  13,  44,
-  203, 234,  327, 358,  385, 416,  45,  76,   386, 417, 77,  108,  387, 418,
-  172, 203,  358, 389,  266, 297,  297, 328,  109, 140, 235, 266,  328, 359,
-  388, 419,  204, 235,  359, 390,  141, 172,  389, 420, 13,  13,   416, 416,
-  14,  45,   417, 448,  46,  77,   298, 329,  418, 449, 267, 298,  329, 360,
-  78,  109,  173, 204,  390, 421,  419, 450,  236, 267, 360, 391,  110, 141,
-  420, 451,  205, 236,  391, 422,  142, 173,  299, 330, 330, 361,  421, 452,
-  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,  449, 480,  47,  78,
-  450, 481,  174, 205,  422, 453,  237, 268,  392, 423, 79,  110,  451, 482,
-  111, 142,  452, 483,  331, 362,  300, 331,  362, 393, 206, 237,  423, 454,
-  143, 174,  269, 300,  393, 424,  453, 484,  480, 480, 481, 512,  238, 269,
-  424, 455,  482, 513,  175, 206,  454, 485,  332, 363, 363, 394,  483, 514,
-  301, 332,  394, 425,  484, 515,  207, 238,  455, 486, 270, 301,  425, 456,
-  485, 516,  364, 395,  239, 270,  456, 487,  512, 512, 333, 364,  395, 426,
-  513, 544,  486, 517,  514, 545,  302, 333,  426, 457, 515, 546,  487, 518,
-  516, 547,  271, 302,  457, 488,  365, 396,  396, 427, 517, 548,  334, 365,
-  427, 458,  488, 519,  544, 544,  303, 334,  458, 489, 518, 549,  545, 576,
-  546, 577,  547, 578,  489, 520,  397, 428,  519, 550, 366, 397,  428, 459,
-  548, 579,  335, 366,  459, 490,  549, 580,  520, 551, 490, 521,  550, 581,
-  576, 576,  577, 608,  398, 429,  429, 460,  578, 609, 367, 398,  460, 491,
-  521, 552,  579, 610,  551, 582,  491, 522,  580, 611, 581, 612,  552, 583,
-  522, 553,  430, 461,  399, 430,  461, 492,  582, 613, 492, 523,  608, 608,
-  609, 640,  610, 641,  553, 584,  611, 642,  523, 554, 583, 614,  612, 643,
-  431, 462,  462, 493,  554, 585,  493, 524,  584, 615, 613, 644,  524, 555,
-  614, 645,  640, 640,  585, 616,  641, 672,  555, 586, 642, 673,  615, 646,
-  463, 494,  643, 674,  494, 525,  644, 675,  525, 556, 586, 617,  616, 647,
-  645, 676,  556, 587,  646, 677,  495, 526,  617, 648, 587, 618,  672, 672,
-  526, 557,  673, 704,  674, 705,  647, 678,  557, 588, 675, 706,  618, 649,
-  676, 707,  588, 619,  648, 679,  677, 708,  527, 558, 558, 589,  678, 709,
-  619, 650,  649, 680,  704, 704,  589, 620,  705, 736, 679, 710,  706, 737,
-  707, 738,  650, 681,  620, 651,  708, 739,  680, 711, 559, 590,  709, 740,
-  590, 621,  651, 682,  681, 712,  710, 741,  621, 652, 736, 736,  737, 768,
-  711, 742,  738, 769,  682, 713,  652, 683,  739, 770, 591, 622,  740, 771,
-  712, 743,  622, 653,  741, 772,  683, 714,  653, 684, 713, 744,  742, 773,
-  623, 654,  743, 774,  768, 768,  769, 800,  684, 715, 714, 745,  770, 801,
-  771, 802,  654, 685,  744, 775,  772, 803,  715, 746, 773, 804,  685, 716,
-  745, 776,  774, 805,  655, 686,  716, 747,  775, 806, 746, 777,  800, 800,
-  801, 832,  686, 717,  802, 833,  803, 834,  776, 807, 804, 835,  747, 778,
-  717, 748,  805, 836,  777, 808,  687, 718,  806, 837, 748, 779,  718, 749,
-  778, 809,  807, 838,  832, 832,  833, 864,  834, 865, 835, 866,  808, 839,
-  749, 780,  836, 867,  779, 810,  719, 750,  837, 868, 809, 840,  838, 869,
-  780, 811,  750, 781,  810, 841,  839, 870,  864, 864, 865, 896,  866, 897,
-  840, 871,  867, 898,  781, 812,  811, 842,  868, 899, 751, 782,  869, 900,
-  841, 872,  812, 843,  870, 901,  782, 813,  842, 873, 871, 902,  896, 896,
-  897, 928,  813, 844,  898, 929,  872, 903,  783, 814, 843, 874,  899, 930,
-  900, 931,  873, 904,  901, 932,  814, 845,  844, 875, 902, 933,  874, 905,
-  903, 934,  845, 876,  928, 928,  815, 846,  929, 960, 930, 961,  875, 906,
-  904, 935,  931, 962,  932, 963,  905, 936,  846, 877, 933, 964,  876, 907,
-  934, 965,  906, 937,  935, 966,  877, 908,  847, 878, 960, 960,  907, 938,
-  961, 992,  936, 967,  962, 993,  963, 994,  964, 995, 878, 909,  937, 968,
-  908, 939,  965, 996,  966, 997,  938, 969,  879, 910, 909, 940,  967, 998,
-  939, 970,  968, 999,  910, 941,  969, 1000, 940, 971, 970, 1001, 911, 942,
-  941, 972,  971, 1002, 942, 973,  972, 1003, 943, 974, 973, 1004, 974, 1005,
-  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111, 112, 143,  144, 175,
-  16,  16,   17,  48,   176, 207,  49,  80,   81,  112, 113, 144,  208, 239,
-  145, 176,  240, 271,  17,  17,   18,  49,   177, 208, 50,  81,   82,  113,
-  272, 303,  209, 240,  114, 145,  146, 177,  241, 272, 304, 335,  178, 209,
-  18,  18,   19,  50,   51,  82,   83,  114,  273, 304, 210, 241,  115, 146,
-  336, 367,  147, 178,  242, 273,  305, 336,  179, 210, 19,  19,   368, 399,
-  20,  51,   52,  83,   274, 305,  84,  115,  211, 242, 337, 368,  116, 147,
-  306, 337,  148, 179,  243, 274,  400, 431,  369, 400, 180, 211,  20,  20,
-  21,  52,   275, 306,  53,  84,   338, 369,  212, 243, 85,  116,  432, 463,
-  117, 148,  401, 432,  307, 338,  244, 275,  149, 180, 370, 401,  181, 212,
-  276, 307,  464, 495,  339, 370,  21,  21,   22,  53,  433, 464,  54,  85,
-  213, 244,  86,  117,  402, 433,  118, 149,  308, 339, 245, 276,  371, 402,
-  150, 181,  496, 527,  465, 496,  182, 213,  434, 465, 340, 371,  277, 308,
-  22,  22,   23,  54,   403, 434,  55,  86,   214, 245, 87,  118,  309, 340,
-  372, 403,  119, 150,  497, 528,  528, 559,  246, 277, 466, 497,  151, 182,
-  435, 466,  341, 372,  183, 214,  278, 309,  404, 435, 23,  23,   24,  55,
-  215, 246,  529, 560,  56,  87,   498, 529,  560, 591, 310, 341,  88,  119,
-  373, 404,  467, 498,  120, 151,  247, 278,  436, 467, 152, 183,  342, 373,
-  279, 310,  405, 436,  184, 215,  530, 561,  561, 592, 499, 530,  592, 623,
-  24,  24,   216, 247,  468, 499,  25,  56,   374, 405, 57,  88,   311, 342,
-  89,  120,  437, 468,  248, 279,  121, 152,  562, 593, 153, 184,  343, 374,
-  531, 562,  593, 624,  406, 437,  500, 531,  624, 655, 280, 311,  185, 216,
-  469, 500,  375, 406,  217, 248,  25,  25,   312, 343, 26,  57,   58,  89,
-  438, 469,  90,  121,  563, 594,  594, 625,  249, 280, 532, 563,  625, 656,
-  122, 153,  344, 375,  501, 532,  656, 687,  407, 438, 154, 185,  281, 312,
-  470, 501,  186, 217,  376, 407,  595, 626,  564, 595, 626, 657,  218, 249,
-  313, 344,  439, 470,  26,  26,   27,  58,   533, 564, 657, 688,  59,  90,
-  91,  122,  250, 281,  502, 533,  688, 719,  123, 154, 408, 439,  345, 376,
-  155, 186,  471, 502,  282, 313,  596, 627,  627, 658, 187, 218,  565, 596,
-  658, 689,  377, 408,  440, 471,  534, 565,  689, 720, 314, 345,  219, 250,
-  27,  27,   28,  59,   503, 534,  720, 751,  60,  91,  92,  123,  251, 282,
-  409, 440,  346, 377,  124, 155,  628, 659,  472, 503, 597, 628,  659, 690,
-  566, 597,  690, 721,  156, 187,  283, 314,  535, 566, 721, 752,  188, 219,
-  378, 409,  441, 472,  315, 346,  504, 535,  752, 783, 220, 251,  28,  28,
-  629, 660,  660, 691,  29,  60,   61,  92,   410, 441, 598, 629,  691, 722,
-  252, 283,  93,  124,  347, 378,  473, 504,  567, 598, 722, 753,  125, 156,
-  284, 315,  536, 567,  753, 784,  157, 188,  442, 473, 379, 410,  189, 220,
-  505, 536,  784, 815,  661, 692,  316, 347,  630, 661, 692, 723,  221, 252,
-  599, 630,  723, 754,  411, 442,  29,  29,   568, 599, 754, 785,  30,  61,
-  474, 505,  62,  93,   253, 284,  348, 379,  94,  125, 537, 568,  785, 816,
-  126, 157,  285, 316,  158, 189,  443, 474,  662, 693, 693, 724,  380, 411,
-  631, 662,  724, 755,  506, 537,  816, 847,  190, 221, 600, 631,  755, 786,
-  317, 348,  222, 253,  569, 600,  786, 817,  412, 443, 475, 506,  30,  30,
-  31,  62,   349, 380,  254, 285,  63,  94,   538, 569, 817, 848,  694, 725,
-  95,  126,  663, 694,  725, 756,  632, 663,  756, 787, 127, 158,  444, 475,
-  286, 317,  381, 412,  507, 538,  848, 879,  159, 190, 601, 632,  787, 818,
-  191, 222,  318, 349,  570, 601,  818, 849,  476, 507, 223, 254,  413, 444,
-  695, 726,  726, 757,  664, 695,  757, 788,  539, 570, 849, 880,  350, 381,
-  255, 286,  633, 664,  788, 819,  445, 476,  602, 633, 819, 850,  508, 539,
-  880, 911,  287, 318,  382, 413,  571, 602,  850, 881, 727, 758,  696, 727,
-  758, 789,  319, 350,  477, 508,  665, 696,  789, 820, 414, 445,  540, 571,
-  881, 912,  634, 665,  820, 851,  351, 382,  603, 634, 851, 882,  446, 477,
-  509, 540,  912, 943,  383, 414,  728, 759,  759, 790, 572, 603,  882, 913,
-  697, 728,  790, 821,  666, 697,  821, 852,  478, 509, 635, 666,  852, 883,
-  415, 446,  541, 572,  913, 944,  604, 635,  883, 914, 760, 791,  729, 760,
-  791, 822,  510, 541,  944, 975,  447, 478,  698, 729, 822, 853,  573, 604,
-  914, 945,  667, 698,  853, 884,  636, 667,  884, 915, 479, 510,  542, 573,
-  945, 976,  761, 792,  792, 823,  605, 636,  915, 946, 730, 761,  823, 854,
-  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605, 946, 977,  668, 699,
-  885, 916,  637, 668,  916, 947,  543, 574,  793, 824, 977, 1008, 762, 793,
-  824, 855,  731, 762,  855, 886,  606, 637,  947, 978, 700, 731,  886, 917,
-  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669, 948, 979,  794, 825,
-  825, 856,  763, 794,  856, 887,  732, 763,  887, 918, 607, 638,  979, 1010,
-  701, 732,  918, 949,  670, 701,  949, 980,  826, 857, 795, 826,  857, 888,
-  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764, 919, 950,  702, 733,
-  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889, 796, 827,  889, 920,
-  765, 796,  920, 951,  734, 765,  951, 982,  703, 734, 982, 1013, 859, 890,
-  828, 859,  890, 921,  797, 828,  921, 952,  766, 797, 952, 983,  735, 766,
-  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953, 798, 829,  953, 984,
-  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954, 830, 861,  954, 985,
-  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893, 955, 986,  831, 862,
-  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894, 987, 1018, 926, 957,
-  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958, 989, 1020, 959, 990,
-  990, 1021, 991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                h2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
-  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
-  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
-  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
-  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
-  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
-  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
-  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
-  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
-  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
-  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
-  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
-  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
-  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
-  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
-  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
-  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
-  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
-  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
-  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
-  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
-  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
-  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
-  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
-  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
-  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
-  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
-  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
-  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
-  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
-  143, 174,  269, 300,  393, 424,  453, 484,  15,  15,   16,  47,   48,  79,
-  238, 269,  424, 455,  175, 206,  454, 485,  80,  111,  332, 363,  363, 394,
-  301, 332,  394, 425,  112, 143,  207, 238,  455, 486,  270, 301,  425, 456,
-  144, 175,  364, 395,  16,  16,   239, 270,  456, 487,  17,  48,   333, 364,
-  395, 426,  176, 207,  49,  80,   302, 333,  426, 457,  81,  112,  113, 144,
-  208, 239,  271, 302,  457, 488,  365, 396,  396, 427,  145, 176,  334, 365,
-  427, 458,  240, 271,  17,  17,   18,  49,   177, 208,  303, 334,  458, 489,
-  50,  81,   82,  113,  272, 303,  209, 240,  397, 428,  114, 145,  366, 397,
-  428, 459,  335, 366,  459, 490,  146, 177,  241, 272,  304, 335,  178, 209,
-  18,  18,   19,  50,   51,  82,   398, 429,  429, 460,  367, 398,  460, 491,
-  83,  114,  273, 304,  210, 241,  115, 146,  336, 367,  147, 178,  242, 273,
-  305, 336,  430, 461,  399, 430,  461, 492,  179, 210,  19,  19,   368, 399,
-  20,  51,   52,  83,   274, 305,  84,  115,  211, 242,  337, 368,  116, 147,
-  431, 462,  462, 493,  306, 337,  148, 179,  243, 274,  400, 431,  369, 400,
-  180, 211,  20,  20,   21,  52,   275, 306,  53,  84,   338, 369,  212, 243,
-  85,  116,  463, 494,  432, 463,  117, 148,  401, 432,  307, 338,  244, 275,
-  149, 180,  370, 401,  181, 212,  276, 307,  464, 495,  339, 370,  21,  21,
-  22,  53,   433, 464,  54,  85,   213, 244,  86,  117,  402, 433,  118, 149,
-  308, 339,  245, 276,  371, 402,  150, 181,  465, 496,  182, 213,  434, 465,
-  340, 371,  277, 308,  22,  22,   23,  54,   403, 434,  55,  86,   214, 245,
-  87,  118,  309, 340,  372, 403,  119, 150,  246, 277,  466, 497,  151, 182,
-  435, 466,  341, 372,  183, 214,  278, 309,  404, 435,  23,  23,   24,  55,
-  215, 246,  56,  87,   310, 341,  88,  119,  373, 404,  467, 498,  120, 151,
-  247, 278,  436, 467,  152, 183,  342, 373,  279, 310,  405, 436,  184, 215,
-  24,  24,   216, 247,  468, 499,  25,  56,   374, 405,  57,  88,   311, 342,
-  89,  120,  437, 468,  248, 279,  121, 152,  153, 184,  343, 374,  406, 437,
-  280, 311,  185, 216,  469, 500,  375, 406,  217, 248,  25,  25,   312, 343,
-  26,  57,   58,  89,   438, 469,  90,  121,  249, 280,  122, 153,  344, 375,
-  407, 438,  154, 185,  281, 312,  470, 501,  186, 217,  376, 407,  218, 249,
-  313, 344,  439, 470,  26,  26,   27,  58,   59,  90,   91,  122,  250, 281,
-  123, 154,  408, 439,  345, 376,  155, 186,  471, 502,  282, 313,  187, 218,
-  377, 408,  440, 471,  314, 345,  219, 250,  27,  27,   28,  59,   60,  91,
-  92,  123,  251, 282,  409, 440,  346, 377,  124, 155,  472, 503,  156, 187,
-  283, 314,  188, 219,  378, 409,  441, 472,  315, 346,  220, 251,  28,  28,
-  29,  60,   61,  92,   410, 441,  252, 283,  93,  124,  347, 378,  473, 504,
-  125, 156,  284, 315,  157, 188,  442, 473,  379, 410,  189, 220,  316, 347,
-  221, 252,  411, 442,  29,  29,   30,  61,   474, 505,  62,  93,   253, 284,
-  348, 379,  94,  125,  126, 157,  285, 316,  158, 189,  443, 474,  380, 411,
-  190, 221,  317, 348,  222, 253,  412, 443,  475, 506,  30,  30,   31,  62,
-  349, 380,  254, 285,  63,  94,   95,  126,  127, 158,  444, 475,  286, 317,
-  381, 412,  159, 190,  191, 222,  318, 349,  476, 507,  223, 254,  413, 444,
-  350, 381,  255, 286,  445, 476,  287, 318,  382, 413,  319, 350,  477, 508,
-  414, 445,  351, 382,  446, 477,  383, 414,  478, 509,  415, 446,  447, 478,
-  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
-  512, 512,  513, 544,  486, 517,  514, 545,  515, 546,  487, 518,  516, 547,
-  517, 548,  488, 519,  544, 544,  518, 549,  545, 576,  546, 577,  547, 578,
-  489, 520,  519, 550,  548, 579,  549, 580,  520, 551,  490, 521,  550, 581,
-  576, 576,  577, 608,  578, 609,  521, 552,  579, 610,  551, 582,  491, 522,
-  580, 611,  581, 612,  552, 583,  522, 553,  582, 613,  492, 523,  608, 608,
-  609, 640,  610, 641,  553, 584,  611, 642,  523, 554,  583, 614,  612, 643,
-  554, 585,  493, 524,  584, 615,  613, 644,  524, 555,  614, 645,  640, 640,
-  585, 616,  641, 672,  555, 586,  642, 673,  615, 646,  643, 674,  494, 525,
-  644, 675,  525, 556,  586, 617,  616, 647,  645, 676,  556, 587,  646, 677,
-  495, 526,  617, 648,  587, 618,  672, 672,  526, 557,  673, 704,  674, 705,
-  647, 678,  557, 588,  675, 706,  618, 649,  676, 707,  588, 619,  648, 679,
-  677, 708,  496, 527,  527, 558,  558, 589,  678, 709,  619, 650,  649, 680,
-  704, 704,  589, 620,  705, 736,  679, 710,  706, 737,  707, 738,  650, 681,
-  620, 651,  497, 528,  528, 559,  708, 739,  680, 711,  559, 590,  709, 740,
-  590, 621,  651, 682,  681, 712,  710, 741,  621, 652,  736, 736,  737, 768,
-  529, 560,  711, 742,  498, 529,  560, 591,  738, 769,  682, 713,  652, 683,
-  739, 770,  591, 622,  740, 771,  712, 743,  622, 653,  741, 772,  683, 714,
-  653, 684,  713, 744,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,
-  623, 654,  743, 774,  768, 768,  769, 800,  684, 715,  714, 745,  770, 801,
-  771, 802,  654, 685,  744, 775,  772, 803,  562, 593,  531, 562,  593, 624,
-  715, 746,  773, 804,  685, 716,  500, 531,  624, 655,  745, 776,  774, 805,
-  655, 686,  716, 747,  775, 806,  746, 777,  800, 800,  801, 832,  686, 717,
-  802, 833,  563, 594,  594, 625,  803, 834,  532, 563,  625, 656,  776, 807,
-  804, 835,  501, 532,  656, 687,  747, 778,  717, 748,  805, 836,  777, 808,
-  687, 718,  806, 837,  748, 779,  595, 626,  564, 595,  626, 657,  718, 749,
-  778, 809,  807, 838,  832, 832,  533, 564,  657, 688,  833, 864,  834, 865,
-  835, 866,  502, 533,  688, 719,  808, 839,  749, 780,  836, 867,  779, 810,
-  719, 750,  837, 868,  809, 840,  596, 627,  627, 658,  565, 596,  658, 689,
-  838, 869,  780, 811,  750, 781,  534, 565,  689, 720,  810, 841,  839, 870,
-  864, 864,  503, 534,  720, 751,  865, 896,  866, 897,  840, 871,  867, 898,
-  781, 812,  811, 842,  628, 659,  868, 899,  751, 782,  597, 628,  659, 690,
-  566, 597,  690, 721,  869, 900,  841, 872,  535, 566,  721, 752,  812, 843,
-  870, 901,  782, 813,  842, 873,  504, 535,  752, 783,  871, 902,  629, 660,
-  660, 691,  896, 896,  897, 928,  598, 629,  691, 722,  813, 844,  898, 929,
-  872, 903,  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  900, 931,
-  536, 567,  753, 784,  873, 904,  901, 932,  814, 845,  844, 875,  902, 933,
-  505, 536,  784, 815,  661, 692,  630, 661,  692, 723,  874, 905,  599, 630,
-  723, 754,  903, 934,  845, 876,  568, 599,  754, 785,  928, 928,  815, 846,
-  929, 960,  930, 961,  875, 906,  904, 935,  931, 962,  537, 568,  785, 816,
-  932, 963,  905, 936,  662, 693,  693, 724,  846, 877,  933, 964,  876, 907,
-  631, 662,  724, 755,  506, 537,  816, 847,  934, 965,  600, 631,  755, 786,
-  906, 937,  569, 600,  786, 817,  935, 966,  877, 908,  847, 878,  960, 960,
-  907, 938,  961, 992,  936, 967,  538, 569,  817, 848,  962, 993,  694, 725,
-  663, 694,  725, 756,  963, 994,  632, 663,  756, 787,  964, 995,  878, 909,
-  937, 968,  507, 538,  848, 879,  908, 939,  601, 632,  787, 818,  965, 996,
-  966, 997,  570, 601,  818, 849,  938, 969,  879, 910,  909, 940,  967, 998,
-  695, 726,  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  939, 970,
-  633, 664,  788, 819,  968, 999,  602, 633,  819, 850,  910, 941,  508, 539,
-  880, 911,  969, 1000, 940, 971,  571, 602,  850, 881,  727, 758,  696, 727,
-  758, 789,  970, 1001, 665, 696,  789, 820,  911, 942,  941, 972,  540, 571,
-  881, 912,  634, 665,  820, 851,  971, 1002, 603, 634,  851, 882,  942, 973,
-  509, 540,  912, 943,  728, 759,  759, 790,  972, 1003, 572, 603,  882, 913,
-  697, 728,  790, 821,  666, 697,  821, 852,  943, 974,  635, 666,  852, 883,
-  541, 572,  913, 944,  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,
-  791, 822,  510, 541,  944, 975,  974, 1005, 698, 729,  822, 853,  573, 604,
-  914, 945,  667, 698,  853, 884,  636, 667,  884, 915,  975, 1006, 542, 573,
-  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
-  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
-  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
-  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
-  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
-  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
-  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
-  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
-  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
-  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
-  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
-  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
-  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
-  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
-  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
-  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
-  990, 1021, 991, 1022, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                qtr_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
-  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
-  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
-  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
-  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
-  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
-  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
-  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
-  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
-  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
-  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
-  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
-  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
-  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
-  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
-  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
-  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
-  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
-  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
-  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
-  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
-  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
-  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
-  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
-  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
-  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
-  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
-  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
-  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
-  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
-  143, 174,  269, 300,  393, 424,  453, 484,  238, 269,  424, 455,  175, 206,
-  454, 485,  332, 363,  363, 394,  301, 332,  394, 425,  207, 238,  455, 486,
-  270, 301,  425, 456,  364, 395,  239, 270,  456, 487,  333, 364,  395, 426,
-  302, 333,  426, 457,  271, 302,  457, 488,  365, 396,  396, 427,  334, 365,
-  427, 458,  303, 334,  458, 489,  397, 428,  366, 397,  428, 459,  335, 366,
-  459, 490,  398, 429,  429, 460,  367, 398,  460, 491,  430, 461,  399, 430,
-  461, 492,  431, 462,  462, 493,  463, 494,  15,  15,   480, 480,  16,  47,
-  481, 512,  48,  79,   482, 513,  80,  111,  483, 514,  112, 143,  484, 515,
-  144, 175,  485, 516,  16,  16,   512, 512,  17,  48,   513, 544,  176, 207,
-  486, 517,  49,  80,   514, 545,  81,  112,  515, 546,  113, 144,  208, 239,
-  487, 518,  516, 547,  145, 176,  517, 548,  240, 271,  488, 519,  17,  17,
-  544, 544,  18,  49,   177, 208,  518, 549,  545, 576,  50,  81,   546, 577,
-  82,  113,  547, 578,  272, 303,  489, 520,  209, 240,  519, 550,  114, 145,
-  548, 579,  146, 177,  549, 580,  241, 272,  520, 551,  304, 335,  490, 521,
-  178, 209,  550, 581,  18,  18,   576, 576,  19,  50,   577, 608,  51,  82,
-  578, 609,  83,  114,  273, 304,  521, 552,  579, 610,  210, 241,  551, 582,
-  115, 146,  336, 367,  491, 522,  580, 611,  147, 178,  581, 612,  242, 273,
-  552, 583,  305, 336,  522, 553,  179, 210,  582, 613,  19,  19,   368, 399,
-  492, 523,  608, 608,  20,  51,   609, 640,  52,  83,   610, 641,  274, 305,
-  553, 584,  84,  115,  611, 642,  211, 242,  337, 368,  523, 554,  583, 614,
-  116, 147,  612, 643,  306, 337,  554, 585,  148, 179,  243, 274,  400, 431,
-  493, 524,  584, 615,  613, 644,  369, 400,  524, 555,  180, 211,  614, 645,
-  20,  20,   640, 640,  21,  52,   275, 306,  585, 616,  641, 672,  53,  84,
-  338, 369,  555, 586,  642, 673,  212, 243,  615, 646,  85,  116,  643, 674,
-  432, 463,  494, 525,  117, 148,  644, 675,  401, 432,  525, 556,  307, 338,
-  586, 617,  244, 275,  616, 647,  149, 180,  645, 676,  370, 401,  556, 587,
-  181, 212,  646, 677,  276, 307,  464, 495,  495, 526,  617, 648,  339, 370,
-  587, 618,  21,  21,   672, 672,  22,  53,   433, 464,  526, 557,  673, 704,
-  54,  85,   674, 705,  213, 244,  647, 678,  86,  117,  402, 433,  557, 588,
-  675, 706,  118, 149,  308, 339,  618, 649,  676, 707,  245, 276,  371, 402,
-  588, 619,  648, 679,  150, 181,  677, 708,  496, 527,  465, 496,  527, 558,
-  182, 213,  434, 465,  558, 589,  678, 709,  340, 371,  619, 650,  277, 308,
-  649, 680,  22,  22,   704, 704,  23,  54,   403, 434,  589, 620,  705, 736,
-  55,  86,   214, 245,  679, 710,  706, 737,  87,  118,  707, 738,  309, 340,
-  650, 681,  372, 403,  620, 651,  119, 150,  497, 528,  528, 559,  708, 739,
-  246, 277,  680, 711,  466, 497,  559, 590,  151, 182,  709, 740,  435, 466,
-  590, 621,  341, 372,  651, 682,  183, 214,  278, 309,  681, 712,  710, 741,
-  404, 435,  621, 652,  23,  23,   736, 736,  24,  55,   737, 768,  215, 246,
-  529, 560,  711, 742,  56,  87,   498, 529,  560, 591,  738, 769,  310, 341,
-  682, 713,  88,  119,  373, 404,  652, 683,  739, 770,  467, 498,  591, 622,
-  120, 151,  740, 771,  247, 278,  712, 743,  436, 467,  622, 653,  152, 183,
-  741, 772,  342, 373,  683, 714,  279, 310,  405, 436,  653, 684,  713, 744,
-  184, 215,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,  24,  24,
-  216, 247,  468, 499,  623, 654,  743, 774,  768, 768,  25,  56,   769, 800,
-  374, 405,  684, 715,  57,  88,   311, 342,  714, 745,  770, 801,  89,  120,
-  771, 802,  437, 468,  654, 685,  248, 279,  744, 775,  121, 152,  772, 803,
-  562, 593,  153, 184,  343, 374,  531, 562,  593, 624,  715, 746,  773, 804,
-  406, 437,  685, 716,  500, 531,  624, 655,  280, 311,  745, 776,  185, 216,
-  774, 805,  469, 500,  655, 686,  375, 406,  716, 747,  217, 248,  775, 806,
-  25,  25,   312, 343,  746, 777,  800, 800,  26,  57,   801, 832,  58,  89,
-  438, 469,  686, 717,  802, 833,  90,  121,  563, 594,  594, 625,  803, 834,
-  249, 280,  532, 563,  625, 656,  776, 807,  122, 153,  804, 835,  344, 375,
-  501, 532,  656, 687,  747, 778,  407, 438,  717, 748,  154, 185,  805, 836,
-  281, 312,  777, 808,  470, 501,  687, 718,  186, 217,  806, 837,  376, 407,
-  748, 779,  595, 626,  564, 595,  626, 657,  218, 249,  313, 344,  439, 470,
-  718, 749,  778, 809,  807, 838,  26,  26,   832, 832,  27,  58,   533, 564,
-  657, 688,  833, 864,  59,  90,   834, 865,  91,  122,  835, 866,  250, 281,
-  502, 533,  688, 719,  808, 839,  123, 154,  408, 439,  749, 780,  836, 867,
-  345, 376,  779, 810,  155, 186,  471, 502,  719, 750,  837, 868,  282, 313,
-  809, 840,  596, 627,  627, 658,  187, 218,  565, 596,  658, 689,  838, 869,
-  377, 408,  780, 811,  440, 471,  750, 781,  534, 565,  689, 720,  314, 345,
-  810, 841,  219, 250,  839, 870,  27,  27,   864, 864,  28,  59,   503, 534,
-  720, 751,  865, 896,  60,  91,   866, 897,  92,  123,  251, 282,  840, 871,
-  867, 898,  409, 440,  781, 812,  346, 377,  811, 842,  124, 155,  628, 659,
-  868, 899,  472, 503,  751, 782,  597, 628,  659, 690,  566, 597,  690, 721,
-  156, 187,  869, 900,  283, 314,  841, 872,  535, 566,  721, 752,  188, 219,
-  378, 409,  812, 843,  870, 901,  441, 472,  782, 813,  315, 346,  842, 873,
-  504, 535,  752, 783,  220, 251,  871, 902,  28,  28,   629, 660,  660, 691,
-  896, 896,  29,  60,   897, 928,  61,  92,   410, 441,  598, 629,  691, 722,
-  813, 844,  898, 929,  252, 283,  872, 903,  93,  124,  347, 378,  473, 504,
-  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  125, 156,  900, 931,
-  284, 315,  536, 567,  753, 784,  873, 904,  157, 188,  901, 932,  442, 473,
-  814, 845,  379, 410,  844, 875,  189, 220,  902, 933,  505, 536,  784, 815,
-  661, 692,  316, 347,  630, 661,  692, 723,  874, 905,  221, 252,  599, 630,
-  723, 754,  903, 934,  411, 442,  845, 876,  29,  29,   568, 599,  754, 785,
-  928, 928,  30,  61,   474, 505,  815, 846,  929, 960,  62,  93,   930, 961,
-  253, 284,  348, 379,  875, 906,  904, 935,  94,  125,  931, 962,  537, 568,
-  785, 816,  126, 157,  932, 963,  285, 316,  905, 936,  158, 189,  443, 474,
-  662, 693,  693, 724,  846, 877,  933, 964,  380, 411,  876, 907,  631, 662,
-  724, 755,  506, 537,  816, 847,  190, 221,  934, 965,  600, 631,  755, 786,
-  317, 348,  906, 937,  222, 253,  569, 600,  786, 817,  935, 966,  412, 443,
-  877, 908,  475, 506,  847, 878,  30,  30,   960, 960,  31,  62,   349, 380,
-  907, 938,  961, 992,  254, 285,  936, 967,  63,  94,   538, 569,  817, 848,
-  962, 993,  694, 725,  95,  126,  663, 694,  725, 756,  963, 994,  632, 663,
-  756, 787,  127, 158,  964, 995,  444, 475,  878, 909,  286, 317,  937, 968,
-  381, 412,  507, 538,  848, 879,  908, 939,  159, 190,  601, 632,  787, 818,
-  965, 996,  191, 222,  966, 997,  318, 349,  570, 601,  818, 849,  938, 969,
-  476, 507,  879, 910,  223, 254,  413, 444,  909, 940,  967, 998,  695, 726,
-  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  350, 381,  939, 970,
-  255, 286,  633, 664,  788, 819,  968, 999,  445, 476,  602, 633,  819, 850,
-  910, 941,  508, 539,  880, 911,  287, 318,  969, 1000, 382, 413,  940, 971,
-  571, 602,  850, 881,  727, 758,  696, 727,  758, 789,  319, 350,  970, 1001,
-  477, 508,  665, 696,  789, 820,  911, 942,  414, 445,  941, 972,  540, 571,
-  881, 912,  634, 665,  820, 851,  351, 382,  971, 1002, 603, 634,  851, 882,
-  446, 477,  942, 973,  509, 540,  912, 943,  383, 414,  728, 759,  759, 790,
-  972, 1003, 572, 603,  882, 913,  697, 728,  790, 821,  666, 697,  821, 852,
-  478, 509,  943, 974,  635, 666,  852, 883,  415, 446,  541, 572,  913, 944,
-  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,  791, 822,  510, 541,
-  944, 975,  447, 478,  974, 1005, 698, 729,  822, 853,  573, 604,  914, 945,
-  667, 698,  853, 884,  636, 667,  884, 915,  479, 510,  975, 1006, 542, 573,
-  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
-  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
-  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
-  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
-  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
-  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
-  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
-  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
-  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
-  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
-  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
-  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
-  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
-  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
-  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
-  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
+  0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    2,   2,
+  2,   33,   33,  64,   64,  64,   96,  96,   65,  96,  34,  65,   3,   34,
+  3,   3,    4,   4,    4,   35,   35,  66,   66,  97,  97,  128,  128, 128,
+  160, 160,  129, 160,  98,  129,  67,  98,   36,  67,  5,   36,   5,   5,
+  6,   6,    6,   37,   37,  68,   68,  99,   99,  130, 130, 161,  161, 192,
+  192, 192,  224, 224,  193, 224,  162, 193,  131, 162, 100, 131,  69,  100,
+  38,  69,   7,   38,   7,   7,    8,   8,    8,   39,  39,  70,   70,  101,
+  101, 132,  132, 163,  163, 194,  194, 225,  225, 256, 256, 256,  288, 288,
+  257, 288,  226, 257,  195, 226,  164, 195,  133, 164, 102, 133,  71,  102,
+  40,  71,   9,   40,   9,   9,    10,  10,   10,  41,  41,  72,   72,  103,
+  103, 134,  134, 165,  165, 196,  196, 227,  227, 258, 258, 289,  289, 320,
+  320, 320,  352, 352,  321, 352,  290, 321,  259, 290, 228, 259,  197, 228,
+  166, 197,  135, 166,  104, 135,  73,  104,  42,  73,  11,  42,   11,  11,
+  12,  12,   12,  43,   43,  74,   74,  105,  105, 136, 136, 167,  167, 198,
+  198, 229,  229, 260,  260, 291,  291, 322,  322, 353, 353, 384,  384, 384,
+  416, 416,  385, 416,  354, 385,  323, 354,  292, 323, 261, 292,  230, 261,
+  199, 230,  168, 199,  137, 168,  106, 137,  75,  106, 44,  75,   13,  44,
+  13,  13,   14,  14,   14,  45,   45,  76,   76,  107, 107, 138,  138, 169,
+  169, 200,  200, 231,  231, 262,  262, 293,  293, 324, 324, 355,  355, 386,
+  386, 417,  417, 448,  448, 448,  480, 480,  449, 480, 418, 449,  387, 418,
+  356, 387,  325, 356,  294, 325,  263, 294,  232, 263, 201, 232,  170, 201,
+  139, 170,  108, 139,  77,  108,  46,  77,   15,  46,  15,  15,   16,  16,
+  16,  47,   47,  78,   78,  109,  109, 140,  140, 171, 171, 202,  202, 233,
+  233, 264,  264, 295,  295, 326,  326, 357,  357, 388, 388, 419,  419, 450,
+  450, 481,  481, 512,  512, 512,  544, 544,  513, 544, 482, 513,  451, 482,
+  420, 451,  389, 420,  358, 389,  327, 358,  296, 327, 265, 296,  234, 265,
+  203, 234,  172, 203,  141, 172,  110, 141,  79,  110, 48,  79,   17,  48,
+  17,  17,   18,  18,   18,  49,   49,  80,   80,  111, 111, 142,  142, 173,
+  173, 204,  204, 235,  235, 266,  266, 297,  297, 328, 328, 359,  359, 390,
+  390, 421,  421, 452,  452, 483,  483, 514,  514, 545, 545, 576,  576, 576,
+  608, 608,  577, 608,  546, 577,  515, 546,  484, 515, 453, 484,  422, 453,
+  391, 422,  360, 391,  329, 360,  298, 329,  267, 298, 236, 267,  205, 236,
+  174, 205,  143, 174,  112, 143,  81,  112,  50,  81,  19,  50,   19,  19,
+  20,  20,   20,  51,   51,  82,   82,  113,  113, 144, 144, 175,  175, 206,
+  206, 237,  237, 268,  268, 299,  299, 330,  330, 361, 361, 392,  392, 423,
+  423, 454,  454, 485,  485, 516,  516, 547,  547, 578, 578, 609,  609, 640,
+  640, 640,  672, 672,  641, 672,  610, 641,  579, 610, 548, 579,  517, 548,
+  486, 517,  455, 486,  424, 455,  393, 424,  362, 393, 331, 362,  300, 331,
+  269, 300,  238, 269,  207, 238,  176, 207,  145, 176, 114, 145,  83,  114,
+  52,  83,   21,  52,   21,  21,   22,  22,   22,  53,  53,  84,   84,  115,
+  115, 146,  146, 177,  177, 208,  208, 239,  239, 270, 270, 301,  301, 332,
+  332, 363,  363, 394,  394, 425,  425, 456,  456, 487, 487, 518,  518, 549,
+  549, 580,  580, 611,  611, 642,  642, 673,  673, 704, 704, 704,  736, 736,
+  705, 736,  674, 705,  643, 674,  612, 643,  581, 612, 550, 581,  519, 550,
+  488, 519,  457, 488,  426, 457,  395, 426,  364, 395, 333, 364,  302, 333,
+  271, 302,  240, 271,  209, 240,  178, 209,  147, 178, 116, 147,  85,  116,
+  54,  85,   23,  54,   23,  23,   24,  24,   24,  55,  55,  86,   86,  117,
+  117, 148,  148, 179,  179, 210,  210, 241,  241, 272, 272, 303,  303, 334,
+  334, 365,  365, 396,  396, 427,  427, 458,  458, 489, 489, 520,  520, 551,
+  551, 582,  582, 613,  613, 644,  644, 675,  675, 706, 706, 737,  737, 768,
+  768, 768,  800, 800,  769, 800,  738, 769,  707, 738, 676, 707,  645, 676,
+  614, 645,  583, 614,  552, 583,  521, 552,  490, 521, 459, 490,  428, 459,
+  397, 428,  366, 397,  335, 366,  304, 335,  273, 304, 242, 273,  211, 242,
+  180, 211,  149, 180,  118, 149,  87,  118,  56,  87,  25,  56,   25,  25,
+  26,  26,   26,  57,   57,  88,   88,  119,  119, 150, 150, 181,  181, 212,
+  212, 243,  243, 274,  274, 305,  305, 336,  336, 367, 367, 398,  398, 429,
+  429, 460,  460, 491,  491, 522,  522, 553,  553, 584, 584, 615,  615, 646,
+  646, 677,  677, 708,  708, 739,  739, 770,  770, 801, 801, 832,  832, 832,
+  864, 864,  833, 864,  802, 833,  771, 802,  740, 771, 709, 740,  678, 709,
+  647, 678,  616, 647,  585, 616,  554, 585,  523, 554, 492, 523,  461, 492,
+  430, 461,  399, 430,  368, 399,  337, 368,  306, 337, 275, 306,  244, 275,
+  213, 244,  182, 213,  151, 182,  120, 151,  89,  120, 58,  89,   27,  58,
+  27,  27,   28,  28,   28,  59,   59,  90,   90,  121, 121, 152,  152, 183,
+  183, 214,  214, 245,  245, 276,  276, 307,  307, 338, 338, 369,  369, 400,
+  400, 431,  431, 462,  462, 493,  493, 524,  524, 555, 555, 586,  586, 617,
+  617, 648,  648, 679,  679, 710,  710, 741,  741, 772, 772, 803,  803, 834,
+  834, 865,  865, 896,  896, 896,  928, 928,  897, 928, 866, 897,  835, 866,
+  804, 835,  773, 804,  742, 773,  711, 742,  680, 711, 649, 680,  618, 649,
+  587, 618,  556, 587,  525, 556,  494, 525,  463, 494, 432, 463,  401, 432,
+  370, 401,  339, 370,  308, 339,  277, 308,  246, 277, 215, 246,  184, 215,
+  153, 184,  122, 153,  91,  122,  60,  91,   29,  60,  29,  29,   30,  30,
+  30,  61,   61,  92,   92,  123,  123, 154,  154, 185, 185, 216,  216, 247,
+  247, 278,  278, 309,  309, 340,  340, 371,  371, 402, 402, 433,  433, 464,
+  464, 495,  495, 526,  526, 557,  557, 588,  588, 619, 619, 650,  650, 681,
+  681, 712,  712, 743,  743, 774,  774, 805,  805, 836, 836, 867,  867, 898,
+  898, 929,  929, 960,  960, 960,  961, 992,  930, 961, 899, 930,  868, 899,
+  837, 868,  806, 837,  775, 806,  744, 775,  713, 744, 682, 713,  651, 682,
+  620, 651,  589, 620,  558, 589,  527, 558,  496, 527, 465, 496,  434, 465,
+  403, 434,  372, 403,  341, 372,  310, 341,  279, 310, 248, 279,  217, 248,
+  186, 217,  155, 186,  124, 155,  93,  124,  62,  93,  31,  62,   63,  94,
+  94,  125,  125, 156,  156, 187,  187, 218,  218, 249, 249, 280,  280, 311,
+  311, 342,  342, 373,  373, 404,  404, 435,  435, 466, 466, 497,  497, 528,
+  528, 559,  559, 590,  590, 621,  621, 652,  652, 683, 683, 714,  714, 745,
+  745, 776,  776, 807,  807, 838,  838, 869,  869, 900, 900, 931,  931, 962,
+  962, 993,  963, 994,  932, 963,  901, 932,  870, 901, 839, 870,  808, 839,
+  777, 808,  746, 777,  715, 746,  684, 715,  653, 684, 622, 653,  591, 622,
+  560, 591,  529, 560,  498, 529,  467, 498,  436, 467, 405, 436,  374, 405,
+  343, 374,  312, 343,  281, 312,  250, 281,  219, 250, 188, 219,  157, 188,
+  126, 157,  95,  126,  127, 158,  158, 189,  189, 220, 220, 251,  251, 282,
+  282, 313,  313, 344,  344, 375,  375, 406,  406, 437, 437, 468,  468, 499,
+  499, 530,  530, 561,  561, 592,  592, 623,  623, 654, 654, 685,  685, 716,
+  716, 747,  747, 778,  778, 809,  809, 840,  840, 871, 871, 902,  902, 933,
+  933, 964,  964, 995,  965, 996,  934, 965,  903, 934, 872, 903,  841, 872,
+  810, 841,  779, 810,  748, 779,  717, 748,  686, 717, 655, 686,  624, 655,
+  593, 624,  562, 593,  531, 562,  500, 531,  469, 500, 438, 469,  407, 438,
+  376, 407,  345, 376,  314, 345,  283, 314,  252, 283, 221, 252,  190, 221,
+  159, 190,  191, 222,  222, 253,  253, 284,  284, 315, 315, 346,  346, 377,
+  377, 408,  408, 439,  439, 470,  470, 501,  501, 532, 532, 563,  563, 594,
+  594, 625,  625, 656,  656, 687,  687, 718,  718, 749, 749, 780,  780, 811,
+  811, 842,  842, 873,  873, 904,  904, 935,  935, 966, 966, 997,  967, 998,
+  936, 967,  905, 936,  874, 905,  843, 874,  812, 843, 781, 812,  750, 781,
+  719, 750,  688, 719,  657, 688,  626, 657,  595, 626, 564, 595,  533, 564,
+  502, 533,  471, 502,  440, 471,  409, 440,  378, 409, 347, 378,  316, 347,
+  285, 316,  254, 285,  223, 254,  255, 286,  286, 317, 317, 348,  348, 379,
+  379, 410,  410, 441,  441, 472,  472, 503,  503, 534, 534, 565,  565, 596,
+  596, 627,  627, 658,  658, 689,  689, 720,  720, 751, 751, 782,  782, 813,
+  813, 844,  844, 875,  875, 906,  906, 937,  937, 968, 968, 999,  969, 1000,
+  938, 969,  907, 938,  876, 907,  845, 876,  814, 845, 783, 814,  752, 783,
+  721, 752,  690, 721,  659, 690,  628, 659,  597, 628, 566, 597,  535, 566,
+  504, 535,  473, 504,  442, 473,  411, 442,  380, 411, 349, 380,  318, 349,
+  287, 318,  319, 350,  350, 381,  381, 412,  412, 443, 443, 474,  474, 505,
+  505, 536,  536, 567,  567, 598,  598, 629,  629, 660, 660, 691,  691, 722,
+  722, 753,  753, 784,  784, 815,  815, 846,  846, 877, 877, 908,  908, 939,
+  939, 970,  970, 1001, 971, 1002, 940, 971,  909, 940, 878, 909,  847, 878,
+  816, 847,  785, 816,  754, 785,  723, 754,  692, 723, 661, 692,  630, 661,
+  599, 630,  568, 599,  537, 568,  506, 537,  475, 506, 444, 475,  413, 444,
+  382, 413,  351, 382,  383, 414,  414, 445,  445, 476, 476, 507,  507, 538,
+  538, 569,  569, 600,  600, 631,  631, 662,  662, 693, 693, 724,  724, 755,
+  755, 786,  786, 817,  817, 848,  848, 879,  879, 910, 910, 941,  941, 972,
+  972, 1003, 973, 1004, 942, 973,  911, 942,  880, 911, 849, 880,  818, 849,
+  787, 818,  756, 787,  725, 756,  694, 725,  663, 694, 632, 663,  601, 632,
+  570, 601,  539, 570,  508, 539,  477, 508,  446, 477, 415, 446,  447, 478,
+  478, 509,  509, 540,  540, 571,  571, 602,  602, 633, 633, 664,  664, 695,
+  695, 726,  726, 757,  757, 788,  788, 819,  819, 850, 850, 881,  881, 912,
+  912, 943,  943, 974,  974, 1005, 975, 1006, 944, 975, 913, 944,  882, 913,
+  851, 882,  820, 851,  789, 820,  758, 789,  727, 758, 696, 727,  665, 696,
+  634, 665,  603, 634,  572, 603,  541, 572,  510, 541, 479, 510,  511, 542,
+  542, 573,  573, 604,  604, 635,  635, 666,  666, 697, 697, 728,  728, 759,
+  759, 790,  790, 821,  821, 852,  852, 883,  883, 914, 914, 945,  945, 976,
+  976, 1007, 977, 1008, 946, 977,  915, 946,  884, 915, 853, 884,  822, 853,
+  791, 822,  760, 791,  729, 760,  698, 729,  667, 698, 636, 667,  605, 636,
+  574, 605,  543, 574,  575, 606,  606, 637,  637, 668, 668, 699,  699, 730,
+  730, 761,  761, 792,  792, 823,  823, 854,  854, 885, 885, 916,  916, 947,
+  947, 978,  978, 1009, 979, 1010, 948, 979,  917, 948, 886, 917,  855, 886,
+  824, 855,  793, 824,  762, 793,  731, 762,  700, 731, 669, 700,  638, 669,
+  607, 638,  639, 670,  670, 701,  701, 732,  732, 763, 763, 794,  794, 825,
+  825, 856,  856, 887,  887, 918,  918, 949,  949, 980, 980, 1011, 981, 1012,
+  950, 981,  919, 950,  888, 919,  857, 888,  826, 857, 795, 826,  764, 795,
+  733, 764,  702, 733,  671, 702,  703, 734,  734, 765, 765, 796,  796, 827,
+  827, 858,  858, 889,  889, 920,  920, 951,  951, 982, 982, 1013, 983, 1014,
+  952, 983,  921, 952,  890, 921,  859, 890,  828, 859, 797, 828,  766, 797,
+  735, 766,  767, 798,  798, 829,  829, 860,  860, 891, 891, 922,  922, 953,
+  953, 984,  984, 1015, 985, 1016, 954, 985,  923, 954, 892, 923,  861, 892,
+  830, 861,  799, 830,  831, 862,  862, 893,  893, 924, 924, 955,  955, 986,
+  986, 1017, 987, 1018, 956, 987,  925, 956,  894, 925, 863, 894,  895, 926,
+  926, 957,  957, 988,  988, 1019, 989, 1020, 958, 989, 927, 958,  959, 990,
   990, 1021, 991, 1022, 0,   0
 };
 
-#if CONFIG_TX64X64
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x64_neighbors[2049 * MAX_NEIGHBORS]) = {
-  0,    0,    0,    0,    0,    0,    1,    1,    1,    32,   32,   32,   2,
-  2,    2,    33,   33,   64,   64,   64,   3,    3,    3,    34,   34,   65,
-  65,   96,   96,   96,   4,    4,    4,    35,   35,   66,   66,   97,   97,
-  128,  128,  128,  5,    5,    5,    36,   36,   67,   67,   98,   98,   129,
-  129,  160,  160,  160,  6,    6,    6,    37,   37,   68,   68,   99,   99,
-  130,  130,  161,  161,  192,  192,  192,  7,    7,    7,    38,   38,   69,
-  69,   100,  100,  131,  131,  162,  162,  193,  193,  224,  224,  224,  8,
-  8,    8,    39,   39,   70,   70,   101,  101,  132,  132,  163,  163,  194,
-  194,  225,  225,  256,  256,  256,  9,    9,    9,    40,   40,   71,   71,
-  102,  102,  133,  133,  164,  164,  195,  195,  226,  226,  257,  257,  288,
-  288,  288,  10,   10,   10,   41,   41,   72,   72,   103,  103,  134,  134,
-  165,  165,  196,  196,  227,  227,  258,  258,  289,  289,  320,  320,  320,
-  11,   11,   11,   42,   42,   73,   73,   104,  104,  135,  135,  166,  166,
-  197,  197,  228,  228,  259,  259,  290,  290,  321,  321,  352,  352,  352,
-  12,   12,   12,   43,   43,   74,   74,   105,  105,  136,  136,  167,  167,
-  198,  198,  229,  229,  260,  260,  291,  291,  322,  322,  353,  353,  384,
-  384,  384,  13,   13,   13,   44,   44,   75,   75,   106,  106,  137,  137,
-  168,  168,  199,  199,  230,  230,  261,  261,  292,  292,  323,  323,  354,
-  354,  385,  385,  416,  416,  416,  14,   14,   14,   45,   45,   76,   76,
-  107,  107,  138,  138,  169,  169,  200,  200,  231,  231,  262,  262,  293,
-  293,  324,  324,  355,  355,  386,  386,  417,  417,  448,  448,  448,  15,
-  15,   15,   46,   46,   77,   77,   108,  108,  139,  139,  170,  170,  201,
-  201,  232,  232,  263,  263,  294,  294,  325,  325,  356,  356,  387,  387,
-  418,  418,  449,  449,  480,  480,  480,  16,   16,   16,   47,   47,   78,
-  78,   109,  109,  140,  140,  171,  171,  202,  202,  233,  233,  264,  264,
-  295,  295,  326,  326,  357,  357,  388,  388,  419,  419,  450,  450,  481,
-  481,  512,  512,  512,  17,   17,   17,   48,   48,   79,   79,   110,  110,
-  141,  141,  172,  172,  203,  203,  234,  234,  265,  265,  296,  296,  327,
-  327,  358,  358,  389,  389,  420,  420,  451,  451,  482,  482,  513,  513,
-  544,  544,  544,  18,   18,   18,   49,   49,   80,   80,   111,  111,  142,
-  142,  173,  173,  204,  204,  235,  235,  266,  266,  297,  297,  328,  328,
-  359,  359,  390,  390,  421,  421,  452,  452,  483,  483,  514,  514,  545,
-  545,  576,  576,  576,  19,   19,   19,   50,   50,   81,   81,   112,  112,
-  143,  143,  174,  174,  205,  205,  236,  236,  267,  267,  298,  298,  329,
-  329,  360,  360,  391,  391,  422,  422,  453,  453,  484,  484,  515,  515,
-  546,  546,  577,  577,  608,  608,  608,  20,   20,   20,   51,   51,   82,
-  82,   113,  113,  144,  144,  175,  175,  206,  206,  237,  237,  268,  268,
-  299,  299,  330,  330,  361,  361,  392,  392,  423,  423,  454,  454,  485,
-  485,  516,  516,  547,  547,  578,  578,  609,  609,  640,  640,  640,  21,
-  21,   21,   52,   52,   83,   83,   114,  114,  145,  145,  176,  176,  207,
-  207,  238,  238,  269,  269,  300,  300,  331,  331,  362,  362,  393,  393,
-  424,  424,  455,  455,  486,  486,  517,  517,  548,  548,  579,  579,  610,
-  610,  641,  641,  672,  672,  672,  22,   22,   22,   53,   53,   84,   84,
-  115,  115,  146,  146,  177,  177,  208,  208,  239,  239,  270,  270,  301,
-  301,  332,  332,  363,  363,  394,  394,  425,  425,  456,  456,  487,  487,
-  518,  518,  549,  549,  580,  580,  611,  611,  642,  642,  673,  673,  704,
-  704,  704,  23,   23,   23,   54,   54,   85,   85,   116,  116,  147,  147,
-  178,  178,  209,  209,  240,  240,  271,  271,  302,  302,  333,  333,  364,
-  364,  395,  395,  426,  426,  457,  457,  488,  488,  519,  519,  550,  550,
-  581,  581,  612,  612,  643,  643,  674,  674,  705,  705,  736,  736,  736,
-  24,   24,   24,   55,   55,   86,   86,   117,  117,  148,  148,  179,  179,
-  210,  210,  241,  241,  272,  272,  303,  303,  334,  334,  365,  365,  396,
-  396,  427,  427,  458,  458,  489,  489,  520,  520,  551,  551,  582,  582,
-  613,  613,  644,  644,  675,  675,  706,  706,  737,  737,  768,  768,  768,
-  25,   25,   25,   56,   56,   87,   87,   118,  118,  149,  149,  180,  180,
-  211,  211,  242,  242,  273,  273,  304,  304,  335,  335,  366,  366,  397,
-  397,  428,  428,  459,  459,  490,  490,  521,  521,  552,  552,  583,  583,
-  614,  614,  645,  645,  676,  676,  707,  707,  738,  738,  769,  769,  800,
-  800,  800,  26,   26,   26,   57,   57,   88,   88,   119,  119,  150,  150,
-  181,  181,  212,  212,  243,  243,  274,  274,  305,  305,  336,  336,  367,
-  367,  398,  398,  429,  429,  460,  460,  491,  491,  522,  522,  553,  553,
-  584,  584,  615,  615,  646,  646,  677,  677,  708,  708,  739,  739,  770,
-  770,  801,  801,  832,  832,  832,  27,   27,   27,   58,   58,   89,   89,
-  120,  120,  151,  151,  182,  182,  213,  213,  244,  244,  275,  275,  306,
-  306,  337,  337,  368,  368,  399,  399,  430,  430,  461,  461,  492,  492,
-  523,  523,  554,  554,  585,  585,  616,  616,  647,  647,  678,  678,  709,
-  709,  740,  740,  771,  771,  802,  802,  833,  833,  864,  864,  864,  28,
-  28,   28,   59,   59,   90,   90,   121,  121,  152,  152,  183,  183,  214,
-  214,  245,  245,  276,  276,  307,  307,  338,  338,  369,  369,  400,  400,
-  431,  431,  462,  462,  493,  493,  524,  524,  555,  555,  586,  586,  617,
-  617,  648,  648,  679,  679,  710,  710,  741,  741,  772,  772,  803,  803,
-  834,  834,  865,  865,  896,  896,  896,  29,   29,   29,   60,   60,   91,
-  91,   122,  122,  153,  153,  184,  184,  215,  215,  246,  246,  277,  277,
-  308,  308,  339,  339,  370,  370,  401,  401,  432,  432,  463,  463,  494,
-  494,  525,  525,  556,  556,  587,  587,  618,  618,  649,  649,  680,  680,
-  711,  711,  742,  742,  773,  773,  804,  804,  835,  835,  866,  866,  897,
-  897,  928,  928,  928,  30,   30,   30,   61,   61,   92,   92,   123,  123,
-  154,  154,  185,  185,  216,  216,  247,  247,  278,  278,  309,  309,  340,
-  340,  371,  371,  402,  402,  433,  433,  464,  464,  495,  495,  526,  526,
-  557,  557,  588,  588,  619,  619,  650,  650,  681,  681,  712,  712,  743,
-  743,  774,  774,  805,  805,  836,  836,  867,  867,  898,  898,  929,  929,
-  960,  960,  960,  31,   62,   62,   93,   93,   124,  124,  155,  155,  186,
-  186,  217,  217,  248,  248,  279,  279,  310,  310,  341,  341,  372,  372,
-  403,  403,  434,  434,  465,  465,  496,  496,  527,  527,  558,  558,  589,
-  589,  620,  620,  651,  651,  682,  682,  713,  713,  744,  744,  775,  775,
-  806,  806,  837,  837,  868,  868,  899,  899,  930,  930,  961,  961,  992,
-  992,  992,  63,   94,   94,   125,  125,  156,  156,  187,  187,  218,  218,
-  249,  249,  280,  280,  311,  311,  342,  342,  373,  373,  404,  404,  435,
-  435,  466,  466,  497,  497,  528,  528,  559,  559,  590,  590,  621,  621,
-  652,  652,  683,  683,  714,  714,  745,  745,  776,  776,  807,  807,  838,
-  838,  869,  869,  900,  900,  931,  931,  962,  962,  993,  993,  1024, 1024,
-  1024, 95,   126,  126,  157,  157,  188,  188,  219,  219,  250,  250,  281,
-  281,  312,  312,  343,  343,  374,  374,  405,  405,  436,  436,  467,  467,
-  498,  498,  529,  529,  560,  560,  591,  591,  622,  622,  653,  653,  684,
-  684,  715,  715,  746,  746,  777,  777,  808,  808,  839,  839,  870,  870,
-  901,  901,  932,  932,  963,  963,  994,  994,  1025, 1025, 1056, 1056, 1056,
-  127,  158,  158,  189,  189,  220,  220,  251,  251,  282,  282,  313,  313,
-  344,  344,  375,  375,  406,  406,  437,  437,  468,  468,  499,  499,  530,
-  530,  561,  561,  592,  592,  623,  623,  654,  654,  685,  685,  716,  716,
-  747,  747,  778,  778,  809,  809,  840,  840,  871,  871,  902,  902,  933,
-  933,  964,  964,  995,  995,  1026, 1026, 1057, 1057, 1088, 1088, 1088, 159,
-  190,  190,  221,  221,  252,  252,  283,  283,  314,  314,  345,  345,  376,
-  376,  407,  407,  438,  438,  469,  469,  500,  500,  531,  531,  562,  562,
-  593,  593,  624,  624,  655,  655,  686,  686,  717,  717,  748,  748,  779,
-  779,  810,  810,  841,  841,  872,  872,  903,  903,  934,  934,  965,  965,
-  996,  996,  1027, 1027, 1058, 1058, 1089, 1089, 1120, 1120, 1120, 191,  222,
-  222,  253,  253,  284,  284,  315,  315,  346,  346,  377,  377,  408,  408,
-  439,  439,  470,  470,  501,  501,  532,  532,  563,  563,  594,  594,  625,
-  625,  656,  656,  687,  687,  718,  718,  749,  749,  780,  780,  811,  811,
-  842,  842,  873,  873,  904,  904,  935,  935,  966,  966,  997,  997,  1028,
-  1028, 1059, 1059, 1090, 1090, 1121, 1121, 1152, 1152, 1152, 223,  254,  254,
-  285,  285,  316,  316,  347,  347,  378,  378,  409,  409,  440,  440,  471,
-  471,  502,  502,  533,  533,  564,  564,  595,  595,  626,  626,  657,  657,
-  688,  688,  719,  719,  750,  750,  781,  781,  812,  812,  843,  843,  874,
-  874,  905,  905,  936,  936,  967,  967,  998,  998,  1029, 1029, 1060, 1060,
-  1091, 1091, 1122, 1122, 1153, 1153, 1184, 1184, 1184, 255,  286,  286,  317,
-  317,  348,  348,  379,  379,  410,  410,  441,  441,  472,  472,  503,  503,
-  534,  534,  565,  565,  596,  596,  627,  627,  658,  658,  689,  689,  720,
-  720,  751,  751,  782,  782,  813,  813,  844,  844,  875,  875,  906,  906,
-  937,  937,  968,  968,  999,  999,  1030, 1030, 1061, 1061, 1092, 1092, 1123,
-  1123, 1154, 1154, 1185, 1185, 1216, 1216, 1216, 287,  318,  318,  349,  349,
-  380,  380,  411,  411,  442,  442,  473,  473,  504,  504,  535,  535,  566,
-  566,  597,  597,  628,  628,  659,  659,  690,  690,  721,  721,  752,  752,
-  783,  783,  814,  814,  845,  845,  876,  876,  907,  907,  938,  938,  969,
-  969,  1000, 1000, 1031, 1031, 1062, 1062, 1093, 1093, 1124, 1124, 1155, 1155,
-  1186, 1186, 1217, 1217, 1248, 1248, 1248, 319,  350,  350,  381,  381,  412,
-  412,  443,  443,  474,  474,  505,  505,  536,  536,  567,  567,  598,  598,
-  629,  629,  660,  660,  691,  691,  722,  722,  753,  753,  784,  784,  815,
-  815,  846,  846,  877,  877,  908,  908,  939,  939,  970,  970,  1001, 1001,
-  1032, 1032, 1063, 1063, 1094, 1094, 1125, 1125, 1156, 1156, 1187, 1187, 1218,
-  1218, 1249, 1249, 1280, 1280, 1280, 351,  382,  382,  413,  413,  444,  444,
-  475,  475,  506,  506,  537,  537,  568,  568,  599,  599,  630,  630,  661,
-  661,  692,  692,  723,  723,  754,  754,  785,  785,  816,  816,  847,  847,
-  878,  878,  909,  909,  940,  940,  971,  971,  1002, 1002, 1033, 1033, 1064,
-  1064, 1095, 1095, 1126, 1126, 1157, 1157, 1188, 1188, 1219, 1219, 1250, 1250,
-  1281, 1281, 1312, 1312, 1312, 383,  414,  414,  445,  445,  476,  476,  507,
-  507,  538,  538,  569,  569,  600,  600,  631,  631,  662,  662,  693,  693,
-  724,  724,  755,  755,  786,  786,  817,  817,  848,  848,  879,  879,  910,
-  910,  941,  941,  972,  972,  1003, 1003, 1034, 1034, 1065, 1065, 1096, 1096,
-  1127, 1127, 1158, 1158, 1189, 1189, 1220, 1220, 1251, 1251, 1282, 1282, 1313,
-  1313, 1344, 1344, 1344, 415,  446,  446,  477,  477,  508,  508,  539,  539,
-  570,  570,  601,  601,  632,  632,  663,  663,  694,  694,  725,  725,  756,
-  756,  787,  787,  818,  818,  849,  849,  880,  880,  911,  911,  942,  942,
-  973,  973,  1004, 1004, 1035, 1035, 1066, 1066, 1097, 1097, 1128, 1128, 1159,
-  1159, 1190, 1190, 1221, 1221, 1252, 1252, 1283, 1283, 1314, 1314, 1345, 1345,
-  1376, 1376, 1376, 447,  478,  478,  509,  509,  540,  540,  571,  571,  602,
-  602,  633,  633,  664,  664,  695,  695,  726,  726,  757,  757,  788,  788,
-  819,  819,  850,  850,  881,  881,  912,  912,  943,  943,  974,  974,  1005,
-  1005, 1036, 1036, 1067, 1067, 1098, 1098, 1129, 1129, 1160, 1160, 1191, 1191,
-  1222, 1222, 1253, 1253, 1284, 1284, 1315, 1315, 1346, 1346, 1377, 1377, 1408,
-  1408, 1408, 479,  510,  510,  541,  541,  572,  572,  603,  603,  634,  634,
-  665,  665,  696,  696,  727,  727,  758,  758,  789,  789,  820,  820,  851,
-  851,  882,  882,  913,  913,  944,  944,  975,  975,  1006, 1006, 1037, 1037,
-  1068, 1068, 1099, 1099, 1130, 1130, 1161, 1161, 1192, 1192, 1223, 1223, 1254,
-  1254, 1285, 1285, 1316, 1316, 1347, 1347, 1378, 1378, 1409, 1409, 1440, 1440,
-  1440, 511,  542,  542,  573,  573,  604,  604,  635,  635,  666,  666,  697,
-  697,  728,  728,  759,  759,  790,  790,  821,  821,  852,  852,  883,  883,
-  914,  914,  945,  945,  976,  976,  1007, 1007, 1038, 1038, 1069, 1069, 1100,
-  1100, 1131, 1131, 1162, 1162, 1193, 1193, 1224, 1224, 1255, 1255, 1286, 1286,
-  1317, 1317, 1348, 1348, 1379, 1379, 1410, 1410, 1441, 1441, 1472, 1472, 1472,
-  543,  574,  574,  605,  605,  636,  636,  667,  667,  698,  698,  729,  729,
-  760,  760,  791,  791,  822,  822,  853,  853,  884,  884,  915,  915,  946,
-  946,  977,  977,  1008, 1008, 1039, 1039, 1070, 1070, 1101, 1101, 1132, 1132,
-  1163, 1163, 1194, 1194, 1225, 1225, 1256, 1256, 1287, 1287, 1318, 1318, 1349,
-  1349, 1380, 1380, 1411, 1411, 1442, 1442, 1473, 1473, 1504, 1504, 1504, 575,
-  606,  606,  637,  637,  668,  668,  699,  699,  730,  730,  761,  761,  792,
-  792,  823,  823,  854,  854,  885,  885,  916,  916,  947,  947,  978,  978,
-  1009, 1009, 1040, 1040, 1071, 1071, 1102, 1102, 1133, 1133, 1164, 1164, 1195,
-  1195, 1226, 1226, 1257, 1257, 1288, 1288, 1319, 1319, 1350, 1350, 1381, 1381,
-  1412, 1412, 1443, 1443, 1474, 1474, 1505, 1505, 1536, 1536, 1536, 607,  638,
-  638,  669,  669,  700,  700,  731,  731,  762,  762,  793,  793,  824,  824,
-  855,  855,  886,  886,  917,  917,  948,  948,  979,  979,  1010, 1010, 1041,
-  1041, 1072, 1072, 1103, 1103, 1134, 1134, 1165, 1165, 1196, 1196, 1227, 1227,
-  1258, 1258, 1289, 1289, 1320, 1320, 1351, 1351, 1382, 1382, 1413, 1413, 1444,
-  1444, 1475, 1475, 1506, 1506, 1537, 1537, 1568, 1568, 1568, 639,  670,  670,
-  701,  701,  732,  732,  763,  763,  794,  794,  825,  825,  856,  856,  887,
-  887,  918,  918,  949,  949,  980,  980,  1011, 1011, 1042, 1042, 1073, 1073,
-  1104, 1104, 1135, 1135, 1166, 1166, 1197, 1197, 1228, 1228, 1259, 1259, 1290,
-  1290, 1321, 1321, 1352, 1352, 1383, 1383, 1414, 1414, 1445, 1445, 1476, 1476,
-  1507, 1507, 1538, 1538, 1569, 1569, 1600, 1600, 1600, 671,  702,  702,  733,
-  733,  764,  764,  795,  795,  826,  826,  857,  857,  888,  888,  919,  919,
-  950,  950,  981,  981,  1012, 1012, 1043, 1043, 1074, 1074, 1105, 1105, 1136,
-  1136, 1167, 1167, 1198, 1198, 1229, 1229, 1260, 1260, 1291, 1291, 1322, 1322,
-  1353, 1353, 1384, 1384, 1415, 1415, 1446, 1446, 1477, 1477, 1508, 1508, 1539,
-  1539, 1570, 1570, 1601, 1601, 1632, 1632, 1632, 703,  734,  734,  765,  765,
-  796,  796,  827,  827,  858,  858,  889,  889,  920,  920,  951,  951,  982,
-  982,  1013, 1013, 1044, 1044, 1075, 1075, 1106, 1106, 1137, 1137, 1168, 1168,
-  1199, 1199, 1230, 1230, 1261, 1261, 1292, 1292, 1323, 1323, 1354, 1354, 1385,
-  1385, 1416, 1416, 1447, 1447, 1478, 1478, 1509, 1509, 1540, 1540, 1571, 1571,
-  1602, 1602, 1633, 1633, 1664, 1664, 1664, 735,  766,  766,  797,  797,  828,
-  828,  859,  859,  890,  890,  921,  921,  952,  952,  983,  983,  1014, 1014,
-  1045, 1045, 1076, 1076, 1107, 1107, 1138, 1138, 1169, 1169, 1200, 1200, 1231,
-  1231, 1262, 1262, 1293, 1293, 1324, 1324, 1355, 1355, 1386, 1386, 1417, 1417,
-  1448, 1448, 1479, 1479, 1510, 1510, 1541, 1541, 1572, 1572, 1603, 1603, 1634,
-  1634, 1665, 1665, 1696, 1696, 1696, 767,  798,  798,  829,  829,  860,  860,
-  891,  891,  922,  922,  953,  953,  984,  984,  1015, 1015, 1046, 1046, 1077,
-  1077, 1108, 1108, 1139, 1139, 1170, 1170, 1201, 1201, 1232, 1232, 1263, 1263,
-  1294, 1294, 1325, 1325, 1356, 1356, 1387, 1387, 1418, 1418, 1449, 1449, 1480,
-  1480, 1511, 1511, 1542, 1542, 1573, 1573, 1604, 1604, 1635, 1635, 1666, 1666,
-  1697, 1697, 1728, 1728, 1728, 799,  830,  830,  861,  861,  892,  892,  923,
-  923,  954,  954,  985,  985,  1016, 1016, 1047, 1047, 1078, 1078, 1109, 1109,
-  1140, 1140, 1171, 1171, 1202, 1202, 1233, 1233, 1264, 1264, 1295, 1295, 1326,
-  1326, 1357, 1357, 1388, 1388, 1419, 1419, 1450, 1450, 1481, 1481, 1512, 1512,
-  1543, 1543, 1574, 1574, 1605, 1605, 1636, 1636, 1667, 1667, 1698, 1698, 1729,
-  1729, 1760, 1760, 1760, 831,  862,  862,  893,  893,  924,  924,  955,  955,
-  986,  986,  1017, 1017, 1048, 1048, 1079, 1079, 1110, 1110, 1141, 1141, 1172,
-  1172, 1203, 1203, 1234, 1234, 1265, 1265, 1296, 1296, 1327, 1327, 1358, 1358,
-  1389, 1389, 1420, 1420, 1451, 1451, 1482, 1482, 1513, 1513, 1544, 1544, 1575,
-  1575, 1606, 1606, 1637, 1637, 1668, 1668, 1699, 1699, 1730, 1730, 1761, 1761,
-  1792, 1792, 1792, 863,  894,  894,  925,  925,  956,  956,  987,  987,  1018,
-  1018, 1049, 1049, 1080, 1080, 1111, 1111, 1142, 1142, 1173, 1173, 1204, 1204,
-  1235, 1235, 1266, 1266, 1297, 1297, 1328, 1328, 1359, 1359, 1390, 1390, 1421,
-  1421, 1452, 1452, 1483, 1483, 1514, 1514, 1545, 1545, 1576, 1576, 1607, 1607,
-  1638, 1638, 1669, 1669, 1700, 1700, 1731, 1731, 1762, 1762, 1793, 1793, 1824,
-  1824, 1824, 895,  926,  926,  957,  957,  988,  988,  1019, 1019, 1050, 1050,
-  1081, 1081, 1112, 1112, 1143, 1143, 1174, 1174, 1205, 1205, 1236, 1236, 1267,
-  1267, 1298, 1298, 1329, 1329, 1360, 1360, 1391, 1391, 1422, 1422, 1453, 1453,
-  1484, 1484, 1515, 1515, 1546, 1546, 1577, 1577, 1608, 1608, 1639, 1639, 1670,
-  1670, 1701, 1701, 1732, 1732, 1763, 1763, 1794, 1794, 1825, 1825, 1856, 1856,
-  1856, 927,  958,  958,  989,  989,  1020, 1020, 1051, 1051, 1082, 1082, 1113,
-  1113, 1144, 1144, 1175, 1175, 1206, 1206, 1237, 1237, 1268, 1268, 1299, 1299,
-  1330, 1330, 1361, 1361, 1392, 1392, 1423, 1423, 1454, 1454, 1485, 1485, 1516,
-  1516, 1547, 1547, 1578, 1578, 1609, 1609, 1640, 1640, 1671, 1671, 1702, 1702,
-  1733, 1733, 1764, 1764, 1795, 1795, 1826, 1826, 1857, 1857, 1888, 1888, 1888,
-  959,  990,  990,  1021, 1021, 1052, 1052, 1083, 1083, 1114, 1114, 1145, 1145,
-  1176, 1176, 1207, 1207, 1238, 1238, 1269, 1269, 1300, 1300, 1331, 1331, 1362,
-  1362, 1393, 1393, 1424, 1424, 1455, 1455, 1486, 1486, 1517, 1517, 1548, 1548,
-  1579, 1579, 1610, 1610, 1641, 1641, 1672, 1672, 1703, 1703, 1734, 1734, 1765,
-  1765, 1796, 1796, 1827, 1827, 1858, 1858, 1889, 1889, 1920, 1920, 1920, 991,
-  1022, 1022, 1053, 1053, 1084, 1084, 1115, 1115, 1146, 1146, 1177, 1177, 1208,
-  1208, 1239, 1239, 1270, 1270, 1301, 1301, 1332, 1332, 1363, 1363, 1394, 1394,
-  1425, 1425, 1456, 1456, 1487, 1487, 1518, 1518, 1549, 1549, 1580, 1580, 1611,
-  1611, 1642, 1642, 1673, 1673, 1704, 1704, 1735, 1735, 1766, 1766, 1797, 1797,
-  1828, 1828, 1859, 1859, 1890, 1890, 1921, 1921, 1952, 1952, 1952, 1023, 1054,
-  1054, 1085, 1085, 1116, 1116, 1147, 1147, 1178, 1178, 1209, 1209, 1240, 1240,
-  1271, 1271, 1302, 1302, 1333, 1333, 1364, 1364, 1395, 1395, 1426, 1426, 1457,
-  1457, 1488, 1488, 1519, 1519, 1550, 1550, 1581, 1581, 1612, 1612, 1643, 1643,
-  1674, 1674, 1705, 1705, 1736, 1736, 1767, 1767, 1798, 1798, 1829, 1829, 1860,
-  1860, 1891, 1891, 1922, 1922, 1953, 1953, 1984, 1984, 1984, 1055, 1086, 1086,
-  1117, 1117, 1148, 1148, 1179, 1179, 1210, 1210, 1241, 1241, 1272, 1272, 1303,
-  1303, 1334, 1334, 1365, 1365, 1396, 1396, 1427, 1427, 1458, 1458, 1489, 1489,
-  1520, 1520, 1551, 1551, 1582, 1582, 1613, 1613, 1644, 1644, 1675, 1675, 1706,
-  1706, 1737, 1737, 1768, 1768, 1799, 1799, 1830, 1830, 1861, 1861, 1892, 1892,
-  1923, 1923, 1954, 1954, 1985, 1985, 2016, 1087, 1118, 1118, 1149, 1149, 1180,
-  1180, 1211, 1211, 1242, 1242, 1273, 1273, 1304, 1304, 1335, 1335, 1366, 1366,
-  1397, 1397, 1428, 1428, 1459, 1459, 1490, 1490, 1521, 1521, 1552, 1552, 1583,
-  1583, 1614, 1614, 1645, 1645, 1676, 1676, 1707, 1707, 1738, 1738, 1769, 1769,
-  1800, 1800, 1831, 1831, 1862, 1862, 1893, 1893, 1924, 1924, 1955, 1955, 1986,
-  1986, 2017, 1119, 1150, 1150, 1181, 1181, 1212, 1212, 1243, 1243, 1274, 1274,
-  1305, 1305, 1336, 1336, 1367, 1367, 1398, 1398, 1429, 1429, 1460, 1460, 1491,
-  1491, 1522, 1522, 1553, 1553, 1584, 1584, 1615, 1615, 1646, 1646, 1677, 1677,
-  1708, 1708, 1739, 1739, 1770, 1770, 1801, 1801, 1832, 1832, 1863, 1863, 1894,
-  1894, 1925, 1925, 1956, 1956, 1987, 1987, 2018, 1151, 1182, 1182, 1213, 1213,
-  1244, 1244, 1275, 1275, 1306, 1306, 1337, 1337, 1368, 1368, 1399, 1399, 1430,
-  1430, 1461, 1461, 1492, 1492, 1523, 1523, 1554, 1554, 1585, 1585, 1616, 1616,
-  1647, 1647, 1678, 1678, 1709, 1709, 1740, 1740, 1771, 1771, 1802, 1802, 1833,
-  1833, 1864, 1864, 1895, 1895, 1926, 1926, 1957, 1957, 1988, 1988, 2019, 1183,
-  1214, 1214, 1245, 1245, 1276, 1276, 1307, 1307, 1338, 1338, 1369, 1369, 1400,
-  1400, 1431, 1431, 1462, 1462, 1493, 1493, 1524, 1524, 1555, 1555, 1586, 1586,
-  1617, 1617, 1648, 1648, 1679, 1679, 1710, 1710, 1741, 1741, 1772, 1772, 1803,
-  1803, 1834, 1834, 1865, 1865, 1896, 1896, 1927, 1927, 1958, 1958, 1989, 1989,
-  2020, 1215, 1246, 1246, 1277, 1277, 1308, 1308, 1339, 1339, 1370, 1370, 1401,
-  1401, 1432, 1432, 1463, 1463, 1494, 1494, 1525, 1525, 1556, 1556, 1587, 1587,
-  1618, 1618, 1649, 1649, 1680, 1680, 1711, 1711, 1742, 1742, 1773, 1773, 1804,
-  1804, 1835, 1835, 1866, 1866, 1897, 1897, 1928, 1928, 1959, 1959, 1990, 1990,
-  2021, 1247, 1278, 1278, 1309, 1309, 1340, 1340, 1371, 1371, 1402, 1402, 1433,
-  1433, 1464, 1464, 1495, 1495, 1526, 1526, 1557, 1557, 1588, 1588, 1619, 1619,
-  1650, 1650, 1681, 1681, 1712, 1712, 1743, 1743, 1774, 1774, 1805, 1805, 1836,
-  1836, 1867, 1867, 1898, 1898, 1929, 1929, 1960, 1960, 1991, 1991, 2022, 1279,
-  1310, 1310, 1341, 1341, 1372, 1372, 1403, 1403, 1434, 1434, 1465, 1465, 1496,
-  1496, 1527, 1527, 1558, 1558, 1589, 1589, 1620, 1620, 1651, 1651, 1682, 1682,
-  1713, 1713, 1744, 1744, 1775, 1775, 1806, 1806, 1837, 1837, 1868, 1868, 1899,
-  1899, 1930, 1930, 1961, 1961, 1992, 1992, 2023, 1311, 1342, 1342, 1373, 1373,
-  1404, 1404, 1435, 1435, 1466, 1466, 1497, 1497, 1528, 1528, 1559, 1559, 1590,
-  1590, 1621, 1621, 1652, 1652, 1683, 1683, 1714, 1714, 1745, 1745, 1776, 1776,
-  1807, 1807, 1838, 1838, 1869, 1869, 1900, 1900, 1931, 1931, 1962, 1962, 1993,
-  1993, 2024, 1343, 1374, 1374, 1405, 1405, 1436, 1436, 1467, 1467, 1498, 1498,
-  1529, 1529, 1560, 1560, 1591, 1591, 1622, 1622, 1653, 1653, 1684, 1684, 1715,
-  1715, 1746, 1746, 1777, 1777, 1808, 1808, 1839, 1839, 1870, 1870, 1901, 1901,
-  1932, 1932, 1963, 1963, 1994, 1994, 2025, 1375, 1406, 1406, 1437, 1437, 1468,
-  1468, 1499, 1499, 1530, 1530, 1561, 1561, 1592, 1592, 1623, 1623, 1654, 1654,
-  1685, 1685, 1716, 1716, 1747, 1747, 1778, 1778, 1809, 1809, 1840, 1840, 1871,
-  1871, 1902, 1902, 1933, 1933, 1964, 1964, 1995, 1995, 2026, 1407, 1438, 1438,
-  1469, 1469, 1500, 1500, 1531, 1531, 1562, 1562, 1593, 1593, 1624, 1624, 1655,
-  1655, 1686, 1686, 1717, 1717, 1748, 1748, 1779, 1779, 1810, 1810, 1841, 1841,
-  1872, 1872, 1903, 1903, 1934, 1934, 1965, 1965, 1996, 1996, 2027, 1439, 1470,
-  1470, 1501, 1501, 1532, 1532, 1563, 1563, 1594, 1594, 1625, 1625, 1656, 1656,
-  1687, 1687, 1718, 1718, 1749, 1749, 1780, 1780, 1811, 1811, 1842, 1842, 1873,
-  1873, 1904, 1904, 1935, 1935, 1966, 1966, 1997, 1997, 2028, 1471, 1502, 1502,
-  1533, 1533, 1564, 1564, 1595, 1595, 1626, 1626, 1657, 1657, 1688, 1688, 1719,
-  1719, 1750, 1750, 1781, 1781, 1812, 1812, 1843, 1843, 1874, 1874, 1905, 1905,
-  1936, 1936, 1967, 1967, 1998, 1998, 2029, 1503, 1534, 1534, 1565, 1565, 1596,
-  1596, 1627, 1627, 1658, 1658, 1689, 1689, 1720, 1720, 1751, 1751, 1782, 1782,
-  1813, 1813, 1844, 1844, 1875, 1875, 1906, 1906, 1937, 1937, 1968, 1968, 1999,
-  1999, 2030, 1535, 1566, 1566, 1597, 1597, 1628, 1628, 1659, 1659, 1690, 1690,
-  1721, 1721, 1752, 1752, 1783, 1783, 1814, 1814, 1845, 1845, 1876, 1876, 1907,
-  1907, 1938, 1938, 1969, 1969, 2000, 2000, 2031, 1567, 1598, 1598, 1629, 1629,
-  1660, 1660, 1691, 1691, 1722, 1722, 1753, 1753, 1784, 1784, 1815, 1815, 1846,
-  1846, 1877, 1877, 1908, 1908, 1939, 1939, 1970, 1970, 2001, 2001, 2032, 1599,
-  1630, 1630, 1661, 1661, 1692, 1692, 1723, 1723, 1754, 1754, 1785, 1785, 1816,
-  1816, 1847, 1847, 1878, 1878, 1909, 1909, 1940, 1940, 1971, 1971, 2002, 2002,
-  2033, 1631, 1662, 1662, 1693, 1693, 1724, 1724, 1755, 1755, 1786, 1786, 1817,
-  1817, 1848, 1848, 1879, 1879, 1910, 1910, 1941, 1941, 1972, 1972, 2003, 2003,
-  2034, 1663, 1694, 1694, 1725, 1725, 1756, 1756, 1787, 1787, 1818, 1818, 1849,
-  1849, 1880, 1880, 1911, 1911, 1942, 1942, 1973, 1973, 2004, 2004, 2035, 1695,
-  1726, 1726, 1757, 1757, 1788, 1788, 1819, 1819, 1850, 1850, 1881, 1881, 1912,
-  1912, 1943, 1943, 1974, 1974, 2005, 2005, 2036, 1727, 1758, 1758, 1789, 1789,
-  1820, 1820, 1851, 1851, 1882, 1882, 1913, 1913, 1944, 1944, 1975, 1975, 2006,
-  2006, 2037, 1759, 1790, 1790, 1821, 1821, 1852, 1852, 1883, 1883, 1914, 1914,
-  1945, 1945, 1976, 1976, 2007, 2007, 2038, 1791, 1822, 1822, 1853, 1853, 1884,
-  1884, 1915, 1915, 1946, 1946, 1977, 1977, 2008, 2008, 2039, 1823, 1854, 1854,
-  1885, 1885, 1916, 1916, 1947, 1947, 1978, 1978, 2009, 2009, 2040, 1855, 1886,
-  1886, 1917, 1917, 1948, 1948, 1979, 1979, 2010, 2010, 2041, 1887, 1918, 1918,
-  1949, 1949, 1980, 1980, 2011, 2011, 2042, 1919, 1950, 1950, 1981, 1981, 2012,
-  2012, 2043, 1951, 1982, 1982, 2013, 2013, 2044, 1983, 2014, 2014, 2045, 2015,
-  2046, 0,    0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_64x32_neighbors[2049 * MAX_NEIGHBORS]) = {
-  0,    0,    0,    0,    0,    0,    1,    1,    1,    64,   64,   64,   2,
-  2,    2,    65,   65,   128,  128,  128,  3,    3,    3,    66,   66,   129,
-  129,  192,  192,  192,  4,    4,    4,    67,   67,   130,  130,  193,  193,
-  256,  256,  256,  5,    5,    5,    68,   68,   131,  131,  194,  194,  257,
-  257,  320,  320,  320,  6,    6,    6,    69,   69,   132,  132,  195,  195,
-  258,  258,  321,  321,  384,  384,  384,  7,    7,    7,    70,   70,   133,
-  133,  196,  196,  259,  259,  322,  322,  385,  385,  448,  448,  448,  8,
-  8,    8,    71,   71,   134,  134,  197,  197,  260,  260,  323,  323,  386,
-  386,  449,  449,  512,  512,  512,  9,    9,    9,    72,   72,   135,  135,
-  198,  198,  261,  261,  324,  324,  387,  387,  450,  450,  513,  513,  576,
-  576,  576,  10,   10,   10,   73,   73,   136,  136,  199,  199,  262,  262,
-  325,  325,  388,  388,  451,  451,  514,  514,  577,  577,  640,  640,  640,
-  11,   11,   11,   74,   74,   137,  137,  200,  200,  263,  263,  326,  326,
-  389,  389,  452,  452,  515,  515,  578,  578,  641,  641,  704,  704,  704,
-  12,   12,   12,   75,   75,   138,  138,  201,  201,  264,  264,  327,  327,
-  390,  390,  453,  453,  516,  516,  579,  579,  642,  642,  705,  705,  768,
-  768,  768,  13,   13,   13,   76,   76,   139,  139,  202,  202,  265,  265,
-  328,  328,  391,  391,  454,  454,  517,  517,  580,  580,  643,  643,  706,
-  706,  769,  769,  832,  832,  832,  14,   14,   14,   77,   77,   140,  140,
-  203,  203,  266,  266,  329,  329,  392,  392,  455,  455,  518,  518,  581,
-  581,  644,  644,  707,  707,  770,  770,  833,  833,  896,  896,  896,  15,
-  15,   15,   78,   78,   141,  141,  204,  204,  267,  267,  330,  330,  393,
-  393,  456,  456,  519,  519,  582,  582,  645,  645,  708,  708,  771,  771,
-  834,  834,  897,  897,  960,  960,  960,  16,   16,   16,   79,   79,   142,
-  142,  205,  205,  268,  268,  331,  331,  394,  394,  457,  457,  520,  520,
-  583,  583,  646,  646,  709,  709,  772,  772,  835,  835,  898,  898,  961,
-  961,  1024, 1024, 1024, 17,   17,   17,   80,   80,   143,  143,  206,  206,
-  269,  269,  332,  332,  395,  395,  458,  458,  521,  521,  584,  584,  647,
-  647,  710,  710,  773,  773,  836,  836,  899,  899,  962,  962,  1025, 1025,
-  1088, 1088, 1088, 18,   18,   18,   81,   81,   144,  144,  207,  207,  270,
-  270,  333,  333,  396,  396,  459,  459,  522,  522,  585,  585,  648,  648,
-  711,  711,  774,  774,  837,  837,  900,  900,  963,  963,  1026, 1026, 1089,
-  1089, 1152, 1152, 1152, 19,   19,   19,   82,   82,   145,  145,  208,  208,
-  271,  271,  334,  334,  397,  397,  460,  460,  523,  523,  586,  586,  649,
-  649,  712,  712,  775,  775,  838,  838,  901,  901,  964,  964,  1027, 1027,
-  1090, 1090, 1153, 1153, 1216, 1216, 1216, 20,   20,   20,   83,   83,   146,
-  146,  209,  209,  272,  272,  335,  335,  398,  398,  461,  461,  524,  524,
-  587,  587,  650,  650,  713,  713,  776,  776,  839,  839,  902,  902,  965,
-  965,  1028, 1028, 1091, 1091, 1154, 1154, 1217, 1217, 1280, 1280, 1280, 21,
-  21,   21,   84,   84,   147,  147,  210,  210,  273,  273,  336,  336,  399,
-  399,  462,  462,  525,  525,  588,  588,  651,  651,  714,  714,  777,  777,
-  840,  840,  903,  903,  966,  966,  1029, 1029, 1092, 1092, 1155, 1155, 1218,
-  1218, 1281, 1281, 1344, 1344, 1344, 22,   22,   22,   85,   85,   148,  148,
-  211,  211,  274,  274,  337,  337,  400,  400,  463,  463,  526,  526,  589,
-  589,  652,  652,  715,  715,  778,  778,  841,  841,  904,  904,  967,  967,
-  1030, 1030, 1093, 1093, 1156, 1156, 1219, 1219, 1282, 1282, 1345, 1345, 1408,
-  1408, 1408, 23,   23,   23,   86,   86,   149,  149,  212,  212,  275,  275,
-  338,  338,  401,  401,  464,  464,  527,  527,  590,  590,  653,  653,  716,
-  716,  779,  779,  842,  842,  905,  905,  968,  968,  1031, 1031, 1094, 1094,
-  1157, 1157, 1220, 1220, 1283, 1283, 1346, 1346, 1409, 1409, 1472, 1472, 1472,
-  24,   24,   24,   87,   87,   150,  150,  213,  213,  276,  276,  339,  339,
-  402,  402,  465,  465,  528,  528,  591,  591,  654,  654,  717,  717,  780,
-  780,  843,  843,  906,  906,  969,  969,  1032, 1032, 1095, 1095, 1158, 1158,
-  1221, 1221, 1284, 1284, 1347, 1347, 1410, 1410, 1473, 1473, 1536, 1536, 1536,
-  25,   25,   25,   88,   88,   151,  151,  214,  214,  277,  277,  340,  340,
-  403,  403,  466,  466,  529,  529,  592,  592,  655,  655,  718,  718,  781,
-  781,  844,  844,  907,  907,  970,  970,  1033, 1033, 1096, 1096, 1159, 1159,
-  1222, 1222, 1285, 1285, 1348, 1348, 1411, 1411, 1474, 1474, 1537, 1537, 1600,
-  1600, 1600, 26,   26,   26,   89,   89,   152,  152,  215,  215,  278,  278,
-  341,  341,  404,  404,  467,  467,  530,  530,  593,  593,  656,  656,  719,
-  719,  782,  782,  845,  845,  908,  908,  971,  971,  1034, 1034, 1097, 1097,
-  1160, 1160, 1223, 1223, 1286, 1286, 1349, 1349, 1412, 1412, 1475, 1475, 1538,
-  1538, 1601, 1601, 1664, 1664, 1664, 27,   27,   27,   90,   90,   153,  153,
-  216,  216,  279,  279,  342,  342,  405,  405,  468,  468,  531,  531,  594,
-  594,  657,  657,  720,  720,  783,  783,  846,  846,  909,  909,  972,  972,
-  1035, 1035, 1098, 1098, 1161, 1161, 1224, 1224, 1287, 1287, 1350, 1350, 1413,
-  1413, 1476, 1476, 1539, 1539, 1602, 1602, 1665, 1665, 1728, 1728, 1728, 28,
-  28,   28,   91,   91,   154,  154,  217,  217,  280,  280,  343,  343,  406,
-  406,  469,  469,  532,  532,  595,  595,  658,  658,  721,  721,  784,  784,
-  847,  847,  910,  910,  973,  973,  1036, 1036, 1099, 1099, 1162, 1162, 1225,
-  1225, 1288, 1288, 1351, 1351, 1414, 1414, 1477, 1477, 1540, 1540, 1603, 1603,
-  1666, 1666, 1729, 1729, 1792, 1792, 1792, 29,   29,   29,   92,   92,   155,
-  155,  218,  218,  281,  281,  344,  344,  407,  407,  470,  470,  533,  533,
-  596,  596,  659,  659,  722,  722,  785,  785,  848,  848,  911,  911,  974,
-  974,  1037, 1037, 1100, 1100, 1163, 1163, 1226, 1226, 1289, 1289, 1352, 1352,
-  1415, 1415, 1478, 1478, 1541, 1541, 1604, 1604, 1667, 1667, 1730, 1730, 1793,
-  1793, 1856, 1856, 1856, 30,   30,   30,   93,   93,   156,  156,  219,  219,
-  282,  282,  345,  345,  408,  408,  471,  471,  534,  534,  597,  597,  660,
-  660,  723,  723,  786,  786,  849,  849,  912,  912,  975,  975,  1038, 1038,
-  1101, 1101, 1164, 1164, 1227, 1227, 1290, 1290, 1353, 1353, 1416, 1416, 1479,
-  1479, 1542, 1542, 1605, 1605, 1668, 1668, 1731, 1731, 1794, 1794, 1857, 1857,
-  1920, 1920, 1920, 31,   31,   31,   94,   94,   157,  157,  220,  220,  283,
-  283,  346,  346,  409,  409,  472,  472,  535,  535,  598,  598,  661,  661,
-  724,  724,  787,  787,  850,  850,  913,  913,  976,  976,  1039, 1039, 1102,
-  1102, 1165, 1165, 1228, 1228, 1291, 1291, 1354, 1354, 1417, 1417, 1480, 1480,
-  1543, 1543, 1606, 1606, 1669, 1669, 1732, 1732, 1795, 1795, 1858, 1858, 1921,
-  1921, 1984, 32,   32,   32,   95,   95,   158,  158,  221,  221,  284,  284,
-  347,  347,  410,  410,  473,  473,  536,  536,  599,  599,  662,  662,  725,
-  725,  788,  788,  851,  851,  914,  914,  977,  977,  1040, 1040, 1103, 1103,
-  1166, 1166, 1229, 1229, 1292, 1292, 1355, 1355, 1418, 1418, 1481, 1481, 1544,
-  1544, 1607, 1607, 1670, 1670, 1733, 1733, 1796, 1796, 1859, 1859, 1922, 1922,
-  1985, 33,   33,   33,   96,   96,   159,  159,  222,  222,  285,  285,  348,
-  348,  411,  411,  474,  474,  537,  537,  600,  600,  663,  663,  726,  726,
-  789,  789,  852,  852,  915,  915,  978,  978,  1041, 1041, 1104, 1104, 1167,
-  1167, 1230, 1230, 1293, 1293, 1356, 1356, 1419, 1419, 1482, 1482, 1545, 1545,
-  1608, 1608, 1671, 1671, 1734, 1734, 1797, 1797, 1860, 1860, 1923, 1923, 1986,
-  34,   34,   34,   97,   97,   160,  160,  223,  223,  286,  286,  349,  349,
-  412,  412,  475,  475,  538,  538,  601,  601,  664,  664,  727,  727,  790,
-  790,  853,  853,  916,  916,  979,  979,  1042, 1042, 1105, 1105, 1168, 1168,
-  1231, 1231, 1294, 1294, 1357, 1357, 1420, 1420, 1483, 1483, 1546, 1546, 1609,
-  1609, 1672, 1672, 1735, 1735, 1798, 1798, 1861, 1861, 1924, 1924, 1987, 35,
-  35,   35,   98,   98,   161,  161,  224,  224,  287,  287,  350,  350,  413,
-  413,  476,  476,  539,  539,  602,  602,  665,  665,  728,  728,  791,  791,
-  854,  854,  917,  917,  980,  980,  1043, 1043, 1106, 1106, 1169, 1169, 1232,
-  1232, 1295, 1295, 1358, 1358, 1421, 1421, 1484, 1484, 1547, 1547, 1610, 1610,
-  1673, 1673, 1736, 1736, 1799, 1799, 1862, 1862, 1925, 1925, 1988, 36,   36,
-  36,   99,   99,   162,  162,  225,  225,  288,  288,  351,  351,  414,  414,
-  477,  477,  540,  540,  603,  603,  666,  666,  729,  729,  792,  792,  855,
-  855,  918,  918,  981,  981,  1044, 1044, 1107, 1107, 1170, 1170, 1233, 1233,
-  1296, 1296, 1359, 1359, 1422, 1422, 1485, 1485, 1548, 1548, 1611, 1611, 1674,
-  1674, 1737, 1737, 1800, 1800, 1863, 1863, 1926, 1926, 1989, 37,   37,   37,
-  100,  100,  163,  163,  226,  226,  289,  289,  352,  352,  415,  415,  478,
-  478,  541,  541,  604,  604,  667,  667,  730,  730,  793,  793,  856,  856,
-  919,  919,  982,  982,  1045, 1045, 1108, 1108, 1171, 1171, 1234, 1234, 1297,
-  1297, 1360, 1360, 1423, 1423, 1486, 1486, 1549, 1549, 1612, 1612, 1675, 1675,
-  1738, 1738, 1801, 1801, 1864, 1864, 1927, 1927, 1990, 38,   38,   38,   101,
-  101,  164,  164,  227,  227,  290,  290,  353,  353,  416,  416,  479,  479,
-  542,  542,  605,  605,  668,  668,  731,  731,  794,  794,  857,  857,  920,
-  920,  983,  983,  1046, 1046, 1109, 1109, 1172, 1172, 1235, 1235, 1298, 1298,
-  1361, 1361, 1424, 1424, 1487, 1487, 1550, 1550, 1613, 1613, 1676, 1676, 1739,
-  1739, 1802, 1802, 1865, 1865, 1928, 1928, 1991, 39,   39,   39,   102,  102,
-  165,  165,  228,  228,  291,  291,  354,  354,  417,  417,  480,  480,  543,
-  543,  606,  606,  669,  669,  732,  732,  795,  795,  858,  858,  921,  921,
-  984,  984,  1047, 1047, 1110, 1110, 1173, 1173, 1236, 1236, 1299, 1299, 1362,
-  1362, 1425, 1425, 1488, 1488, 1551, 1551, 1614, 1614, 1677, 1677, 1740, 1740,
-  1803, 1803, 1866, 1866, 1929, 1929, 1992, 40,   40,   40,   103,  103,  166,
-  166,  229,  229,  292,  292,  355,  355,  418,  418,  481,  481,  544,  544,
-  607,  607,  670,  670,  733,  733,  796,  796,  859,  859,  922,  922,  985,
-  985,  1048, 1048, 1111, 1111, 1174, 1174, 1237, 1237, 1300, 1300, 1363, 1363,
-  1426, 1426, 1489, 1489, 1552, 1552, 1615, 1615, 1678, 1678, 1741, 1741, 1804,
-  1804, 1867, 1867, 1930, 1930, 1993, 41,   41,   41,   104,  104,  167,  167,
-  230,  230,  293,  293,  356,  356,  419,  419,  482,  482,  545,  545,  608,
-  608,  671,  671,  734,  734,  797,  797,  860,  860,  923,  923,  986,  986,
-  1049, 1049, 1112, 1112, 1175, 1175, 1238, 1238, 1301, 1301, 1364, 1364, 1427,
-  1427, 1490, 1490, 1553, 1553, 1616, 1616, 1679, 1679, 1742, 1742, 1805, 1805,
-  1868, 1868, 1931, 1931, 1994, 42,   42,   42,   105,  105,  168,  168,  231,
-  231,  294,  294,  357,  357,  420,  420,  483,  483,  546,  546,  609,  609,
-  672,  672,  735,  735,  798,  798,  861,  861,  924,  924,  987,  987,  1050,
-  1050, 1113, 1113, 1176, 1176, 1239, 1239, 1302, 1302, 1365, 1365, 1428, 1428,
-  1491, 1491, 1554, 1554, 1617, 1617, 1680, 1680, 1743, 1743, 1806, 1806, 1869,
-  1869, 1932, 1932, 1995, 43,   43,   43,   106,  106,  169,  169,  232,  232,
-  295,  295,  358,  358,  421,  421,  484,  484,  547,  547,  610,  610,  673,
-  673,  736,  736,  799,  799,  862,  862,  925,  925,  988,  988,  1051, 1051,
-  1114, 1114, 1177, 1177, 1240, 1240, 1303, 1303, 1366, 1366, 1429, 1429, 1492,
-  1492, 1555, 1555, 1618, 1618, 1681, 1681, 1744, 1744, 1807, 1807, 1870, 1870,
-  1933, 1933, 1996, 44,   44,   44,   107,  107,  170,  170,  233,  233,  296,
-  296,  359,  359,  422,  422,  485,  485,  548,  548,  611,  611,  674,  674,
-  737,  737,  800,  800,  863,  863,  926,  926,  989,  989,  1052, 1052, 1115,
-  1115, 1178, 1178, 1241, 1241, 1304, 1304, 1367, 1367, 1430, 1430, 1493, 1493,
-  1556, 1556, 1619, 1619, 1682, 1682, 1745, 1745, 1808, 1808, 1871, 1871, 1934,
-  1934, 1997, 45,   45,   45,   108,  108,  171,  171,  234,  234,  297,  297,
-  360,  360,  423,  423,  486,  486,  549,  549,  612,  612,  675,  675,  738,
-  738,  801,  801,  864,  864,  927,  927,  990,  990,  1053, 1053, 1116, 1116,
-  1179, 1179, 1242, 1242, 1305, 1305, 1368, 1368, 1431, 1431, 1494, 1494, 1557,
-  1557, 1620, 1620, 1683, 1683, 1746, 1746, 1809, 1809, 1872, 1872, 1935, 1935,
-  1998, 46,   46,   46,   109,  109,  172,  172,  235,  235,  298,  298,  361,
-  361,  424,  424,  487,  487,  550,  550,  613,  613,  676,  676,  739,  739,
-  802,  802,  865,  865,  928,  928,  991,  991,  1054, 1054, 1117, 1117, 1180,
-  1180, 1243, 1243, 1306, 1306, 1369, 1369, 1432, 1432, 1495, 1495, 1558, 1558,
-  1621, 1621, 1684, 1684, 1747, 1747, 1810, 1810, 1873, 1873, 1936, 1936, 1999,
-  47,   47,   47,   110,  110,  173,  173,  236,  236,  299,  299,  362,  362,
-  425,  425,  488,  488,  551,  551,  614,  614,  677,  677,  740,  740,  803,
-  803,  866,  866,  929,  929,  992,  992,  1055, 1055, 1118, 1118, 1181, 1181,
-  1244, 1244, 1307, 1307, 1370, 1370, 1433, 1433, 1496, 1496, 1559, 1559, 1622,
-  1622, 1685, 1685, 1748, 1748, 1811, 1811, 1874, 1874, 1937, 1937, 2000, 48,
-  48,   48,   111,  111,  174,  174,  237,  237,  300,  300,  363,  363,  426,
-  426,  489,  489,  552,  552,  615,  615,  678,  678,  741,  741,  804,  804,
-  867,  867,  930,  930,  993,  993,  1056, 1056, 1119, 1119, 1182, 1182, 1245,
-  1245, 1308, 1308, 1371, 1371, 1434, 1434, 1497, 1497, 1560, 1560, 1623, 1623,
-  1686, 1686, 1749, 1749, 1812, 1812, 1875, 1875, 1938, 1938, 2001, 49,   49,
-  49,   112,  112,  175,  175,  238,  238,  301,  301,  364,  364,  427,  427,
-  490,  490,  553,  553,  616,  616,  679,  679,  742,  742,  805,  805,  868,
-  868,  931,  931,  994,  994,  1057, 1057, 1120, 1120, 1183, 1183, 1246, 1246,
-  1309, 1309, 1372, 1372, 1435, 1435, 1498, 1498, 1561, 1561, 1624, 1624, 1687,
-  1687, 1750, 1750, 1813, 1813, 1876, 1876, 1939, 1939, 2002, 50,   50,   50,
-  113,  113,  176,  176,  239,  239,  302,  302,  365,  365,  428,  428,  491,
-  491,  554,  554,  617,  617,  680,  680,  743,  743,  806,  806,  869,  869,
-  932,  932,  995,  995,  1058, 1058, 1121, 1121, 1184, 1184, 1247, 1247, 1310,
-  1310, 1373, 1373, 1436, 1436, 1499, 1499, 1562, 1562, 1625, 1625, 1688, 1688,
-  1751, 1751, 1814, 1814, 1877, 1877, 1940, 1940, 2003, 51,   51,   51,   114,
-  114,  177,  177,  240,  240,  303,  303,  366,  366,  429,  429,  492,  492,
-  555,  555,  618,  618,  681,  681,  744,  744,  807,  807,  870,  870,  933,
-  933,  996,  996,  1059, 1059, 1122, 1122, 1185, 1185, 1248, 1248, 1311, 1311,
-  1374, 1374, 1437, 1437, 1500, 1500, 1563, 1563, 1626, 1626, 1689, 1689, 1752,
-  1752, 1815, 1815, 1878, 1878, 1941, 1941, 2004, 52,   52,   52,   115,  115,
-  178,  178,  241,  241,  304,  304,  367,  367,  430,  430,  493,  493,  556,
-  556,  619,  619,  682,  682,  745,  745,  808,  808,  871,  871,  934,  934,
-  997,  997,  1060, 1060, 1123, 1123, 1186, 1186, 1249, 1249, 1312, 1312, 1375,
-  1375, 1438, 1438, 1501, 1501, 1564, 1564, 1627, 1627, 1690, 1690, 1753, 1753,
-  1816, 1816, 1879, 1879, 1942, 1942, 2005, 53,   53,   53,   116,  116,  179,
-  179,  242,  242,  305,  305,  368,  368,  431,  431,  494,  494,  557,  557,
-  620,  620,  683,  683,  746,  746,  809,  809,  872,  872,  935,  935,  998,
-  998,  1061, 1061, 1124, 1124, 1187, 1187, 1250, 1250, 1313, 1313, 1376, 1376,
-  1439, 1439, 1502, 1502, 1565, 1565, 1628, 1628, 1691, 1691, 1754, 1754, 1817,
-  1817, 1880, 1880, 1943, 1943, 2006, 54,   54,   54,   117,  117,  180,  180,
-  243,  243,  306,  306,  369,  369,  432,  432,  495,  495,  558,  558,  621,
-  621,  684,  684,  747,  747,  810,  810,  873,  873,  936,  936,  999,  999,
-  1062, 1062, 1125, 1125, 1188, 1188, 1251, 1251, 1314, 1314, 1377, 1377, 1440,
-  1440, 1503, 1503, 1566, 1566, 1629, 1629, 1692, 1692, 1755, 1755, 1818, 1818,
-  1881, 1881, 1944, 1944, 2007, 55,   55,   55,   118,  118,  181,  181,  244,
-  244,  307,  307,  370,  370,  433,  433,  496,  496,  559,  559,  622,  622,
-  685,  685,  748,  748,  811,  811,  874,  874,  937,  937,  1000, 1000, 1063,
-  1063, 1126, 1126, 1189, 1189, 1252, 1252, 1315, 1315, 1378, 1378, 1441, 1441,
-  1504, 1504, 1567, 1567, 1630, 1630, 1693, 1693, 1756, 1756, 1819, 1819, 1882,
-  1882, 1945, 1945, 2008, 56,   56,   56,   119,  119,  182,  182,  245,  245,
-  308,  308,  371,  371,  434,  434,  497,  497,  560,  560,  623,  623,  686,
-  686,  749,  749,  812,  812,  875,  875,  938,  938,  1001, 1001, 1064, 1064,
-  1127, 1127, 1190, 1190, 1253, 1253, 1316, 1316, 1379, 1379, 1442, 1442, 1505,
-  1505, 1568, 1568, 1631, 1631, 1694, 1694, 1757, 1757, 1820, 1820, 1883, 1883,
-  1946, 1946, 2009, 57,   57,   57,   120,  120,  183,  183,  246,  246,  309,
-  309,  372,  372,  435,  435,  498,  498,  561,  561,  624,  624,  687,  687,
-  750,  750,  813,  813,  876,  876,  939,  939,  1002, 1002, 1065, 1065, 1128,
-  1128, 1191, 1191, 1254, 1254, 1317, 1317, 1380, 1380, 1443, 1443, 1506, 1506,
-  1569, 1569, 1632, 1632, 1695, 1695, 1758, 1758, 1821, 1821, 1884, 1884, 1947,
-  1947, 2010, 58,   58,   58,   121,  121,  184,  184,  247,  247,  310,  310,
-  373,  373,  436,  436,  499,  499,  562,  562,  625,  625,  688,  688,  751,
-  751,  814,  814,  877,  877,  940,  940,  1003, 1003, 1066, 1066, 1129, 1129,
-  1192, 1192, 1255, 1255, 1318, 1318, 1381, 1381, 1444, 1444, 1507, 1507, 1570,
-  1570, 1633, 1633, 1696, 1696, 1759, 1759, 1822, 1822, 1885, 1885, 1948, 1948,
-  2011, 59,   59,   59,   122,  122,  185,  185,  248,  248,  311,  311,  374,
-  374,  437,  437,  500,  500,  563,  563,  626,  626,  689,  689,  752,  752,
-  815,  815,  878,  878,  941,  941,  1004, 1004, 1067, 1067, 1130, 1130, 1193,
-  1193, 1256, 1256, 1319, 1319, 1382, 1382, 1445, 1445, 1508, 1508, 1571, 1571,
-  1634, 1634, 1697, 1697, 1760, 1760, 1823, 1823, 1886, 1886, 1949, 1949, 2012,
-  60,   60,   60,   123,  123,  186,  186,  249,  249,  312,  312,  375,  375,
-  438,  438,  501,  501,  564,  564,  627,  627,  690,  690,  753,  753,  816,
-  816,  879,  879,  942,  942,  1005, 1005, 1068, 1068, 1131, 1131, 1194, 1194,
-  1257, 1257, 1320, 1320, 1383, 1383, 1446, 1446, 1509, 1509, 1572, 1572, 1635,
-  1635, 1698, 1698, 1761, 1761, 1824, 1824, 1887, 1887, 1950, 1950, 2013, 61,
-  61,   61,   124,  124,  187,  187,  250,  250,  313,  313,  376,  376,  439,
-  439,  502,  502,  565,  565,  628,  628,  691,  691,  754,  754,  817,  817,
-  880,  880,  943,  943,  1006, 1006, 1069, 1069, 1132, 1132, 1195, 1195, 1258,
-  1258, 1321, 1321, 1384, 1384, 1447, 1447, 1510, 1510, 1573, 1573, 1636, 1636,
-  1699, 1699, 1762, 1762, 1825, 1825, 1888, 1888, 1951, 1951, 2014, 62,   62,
-  62,   125,  125,  188,  188,  251,  251,  314,  314,  377,  377,  440,  440,
-  503,  503,  566,  566,  629,  629,  692,  692,  755,  755,  818,  818,  881,
-  881,  944,  944,  1007, 1007, 1070, 1070, 1133, 1133, 1196, 1196, 1259, 1259,
-  1322, 1322, 1385, 1385, 1448, 1448, 1511, 1511, 1574, 1574, 1637, 1637, 1700,
-  1700, 1763, 1763, 1826, 1826, 1889, 1889, 1952, 1952, 2015, 63,   126,  126,
-  189,  189,  252,  252,  315,  315,  378,  378,  441,  441,  504,  504,  567,
-  567,  630,  630,  693,  693,  756,  756,  819,  819,  882,  882,  945,  945,
-  1008, 1008, 1071, 1071, 1134, 1134, 1197, 1197, 1260, 1260, 1323, 1323, 1386,
-  1386, 1449, 1449, 1512, 1512, 1575, 1575, 1638, 1638, 1701, 1701, 1764, 1764,
-  1827, 1827, 1890, 1890, 1953, 1953, 2016, 127,  190,  190,  253,  253,  316,
-  316,  379,  379,  442,  442,  505,  505,  568,  568,  631,  631,  694,  694,
-  757,  757,  820,  820,  883,  883,  946,  946,  1009, 1009, 1072, 1072, 1135,
-  1135, 1198, 1198, 1261, 1261, 1324, 1324, 1387, 1387, 1450, 1450, 1513, 1513,
-  1576, 1576, 1639, 1639, 1702, 1702, 1765, 1765, 1828, 1828, 1891, 1891, 1954,
-  1954, 2017, 191,  254,  254,  317,  317,  380,  380,  443,  443,  506,  506,
-  569,  569,  632,  632,  695,  695,  758,  758,  821,  821,  884,  884,  947,
-  947,  1010, 1010, 1073, 1073, 1136, 1136, 1199, 1199, 1262, 1262, 1325, 1325,
-  1388, 1388, 1451, 1451, 1514, 1514, 1577, 1577, 1640, 1640, 1703, 1703, 1766,
-  1766, 1829, 1829, 1892, 1892, 1955, 1955, 2018, 255,  318,  318,  381,  381,
-  444,  444,  507,  507,  570,  570,  633,  633,  696,  696,  759,  759,  822,
-  822,  885,  885,  948,  948,  1011, 1011, 1074, 1074, 1137, 1137, 1200, 1200,
-  1263, 1263, 1326, 1326, 1389, 1389, 1452, 1452, 1515, 1515, 1578, 1578, 1641,
-  1641, 1704, 1704, 1767, 1767, 1830, 1830, 1893, 1893, 1956, 1956, 2019, 319,
-  382,  382,  445,  445,  508,  508,  571,  571,  634,  634,  697,  697,  760,
-  760,  823,  823,  886,  886,  949,  949,  1012, 1012, 1075, 1075, 1138, 1138,
-  1201, 1201, 1264, 1264, 1327, 1327, 1390, 1390, 1453, 1453, 1516, 1516, 1579,
-  1579, 1642, 1642, 1705, 1705, 1768, 1768, 1831, 1831, 1894, 1894, 1957, 1957,
-  2020, 383,  446,  446,  509,  509,  572,  572,  635,  635,  698,  698,  761,
-  761,  824,  824,  887,  887,  950,  950,  1013, 1013, 1076, 1076, 1139, 1139,
-  1202, 1202, 1265, 1265, 1328, 1328, 1391, 1391, 1454, 1454, 1517, 1517, 1580,
-  1580, 1643, 1643, 1706, 1706, 1769, 1769, 1832, 1832, 1895, 1895, 1958, 1958,
-  2021, 447,  510,  510,  573,  573,  636,  636,  699,  699,  762,  762,  825,
-  825,  888,  888,  951,  951,  1014, 1014, 1077, 1077, 1140, 1140, 1203, 1203,
-  1266, 1266, 1329, 1329, 1392, 1392, 1455, 1455, 1518, 1518, 1581, 1581, 1644,
-  1644, 1707, 1707, 1770, 1770, 1833, 1833, 1896, 1896, 1959, 1959, 2022, 511,
-  574,  574,  637,  637,  700,  700,  763,  763,  826,  826,  889,  889,  952,
-  952,  1015, 1015, 1078, 1078, 1141, 1141, 1204, 1204, 1267, 1267, 1330, 1330,
-  1393, 1393, 1456, 1456, 1519, 1519, 1582, 1582, 1645, 1645, 1708, 1708, 1771,
-  1771, 1834, 1834, 1897, 1897, 1960, 1960, 2023, 575,  638,  638,  701,  701,
-  764,  764,  827,  827,  890,  890,  953,  953,  1016, 1016, 1079, 1079, 1142,
-  1142, 1205, 1205, 1268, 1268, 1331, 1331, 1394, 1394, 1457, 1457, 1520, 1520,
-  1583, 1583, 1646, 1646, 1709, 1709, 1772, 1772, 1835, 1835, 1898, 1898, 1961,
-  1961, 2024, 639,  702,  702,  765,  765,  828,  828,  891,  891,  954,  954,
-  1017, 1017, 1080, 1080, 1143, 1143, 1206, 1206, 1269, 1269, 1332, 1332, 1395,
-  1395, 1458, 1458, 1521, 1521, 1584, 1584, 1647, 1647, 1710, 1710, 1773, 1773,
-  1836, 1836, 1899, 1899, 1962, 1962, 2025, 703,  766,  766,  829,  829,  892,
-  892,  955,  955,  1018, 1018, 1081, 1081, 1144, 1144, 1207, 1207, 1270, 1270,
-  1333, 1333, 1396, 1396, 1459, 1459, 1522, 1522, 1585, 1585, 1648, 1648, 1711,
-  1711, 1774, 1774, 1837, 1837, 1900, 1900, 1963, 1963, 2026, 767,  830,  830,
-  893,  893,  956,  956,  1019, 1019, 1082, 1082, 1145, 1145, 1208, 1208, 1271,
-  1271, 1334, 1334, 1397, 1397, 1460, 1460, 1523, 1523, 1586, 1586, 1649, 1649,
-  1712, 1712, 1775, 1775, 1838, 1838, 1901, 1901, 1964, 1964, 2027, 831,  894,
-  894,  957,  957,  1020, 1020, 1083, 1083, 1146, 1146, 1209, 1209, 1272, 1272,
-  1335, 1335, 1398, 1398, 1461, 1461, 1524, 1524, 1587, 1587, 1650, 1650, 1713,
-  1713, 1776, 1776, 1839, 1839, 1902, 1902, 1965, 1965, 2028, 895,  958,  958,
-  1021, 1021, 1084, 1084, 1147, 1147, 1210, 1210, 1273, 1273, 1336, 1336, 1399,
-  1399, 1462, 1462, 1525, 1525, 1588, 1588, 1651, 1651, 1714, 1714, 1777, 1777,
-  1840, 1840, 1903, 1903, 1966, 1966, 2029, 959,  1022, 1022, 1085, 1085, 1148,
-  1148, 1211, 1211, 1274, 1274, 1337, 1337, 1400, 1400, 1463, 1463, 1526, 1526,
-  1589, 1589, 1652, 1652, 1715, 1715, 1778, 1778, 1841, 1841, 1904, 1904, 1967,
-  1967, 2030, 1023, 1086, 1086, 1149, 1149, 1212, 1212, 1275, 1275, 1338, 1338,
-  1401, 1401, 1464, 1464, 1527, 1527, 1590, 1590, 1653, 1653, 1716, 1716, 1779,
-  1779, 1842, 1842, 1905, 1905, 1968, 1968, 2031, 1087, 1150, 1150, 1213, 1213,
-  1276, 1276, 1339, 1339, 1402, 1402, 1465, 1465, 1528, 1528, 1591, 1591, 1654,
-  1654, 1717, 1717, 1780, 1780, 1843, 1843, 1906, 1906, 1969, 1969, 2032, 1151,
-  1214, 1214, 1277, 1277, 1340, 1340, 1403, 1403, 1466, 1466, 1529, 1529, 1592,
-  1592, 1655, 1655, 1718, 1718, 1781, 1781, 1844, 1844, 1907, 1907, 1970, 1970,
-  2033, 1215, 1278, 1278, 1341, 1341, 1404, 1404, 1467, 1467, 1530, 1530, 1593,
-  1593, 1656, 1656, 1719, 1719, 1782, 1782, 1845, 1845, 1908, 1908, 1971, 1971,
-  2034, 1279, 1342, 1342, 1405, 1405, 1468, 1468, 1531, 1531, 1594, 1594, 1657,
-  1657, 1720, 1720, 1783, 1783, 1846, 1846, 1909, 1909, 1972, 1972, 2035, 1343,
-  1406, 1406, 1469, 1469, 1532, 1532, 1595, 1595, 1658, 1658, 1721, 1721, 1784,
-  1784, 1847, 1847, 1910, 1910, 1973, 1973, 2036, 1407, 1470, 1470, 1533, 1533,
-  1596, 1596, 1659, 1659, 1722, 1722, 1785, 1785, 1848, 1848, 1911, 1911, 1974,
-  1974, 2037, 1471, 1534, 1534, 1597, 1597, 1660, 1660, 1723, 1723, 1786, 1786,
-  1849, 1849, 1912, 1912, 1975, 1975, 2038, 1535, 1598, 1598, 1661, 1661, 1724,
-  1724, 1787, 1787, 1850, 1850, 1913, 1913, 1976, 1976, 2039, 1599, 1662, 1662,
-  1725, 1725, 1788, 1788, 1851, 1851, 1914, 1914, 1977, 1977, 2040, 1663, 1726,
-  1726, 1789, 1789, 1852, 1852, 1915, 1915, 1978, 1978, 2041, 1727, 1790, 1790,
-  1853, 1853, 1916, 1916, 1979, 1979, 2042, 1791, 1854, 1854, 1917, 1917, 1980,
-  1980, 2043, 1855, 1918, 1918, 1981, 1981, 2044, 1919, 1982, 1982, 2045, 1983,
-  2046, 0,    0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_64x64_neighbors[4097 * MAX_NEIGHBORS]) = {
-  0,    0,    0,    0,    0,    0,    1,    64,   1,    1,    64,   64,   2,
-  65,   65,   128,  66,   129,  2,    2,    128,  128,  3,    66,   129,  192,
-  67,   130,  130,  193,  3,    3,    192,  192,  4,    67,   193,  256,  131,
-  194,  68,   131,  194,  257,  4,    4,    132,  195,  195,  258,  256,  256,
-  5,    68,   257,  320,  69,   132,  258,  321,  196,  259,  133,  196,  259,
-  322,  5,    5,    320,  320,  6,    69,   321,  384,  70,   133,  322,  385,
-  197,  260,  260,  323,  134,  197,  323,  386,  6,    6,    384,  384,  7,
-  70,   261,  324,  385,  448,  198,  261,  324,  387,  71,   134,  386,  449,
-  135,  198,  387,  450,  262,  325,  325,  388,  7,    7,    448,  448,  8,
-  71,   199,  262,  388,  451,  449,  512,  72,   135,  450,  513,  326,  389,
-  136,  199,  451,  514,  263,  326,  389,  452,  200,  263,  452,  515,  8,
-  8,    512,  512,  9,    72,   513,  576,  73,   136,  327,  390,  390,  453,
-  514,  577,  264,  327,  453,  516,  137,  200,  515,  578,  201,  264,  516,
-  579,  391,  454,  9,    9,    328,  391,  454,  517,  576,  576,  10,   73,
-  577,  640,  74,   137,  578,  641,  265,  328,  517,  580,  138,  201,  579,
-  642,  392,  455,  455,  518,  202,  265,  580,  643,  329,  392,  518,  581,
-  10,   10,   640,  640,  11,   74,   641,  704,  75,   138,  266,  329,  581,
-  644,  642,  705,  456,  519,  139,  202,  393,  456,  519,  582,  643,  706,
-  330,  393,  582,  645,  203,  266,  644,  707,  11,   11,   704,  704,  12,
-  75,   457,  520,  520,  583,  705,  768,  267,  330,  645,  708,  76,   139,
-  706,  769,  394,  457,  583,  646,  140,  203,  707,  770,  331,  394,  646,
-  709,  204,  267,  708,  771,  521,  584,  458,  521,  584,  647,  12,   12,
-  268,  331,  709,  772,  768,  768,  13,   76,   395,  458,  647,  710,  769,
-  832,  77,   140,  770,  833,  141,  204,  771,  834,  332,  395,  710,  773,
-  522,  585,  585,  648,  205,  268,  459,  522,  648,  711,  772,  835,  396,
-  459,  711,  774,  269,  332,  773,  836,  13,   13,   832,  832,  14,   77,
-  833,  896,  78,   141,  586,  649,  834,  897,  523,  586,  649,  712,  142,
-  205,  333,  396,  774,  837,  835,  898,  460,  523,  712,  775,  206,  269,
-  836,  899,  397,  460,  775,  838,  270,  333,  587,  650,  650,  713,  837,
-  900,  14,   14,   524,  587,  713,  776,  896,  896,  15,   78,   897,  960,
-  79,   142,  898,  961,  334,  397,  838,  901,  461,  524,  776,  839,  143,
-  206,  899,  962,  207,  270,  900,  963,  651,  714,  588,  651,  714,  777,
-  398,  461,  839,  902,  271,  334,  525,  588,  777,  840,  901,  964,  15,
-  15,   960,  960,  16,   79,   961,  1024, 80,   143,  462,  525,  840,  903,
-  962,  1025, 335,  398,  902,  965,  144,  207,  652,  715,  715,  778,  963,
-  1026, 589,  652,  778,  841,  208,  271,  964,  1027, 399,  462,  903,  966,
-  526,  589,  841,  904,  272,  335,  965,  1028, 716,  779,  16,   16,   463,
-  526,  904,  967,  1024, 1024, 17,   80,   653,  716,  779,  842,  1025, 1088,
-  336,  399,  966,  1029, 81,   144,  1026, 1089, 590,  653,  842,  905,  145,
-  208,  1027, 1090, 209,  272,  400,  463,  967,  1030, 1028, 1091, 527,  590,
-  905,  968,  717,  780,  780,  843,  273,  336,  1029, 1092, 654,  717,  843,
-  906,  464,  527,  968,  1031, 17,   17,   1088, 1088, 18,   81,   337,  400,
-  591,  654,  906,  969,  1030, 1093, 1089, 1152, 82,   145,  1090, 1153, 146,
-  209,  1091, 1154, 528,  591,  969,  1032, 401,  464,  781,  844,  1031, 1094,
-  210,  273,  718,  781,  844,  907,  1092, 1155, 655,  718,  907,  970,  274,
-  337,  1093, 1156, 465,  528,  1032, 1095, 592,  655,  970,  1033, 338,  401,
-  1094, 1157, 18,   18,   1152, 1152, 19,   82,   1153, 1216, 83,   146,  782,
-  845,  845,  908,  1154, 1217, 719,  782,  908,  971,  147,  210,  529,  592,
-  1033, 1096, 1155, 1218, 402,  465,  1095, 1158, 211,  274,  656,  719,  971,
-  1034, 1156, 1219, 275,  338,  1157, 1220, 466,  529,  1096, 1159, 593,  656,
-  1034, 1097, 846,  909,  783,  846,  909,  972,  339,  402,  1158, 1221, 19,
-  19,   720,  783,  972,  1035, 1216, 1216, 20,   83,   1217, 1280, 84,   147,
-  1218, 1281, 530,  593,  1097, 1160, 148,  211,  1219, 1282, 403,  466,  657,
-  720,  1035, 1098, 1159, 1222, 212,  275,  1220, 1283, 847,  910,  910,  973,
-  594,  657,  1098, 1161, 276,  339,  467,  530,  784,  847,  973,  1036, 1160,
-  1223, 1221, 1284, 721,  784,  1036, 1099, 340,  403,  1222, 1285, 20,   20,
-  1280, 1280, 21,   84,   531,  594,  1161, 1224, 1281, 1344, 85,   148,  658,
-  721,  1099, 1162, 1282, 1345, 404,  467,  1223, 1286, 149,  212,  911,  974,
-  1283, 1346, 848,  911,  974,  1037, 213,  276,  1284, 1347, 785,  848,  1037,
-  1100, 595,  658,  1162, 1225, 468,  531,  1224, 1287, 277,  340,  1285, 1348,
-  722,  785,  1100, 1163, 341,  404,  1286, 1349, 532,  595,  912,  975,  975,
-  1038, 1225, 1288, 659,  722,  1163, 1226, 21,   21,   1344, 1344, 22,   85,
-  849,  912,  1038, 1101, 1345, 1408, 86,   149,  1346, 1409, 405,  468,  1287,
-  1350, 150,  213,  786,  849,  1101, 1164, 1347, 1410, 214,  277,  596,  659,
-  1226, 1289, 1348, 1411, 469,  532,  723,  786,  1164, 1227, 1288, 1351, 278,
-  341,  1349, 1412, 976,  1039, 913,  976,  1039, 1102, 342,  405,  850,  913,
-  1102, 1165, 1350, 1413, 660,  723,  1227, 1290, 533,  596,  1289, 1352, 22,
-  22,   1408, 1408, 23,   86,   787,  850,  1165, 1228, 1409, 1472, 87,   150,
-  406,  469,  1351, 1414, 1410, 1473, 151,  214,  1411, 1474, 597,  660,  1290,
-  1353, 724,  787,  1228, 1291, 215,  278,  977,  1040, 1040, 1103, 1412, 1475,
-  470,  533,  1352, 1415, 914,  977,  1103, 1166, 279,  342,  1413, 1476, 851,
-  914,  1166, 1229, 661,  724,  1291, 1354, 343,  406,  534,  597,  1353, 1416,
-  1414, 1477, 788,  851,  1229, 1292, 23,   23,   1472, 1472, 24,   87,   1473,
-  1536, 407,  470,  1041, 1104, 1415, 1478, 88,   151,  978,  1041, 1104, 1167,
-  1474, 1537, 598,  661,  1354, 1417, 152,  215,  725,  788,  1292, 1355, 1475,
-  1538, 915,  978,  1167, 1230, 216,  279,  1476, 1539, 471,  534,  1416, 1479,
-  852,  915,  1230, 1293, 280,  343,  1477, 1540, 662,  725,  1355, 1418, 535,
-  598,  789,  852,  1293, 1356, 1417, 1480, 344,  407,  1478, 1541, 1042, 1105,
-  1105, 1168, 979,  1042, 1168, 1231, 24,   24,   408,  471,  916,  979,  1231,
-  1294, 1479, 1542, 1536, 1536, 25,   88,   1537, 1600, 726,  789,  1356, 1419,
-  89,   152,  599,  662,  1418, 1481, 1538, 1601, 153,  216,  1539, 1602, 853,
-  916,  1294, 1357, 472,  535,  1480, 1543, 217,  280,  1540, 1603, 1106, 1169,
-  281,  344,  663,  726,  1043, 1106, 1169, 1232, 1419, 1482, 1541, 1604, 790,
-  853,  1357, 1420, 980,  1043, 1232, 1295, 536,  599,  1481, 1544, 345,  408,
-  1542, 1605, 917,  980,  1295, 1358, 727,  790,  1420, 1483, 409,  472,  1543,
-  1606, 25,   25,   600,  663,  1482, 1545, 1600, 1600, 26,   89,   1601, 1664,
-  90,   153,  854,  917,  1358, 1421, 1602, 1665, 154,  217,  1107, 1170, 1170,
-  1233, 1603, 1666, 473,  536,  1044, 1107, 1233, 1296, 1544, 1607, 218,  281,
-  1604, 1667, 664,  727,  981,  1044, 1296, 1359, 1483, 1546, 791,  854,  1421,
-  1484, 282,  345,  1605, 1668, 537,  600,  1545, 1608, 918,  981,  1359, 1422,
-  346,  409,  1606, 1669, 728,  791,  1484, 1547, 1171, 1234, 1108, 1171, 1234,
-  1297, 410,  473,  601,  664,  855,  918,  1422, 1485, 1546, 1609, 1607, 1670,
-  26,   26,   1664, 1664, 27,   90,   1045, 1108, 1297, 1360, 1665, 1728, 91,
-  154,  1666, 1729, 155,  218,  1667, 1730, 474,  537,  982,  1045, 1360, 1423,
-  1608, 1671, 219,  282,  792,  855,  1485, 1548, 1668, 1731, 665,  728,  1547,
-  1610, 283,  346,  919,  982,  1423, 1486, 1669, 1732, 538,  601,  1609, 1672,
-  1172, 1235, 1235, 1298, 347,  410,  1109, 1172, 1298, 1361, 1670, 1733, 729,
-  792,  1548, 1611, 856,  919,  1486, 1549, 1046, 1109, 1361, 1424, 602,  665,
-  1610, 1673, 411,  474,  1671, 1734, 27,   27,   1728, 1728, 28,   91,   983,
-  1046, 1424, 1487, 1729, 1792, 92,   155,  1730, 1793, 156,  219,  475,  538,
-  1672, 1735, 1731, 1794, 793,  856,  1549, 1612, 666,  729,  1611, 1674, 220,
-  283,  1236, 1299, 1732, 1795, 920,  983,  1487, 1550, 1173, 1236, 1299, 1362,
-  1110, 1173, 1362, 1425, 284,  347,  1733, 1796, 539,  602,  1673, 1736, 1047,
-  1110, 1425, 1488, 348,  411,  730,  793,  1612, 1675, 1734, 1797, 857,  920,
-  1550, 1613, 603,  666,  1674, 1737, 984,  1047, 1488, 1551, 412,  475,  1735,
-  1798, 28,   28,   1237, 1300, 1300, 1363, 1792, 1792, 29,   92,   1793, 1856,
-  93,   156,  794,  857,  1174, 1237, 1363, 1426, 1613, 1676, 1794, 1857, 476,
-  539,  1736, 1799, 157,  220,  667,  730,  921,  984,  1551, 1614, 1675, 1738,
-  1795, 1858, 1111, 1174, 1426, 1489, 221,  284,  1796, 1859, 540,  603,  1048,
-  1111, 1489, 1552, 1737, 1800, 285,  348,  1797, 1860, 858,  921,  1614, 1677,
-  731,  794,  1676, 1739, 349,  412,  1798, 1861, 985,  1048, 1552, 1615, 1301,
-  1364, 604,  667,  1238, 1301, 1364, 1427, 1738, 1801, 413,  476,  1175, 1238,
-  1427, 1490, 1799, 1862, 795,  858,  1677, 1740, 29,   29,   1112, 1175, 1490,
-  1553, 1856, 1856, 30,   93,   922,  985,  1615, 1678, 1857, 1920, 94,   157,
-  1858, 1921, 477,  540,  668,  731,  1739, 1802, 1800, 1863, 158,  221,  1859,
-  1922, 1049, 1112, 1553, 1616, 222,  285,  1860, 1923, 541,  604,  1801, 1864,
-  286,  349,  859,  922,  1302, 1365, 1365, 1428, 1678, 1741, 1861, 1924, 732,
-  795,  1740, 1803, 1239, 1302, 1428, 1491, 986,  1049, 1616, 1679, 350,  413,
-  1862, 1925, 1176, 1239, 1491, 1554, 605,  668,  1802, 1865, 414,  477,  1113,
-  1176, 1554, 1617, 1863, 1926, 796,  859,  1741, 1804, 923,  986,  1679, 1742,
-  30,   30,   1920, 1920, 31,   94,   669,  732,  1803, 1866, 1921, 1984, 478,
-  541,  1864, 1927, 95,   158,  1050, 1113, 1617, 1680, 1922, 1985, 1366, 1429,
-  159,  222,  1303, 1366, 1429, 1492, 1923, 1986, 1240, 1303, 1492, 1555, 223,
-  286,  1924, 1987, 860,  923,  1742, 1805, 542,  605,  1865, 1928, 733,  796,
-  987,  1050, 1680, 1743, 1804, 1867, 287,  350,  1177, 1240, 1555, 1618, 1925,
-  1988, 351,  414,  1926, 1989, 606,  669,  1114, 1177, 1618, 1681, 1866, 1929,
-  924,  987,  1743, 1806, 415,  478,  797,  860,  1805, 1868, 1927, 1990, 1367,
-  1430, 1430, 1493, 1304, 1367, 1493, 1556, 1051, 1114, 1681, 1744, 670,  733,
-  1867, 1930, 31,   31,   1984, 1984, 32,   95,   479,  542,  1241, 1304, 1556,
-  1619, 1928, 1991, 1985, 2048, 96,   159,  1986, 2049, 160,  223,  1987, 2050,
-  861,  924,  1178, 1241, 1619, 1682, 1806, 1869, 224,  287,  988,  1051, 1744,
-  1807, 1988, 2051, 543,  606,  1929, 1992, 734,  797,  1868, 1931, 288,  351,
-  1989, 2052, 1115, 1178, 1682, 1745, 1431, 1494, 352,  415,  1368, 1431, 1494,
-  1557, 1990, 2053, 607,  670,  1930, 1993, 925,  988,  1305, 1368, 1557, 1620,
-  1807, 1870, 798,  861,  1869, 1932, 416,  479,  1052, 1115, 1745, 1808, 1991,
-  2054, 1242, 1305, 1620, 1683, 671,  734,  1931, 1994, 480,  543,  1992, 2055,
-  32,   32,   2048, 2048, 33,   96,   1179, 1242, 1683, 1746, 2049, 2112, 97,
-  160,  2050, 2113, 862,  925,  1870, 1933, 989,  1052, 1808, 1871, 161,  224,
-  2051, 2114, 225,  288,  544,  607,  735,  798,  1432, 1495, 1495, 1558, 1932,
-  1995, 1993, 2056, 2052, 2115, 1116, 1179, 1746, 1809, 1369, 1432, 1558, 1621,
-  289,  352,  2053, 2116, 1306, 1369, 1621, 1684, 608,  671,  1994, 2057, 353,
-  416,  926,  989,  1871, 1934, 2054, 2117, 1243, 1306, 1684, 1747, 799,  862,
-  1053, 1116, 1809, 1872, 1933, 1996, 417,  480,  2055, 2118, 672,  735,  1180,
-  1243, 1747, 1810, 1995, 2058, 1496, 1559, 481,  544,  2056, 2119, 1433, 1496,
-  1559, 1622, 33,   33,   990,  1053, 1872, 1935, 2112, 2112, 34,   97,   863,
-  926,  1934, 1997, 2113, 2176, 98,   161,  1370, 1433, 1622, 1685, 2114, 2177,
-  162,  225,  1117, 1180, 1810, 1873, 2115, 2178, 736,  799,  1996, 2059, 545,
-  608,  1307, 1370, 1685, 1748, 2057, 2120, 226,  289,  2116, 2179, 290,  353,
-  2117, 2180, 1244, 1307, 1748, 1811, 927,  990,  1935, 1998, 609,  672,  1054,
-  1117, 1873, 1936, 2058, 2121, 354,  417,  2118, 2181, 800,  863,  1997, 2060,
-  1497, 1560, 1560, 1623, 1181, 1244, 1811, 1874, 418,  481,  1434, 1497, 1623,
-  1686, 2119, 2182, 673,  736,  2059, 2122, 1371, 1434, 1686, 1749, 991,  1054,
-  1936, 1999, 482,  545,  864,  927,  1998, 2061, 2120, 2183, 1118, 1181, 1874,
-  1937, 34,   34,   1308, 1371, 1749, 1812, 2176, 2176, 35,   98,   2177, 2240,
-  99,   162,  2178, 2241, 737,  800,  2060, 2123, 163,  226,  2179, 2242, 546,
-  609,  2121, 2184, 227,  290,  1245, 1308, 1812, 1875, 2180, 2243, 928,  991,
-  1999, 2062, 291,  354,  1055, 1118, 1561, 1624, 1937, 2000, 2181, 2244, 1498,
-  1561, 1624, 1687, 610,  673,  2122, 2185, 801,  864,  1435, 1498, 1687, 1750,
-  2061, 2124, 355,  418,  1182, 1245, 1875, 1938, 2182, 2245, 1372, 1435, 1750,
-  1813, 419,  482,  2183, 2246, 674,  737,  2123, 2186, 992,  1055, 2000, 2063,
-  1309, 1372, 1813, 1876, 865,  928,  1119, 1182, 1938, 2001, 2062, 2125, 483,
-  546,  2184, 2247, 35,   35,   2240, 2240, 36,   99,   2241, 2304, 100,  163,
-  738,  801,  1246, 1309, 1876, 1939, 2124, 2187, 2242, 2305, 1562, 1625, 1625,
-  1688, 164,  227,  1499, 1562, 1688, 1751, 2243, 2306, 547,  610,  2185, 2248,
-  228,  291,  2244, 2307, 1056, 1119, 1436, 1499, 1751, 1814, 2001, 2064, 929,
-  992,  2063, 2126, 292,  355,  2245, 2308, 1183, 1246, 1939, 2002, 611,  674,
-  802,  865,  1373, 1436, 1814, 1877, 2125, 2188, 2186, 2249, 356,  419,  2246,
-  2309, 1310, 1373, 1877, 1940, 420,  483,  993,  1056, 2064, 2127, 2247, 2310,
-  675,  738,  2187, 2250, 1120, 1183, 2002, 2065, 866,  929,  1626, 1689, 2126,
-  2189, 1563, 1626, 1689, 1752, 484,  547,  1500, 1563, 1752, 1815, 2248, 2311,
-  1247, 1310, 1940, 2003, 36,   36,   739,  802,  2188, 2251, 2304, 2304, 37,
-  100,  1437, 1500, 1815, 1878, 2305, 2368, 101,  164,  2306, 2369, 548,  611,
-  2249, 2312, 165,  228,  1057, 1120, 2065, 2128, 2307, 2370, 930,  993,  2127,
-  2190, 1374, 1437, 1878, 1941, 229,  292,  1184, 1247, 2003, 2066, 2308, 2371,
-  293,  356,  803,  866,  2189, 2252, 2309, 2372, 612,  675,  2250, 2313, 1311,
-  1374, 1941, 2004, 357,  420,  1627, 1690, 1690, 1753, 2310, 2373, 1564, 1627,
-  1753, 1816, 994,  1057, 2128, 2191, 1121, 1184, 2066, 2129, 676,  739,  1501,
-  1564, 1816, 1879, 2251, 2314, 421,  484,  2311, 2374, 867,  930,  2190, 2253,
-  1248, 1311, 2004, 2067, 1438, 1501, 1879, 1942, 485,  548,  2312, 2375, 740,
-  803,  2252, 2315, 37,   37,   2368, 2368, 38,   101,  1058, 1121, 1375, 1438,
-  1942, 2005, 2129, 2192, 2369, 2432, 102,  165,  2370, 2433, 549,  612,  931,
-  994,  1185, 1248, 2067, 2130, 2191, 2254, 2313, 2376, 166,  229,  2371, 2434,
-  1691, 1754, 230,  293,  1628, 1691, 1754, 1817, 2372, 2435, 804,  867,  1312,
-  1375, 2005, 2068, 2253, 2316, 1565, 1628, 1817, 1880, 294,  357,  613,  676,
-  2314, 2377, 2373, 2436, 1502, 1565, 1880, 1943, 358,  421,  1122, 1185, 2130,
-  2193, 2374, 2437, 995,  1058, 2192, 2255, 1249, 1312, 2068, 2131, 677,  740,
-  1439, 1502, 1943, 2006, 2315, 2378, 868,  931,  2254, 2317, 422,  485,  2375,
-  2438, 486,  549,  1376, 1439, 2006, 2069, 2376, 2439, 741,  804,  1692, 1755,
-  1755, 1818, 2316, 2379, 1059, 1122, 2193, 2256, 1186, 1249, 1629, 1692, 1818,
-  1881, 2131, 2194, 38,   38,   932,  995,  2255, 2318, 2432, 2432, 39,   102,
-  2433, 2496, 103,  166,  550,  613,  1566, 1629, 1881, 1944, 2377, 2440, 2434,
-  2497, 167,  230,  1313, 1376, 2069, 2132, 2435, 2498, 231,  294,  1503, 1566,
-  1944, 2007, 2436, 2499, 805,  868,  2317, 2380, 614,  677,  2378, 2441, 295,
-  358,  2437, 2500, 1123, 1186, 2194, 2257, 996,  1059, 2256, 2319, 1440, 1503,
-  2007, 2070, 1250, 1313, 2132, 2195, 359,  422,  2438, 2501, 678,  741,  869,
-  932,  2318, 2381, 2379, 2442, 1756, 1819, 423,  486,  1693, 1756, 1819, 1882,
-  2439, 2502, 1377, 1440, 2070, 2133, 1630, 1693, 1882, 1945, 487,  550,  1060,
-  1123, 2257, 2320, 2440, 2503, 1187, 1250, 1567, 1630, 1945, 2008, 2195, 2258,
-  742,  805,  2380, 2443, 933,  996,  2319, 2382, 1314, 1377, 2133, 2196, 39,
-  39,   1504, 1567, 2008, 2071, 2496, 2496, 40,   103,  2497, 2560, 551,  614,
-  2441, 2504, 104,  167,  2498, 2561, 168,  231,  2499, 2562, 806,  869,  2381,
-  2444, 232,  295,  2500, 2563, 1441, 1504, 2071, 2134, 1124, 1187, 2258, 2321,
-  615,  678,  2442, 2505, 296,  359,  997,  1060, 1251, 1314, 1757, 1820, 1820,
-  1883, 2196, 2259, 2320, 2383, 2501, 2564, 1694, 1757, 1883, 1946, 360,  423,
-  2502, 2565, 1631, 1694, 1946, 2009, 870,  933,  1378, 1441, 2134, 2197, 2382,
-  2445, 679,  742,  2443, 2506, 424,  487,  1568, 1631, 2009, 2072, 2503, 2566,
-  1188, 1251, 2259, 2322, 1061, 1124, 2321, 2384, 488,  551,  2504, 2567, 743,
-  806,  1505, 1568, 2072, 2135, 2444, 2507, 1315, 1378, 2197, 2260, 934,  997,
-  2383, 2446, 40,   40,   552,  615,  2505, 2568, 2560, 2560, 41,   104,  1821,
-  1884, 2561, 2624, 1758, 1821, 1884, 1947, 105,  168,  1442, 1505, 2135, 2198,
-  2562, 2625, 169,  232,  807,  870,  1695, 1758, 1947, 2010, 2445, 2508, 2563,
-  2626, 1125, 1188, 2322, 2385, 1252, 1315, 2260, 2323, 233,  296,  2564, 2627,
-  616,  679,  998,  1061, 1632, 1695, 2010, 2073, 2384, 2447, 2506, 2569, 297,
-  360,  2565, 2628, 1379, 1442, 2198, 2261, 1569, 1632, 2073, 2136, 361,  424,
-  871,  934,  2446, 2509, 2566, 2629, 680,  743,  2507, 2570, 425,  488,  1189,
-  1252, 2323, 2386, 2567, 2630, 1506, 1569, 2136, 2199, 1062, 1125, 2385, 2448,
-  1316, 1379, 2261, 2324, 1822, 1885, 1885, 1948, 744,  807,  2508, 2571, 489,
-  552,  1759, 1822, 1948, 2011, 2568, 2631, 935,  998,  2447, 2510, 1696, 1759,
-  2011, 2074, 1443, 1506, 2199, 2262, 553,  616,  2569, 2632, 41,   41,   2624,
-  2624, 42,   105,  1633, 1696, 2074, 2137, 2625, 2688, 106,  169,  1126, 1189,
-  2386, 2449, 2626, 2689, 808,  871,  1253, 1316, 2324, 2387, 2509, 2572, 170,
-  233,  2627, 2690, 999,  1062, 2448, 2511, 234,  297,  1380, 1443, 2262, 2325,
-  2628, 2691, 617,  680,  1570, 1633, 2137, 2200, 2570, 2633, 298,  361,  2629,
-  2692, 872,  935,  2510, 2573, 362,  425,  1886, 1949, 2630, 2693, 1507, 1570,
-  2200, 2263, 681,  744,  1823, 1886, 1949, 2012, 2571, 2634, 1190, 1253, 2387,
-  2450, 1760, 1823, 2012, 2075, 1063, 1126, 1317, 1380, 2325, 2388, 2449, 2512,
-  426,  489,  2631, 2694, 1697, 1760, 2075, 2138, 745,  808,  936,  999,  1444,
-  1507, 2263, 2326, 2511, 2574, 2572, 2635, 490,  553,  2632, 2695, 1634, 1697,
-  2138, 2201, 1254, 1317, 2388, 2451, 554,  617,  1127, 1190, 2450, 2513, 2633,
-  2696, 42,   42,   2688, 2688, 43,   106,  809,  872,  1571, 1634, 2201, 2264,
-  2573, 2636, 2689, 2752, 107,  170,  1381, 1444, 2326, 2389, 2690, 2753, 1000,
-  1063, 2512, 2575, 171,  234,  2691, 2754, 1887, 1950, 1950, 2013, 618,  681,
-  2634, 2697, 235,  298,  1824, 1887, 2013, 2076, 2692, 2755, 1508, 1571, 2264,
-  2327, 1761, 1824, 2076, 2139, 299,  362,  2693, 2756, 873,  936,  2574, 2637,
-  1191, 1254, 2451, 2514, 363,  426,  682,  745,  1318, 1381, 1698, 1761, 2139,
-  2202, 2389, 2452, 2635, 2698, 2694, 2757, 1064, 1127, 2513, 2576, 427,  490,
-  1445, 1508, 2327, 2390, 2695, 2758, 1635, 1698, 2202, 2265, 937,  1000, 2575,
-  2638, 746,  809,  2636, 2699, 491,  554,  2696, 2759, 1255, 1318, 1572, 1635,
-  2265, 2328, 2452, 2515, 1951, 2014, 1128, 1191, 1888, 1951, 2014, 2077, 2514,
-  2577, 1382, 1445, 2390, 2453, 555,  618,  1825, 1888, 2077, 2140, 2697, 2760,
-  810,  873,  2637, 2700, 43,   43,   2752, 2752, 44,   107,  1001, 1064, 2576,
-  2639, 2753, 2816, 108,  171,  1762, 1825, 2140, 2203, 2754, 2817, 172,  235,
-  1509, 1572, 2328, 2391, 2755, 2818, 619,  682,  2698, 2761, 236,  299,  2756,
-  2819, 1699, 1762, 2203, 2266, 874,  937,  2638, 2701, 300,  363,  1192, 1255,
-  2515, 2578, 2757, 2820, 1319, 1382, 2453, 2516, 683,  746,  1065, 1128, 2577,
-  2640, 2699, 2762, 364,  427,  1636, 1699, 2266, 2329, 2758, 2821, 1446, 1509,
-  2391, 2454, 428,  491,  1952, 2015, 2015, 2078, 2759, 2822, 938,  1001, 1889,
-  1952, 2078, 2141, 2639, 2702, 747,  810,  2700, 2763, 1573, 1636, 2329, 2392,
-  1826, 1889, 2141, 2204, 492,  555,  1256, 1319, 2516, 2579, 2760, 2823, 1129,
-  1192, 1383, 1446, 2454, 2517, 2578, 2641, 1763, 1826, 2204, 2267, 556,  619,
-  2761, 2824, 811,  874,  2701, 2764, 1002, 1065, 1510, 1573, 2392, 2455, 2640,
-  2703, 44,   44,   1700, 1763, 2267, 2330, 2816, 2816, 45,   108,  2817, 2880,
-  109,  172,  2818, 2881, 173,  236,  2819, 2882, 620,  683,  2762, 2825, 237,
-  300,  1320, 1383, 2517, 2580, 2820, 2883, 1193, 1256, 2579, 2642, 875,  938,
-  1637, 1700, 2330, 2393, 2702, 2765, 2016, 2079, 301,  364,  1447, 1510, 1953,
-  2016, 2079, 2142, 2455, 2518, 2821, 2884, 1066, 1129, 2641, 2704, 1890, 1953,
-  2142, 2205, 684,  747,  2763, 2826, 365,  428,  2822, 2885, 1827, 1890, 2205,
-  2268, 1574, 1637, 2393, 2456, 429,  492,  939,  1002, 2703, 2766, 2823, 2886,
-  748,  811,  1764, 1827, 2268, 2331, 2764, 2827, 1257, 1320, 2580, 2643, 1384,
-  1447, 2518, 2581, 1130, 1193, 2642, 2705, 493,  556,  2824, 2887, 1511, 1574,
-  2456, 2519, 1701, 1764, 2331, 2394, 812,  875,  1003, 1066, 2704, 2767, 2765,
-  2828, 557,  620,  2825, 2888, 2017, 2080, 2080, 2143, 45,   45,   2880, 2880,
-  46,   109,  1954, 2017, 2143, 2206, 2881, 2944, 110,  173,  1638, 1701, 2394,
-  2457, 2882, 2945, 1321, 1384, 2581, 2644, 174,  237,  621,  684,  1194, 1257,
-  1891, 1954, 2206, 2269, 2643, 2706, 2826, 2889, 2883, 2946, 1448, 1511, 2519,
-  2582, 238,  301,  876,  939,  2766, 2829, 2884, 2947, 1828, 1891, 2269, 2332,
-  1067, 1130, 2705, 2768, 302,  365,  2885, 2948, 685,  748,  1575, 1638, 2457,
-  2520, 2827, 2890, 366,  429,  2886, 2949, 1765, 1828, 2332, 2395, 940,  1003,
-  2767, 2830, 1258, 1321, 2644, 2707, 430,  493,  1385, 1448, 2582, 2645, 2887,
-  2950, 749,  812,  2828, 2891, 1131, 1194, 1702, 1765, 2395, 2458, 2706, 2769,
-  1512, 1575, 2520, 2583, 2081, 2144, 494,  557,  2018, 2081, 2144, 2207, 2888,
-  2951, 1955, 2018, 2207, 2270, 1004, 1067, 2768, 2831, 813,  876,  2829, 2892,
-  1892, 1955, 2270, 2333, 558,  621,  1639, 1702, 2458, 2521, 2889, 2952, 1322,
-  1385, 2645, 2708, 46,   46,   2944, 2944, 47,   110,  1195, 1258, 1449, 1512,
-  1829, 1892, 2333, 2396, 2583, 2646, 2707, 2770, 2945, 3008, 111,  174,  2946,
-  3009, 622,  685,  2890, 2953, 175,  238,  2947, 3010, 877,  940,  2830, 2893,
-  239,  302,  1068, 1131, 1576, 1639, 2521, 2584, 2769, 2832, 2948, 3011, 1766,
-  1829, 2396, 2459, 303,  366,  2949, 3012, 686,  749,  2891, 2954, 367,  430,
-  2082, 2145, 2145, 2208, 2950, 3013, 1386, 1449, 2646, 2709, 1259, 1322, 2019,
-  2082, 2208, 2271, 2708, 2771, 941,  1004, 1703, 1766, 2459, 2522, 2831, 2894,
-  1513, 1576, 1956, 2019, 2271, 2334, 2584, 2647, 431,  494,  2951, 3014, 750,
-  813,  1132, 1195, 2770, 2833, 2892, 2955, 1893, 1956, 2334, 2397, 495,  558,
-  2952, 3015, 1640, 1703, 2522, 2585, 1005, 1068, 2832, 2895, 814,  877,  1830,
-  1893, 2397, 2460, 2893, 2956, 559,  622,  1323, 1386, 2709, 2772, 2953, 3016,
-  1450, 1513, 2647, 2710, 1196, 1259, 2771, 2834, 47,   47,   3008, 3008, 48,
-  111,  1767, 1830, 2460, 2523, 3009, 3072, 1577, 1640, 2585, 2648, 112,  175,
-  3010, 3073, 623,  686,  2954, 3017, 878,  941,  2146, 2209, 2894, 2957, 176,
-  239,  3011, 3074, 1069, 1132, 2083, 2146, 2209, 2272, 2833, 2896, 240,  303,
-  2020, 2083, 2272, 2335, 3012, 3075, 304,  367,  1704, 1767, 2523, 2586, 3013,
-  3076, 687,  750,  1957, 2020, 2335, 2398, 2955, 3018, 1387, 1450, 2710, 2773,
-  1260, 1323, 2772, 2835, 368,  431,  1514, 1577, 2648, 2711, 3014, 3077, 942,
-  1005, 2895, 2958, 1894, 1957, 2398, 2461, 1133, 1196, 2834, 2897, 432,  495,
-  751,  814,  2956, 3019, 3015, 3078, 1641, 1704, 2586, 2649, 1831, 1894, 2461,
-  2524, 496,  559,  3016, 3079, 1006, 1069, 2896, 2959, 1324, 1387, 2773, 2836,
-  815,  878,  1451, 1514, 2711, 2774, 2957, 3020, 2147, 2210, 2210, 2273, 1768,
-  1831, 2524, 2587, 560,  623,  2084, 2147, 2273, 2336, 3017, 3080, 1197, 1260,
-  2835, 2898, 1578, 1641, 2649, 2712, 2021, 2084, 2336, 2399, 48,   48,   3072,
-  3072, 49,   112,  3073, 3136, 624,  687,  3018, 3081, 113,  176,  879,  942,
-  1070, 1133, 1958, 2021, 2399, 2462, 2897, 2960, 2958, 3021, 3074, 3137, 177,
-  240,  1705, 1768, 2587, 2650, 3075, 3138, 241,  304,  3076, 3139, 1388, 1451,
-  2774, 2837, 1895, 1958, 2462, 2525, 688,  751,  1261, 1324, 1515, 1578, 2712,
-  2775, 2836, 2899, 3019, 3082, 305,  368,  3077, 3140, 943,  1006, 2959, 3022,
-  369,  432,  3078, 3141, 1134, 1197, 1642, 1705, 2650, 2713, 2898, 2961, 1832,
-  1895, 2525, 2588, 752,  815,  3020, 3083, 433,  496,  2211, 2274, 3079, 3142,
-  2148, 2211, 2274, 2337, 2085, 2148, 2337, 2400, 497,  560,  1007, 1070, 1452,
-  1515, 1769, 1832, 2588, 2651, 2775, 2838, 2960, 3023, 3080, 3143, 1325, 1388,
-  2837, 2900, 2022, 2085, 2400, 2463, 816,  879,  3021, 3084, 1579, 1642, 2713,
-  2776, 1198, 1261, 2899, 2962, 561,  624,  1959, 2022, 2463, 2526, 3081, 3144,
-  1706, 1769, 2651, 2714, 1071, 1134, 2961, 3024, 49,   49,   880,  943,  1896,
-  1959, 2526, 2589, 3022, 3085, 3136, 3136, 50,   113,  625,  688,  3082, 3145,
-  3137, 3200, 114,  177,  3138, 3201, 178,  241,  1389, 1452, 2838, 2901, 3139,
-  3202, 1516, 1579, 2776, 2839, 242,  305,  1262, 1325, 2900, 2963, 3140, 3203,
-  2212, 2275, 2275, 2338, 689,  752,  1833, 1896, 2589, 2652, 3083, 3146, 306,
-  369,  1643, 1706, 2149, 2212, 2338, 2401, 2714, 2777, 3141, 3204, 944,  1007,
-  3023, 3086, 1135, 1198, 2086, 2149, 2401, 2464, 2962, 3025, 370,  433,  3142,
-  3205, 753,  816,  2023, 2086, 2464, 2527, 3084, 3147, 1770, 1833, 2652, 2715,
-  434,  497,  3143, 3206, 1453, 1516, 2839, 2902, 1326, 1389, 2901, 2964, 1008,
-  1071, 3024, 3087, 1580, 1643, 1960, 2023, 2527, 2590, 2777, 2840, 498,  561,
-  3144, 3207, 817,  880,  1199, 1262, 2963, 3026, 3085, 3148, 1707, 1770, 2715,
-  2778, 562,  625,  1897, 1960, 2590, 2653, 3145, 3208, 2276, 2339, 1072, 1135,
-  3025, 3088, 2213, 2276, 2339, 2402, 881,  944,  3086, 3149, 626,  689,  1390,
-  1453, 2150, 2213, 2402, 2465, 2902, 2965, 3146, 3209, 50,   50,   1517, 1580,
-  2840, 2903, 3200, 3200, 51,   114,  3201, 3264, 115,  178,  1834, 1897, 2653,
-  2716, 3202, 3265, 1263, 1326, 2964, 3027, 179,  242,  2087, 2150, 2465, 2528,
-  3203, 3266, 1644, 1707, 2778, 2841, 243,  306,  3204, 3267, 690,  753,  3147,
-  3210, 2024, 2087, 2528, 2591, 307,  370,  945,  1008, 3087, 3150, 3205, 3268,
-  1136, 1199, 3026, 3089, 1771, 1834, 2716, 2779, 371,  434,  3206, 3269, 1961,
-  2024, 2591, 2654, 754,  817,  3148, 3211, 1454, 1517, 2903, 2966, 435,  498,
-  1327, 1390, 1581, 1644, 2841, 2904, 2965, 3028, 3207, 3270, 1009, 1072, 3088,
-  3151, 1898, 1961, 2654, 2717, 499,  562,  1200, 1263, 1708, 1771, 2277, 2340,
-  2340, 2403, 2779, 2842, 3027, 3090, 3208, 3271, 818,  881,  2214, 2277, 2403,
-  2466, 3149, 3212, 2151, 2214, 2466, 2529, 563,  626,  3209, 3272, 2088, 2151,
-  2529, 2592, 1073, 1136, 1835, 1898, 2717, 2780, 3089, 3152, 1518, 1581, 2904,
-  2967, 1391, 1454, 2966, 3029, 882,  945,  3150, 3213, 627,  690,  1645, 1708,
-  2842, 2905, 3210, 3273, 51,   51,   1264, 1327, 3028, 3091, 3264, 3264, 52,
-  115,  2025, 2088, 2592, 2655, 3265, 3328, 116,  179,  3266, 3329, 180,  243,
-  3267, 3330, 244,  307,  1772, 1835, 2780, 2843, 3268, 3331, 691,  754,  3211,
-  3274, 946,  1009, 1137, 1200, 1962, 2025, 2655, 2718, 3090, 3153, 3151, 3214,
-  308,  371,  3269, 3332, 1455, 1518, 2341, 2404, 2967, 3030, 372,  435,  2278,
-  2341, 2404, 2467, 3270, 3333, 1582, 1645, 2905, 2968, 755,  818,  1328, 1391,
-  3029, 3092, 3212, 3275, 2215, 2278, 2467, 2530, 1899, 1962, 2718, 2781, 436,
-  499,  3271, 3334, 1709, 1772, 2843, 2906, 1010, 1073, 2152, 2215, 2530, 2593,
-  3152, 3215, 1201, 1264, 3091, 3154, 500,  563,  3272, 3335, 819,  882,  2089,
-  2152, 2593, 2656, 3213, 3276, 1836, 1899, 2781, 2844, 564,  627,  1519, 1582,
-  2968, 3031, 3273, 3336, 1392, 1455, 2026, 2089, 2656, 2719, 3030, 3093, 1074,
-  1137, 3153, 3216, 1646, 1709, 2906, 2969, 883,  946,  3214, 3277, 1265, 1328,
-  3092, 3155, 628,  691,  3274, 3337, 52,   52,   1773, 1836, 2844, 2907, 3328,
-  3328, 53,   116,  1963, 2026, 2719, 2782, 3329, 3392, 117,  180,  2342, 2405,
-  2405, 2468, 3330, 3393, 2279, 2342, 2468, 2531, 181,  244,  3331, 3394, 1138,
-  1201, 3154, 3217, 245,  308,  692,  755,  2216, 2279, 2531, 2594, 3275, 3338,
-  3332, 3395, 947,  1010, 3215, 3278, 1456, 1519, 3031, 3094, 309,  372,  1583,
-  1646, 2969, 3032, 3333, 3396, 1900, 1963, 2782, 2845, 2153, 2216, 2594, 2657,
-  1329, 1392, 3093, 3156, 373,  436,  1710, 1773, 2907, 2970, 3334, 3397, 756,
-  819,  3276, 3339, 2090, 2153, 2657, 2720, 1011, 1074, 3216, 3279, 437,  500,
-  3335, 3398, 1202, 1265, 3155, 3218, 1837, 1900, 2845, 2908, 501,  564,  820,
-  883,  2027, 2090, 2720, 2783, 3277, 3340, 3336, 3399, 1520, 1583, 3032, 3095,
-  1393, 1456, 1647, 1710, 2970, 3033, 3094, 3157, 2406, 2469, 565,  628,  1075,
-  1138, 2343, 2406, 2469, 2532, 3217, 3280, 3337, 3400, 2280, 2343, 2532, 2595,
-  1964, 2027, 2783, 2846, 884,  947,  1266, 1329, 1774, 1837, 2908, 2971, 3156,
-  3219, 3278, 3341, 2217, 2280, 2595, 2658, 629,  692,  3338, 3401, 53,   53,
-  3392, 3392, 54,   117,  3393, 3456, 118,  181,  2154, 2217, 2658, 2721, 3394,
-  3457, 182,  245,  1139, 1202, 1901, 1964, 2846, 2909, 3218, 3281, 3395, 3458,
-  948,  1011, 1584, 1647, 3033, 3096, 3279, 3342, 693,  756,  1457, 1520, 3095,
-  3158, 3339, 3402, 246,  309,  3396, 3459, 1711, 1774, 2091, 2154, 2721, 2784,
-  2971, 3034, 310,  373,  1330, 1393, 3157, 3220, 3397, 3460, 374,  437,  3398,
-  3461, 757,  820,  3340, 3403, 1838, 1901, 2909, 2972, 1012, 1075, 2028, 2091,
-  2784, 2847, 3280, 3343, 1203, 1266, 3219, 3282, 438,  501,  2407, 2470, 2470,
-  2533, 3399, 3462, 2344, 2407, 2533, 2596, 1521, 1584, 2281, 2344, 2596, 2659,
-  3096, 3159, 821,  884,  3341, 3404, 502,  565,  1648, 1711, 3034, 3097, 3400,
-  3463, 1394, 1457, 3158, 3221, 1965, 2028, 2847, 2910, 2218, 2281, 2659, 2722,
-  1076, 1139, 1775, 1838, 2972, 3035, 3281, 3344, 566,  629,  3401, 3464, 1267,
-  1330, 3220, 3283, 885,  948,  2155, 2218, 2722, 2785, 3342, 3405, 630,  693,
-  1902, 1965, 2910, 2973, 3402, 3465, 54,   54,   2092, 2155, 2785, 2848, 3456,
-  3456, 55,   118,  1585, 1648, 3097, 3160, 3457, 3520, 1140, 1203, 3282, 3345,
-  119,  182,  1458, 1521, 3159, 3222, 3458, 3521, 1712, 1775, 3035, 3098, 183,
-  246,  949,  1012, 3343, 3406, 3459, 3522, 694,  757,  3403, 3466, 247,  310,
-  3460, 3523, 1331, 1394, 2471, 2534, 3221, 3284, 2408, 2471, 2534, 2597, 2029,
-  2092, 2848, 2911, 311,  374,  1839, 1902, 2345, 2408, 2597, 2660, 2973, 3036,
-  3461, 3524, 758,  821,  2282, 2345, 2660, 2723, 3404, 3467, 375,  438,  3462,
-  3525, 1013, 1076, 1204, 1267, 3283, 3346, 3344, 3407, 439,  502,  2219, 2282,
-  2723, 2786, 3463, 3526, 1522, 1585, 3160, 3223, 1649, 1712, 1966, 2029, 2911,
-  2974, 3098, 3161, 822,  885,  1395, 1458, 3222, 3285, 3405, 3468, 1776, 1839,
-  3036, 3099, 503,  566,  3464, 3527, 2156, 2219, 2786, 2849, 1077, 1140, 3345,
-  3408, 1268, 1331, 3284, 3347, 567,  630,  3465, 3528, 1903, 1966, 2974, 3037,
-  886,  949,  3406, 3469, 2093, 2156, 2849, 2912, 2472, 2535, 2535, 2598, 631,
-  694,  1586, 1649, 2409, 2472, 2598, 2661, 3161, 3224, 3466, 3529, 1459, 1522,
-  1713, 1776, 3099, 3162, 3223, 3286, 1141, 1204, 2346, 2409, 2661, 2724, 3346,
-  3409, 55,   55,   3520, 3520, 56,   119,  3521, 3584, 120,  183,  2030, 2093,
-  2912, 2975, 3522, 3585, 950,  1013, 3407, 3470, 184,  247,  1332, 1395, 1840,
-  1903, 2283, 2346, 2724, 2787, 3037, 3100, 3285, 3348, 3523, 3586, 695,  758,
-  3467, 3530, 248,  311,  3524, 3587, 312,  375,  2220, 2283, 2787, 2850, 3525,
-  3588, 759,  822,  3468, 3531, 1205, 1268, 1967, 2030, 2975, 3038, 3347, 3410,
-  376,  439,  1014, 1077, 3408, 3471, 3526, 3589, 1650, 1713, 3162, 3225, 1523,
-  1586, 3224, 3287, 2157, 2220, 2850, 2913, 440,  503,  1777, 1840, 3100, 3163,
-  3527, 3590, 1396, 1459, 3286, 3349, 823,  886,  3469, 3532, 504,  567,  2536,
-  2599, 3528, 3591, 2473, 2536, 2599, 2662, 1904, 1967, 3038, 3101, 1078, 1141,
-  2094, 2157, 2913, 2976, 3409, 3472, 2410, 2473, 2662, 2725, 1269, 1332, 3348,
-  3411, 568,  631,  3529, 3592, 2347, 2410, 2725, 2788, 887,  950,  3470, 3533,
-  1587, 1650, 3225, 3288, 1714, 1777, 3163, 3226, 2284, 2347, 2788, 2851, 1460,
-  1523, 2031, 2094, 2976, 3039, 3287, 3350, 632,  695,  3530, 3593, 1142, 1205,
-  3410, 3473, 1841, 1904, 3101, 3164, 56,   56,   3584, 3584, 57,   120,  951,
-  1014, 1333, 1396, 2221, 2284, 2851, 2914, 3349, 3412, 3471, 3534, 3585, 3648,
-  121,  184,  3586, 3649, 696,  759,  3531, 3594, 185,  248,  3587, 3650, 249,
-  312,  1968, 2031, 3039, 3102, 3588, 3651, 2158, 2221, 2914, 2977, 313,  376,
-  3589, 3652, 1206, 1269, 1651, 1714, 3226, 3289, 3411, 3474, 760,  823,  1524,
-  1587, 3288, 3351, 3532, 3595, 1015, 1078, 2537, 2600, 2600, 2663, 3472, 3535,
-  1778, 1841, 3164, 3227, 377,  440,  2474, 2537, 2663, 2726, 3590, 3653, 1397,
-  1460, 2411, 2474, 2726, 2789, 3350, 3413, 441,  504,  2095, 2158, 2977, 3040,
-  3591, 3654, 1905, 1968, 3102, 3165, 824,  887,  2348, 2411, 2789, 2852, 3533,
-  3596, 505,  568,  3592, 3655, 1079, 1142, 3473, 3536, 1270, 1333, 3412, 3475,
-  2285, 2348, 2852, 2915, 2032, 2095, 3040, 3103, 1588, 1651, 3289, 3352, 569,
-  632,  1715, 1778, 3227, 3290, 3593, 3656, 888,  951,  3534, 3597, 1461, 1524,
-  3351, 3414, 1842, 1905, 2222, 2285, 2915, 2978, 3165, 3228, 633,  696,  1143,
-  1206, 3474, 3537, 3594, 3657, 1334, 1397, 3413, 3476, 952,  1015, 3535, 3598,
-  1969, 2032, 2601, 2664, 3103, 3166, 57,   57,   2538, 2601, 2664, 2727, 3648,
-  3648, 58,   121,  2159, 2222, 2978, 3041, 3649, 3712, 122,  185,  3650, 3713,
-  697,  760,  2475, 2538, 2727, 2790, 3595, 3658, 186,  249,  3651, 3714, 250,
-  313,  1652, 1715, 2412, 2475, 2790, 2853, 3290, 3353, 3652, 3715, 1525, 1588,
-  1779, 1842, 3228, 3291, 3352, 3415, 1207, 1270, 3475, 3538, 314,  377,  3653,
-  3716, 1016, 1079, 3536, 3599, 761,  824,  2096, 2159, 3041, 3104, 3596, 3659,
-  2349, 2412, 2853, 2916, 378,  441,  1398, 1461, 1906, 1969, 3166, 3229, 3414,
-  3477, 3654, 3717, 2286, 2349, 2916, 2979, 442,  505,  3655, 3718, 825,  888,
-  3597, 3660, 1080, 1143, 1271, 1334, 2033, 2096, 3104, 3167, 3476, 3539, 3537,
-  3600, 506,  569,  3656, 3719, 1716, 1779, 3291, 3354, 1589, 1652, 2223, 2286,
-  2979, 3042, 3353, 3416, 1843, 1906, 3229, 3292, 570,  633,  889,  952,  1462,
-  1525, 2602, 2665, 2665, 2728, 3415, 3478, 3598, 3661, 3657, 3720, 2539, 2602,
-  2728, 2791, 2476, 2539, 2791, 2854, 1144, 1207, 2160, 2223, 3042, 3105, 3538,
-  3601, 1970, 2033, 3167, 3230, 634,  697,  3658, 3721, 1335, 1398, 3477, 3540,
-  2413, 2476, 2854, 2917, 953,  1016, 3599, 3662, 58,   58,   3712, 3712, 59,
-  122,  3713, 3776, 123,  186,  698,  761,  1653, 1716, 2350, 2413, 2917, 2980,
-  3354, 3417, 3659, 3722, 3714, 3777, 1780, 1843, 3292, 3355, 187,  250,  2097,
-  2160, 3105, 3168, 3715, 3778, 1526, 1589, 3416, 3479, 251,  314,  1208, 1271,
-  3539, 3602, 3716, 3779, 1907, 1970, 3230, 3293, 1017, 1080, 2287, 2350, 2980,
-  3043, 3600, 3663, 315,  378,  3717, 3780, 762,  825,  3660, 3723, 1399, 1462,
-  3478, 3541, 379,  442,  3718, 3781, 2034, 2097, 3168, 3231, 2666, 2729, 2224,
-  2287, 3043, 3106, 443,  506,  2603, 2666, 2729, 2792, 3719, 3782, 826,  889,
-  3661, 3724, 1272, 1335, 2540, 2603, 2792, 2855, 3540, 3603, 1081, 1144, 1717,
-  1780, 3355, 3418, 3601, 3664, 1590, 1653, 3417, 3480, 507,  570,  1844, 1907,
-  3293, 3356, 3720, 3783, 2477, 2540, 2855, 2918, 1463, 1526, 3479, 3542, 2161,
-  2224, 3106, 3169, 890,  953,  2414, 2477, 2918, 2981, 3662, 3725, 571,  634,
-  1971, 2034, 3231, 3294, 3721, 3784, 1145, 1208, 3602, 3665, 1336, 1399, 3541,
-  3604, 2351, 2414, 2981, 3044, 635,  698,  3722, 3785, 954,  1017, 2098, 2161,
-  3169, 3232, 3663, 3726, 1654, 1717, 3418, 3481, 1781, 1844, 3356, 3419, 59,
-  59,   2288, 2351, 3044, 3107, 3776, 3776, 60,   123,  1527, 1590, 3480, 3543,
-  3777, 3840, 699,  762,  3723, 3786, 124,  187,  1908, 1971, 3294, 3357, 3778,
-  3841, 188,  251,  3779, 3842, 1209, 1272, 3603, 3666, 2667, 2730, 2730, 2793,
-  252,  315,  3780, 3843, 2604, 2667, 2793, 2856, 1018, 1081, 1400, 1463, 3542,
-  3605, 3664, 3727, 316,  379,  763,  826,  2035, 2098, 2541, 2604, 2856, 2919,
-  3232, 3295, 3724, 3787, 3781, 3844, 2225, 2288, 3107, 3170, 380,  443,  3782,
-  3845, 2478, 2541, 2919, 2982, 1718, 1781, 3419, 3482, 444,  507,  1273, 1336,
-  3604, 3667, 3783, 3846, 827,  890,  1591, 1654, 1845, 1908, 3357, 3420, 3481,
-  3544, 3725, 3788, 1082, 1145, 2415, 2478, 2982, 3045, 3665, 3728, 2162, 2225,
-  3170, 3233, 508,  571,  3784, 3847, 1464, 1527, 1972, 2035, 3295, 3358, 3543,
-  3606, 2352, 2415, 3045, 3108, 891,  954,  3726, 3789, 572,  635,  3785, 3848,
-  1146, 1209, 3666, 3729, 1337, 1400, 2099, 2162, 3233, 3296, 3605, 3668, 2289,
-  2352, 3108, 3171, 2731, 2794, 636,  699,  1782, 1845, 2668, 2731, 2794, 2857,
-  3420, 3483, 3786, 3849, 1655, 1718, 3482, 3545, 955,  1018, 2605, 2668, 2857,
-  2920, 3727, 3790, 1909, 1972, 3358, 3421, 1528, 1591, 3544, 3607, 2542, 2605,
-  2920, 2983, 60,   60,   700,  763,  3787, 3850, 3840, 3840, 61,   124,  3841,
-  3904, 125,  188,  1210, 1273, 2226, 2289, 3171, 3234, 3667, 3730, 3842, 3905,
-  2036, 2099, 3296, 3359, 189,  252,  2479, 2542, 2983, 3046, 3843, 3906, 1401,
-  1464, 3606, 3669, 253,  316,  1019, 1082, 3728, 3791, 3844, 3907, 764,  827,
-  3788, 3851, 317,  380,  3845, 3908, 2416, 2479, 3046, 3109, 1719, 1782, 3483,
-  3546, 381,  444,  1846, 1909, 2163, 2226, 3234, 3297, 3421, 3484, 3846, 3909,
-  1592, 1655, 3545, 3608, 1274, 1337, 3668, 3731, 828,  891,  3789, 3852, 445,
-  508,  1083, 1146, 1973, 2036, 2353, 2416, 3109, 3172, 3359, 3422, 3729, 3792,
-  3847, 3910, 1465, 1528, 3607, 3670, 509,  572,  2732, 2795, 2795, 2858, 3848,
-  3911, 2669, 2732, 2858, 2921, 2100, 2163, 3297, 3360, 892,  955,  2290, 2353,
-  3172, 3235, 3790, 3853, 2606, 2669, 2921, 2984, 573,  636,  3849, 3912, 1147,
-  1210, 1338, 1401, 3669, 3732, 3730, 3793, 1783, 1846, 2543, 2606, 2984, 3047,
-  3484, 3547, 1656, 1719, 3546, 3609, 1910, 1973, 3422, 3485, 637,  700,  3850,
-  3913, 956,  1019, 1529, 1592, 2480, 2543, 3047, 3110, 3608, 3671, 3791, 3854,
-  2227, 2290, 3235, 3298, 2037, 2100, 3360, 3423, 701,  764,  1211, 1274, 3731,
-  3794, 3851, 3914, 61,   61,   3904, 3904, 62,   125,  2417, 2480, 3110, 3173,
-  3905, 3968, 126,  189,  1402, 1465, 3670, 3733, 3906, 3969, 190,  253,  3907,
-  3970, 1020, 1083, 3792, 3855, 254,  317,  2164, 2227, 3298, 3361, 3908, 3971,
-  765,  828,  1720, 1783, 3547, 3610, 3852, 3915, 1847, 1910, 3485, 3548, 318,
-  381,  2354, 2417, 3173, 3236, 3909, 3972, 2796, 2859, 1593, 1656, 2733, 2796,
-  2859, 2922, 3609, 3672, 1974, 2037, 3423, 3486, 382,  445,  2670, 2733, 2922,
-  2985, 3910, 3973, 1275, 1338, 3732, 3795, 1084, 1147, 3793, 3856, 829,  892,
-  2607, 2670, 2985, 3048, 3853, 3916, 446,  509,  1466, 1529, 3671, 3734, 3911,
-  3974, 2291, 2354, 3236, 3299, 2101, 2164, 3361, 3424, 2544, 2607, 3048, 3111,
-  510,  573,  3912, 3975, 893,  956,  3854, 3917, 1784, 1847, 3548, 3611, 1339,
-  1402, 2481, 2544, 3111, 3174, 3733, 3796, 1148, 1211, 3794, 3857, 574,  637,
-  1657, 1720, 1911, 1974, 3486, 3549, 3610, 3673, 3913, 3976, 2228, 2291, 3299,
-  3362, 1530, 1593, 2038, 2101, 3424, 3487, 3672, 3735, 638,  701,  2418, 2481,
-  3174, 3237, 3914, 3977, 957,  1020, 3855, 3918, 1212, 1275, 2797, 2860, 2860,
-  2923, 3795, 3858, 702,  765,  1403, 1466, 2165, 2228, 2734, 2797, 2923, 2986,
-  3362, 3425, 3734, 3797, 3915, 3978, 62,   62,   3968, 3968, 63,   126,  2355,
-  2418, 3237, 3300, 3969, 4032, 127,  190,  2671, 2734, 2986, 3049, 3970, 4033,
-  1021, 1084, 1848, 1911, 3549, 3612, 3856, 3919, 191,  254,  1721, 1784, 3611,
-  3674, 3971, 4034, 255,  318,  2608, 2671, 3049, 3112, 3972, 4035, 1975, 2038,
-  3487, 3550, 766,  829,  3916, 3979, 1594, 1657, 3673, 3736, 319,  382,  3973,
-  4036, 1276, 1339, 2292, 2355, 3300, 3363, 3796, 3859, 2545, 2608, 3112, 3175,
-  383,  446,  2102, 2165, 3425, 3488, 3974, 4037, 1085, 1148, 1467, 1530, 3735,
-  3798, 3857, 3920, 830,  893,  3917, 3980, 447,  510,  3975, 4038, 2482, 2545,
-  3175, 3238, 511,  574,  1785, 1848, 3612, 3675, 3976, 4039, 2229, 2292, 3363,
-  3426, 1912, 1975, 3550, 3613, 894,  957,  1658, 1721, 3674, 3737, 3918, 3981,
-  1340, 1403, 3797, 3860, 1149, 1212, 2419, 2482, 3238, 3301, 3858, 3921, 2039,
-  2102, 3488, 3551, 575,  638,  2861, 2924, 3977, 4040, 2798, 2861, 2924, 2987,
-  1531, 1594, 3736, 3799, 2735, 2798, 2987, 3050, 2672, 2735, 3050, 3113, 639,
-  702,  958,  1021, 3919, 3982, 3978, 4041, 2166, 2229, 3426, 3489, 2356, 2419,
-  3301, 3364, 1213, 1276, 2609, 2672, 3113, 3176, 3859, 3922, 1404, 1467, 3798,
-  3861, 703,  766,  1849, 1912, 3613, 3676, 3979, 4042, 1722, 1785, 3675, 3738,
-  1976, 2039, 3551, 3614, 1022, 1085, 2546, 2609, 3176, 3239, 3920, 3983, 2293,
-  2356, 3364, 3427, 1595, 1658, 3737, 3800, 767,  830,  3980, 4043, 2103, 2166,
-  3489, 3552, 1277, 1340, 3860, 3923, 2483, 2546, 3239, 3302, 1468, 1531, 3799,
-  3862, 1086, 1149, 3921, 3984, 831,  894,  3981, 4044, 2230, 2293, 2862, 2925,
-  2925, 2988, 3427, 3490, 2799, 2862, 2988, 3051, 1786, 1849, 2420, 2483, 3302,
-  3365, 3676, 3739, 1913, 1976, 3614, 3677, 2736, 2799, 3051, 3114, 1659, 1722,
-  3738, 3801, 2040, 2103, 3552, 3615, 1341, 1404, 3861, 3924, 895,  958,  2673,
-  2736, 3114, 3177, 3982, 4045, 1150, 1213, 3922, 3985, 1532, 1595, 3800, 3863,
-  2357, 2420, 3365, 3428, 2167, 2230, 2610, 2673, 3177, 3240, 3490, 3553, 959,
-  1022, 3983, 4046, 2547, 2610, 3240, 3303, 1214, 1277, 1405, 1468, 1850, 1913,
-  3677, 3740, 3862, 3925, 3923, 3986, 1723, 1786, 1977, 2040, 3615, 3678, 3739,
-  3802, 2294, 2357, 3428, 3491, 1023, 1086, 1596, 1659, 2104, 2167, 2484, 2547,
-  3303, 3366, 3553, 3616, 3801, 3864, 3984, 4047, 2926, 2989, 2863, 2926, 2989,
-  3052, 2800, 2863, 3052, 3115, 1278, 1341, 3924, 3987, 1469, 1532, 2231, 2294,
-  2737, 2800, 3115, 3178, 3491, 3554, 3863, 3926, 2421, 2484, 3366, 3429, 1087,
-  1150, 3985, 4048, 1914, 1977, 2674, 2737, 3178, 3241, 3678, 3741, 1787, 1850,
-  3740, 3803, 2041, 2104, 3616, 3679, 1660, 1723, 3802, 3865, 2611, 2674, 3241,
-  3304, 1342, 1405, 2358, 2421, 3429, 3492, 3925, 3988, 2168, 2231, 3554, 3617,
-  1151, 1214, 3986, 4049, 1533, 1596, 3864, 3927, 2548, 2611, 3304, 3367, 2295,
-  2358, 3492, 3555, 1851, 1914, 3741, 3804, 1978, 2041, 2927, 2990, 2990, 3053,
-  3679, 3742, 1406, 1469, 3926, 3989, 1724, 1787, 2864, 2927, 3053, 3116, 3803,
-  3866, 1215, 1278, 2485, 2548, 3367, 3430, 3987, 4050, 2801, 2864, 3116, 3179,
-  2105, 2168, 3617, 3680, 1597, 1660, 3865, 3928, 2738, 2801, 3179, 3242, 2422,
-  2485, 3430, 3493, 2232, 2295, 3555, 3618, 2675, 2738, 3242, 3305, 1279, 1342,
-  3988, 4051, 1470, 1533, 3927, 3990, 1915, 1978, 3742, 3805, 1788, 1851, 3804,
-  3867, 2612, 2675, 3305, 3368, 2042, 2105, 3680, 3743, 2359, 2422, 3493, 3556,
-  1661, 1724, 3866, 3929, 2169, 2232, 3618, 3681, 2549, 2612, 3368, 3431, 1343,
-  1406, 3989, 4052, 2991, 3054, 1534, 1597, 2928, 2991, 3054, 3117, 3928, 3991,
-  2865, 2928, 3117, 3180, 2296, 2359, 3556, 3619, 2802, 2865, 3180, 3243, 2486,
-  2549, 3431, 3494, 1852, 1915, 3805, 3868, 1979, 2042, 3743, 3806, 1725, 1788,
-  2739, 2802, 3243, 3306, 3867, 3930, 1407, 1470, 2106, 2169, 3681, 3744, 3990,
-  4053, 2676, 2739, 3306, 3369, 1598, 1661, 2423, 2486, 3494, 3557, 3929, 3992,
-  2233, 2296, 3619, 3682, 2613, 2676, 3369, 3432, 1471, 1534, 3991, 4054, 1916,
-  1979, 3806, 3869, 1789, 1852, 2043, 2106, 2360, 2423, 3557, 3620, 3744, 3807,
-  3868, 3931, 2992, 3055, 3055, 3118, 2550, 2613, 3432, 3495, 2929, 2992, 3118,
-  3181, 1662, 1725, 2170, 2233, 3682, 3745, 3930, 3993, 2866, 2929, 3181, 3244,
-  2803, 2866, 3244, 3307, 1535, 1598, 2297, 2360, 3620, 3683, 3992, 4055, 2487,
-  2550, 3495, 3558, 2740, 2803, 3307, 3370, 1980, 2043, 3807, 3870, 1853, 1916,
-  3869, 3932, 2107, 2170, 3745, 3808, 1726, 1789, 2677, 2740, 3370, 3433, 3931,
-  3994, 2424, 2487, 3558, 3621, 2234, 2297, 3683, 3746, 1599, 1662, 3993, 4056,
-  2614, 2677, 3433, 3496, 3056, 3119, 2993, 3056, 3119, 3182, 2930, 2993, 3182,
-  3245, 2361, 2424, 3621, 3684, 1917, 1980, 3870, 3933, 2044, 2107, 3808, 3871,
-  2551, 2614, 3496, 3559, 2867, 2930, 3245, 3308, 1790, 1853, 3932, 3995, 2171,
-  2234, 3746, 3809, 2804, 2867, 3308, 3371, 1663, 1726, 3994, 4057, 2488, 2551,
-  3559, 3622, 2741, 2804, 3371, 3434, 2298, 2361, 3684, 3747, 2678, 2741, 3434,
-  3497, 1981, 2044, 3871, 3934, 1854, 1917, 3933, 3996, 2108, 2171, 3809, 3872,
-  2425, 2488, 3622, 3685, 1727, 1790, 3995, 4058, 3057, 3120, 3120, 3183, 2235,
-  2298, 2615, 2678, 3497, 3560, 3747, 3810, 2994, 3057, 3183, 3246, 2931, 2994,
-  3246, 3309, 2868, 2931, 3309, 3372, 2362, 2425, 3685, 3748, 2552, 2615, 3560,
-  3623, 1918, 1981, 3934, 3997, 2045, 2108, 2805, 2868, 3372, 3435, 3872, 3935,
-  1791, 1854, 3996, 4059, 2172, 2235, 3810, 3873, 2742, 2805, 3435, 3498, 2489,
-  2552, 3623, 3686, 2299, 2362, 3748, 3811, 2679, 2742, 3498, 3561, 3121, 3184,
-  3058, 3121, 3184, 3247, 1982, 2045, 3935, 3998, 2426, 2489, 3686, 3749, 1855,
-  1918, 2109, 2172, 2995, 3058, 3247, 3310, 3873, 3936, 3997, 4060, 2616, 2679,
-  3561, 3624, 2932, 2995, 3310, 3373, 2236, 2299, 3811, 3874, 2869, 2932, 3373,
-  3436, 2553, 2616, 3624, 3687, 2363, 2426, 3749, 3812, 2806, 2869, 3436, 3499,
-  2046, 2109, 3936, 3999, 1919, 1982, 3998, 4061, 2743, 2806, 3499, 3562, 2173,
-  2236, 3874, 3937, 2490, 2553, 3687, 3750, 2300, 2363, 3812, 3875, 2680, 2743,
-  3562, 3625, 3122, 3185, 3185, 3248, 3059, 3122, 3248, 3311, 2996, 3059, 3311,
-  3374, 2427, 2490, 2933, 2996, 3374, 3437, 3750, 3813, 1983, 2046, 2617, 2680,
-  3625, 3688, 3999, 4062, 2110, 2173, 3937, 4000, 2870, 2933, 3437, 3500, 2237,
-  2300, 3875, 3938, 2807, 2870, 3500, 3563, 2554, 2617, 3688, 3751, 2364, 2427,
-  3813, 3876, 2744, 2807, 3563, 3626, 2047, 2110, 4000, 4063, 2174, 2237, 3186,
-  3249, 3938, 4001, 2491, 2554, 3123, 3186, 3249, 3312, 3751, 3814, 3060, 3123,
-  3312, 3375, 2681, 2744, 3626, 3689, 2301, 2364, 3876, 3939, 2997, 3060, 3375,
-  3438, 2934, 2997, 3438, 3501, 2428, 2491, 3814, 3877, 2618, 2681, 3689, 3752,
-  2871, 2934, 3501, 3564, 2111, 2174, 4001, 4064, 2238, 2301, 3939, 4002, 2808,
-  2871, 3564, 3627, 2555, 2618, 3752, 3815, 2365, 2428, 3877, 3940, 2745, 2808,
-  3627, 3690, 3187, 3250, 3250, 3313, 3124, 3187, 3313, 3376, 3061, 3124, 3376,
-  3439, 2492, 2555, 3815, 3878, 2175, 2238, 2998, 3061, 3439, 3502, 4002, 4065,
-  2682, 2745, 3690, 3753, 2302, 2365, 3940, 4003, 2935, 2998, 3502, 3565, 2872,
-  2935, 3565, 3628, 2619, 2682, 3753, 3816, 2429, 2492, 3878, 3941, 2809, 2872,
-  3628, 3691, 2239, 2302, 4003, 4066, 2556, 2619, 3816, 3879, 3251, 3314, 3188,
-  3251, 3314, 3377, 3125, 3188, 3377, 3440, 2366, 2429, 2746, 2809, 3691, 3754,
-  3941, 4004, 3062, 3125, 3440, 3503, 2999, 3062, 3503, 3566, 2493, 2556, 3879,
-  3942, 2683, 2746, 3754, 3817, 2936, 2999, 3566, 3629, 2303, 2366, 4004, 4067,
-  2873, 2936, 3629, 3692, 2620, 2683, 3817, 3880, 2430, 2493, 3942, 4005, 2810,
-  2873, 3692, 3755, 3252, 3315, 3315, 3378, 3189, 3252, 3378, 3441, 3126, 3189,
-  3441, 3504, 2557, 2620, 3880, 3943, 3063, 3126, 3504, 3567, 2747, 2810, 3755,
-  3818, 2367, 2430, 4005, 4068, 3000, 3063, 3567, 3630, 2684, 2747, 3818, 3881,
-  2494, 2557, 2937, 3000, 3630, 3693, 3943, 4006, 2874, 2937, 3693, 3756, 2621,
-  2684, 3881, 3944, 3316, 3379, 3253, 3316, 3379, 3442, 2431, 2494, 4006, 4069,
-  3190, 3253, 3442, 3505, 2811, 2874, 3756, 3819, 3127, 3190, 3505, 3568, 3064,
-  3127, 3568, 3631, 2558, 2621, 3944, 4007, 2748, 2811, 3819, 3882, 3001, 3064,
-  3631, 3694, 2938, 3001, 3694, 3757, 2685, 2748, 3882, 3945, 2495, 2558, 4007,
-  4070, 2875, 2938, 3757, 3820, 3317, 3380, 3380, 3443, 3254, 3317, 3443, 3506,
-  2622, 2685, 3191, 3254, 3506, 3569, 3945, 4008, 2812, 2875, 3820, 3883, 3128,
-  3191, 3569, 3632, 3065, 3128, 3632, 3695, 2559, 2622, 4008, 4071, 2749, 2812,
-  3883, 3946, 3002, 3065, 3695, 3758, 2939, 3002, 3758, 3821, 2686, 2749, 3946,
-  4009, 3381, 3444, 3318, 3381, 3444, 3507, 2876, 2939, 3821, 3884, 3255, 3318,
-  3507, 3570, 3192, 3255, 3570, 3633, 2623, 2686, 3129, 3192, 3633, 3696, 4009,
-  4072, 2813, 2876, 3884, 3947, 3066, 3129, 3696, 3759, 3003, 3066, 3759, 3822,
-  2750, 2813, 3947, 4010, 2940, 3003, 3822, 3885, 3382, 3445, 3445, 3508, 3319,
-  3382, 3508, 3571, 2687, 2750, 4010, 4073, 3256, 3319, 3571, 3634, 2877, 2940,
-  3885, 3948, 3193, 3256, 3634, 3697, 3130, 3193, 3697, 3760, 2814, 2877, 3948,
-  4011, 3067, 3130, 3760, 3823, 3004, 3067, 3823, 3886, 2751, 2814, 4011, 4074,
-  3446, 3509, 3383, 3446, 3509, 3572, 2941, 3004, 3886, 3949, 3320, 3383, 3572,
-  3635, 3257, 3320, 3635, 3698, 3194, 3257, 3698, 3761, 2878, 2941, 3949, 4012,
-  3131, 3194, 3761, 3824, 3068, 3131, 3824, 3887, 2815, 2878, 4012, 4075, 3005,
-  3068, 3887, 3950, 3447, 3510, 3510, 3573, 3384, 3447, 3573, 3636, 3321, 3384,
-  3636, 3699, 2942, 3005, 3950, 4013, 3258, 3321, 3699, 3762, 3195, 3258, 3762,
-  3825, 2879, 2942, 4013, 4076, 3132, 3195, 3825, 3888, 3069, 3132, 3888, 3951,
-  3511, 3574, 3448, 3511, 3574, 3637, 3006, 3069, 3951, 4014, 3385, 3448, 3637,
-  3700, 3322, 3385, 3700, 3763, 3259, 3322, 3763, 3826, 2943, 3006, 4014, 4077,
-  3196, 3259, 3826, 3889, 3133, 3196, 3889, 3952, 3070, 3133, 3952, 4015, 3512,
-  3575, 3575, 3638, 3449, 3512, 3638, 3701, 3386, 3449, 3701, 3764, 3007, 3070,
-  4015, 4078, 3323, 3386, 3764, 3827, 3260, 3323, 3827, 3890, 3197, 3260, 3890,
-  3953, 3134, 3197, 3953, 4016, 3576, 3639, 3071, 3134, 4016, 4079, 3513, 3576,
-  3639, 3702, 3450, 3513, 3702, 3765, 3387, 3450, 3765, 3828, 3324, 3387, 3828,
-  3891, 3261, 3324, 3891, 3954, 3198, 3261, 3954, 4017, 3135, 3198, 4017, 4080,
-  3577, 3640, 3640, 3703, 3514, 3577, 3703, 3766, 3451, 3514, 3766, 3829, 3388,
-  3451, 3829, 3892, 3325, 3388, 3892, 3955, 3262, 3325, 3955, 4018, 3199, 3262,
-  4018, 4081, 3641, 3704, 3578, 3641, 3704, 3767, 3515, 3578, 3767, 3830, 3452,
-  3515, 3830, 3893, 3389, 3452, 3893, 3956, 3326, 3389, 3956, 4019, 3263, 3326,
-  4019, 4082, 3642, 3705, 3705, 3768, 3579, 3642, 3768, 3831, 3516, 3579, 3831,
-  3894, 3453, 3516, 3894, 3957, 3390, 3453, 3957, 4020, 3327, 3390, 4020, 4083,
-  3706, 3769, 3643, 3706, 3769, 3832, 3580, 3643, 3832, 3895, 3517, 3580, 3895,
-  3958, 3454, 3517, 3958, 4021, 3391, 3454, 4021, 4084, 3707, 3770, 3770, 3833,
-  3644, 3707, 3833, 3896, 3581, 3644, 3896, 3959, 3518, 3581, 3959, 4022, 3455,
-  3518, 4022, 4085, 3771, 3834, 3708, 3771, 3834, 3897, 3645, 3708, 3897, 3960,
-  3582, 3645, 3960, 4023, 3519, 3582, 4023, 4086, 3772, 3835, 3835, 3898, 3709,
-  3772, 3898, 3961, 3646, 3709, 3961, 4024, 3583, 3646, 4024, 4087, 3836, 3899,
-  3773, 3836, 3899, 3962, 3710, 3773, 3962, 4025, 3647, 3710, 4025, 4088, 3837,
-  3900, 3900, 3963, 3774, 3837, 3963, 4026, 3711, 3774, 4026, 4089, 3901, 3964,
-  3838, 3901, 3964, 4027, 3775, 3838, 4027, 4090, 3902, 3965, 3965, 4028, 3839,
-  3902, 4028, 4091, 3966, 4029, 3903, 3966, 4029, 4092, 3967, 4030, 4030, 4093,
-  4031, 4094, 0,    0,
-};
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_CHROMA_2X2
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_2x2[4]) = { 0, 1, 2,
-                                                                        3 };
-#endif
-
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
-  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+  0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 };
@@ -5228,19 +2387,10 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
 DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_4x4[16]) = {
-  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_4x4[16]) = {
-  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
-  0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
-  15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
@@ -5254,8 +2404,8 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
-  0, 1, 4, 9,  15, 19, 24, 28, 2,  3,  6,  11, 16, 21, 25, 29,
-  5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
+  0, 2, 5,  9,  13, 17, 21, 25, 1, 4,  8,  12, 16, 20, 24, 28,
+  3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
@@ -5269,20 +2419,19 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
-  0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
-  15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+  29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+  45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
-  0,  1,  4,  9,  15, 19, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
-  2,  3,  6,  11, 16, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
-  5,  7,  8,  13, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
-  10, 12, 14, 17, 20, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+  0, 2,  5,  9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57,
+  1, 4,  8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  3, 7,  11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62,
+  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
@@ -5310,7 +2459,6 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
   2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
   3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
   0,   1,   3,   6,   10,  15,  21,  28,  2,   4,   7,   11,  16,  22,  29,
@@ -5330,30 +2478,30 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
   201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
   210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
   219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+  255,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
-  0,   1,   3,   6,   10,  15,  21,  28,  36,  44,  52,  60,  68,  76,  84,
-  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204,
-  212, 220, 2,   4,   7,   11,  16,  22,  29,  37,  45,  53,  61,  69,  77,
-  85,  93,  101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197,
-  205, 213, 221, 228, 5,   8,   12,  17,  23,  30,  38,  46,  54,  62,  70,
-  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190,
-  198, 206, 214, 222, 229, 235, 9,   13,  18,  24,  31,  39,  47,  55,  63,
-  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183,
-  191, 199, 207, 215, 223, 230, 236, 241, 14,  19,  25,  32,  40,  48,  56,
-  64,  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176,
-  184, 192, 200, 208, 216, 224, 231, 237, 242, 246, 20,  26,  33,  41,  49,
-  57,  65,  73,  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169,
-  177, 185, 193, 201, 209, 217, 225, 232, 238, 243, 247, 250, 27,  34,  42,
-  50,  58,  66,  74,  82,  90,  98,  106, 114, 122, 130, 138, 146, 154, 162,
-  170, 178, 186, 194, 202, 210, 218, 226, 233, 239, 244, 248, 251, 253, 35,
-  43,  51,  59,  67,  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155,
-  163, 171, 179, 187, 195, 203, 211, 219, 227, 234, 240, 245, 249, 252, 254,
+  0,   2,   5,   9,   14,  20,  27,  35,  43,  51,  59,  67,  75,  83,  91,
+  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211,
+  219, 227, 1,   4,   8,   13,  19,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 3,   7,   12,  18,  25,  33,  41,  49,  57,  65,  73,
+  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193,
+  201, 209, 217, 225, 233, 240, 6,   11,  17,  24,  32,  40,  48,  56,  64,
+  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184,
+  192, 200, 208, 216, 224, 232, 239, 245, 10,  16,  23,  31,  39,  47,  55,
+  63,  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175,
+  183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15,  22,  30,  38,  46,
+  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166,
+  174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28,
+  36,  44,  52,  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148,
+  156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253,
   255,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
   15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
@@ -5435,9 +2583,7 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
   135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
   255,
 };
-#endif  // CONFIG_EXT_TX
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
   0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
   2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
@@ -5451,27 +2597,12 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_8x8[64]) = {
-  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
-  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
-  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
-  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_8x8[64]) = {
-  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
-  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
-  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
-  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
-  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
-  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
-  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
-  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+  0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
+  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
@@ -5486,14 +2617,14 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
-  0,  1,  3,  6,  10, 15, 21, 28, 36, 44,  52,  60,  68,  76,  84,  92,
-  2,  4,  7,  11, 16, 22, 29, 37, 45, 53,  61,  69,  77,  85,  93,  100,
-  5,  8,  12, 17, 23, 30, 38, 46, 54, 62,  70,  78,  86,  94,  101, 107,
-  9,  13, 18, 24, 31, 39, 47, 55, 63, 71,  79,  87,  95,  102, 108, 113,
-  14, 19, 25, 32, 40, 48, 56, 64, 72, 80,  88,  96,  103, 109, 114, 118,
-  20, 26, 33, 41, 49, 57, 65, 73, 81, 89,  97,  104, 110, 115, 119, 122,
-  27, 34, 42, 50, 58, 66, 74, 82, 90, 98,  105, 111, 116, 120, 123, 125,
-  35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
+  0,  2,  5,  9,  14, 20, 27, 35, 43, 51,  59,  67,  75,  83,  91,  99,
+  1,  4,  8,  13, 19, 26, 34, 42, 50, 58,  66,  74,  82,  90,  98,  106,
+  3,  7,  12, 18, 25, 33, 41, 49, 57, 65,  73,  81,  89,  97,  105, 112,
+  6,  11, 17, 24, 32, 40, 48, 56, 64, 72,  80,  88,  96,  104, 111, 117,
+  10, 16, 23, 31, 39, 47, 55, 63, 71, 79,  87,  95,  103, 110, 116, 121,
+  15, 22, 30, 38, 46, 54, 62, 70, 78, 86,  94,  102, 109, 115, 120, 124,
+  21, 29, 37, 45, 53, 61, 69, 77, 85, 93,  101, 108, 114, 119, 123, 126,
+  28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
@@ -5581,41 +2712,41 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
-  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
-  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
-  360, 376, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,
-  106, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329,
-  345, 361, 377, 392, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,
-  93,  107, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314,
-  330, 346, 362, 378, 393, 407, 9,   13,  18,  24,  31,  39,  48,  58,  69,
-  81,  94,  108, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299,
-  315, 331, 347, 363, 379, 394, 408, 421, 14,  19,  25,  32,  40,  49,  59,
-  70,  82,  95,  109, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284,
-  300, 316, 332, 348, 364, 380, 395, 409, 422, 434, 20,  26,  33,  41,  50,
-  60,  71,  83,  96,  110, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269,
-  285, 301, 317, 333, 349, 365, 381, 396, 410, 423, 435, 446, 27,  34,  42,
-  51,  61,  72,  84,  97,  111, 126, 142, 158, 174, 190, 206, 222, 238, 254,
-  270, 286, 302, 318, 334, 350, 366, 382, 397, 411, 424, 436, 447, 457, 35,
-  43,  52,  62,  73,  85,  98,  112, 127, 143, 159, 175, 191, 207, 223, 239,
-  255, 271, 287, 303, 319, 335, 351, 367, 383, 398, 412, 425, 437, 448, 458,
-  467, 44,  53,  63,  74,  86,  99,  113, 128, 144, 160, 176, 192, 208, 224,
-  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 399, 413, 426, 438, 449,
-  459, 468, 476, 54,  64,  75,  87,  100, 114, 129, 145, 161, 177, 193, 209,
-  225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 400, 414, 427, 439,
-  450, 460, 469, 477, 484, 65,  76,  88,  101, 115, 130, 146, 162, 178, 194,
-  210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 401, 415, 428,
-  440, 451, 461, 470, 478, 485, 491, 77,  89,  102, 116, 131, 147, 163, 179,
-  195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 402, 416,
-  429, 441, 452, 462, 471, 479, 486, 492, 497, 90,  103, 117, 132, 148, 164,
-  180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 403,
-  417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 104, 118, 133, 149,
-  165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389,
-  404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, 119, 134,
-  150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374,
-  390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, 509,
+  0,   2,   5,   9,   14,  20,  27,  35,  44,  54,  65,  77,  90,  104, 119,
   135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
-  375, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
-  510, 511,
+  375, 391, 1,   4,   8,   13,  19,  26,  34,  43,  53,  64,  76,  89,  103,
+  118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342,
+  358, 374, 390, 406, 3,   7,   12,  18,  25,  33,  42,  52,  63,  75,  88,
+  102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325,
+  341, 357, 373, 389, 405, 420, 6,   11,  17,  24,  32,  41,  51,  62,  74,
+  87,  101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308,
+  324, 340, 356, 372, 388, 404, 419, 433, 10,  16,  23,  31,  40,  50,  61,
+  73,  86,  100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291,
+  307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15,  22,  30,  39,  49,
+  60,  72,  85,  99,  114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274,
+  290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21,  29,  38,
+  48,  59,  71,  84,  98,  113, 129, 145, 161, 177, 193, 209, 225, 241, 257,
+  273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28,
+  37,  47,  58,  70,  83,  97,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465,
+  475, 36,  46,  57,  69,  82,  96,  111, 127, 143, 159, 175, 191, 207, 223,
+  239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453,
+  464, 474, 483, 45,  56,  68,  81,  95,  110, 126, 142, 158, 174, 190, 206,
+  222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440,
+  452, 463, 473, 482, 490, 55,  67,  80,  94,  109, 125, 141, 157, 173, 189,
+  205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426,
+  439, 451, 462, 472, 481, 489, 496, 66,  79,  93,  108, 124, 140, 156, 172,
+  188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411,
+  425, 438, 450, 461, 471, 480, 488, 495, 501, 78,  92,  107, 123, 139, 155,
+  171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395,
+  410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91,  106, 122, 138,
+  154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378,
+  394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121,
+  137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361,
+  377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510,
+  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+  360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506,
+  509, 511,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
@@ -5767,7 +2898,6 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
   510, 511,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
   0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
   1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
@@ -5807,70 +2937,28 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
   240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
   255,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_16x16[256]) = {
-  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
-  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
-  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
-  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
-  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
-  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
-  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
-  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
-  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
-  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
-  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
-  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
-  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
-  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
-  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
-  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_16x16[256]) = {
-  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
-  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
-  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
-  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
-  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
-  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
-  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
-  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
-  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
-  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
-  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
-  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
-  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
-  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
-  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
-  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
-  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
-  255,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
-  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
-  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
-  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
-  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
-  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
-  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
-  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
-  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
-  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
-  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
-  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
-  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
-  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
-  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
-  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
-  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
-  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
-  255,
+  0,   1,   5,   6,   14,  15,  27,  28,  44,  45,  65,  66,  90,  91,  119,
+  120, 2,   4,   7,   13,  16,  26,  29,  43,  46,  64,  67,  89,  92,  118,
+  121, 150, 3,   8,   12,  17,  25,  30,  42,  47,  63,  68,  88,  93,  117,
+  122, 149, 151, 9,   11,  18,  24,  31,  41,  48,  62,  69,  87,  94,  116,
+  123, 148, 152, 177, 10,  19,  23,  32,  40,  49,  61,  70,  86,  95,  115,
+  124, 147, 153, 176, 178, 20,  22,  33,  39,  50,  60,  71,  85,  96,  114,
+  125, 146, 154, 175, 179, 200, 21,  34,  38,  51,  59,  72,  84,  97,  113,
+  126, 145, 155, 174, 180, 199, 201, 35,  37,  52,  58,  73,  83,  98,  112,
+  127, 144, 156, 173, 181, 198, 202, 219, 36,  53,  57,  74,  82,  99,  111,
+  128, 143, 157, 172, 182, 197, 203, 218, 220, 54,  56,  75,  81,  100, 110,
+  129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55,  76,  80,  101, 109,
+  130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77,  79,  102, 108,
+  131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78,  103, 107,
+  132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+  133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+  134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+  135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+  255
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
   0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
   448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
@@ -6029,1423 +3117,118 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
   1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
   1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
-  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
-  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
-  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
-  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
-  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
-  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
-  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
-  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
-  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
-  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
-  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
-  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
-  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
-  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
-  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
-  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
-  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
-  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
-  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
-  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
-  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
-  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
-  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
-  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
-  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
-  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
-  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
-  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
-  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
-  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
-  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
-  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
-  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
-  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
-  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
-  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
-  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
-  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
-  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
-  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
-  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
-  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
-  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
-  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
-  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
-  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
-  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
-  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
-  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
-  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
-  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
-  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
-  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
-  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
-  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
-  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
-  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
-  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
-  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
-  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
-  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
-  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
-  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
-  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
-  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
-  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
-  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
-  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
-  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
-  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
-  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
-  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
-  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
-  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
-  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
-  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
-  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
-  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
-  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_v2_iscan_32x32[1024]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  512,  518,  527,  539,  551,  566,  584,  602,  621,  644,
-  668,  695,  721,  748,  780,  811,  2,    3,    6,    11,   17,   26,   35,
-  45,   58,   73,   90,   106,  123,  146,  168,  193,  513,  519,  528,  540,
-  553,  567,  585,  603,  622,  647,  670,  696,  722,  751,  783,  812,  5,
-  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
-  170,  195,  514,  521,  530,  541,  554,  569,  587,  605,  625,  649,  671,
-  699,  725,  752,  785,  815,  10,   12,   14,   19,   23,   31,   41,   52,
-  65,   81,   96,   113,  133,  152,  175,  201,  515,  522,  531,  542,  556,
-  572,  589,  607,  629,  651,  673,  700,  726,  757,  788,  819,  16,   18,
-  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
-  203,  516,  523,  534,  545,  559,  574,  591,  610,  632,  654,  679,  704,
-  730,  762,  791,  824,  25,   27,   29,   32,   40,   46,   54,   67,   79,
-  94,   109,  127,  143,  164,  185,  210,  517,  525,  535,  547,  561,  578,
-  595,  615,  635,  656,  684,  707,  737,  766,  793,  830,  34,   36,   38,
-  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
-  520,  529,  538,  550,  565,  580,  598,  618,  639,  664,  687,  712,  741,
-  769,  802,  833,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
-  131,  147,  162,  183,  208,  227,  524,  533,  544,  557,  571,  588,  606,
-  623,  645,  667,  692,  720,  747,  776,  806,  838,  57,   61,   63,   66,
-  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  216,  233,  526,
-  536,  548,  562,  577,  593,  613,  633,  653,  676,  701,  727,  756,  786,
-  814,  847,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
-  173,  190,  211,  229,  246,  532,  543,  555,  568,  581,  601,  619,  637,
-  663,  685,  709,  738,  763,  792,  826,  855,  89,   91,   93,   97,   101,
-  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  255,  537,  549,
-  560,  576,  592,  608,  628,  650,  669,  693,  719,  744,  773,  805,  834,
-  862,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
-  221,  236,  251,  267,  546,  558,  570,  583,  600,  617,  636,  657,  680,
-  706,  729,  758,  787,  813,  846,  871,  122,  126,  130,  134,  138,  144,
-  155,  163,  180,  191,  207,  222,  232,  248,  264,  278,  552,  564,  579,
-  594,  609,  630,  648,  666,  688,  715,  742,  768,  797,  827,  856,  877,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  237,  249,
-  262,  275,  289,  563,  575,  590,  604,  620,  638,  660,  683,  705,  728,
-  753,  779,  809,  839,  866,  889,  167,  169,  172,  178,  182,  188,  198,
-  209,  217,  230,  242,  252,  265,  276,  288,  301,  573,  586,  599,  616,
-  634,  652,  672,  694,  716,  743,  767,  794,  825,  850,  874,  899,  192,
-  194,  196,  202,  204,  213,  220,  228,  234,  247,  256,  268,  279,  290,
-  302,  315,  582,  597,  614,  631,  646,  665,  686,  708,  732,  759,  784,
-  810,  837,  863,  886,  908,  214,  215,  218,  223,  226,  231,  239,  244,
-  253,  261,  271,  283,  292,  304,  317,  325,  596,  611,  626,  642,  661,
-  681,  702,  723,  745,  770,  800,  828,  853,  875,  897,  919,  235,  238,
-  240,  243,  245,  250,  257,  263,  270,  280,  287,  298,  307,  319,  329,
-  340,  612,  624,  640,  658,  677,  697,  717,  739,  764,  789,  816,  844,
-  867,  890,  909,  927,  254,  258,  259,  260,  266,  269,  272,  282,  286,
-  296,  303,  312,  323,  333,  341,  355,  627,  641,  655,  674,  690,  713,
-  735,  760,  781,  807,  835,  857,  880,  902,  921,  940,  273,  274,  277,
-  281,  284,  285,  291,  299,  305,  310,  320,  327,  337,  346,  357,  369,
-  643,  659,  675,  689,  710,  733,  754,  777,  803,  831,  851,  872,  892,
-  913,  934,  950,  293,  294,  295,  297,  300,  306,  308,  314,  321,  326,
-  335,  343,  352,  361,  372,  378,  662,  678,  691,  711,  731,  749,  774,
-  798,  822,  848,  869,  887,  906,  925,  942,  961,  309,  311,  313,  316,
-  318,  322,  324,  332,  338,  344,  351,  358,  367,  375,  386,  394,  682,
-  698,  714,  734,  750,  772,  795,  820,  842,  864,  884,  904,  923,  938,
-  954,  967,  328,  330,  331,  334,  336,  339,  342,  348,  354,  359,  366,
-  374,  382,  391,  400,  409,  703,  718,  736,  755,  775,  796,  818,  840,
-  860,  882,  900,  917,  936,  952,  965,  977,  345,  347,  349,  350,  353,
-  356,  360,  364,  371,  376,  383,  389,  395,  406,  412,  423,  724,  740,
-  761,  778,  799,  821,  841,  859,  878,  895,  915,  932,  948,  963,  975,
-  986,  362,  363,  365,  368,  370,  373,  377,  379,  387,  392,  397,  405,
-  411,  420,  428,  439,  746,  765,  782,  804,  823,  843,  861,  879,  894,
-  911,  930,  946,  959,  973,  984,  994,  380,  381,  384,  385,  388,  390,
-  393,  396,  403,  408,  413,  422,  427,  436,  444,  452,  771,  790,  808,
-  832,  849,  865,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
-  398,  399,  401,  402,  404,  407,  410,  414,  419,  425,  429,  437,  442,
-  449,  458,  465,  801,  817,  836,  852,  870,  885,  901,  916,  931,  945,
-  956,  969,  980,  990,  999,  1007, 415,  416,  417,  418,  421,  424,  426,
-  430,  434,  441,  445,  453,  459,  463,  473,  480,  829,  845,  858,  873,
-  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 431,
-  432,  433,  435,  438,  440,  443,  446,  451,  456,  461,  468,  475,  479,
-  488,  494,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
-  989,  996,  1003, 1010, 1016, 447,  448,  450,  454,  455,  457,  460,  462,
-  469,  472,  477,  482,  490,  495,  499,  503,  876,  891,  903,  914,  926,
-  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 464,  466,
-  467,  470,  471,  474,  476,  478,  484,  489,  493,  497,  501,  504,  506,
-  508,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
-  1011, 1015, 1018, 1021, 481,  483,  485,  486,  487,  491,  492,  496,  498,
-  500,  502,  505,  507,  509,  510,  511,  920,  929,  941,  951,  962,  968,
-  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_h2_iscan_32x32[1024]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  214,  233,  254,  273,  292,  309,  328,  345,  362,  378,
-  397,  415,  431,  447,  464,  481,  2,    3,    6,    11,   17,   26,   35,
-  45,   58,   73,   90,   106,  123,  146,  168,  193,  215,  236,  255,  274,
-  294,  310,  329,  346,  363,  381,  399,  416,  432,  448,  465,  482,  5,
-  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
-  170,  195,  216,  240,  259,  275,  295,  312,  331,  348,  365,  383,  400,
-  417,  433,  449,  467,  485,  10,   12,   14,   19,   23,   31,   41,   52,
-  65,   81,   96,   113,  133,  152,  175,  201,  221,  243,  260,  280,  297,
-  315,  333,  350,  367,  385,  402,  418,  434,  452,  470,  486,  16,   18,
-  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
-  203,  226,  244,  264,  283,  300,  318,  335,  353,  370,  388,  404,  420,
-  438,  455,  471,  487,  25,   27,   29,   32,   40,   46,   54,   67,   79,
-  94,   109,  127,  143,  164,  185,  210,  231,  250,  269,  285,  304,  322,
-  339,  356,  373,  389,  407,  423,  440,  457,  473,  491,  34,   36,   38,
-  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
-  239,  256,  272,  291,  308,  324,  341,  359,  377,  393,  410,  426,  442,
-  460,  476,  492,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
-  131,  147,  162,  183,  208,  227,  245,  262,  282,  298,  314,  332,  349,
-  364,  379,  396,  412,  430,  446,  462,  478,  495,  57,   61,   63,   66,
-  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  217,  234,  253,
-  270,  286,  305,  321,  337,  354,  371,  387,  403,  419,  435,  451,  468,
-  484,  498,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
-  173,  190,  211,  229,  246,  261,  281,  296,  311,  325,  344,  360,  375,
-  392,  408,  425,  441,  456,  472,  489,  500,  89,   91,   93,   97,   101,
-  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  257,  271,  287,
-  303,  320,  336,  351,  366,  384,  398,  413,  429,  445,  461,  477,  493,
-  502,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
-  222,  237,  251,  267,  284,  299,  313,  327,  343,  358,  374,  390,  405,
-  422,  437,  453,  469,  483,  497,  505,  122,  126,  130,  134,  138,  144,
-  155,  163,  180,  191,  207,  223,  232,  248,  265,  278,  293,  307,  323,
-  338,  352,  368,  382,  395,  411,  427,  443,  459,  475,  490,  501,  507,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  238,  249,
-  263,  276,  289,  306,  319,  334,  347,  361,  376,  391,  406,  421,  436,
-  450,  463,  479,  496,  504,  509,  167,  169,  172,  178,  182,  188,  198,
-  209,  218,  230,  242,  252,  266,  277,  288,  301,  317,  330,  342,  357,
-  372,  386,  401,  414,  428,  444,  458,  474,  488,  499,  506,  510,  192,
-  194,  196,  202,  204,  213,  220,  228,  235,  247,  258,  268,  279,  290,
-  302,  316,  326,  340,  355,  369,  380,  394,  409,  424,  439,  454,  466,
-  480,  494,  503,  508,  511,  512,  513,  514,  515,  516,  517,  520,  523,
-  526,  532,  537,  545,  551,  561,  573,  581,  596,  610,  625,  642,  661,
-  680,  701,  722,  745,  770,  800,  827,  853,  875,  897,  919,  518,  519,
-  521,  522,  524,  525,  528,  533,  536,  542,  549,  557,  564,  575,  585,
-  597,  611,  623,  640,  656,  676,  696,  717,  739,  763,  789,  815,  844,
-  867,  889,  909,  927,  527,  529,  530,  531,  534,  535,  538,  544,  548,
-  555,  560,  569,  579,  589,  598,  614,  626,  641,  655,  673,  690,  712,
-  735,  760,  780,  806,  834,  857,  880,  902,  921,  940,  539,  540,  541,
-  543,  546,  547,  550,  558,  562,  567,  576,  583,  593,  603,  616,  631,
-  643,  657,  674,  689,  710,  733,  752,  776,  803,  830,  850,  872,  892,
-  913,  934,  950,  552,  553,  554,  556,  559,  563,  565,  571,  577,  582,
-  591,  600,  609,  620,  634,  644,  662,  677,  691,  711,  730,  748,  773,
-  798,  822,  847,  869,  887,  906,  925,  942,  961,  566,  568,  570,  572,
-  574,  578,  580,  588,  594,  601,  608,  617,  629,  637,  652,  665,  681,
-  697,  713,  734,  749,  772,  793,  819,  842,  863,  884,  904,  923,  938,
-  954,  967,  584,  586,  587,  590,  592,  595,  599,  605,  613,  618,  628,
-  636,  648,  660,  671,  686,  702,  718,  736,  753,  774,  794,  818,  840,
-  860,  882,  900,  917,  936,  952,  965,  977,  602,  604,  606,  607,  612,
-  615,  619,  624,  633,  638,  649,  658,  666,  683,  692,  707,  723,  740,
-  761,  777,  799,  820,  841,  859,  877,  895,  915,  932,  948,  963,  975,
-  986,  621,  622,  627,  630,  632,  635,  639,  645,  653,  663,  668,  682,
-  688,  704,  716,  732,  746,  764,  781,  804,  823,  843,  861,  878,  894,
-  911,  930,  946,  959,  973,  984,  994,  646,  647,  650,  651,  654,  659,
-  664,  667,  678,  685,  693,  706,  715,  728,  743,  757,  771,  790,  807,
-  831,  848,  864,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
-  669,  670,  672,  675,  679,  684,  687,  694,  703,  709,  719,  729,  741,
-  754,  767,  783,  801,  816,  835,  851,  870,  885,  901,  916,  931,  945,
-  956,  969,  980,  990,  999,  1007, 695,  698,  699,  700,  705,  708,  714,
-  720,  726,  738,  744,  758,  768,  779,  795,  810,  828,  845,  858,  873,
-  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 721,
-  724,  725,  727,  731,  737,  742,  747,  756,  765,  775,  786,  797,  809,
-  825,  837,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
-  989,  996,  1003, 1010, 1016, 750,  751,  755,  759,  762,  766,  769,  778,
-  787,  792,  805,  812,  829,  838,  852,  865,  876,  890,  903,  914,  926,
-  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 782,  784,
-  785,  788,  791,  796,  802,  808,  814,  826,  836,  846,  856,  866,  874,
-  886,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
-  1011, 1015, 1018, 1021, 811,  813,  817,  821,  824,  832,  833,  839,  849,
-  855,  862,  871,  879,  891,  899,  908,  920,  929,  941,  951,  962,  968,
-  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_qtr_iscan_32x32[1024]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  256,  268,  286,  310,  334,  364,  400,  435,  471,  510,
-  553,  598,  640,  683,  732,  780,  2,    3,    6,    11,   17,   26,   35,
-  45,   58,   73,   90,   106,  123,  146,  168,  193,  258,  270,  288,  312,
-  338,  366,  402,  437,  473,  516,  557,  600,  642,  687,  736,  782,  5,
-  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
-  170,  195,  260,  274,  292,  314,  340,  370,  406,  441,  478,  520,  559,
-  604,  646,  689,  740,  788,  10,   12,   14,   19,   23,   31,   41,   52,
-  65,   81,   96,   113,  133,  152,  175,  201,  262,  276,  294,  316,  344,
-  376,  410,  445,  484,  524,  563,  606,  648,  697,  746,  793,  16,   18,
-  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
-  203,  264,  278,  300,  322,  350,  380,  414,  451,  490,  530,  571,  612,
-  656,  705,  750,  799,  25,   27,   29,   32,   40,   46,   54,   67,   79,
-  94,   109,  127,  143,  164,  185,  210,  266,  282,  302,  326,  354,  388,
-  422,  459,  496,  533,  579,  618,  665,  711,  754,  809,  34,   36,   38,
-  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  216,
-  272,  289,  308,  332,  362,  392,  427,  465,  504,  545,  585,  626,  671,
-  717,  766,  813,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
-  131,  147,  162,  183,  208,  222,  279,  298,  320,  346,  374,  408,  442,
-  475,  511,  551,  592,  638,  681,  726,  772,  821,  57,   61,   63,   66,
-  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  214,  227,  284,
-  304,  328,  355,  386,  418,  455,  492,  528,  567,  608,  649,  695,  742,
-  786,  833,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
-  173,  190,  211,  224,  233,  296,  317,  342,  367,  394,  433,  466,  500,
-  543,  581,  622,  667,  707,  752,  803,  843,  89,   91,   93,   97,   101,
-  110,  118,  132,  141,  157,  171,  186,  206,  220,  231,  239,  306,  330,
-  352,  384,  415,  447,  482,  521,  554,  593,  636,  677,  722,  770,  815,
-  852,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
-  218,  229,  237,  244,  323,  347,  371,  398,  431,  463,  498,  534,  573,
-  616,  654,  698,  743,  783,  831,  864,  122,  126,  130,  134,  138,  144,
-  155,  163,  180,  191,  207,  219,  226,  235,  242,  248,  335,  360,  390,
-  419,  449,  485,  518,  549,  587,  630,  672,  715,  760,  805,  845,  872,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  221,  230,  236,
-  241,  246,  251,  356,  382,  411,  438,  469,  501,  539,  577,  613,  652,
-  690,  730,  776,  822,  858,  886,  167,  169,  172,  178,  182,  188,  198,
-  209,  215,  225,  232,  238,  243,  247,  250,  253,  378,  403,  428,  461,
-  494,  526,  560,  594,  632,  675,  713,  755,  801,  837,  868,  897,  192,
-  194,  196,  202,  204,  213,  217,  223,  228,  234,  240,  245,  249,  252,
-  254,  255,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,
-  778,  819,  854,  882,  907,  257,  259,  261,  263,  265,  267,  273,  280,
-  285,  297,  307,  324,  336,  357,  379,  396,  424,  452,  479,  508,  541,
-  574,  609,  643,  679,  719,  764,  806,  841,  870,  895,  919,  269,  271,
-  275,  277,  281,  283,  290,  299,  305,  318,  331,  348,  361,  383,  404,
-  426,  453,  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,
-  860,  887,  909,  927,  287,  291,  293,  295,  301,  303,  309,  321,  329,
-  343,  353,  372,  391,  412,  429,  458,  480,  507,  532,  564,  590,  627,
-  663,  703,  733,  773,  816,  847,  876,  901,  921,  940,  311,  313,  315,
-  319,  325,  327,  333,  349,  358,  368,  385,  399,  420,  439,  462,  489,
-  509,  536,  565,  589,  624,  661,  691,  727,  768,  810,  838,  866,  890,
-  913,  934,  950,  337,  339,  341,  345,  351,  359,  363,  375,  387,  397,
-  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,  657,  684,  723,
-  762,  797,  834,  862,  884,  905,  925,  942,  961,  365,  369,  373,  377,
-  381,  389,  393,  409,  421,  434,  448,  464,  486,  502,  527,  548,  575,
-  602,  628,  662,  685,  721,  756,  794,  827,  855,  880,  903,  923,  938,
-  954,  967,  401,  405,  407,  413,  417,  423,  430,  443,  456,  467,  483,
-  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,  792,  825,
-  850,  878,  899,  917,  936,  952,  965,  977,  436,  440,  444,  446,  454,
-  460,  468,  477,  493,  503,  522,  537,  550,  578,  595,  620,  644,  670,
-  704,  728,  763,  795,  826,  849,  873,  893,  915,  932,  948,  963,  975,
-  986,  472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,  576,
-  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  851,  874,  892,
-  911,  930,  946,  959,  973,  984,  994,  515,  517,  523,  525,  531,  538,
-  546,  552,  570,  582,  596,  617,  631,  653,  676,  700,  720,  749,  774,
-  811,  835,  856,  879,  894,  912,  928,  944,  957,  971,  982,  992,  1001,
-  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
-  693,  714,  738,  765,  790,  817,  839,  863,  881,  900,  916,  931,  945,
-  956,  969,  980,  990,  999,  1007, 599,  603,  605,  607,  615,  621,  629,
-  639,  650,  668,  678,  701,  716,  731,  758,  779,  807,  830,  848,  867,
-  885,  904,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 641,
-  645,  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,
-  802,  820,  842,  861,  877,  891,  906,  924,  937,  949,  960,  972,  981,
-  989,  996,  1003, 1010, 1016, 686,  688,  694,  702,  706,  712,  718,  729,
-  745,  753,  771,  784,  808,  823,  840,  857,  871,  888,  902,  914,  926,
-  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 735,  739,
-  741,  747,  751,  759,  767,  775,  787,  804,  818,  832,  846,  859,  869,
-  883,  896,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
-  1011, 1015, 1018, 1021, 781,  785,  791,  796,  800,  812,  814,  824,  836,
-  844,  853,  865,  875,  889,  898,  908,  920,  929,  941,  951,  962,  968,
-  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
-};
-
-#if CONFIG_TX64X64
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x64[2048]) = {
-  0,    1,    3,    6,    10,   15,   21,   28,   36,   45,   55,   66,   78,
-  91,   105,  120,  136,  153,  171,  190,  210,  231,  253,  276,  300,  325,
-  351,  378,  406,  435,  465,  496,  2,    4,    7,    11,   16,   22,   29,
-  37,   46,   56,   67,   79,   92,   106,  121,  137,  154,  172,  191,  211,
-  232,  254,  277,  301,  326,  352,  379,  407,  436,  466,  497,  528,  5,
-  8,    12,   17,   23,   30,   38,   47,   57,   68,   80,   93,   107,  122,
-  138,  155,  173,  192,  212,  233,  255,  278,  302,  327,  353,  380,  408,
-  437,  467,  498,  529,  560,  9,    13,   18,   24,   31,   39,   48,   58,
-  69,   81,   94,   108,  123,  139,  156,  174,  193,  213,  234,  256,  279,
-  303,  328,  354,  381,  409,  438,  468,  499,  530,  561,  592,  14,   19,
-  25,   32,   40,   49,   59,   70,   82,   95,   109,  124,  140,  157,  175,
-  194,  214,  235,  257,  280,  304,  329,  355,  382,  410,  439,  469,  500,
-  531,  562,  593,  624,  20,   26,   33,   41,   50,   60,   71,   83,   96,
-  110,  125,  141,  158,  176,  195,  215,  236,  258,  281,  305,  330,  356,
-  383,  411,  440,  470,  501,  532,  563,  594,  625,  656,  27,   34,   42,
-  51,   61,   72,   84,   97,   111,  126,  142,  159,  177,  196,  216,  237,
-  259,  282,  306,  331,  357,  384,  412,  441,  471,  502,  533,  564,  595,
-  626,  657,  688,  35,   43,   52,   62,   73,   85,   98,   112,  127,  143,
-  160,  178,  197,  217,  238,  260,  283,  307,  332,  358,  385,  413,  442,
-  472,  503,  534,  565,  596,  627,  658,  689,  720,  44,   53,   63,   74,
-  86,   99,   113,  128,  144,  161,  179,  198,  218,  239,  261,  284,  308,
-  333,  359,  386,  414,  443,  473,  504,  535,  566,  597,  628,  659,  690,
-  721,  752,  54,   64,   75,   87,   100,  114,  129,  145,  162,  180,  199,
-  219,  240,  262,  285,  309,  334,  360,  387,  415,  444,  474,  505,  536,
-  567,  598,  629,  660,  691,  722,  753,  784,  65,   76,   88,   101,  115,
-  130,  146,  163,  181,  200,  220,  241,  263,  286,  310,  335,  361,  388,
-  416,  445,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,
-  816,  77,   89,   102,  116,  131,  147,  164,  182,  201,  221,  242,  264,
-  287,  311,  336,  362,  389,  417,  446,  476,  507,  538,  569,  600,  631,
-  662,  693,  724,  755,  786,  817,  848,  90,   103,  117,  132,  148,  165,
-  183,  202,  222,  243,  265,  288,  312,  337,  363,  390,  418,  447,  477,
-  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,
-  104,  118,  133,  149,  166,  184,  203,  223,  244,  266,  289,  313,  338,
-  364,  391,  419,  448,  478,  509,  540,  571,  602,  633,  664,  695,  726,
-  757,  788,  819,  850,  881,  912,  119,  134,  150,  167,  185,  204,  224,
-  245,  267,  290,  314,  339,  365,  392,  420,  449,  479,  510,  541,  572,
-  603,  634,  665,  696,  727,  758,  789,  820,  851,  882,  913,  944,  135,
-  151,  168,  186,  205,  225,  246,  268,  291,  315,  340,  366,  393,  421,
-  450,  480,  511,  542,  573,  604,  635,  666,  697,  728,  759,  790,  821,
-  852,  883,  914,  945,  976,  152,  169,  187,  206,  226,  247,  269,  292,
-  316,  341,  367,  394,  422,  451,  481,  512,  543,  574,  605,  636,  667,
-  698,  729,  760,  791,  822,  853,  884,  915,  946,  977,  1008, 170,  188,
-  207,  227,  248,  270,  293,  317,  342,  368,  395,  423,  452,  482,  513,
-  544,  575,  606,  637,  668,  699,  730,  761,  792,  823,  854,  885,  916,
-  947,  978,  1009, 1040, 189,  208,  228,  249,  271,  294,  318,  343,  369,
-  396,  424,  453,  483,  514,  545,  576,  607,  638,  669,  700,  731,  762,
-  793,  824,  855,  886,  917,  948,  979,  1010, 1041, 1072, 209,  229,  250,
-  272,  295,  319,  344,  370,  397,  425,  454,  484,  515,  546,  577,  608,
-  639,  670,  701,  732,  763,  794,  825,  856,  887,  918,  949,  980,  1011,
-  1042, 1073, 1104, 230,  251,  273,  296,  320,  345,  371,  398,  426,  455,
-  485,  516,  547,  578,  609,  640,  671,  702,  733,  764,  795,  826,  857,
-  888,  919,  950,  981,  1012, 1043, 1074, 1105, 1136, 252,  274,  297,  321,
-  346,  372,  399,  427,  456,  486,  517,  548,  579,  610,  641,  672,  703,
-  734,  765,  796,  827,  858,  889,  920,  951,  982,  1013, 1044, 1075, 1106,
-  1137, 1168, 275,  298,  322,  347,  373,  400,  428,  457,  487,  518,  549,
-  580,  611,  642,  673,  704,  735,  766,  797,  828,  859,  890,  921,  952,
-  983,  1014, 1045, 1076, 1107, 1138, 1169, 1200, 299,  323,  348,  374,  401,
-  429,  458,  488,  519,  550,  581,  612,  643,  674,  705,  736,  767,  798,
-  829,  860,  891,  922,  953,  984,  1015, 1046, 1077, 1108, 1139, 1170, 1201,
-  1232, 324,  349,  375,  402,  430,  459,  489,  520,  551,  582,  613,  644,
-  675,  706,  737,  768,  799,  830,  861,  892,  923,  954,  985,  1016, 1047,
-  1078, 1109, 1140, 1171, 1202, 1233, 1264, 350,  376,  403,  431,  460,  490,
-  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  831,  862,  893,
-  924,  955,  986,  1017, 1048, 1079, 1110, 1141, 1172, 1203, 1234, 1265, 1296,
-  377,  404,  432,  461,  491,  522,  553,  584,  615,  646,  677,  708,  739,
-  770,  801,  832,  863,  894,  925,  956,  987,  1018, 1049, 1080, 1111, 1142,
-  1173, 1204, 1235, 1266, 1297, 1328, 405,  433,  462,  492,  523,  554,  585,
-  616,  647,  678,  709,  740,  771,  802,  833,  864,  895,  926,  957,  988,
-  1019, 1050, 1081, 1112, 1143, 1174, 1205, 1236, 1267, 1298, 1329, 1360, 434,
-  463,  493,  524,  555,  586,  617,  648,  679,  710,  741,  772,  803,  834,
-  865,  896,  927,  958,  989,  1020, 1051, 1082, 1113, 1144, 1175, 1206, 1237,
-  1268, 1299, 1330, 1361, 1392, 464,  494,  525,  556,  587,  618,  649,  680,
-  711,  742,  773,  804,  835,  866,  897,  928,  959,  990,  1021, 1052, 1083,
-  1114, 1145, 1176, 1207, 1238, 1269, 1300, 1331, 1362, 1393, 1424, 495,  526,
-  557,  588,  619,  650,  681,  712,  743,  774,  805,  836,  867,  898,  929,
-  960,  991,  1022, 1053, 1084, 1115, 1146, 1177, 1208, 1239, 1270, 1301, 1332,
-  1363, 1394, 1425, 1456, 527,  558,  589,  620,  651,  682,  713,  744,  775,
-  806,  837,  868,  899,  930,  961,  992,  1023, 1054, 1085, 1116, 1147, 1178,
-  1209, 1240, 1271, 1302, 1333, 1364, 1395, 1426, 1457, 1488, 559,  590,  621,
-  652,  683,  714,  745,  776,  807,  838,  869,  900,  931,  962,  993,  1024,
-  1055, 1086, 1117, 1148, 1179, 1210, 1241, 1272, 1303, 1334, 1365, 1396, 1427,
-  1458, 1489, 1520, 591,  622,  653,  684,  715,  746,  777,  808,  839,  870,
-  901,  932,  963,  994,  1025, 1056, 1087, 1118, 1149, 1180, 1211, 1242, 1273,
-  1304, 1335, 1366, 1397, 1428, 1459, 1490, 1521, 1552, 623,  654,  685,  716,
-  747,  778,  809,  840,  871,  902,  933,  964,  995,  1026, 1057, 1088, 1119,
-  1150, 1181, 1212, 1243, 1274, 1305, 1336, 1367, 1398, 1429, 1460, 1491, 1522,
-  1553, 1583, 655,  686,  717,  748,  779,  810,  841,  872,  903,  934,  965,
-  996,  1027, 1058, 1089, 1120, 1151, 1182, 1213, 1244, 1275, 1306, 1337, 1368,
-  1399, 1430, 1461, 1492, 1523, 1554, 1584, 1613, 687,  718,  749,  780,  811,
-  842,  873,  904,  935,  966,  997,  1028, 1059, 1090, 1121, 1152, 1183, 1214,
-  1245, 1276, 1307, 1338, 1369, 1400, 1431, 1462, 1493, 1524, 1555, 1585, 1614,
-  1642, 719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  1029, 1060,
-  1091, 1122, 1153, 1184, 1215, 1246, 1277, 1308, 1339, 1370, 1401, 1432, 1463,
-  1494, 1525, 1556, 1586, 1615, 1643, 1670, 751,  782,  813,  844,  875,  906,
-  937,  968,  999,  1030, 1061, 1092, 1123, 1154, 1185, 1216, 1247, 1278, 1309,
-  1340, 1371, 1402, 1433, 1464, 1495, 1526, 1557, 1587, 1616, 1644, 1671, 1697,
-  783,  814,  845,  876,  907,  938,  969,  1000, 1031, 1062, 1093, 1124, 1155,
-  1186, 1217, 1248, 1279, 1310, 1341, 1372, 1403, 1434, 1465, 1496, 1527, 1558,
-  1588, 1617, 1645, 1672, 1698, 1723, 815,  846,  877,  908,  939,  970,  1001,
-  1032, 1063, 1094, 1125, 1156, 1187, 1218, 1249, 1280, 1311, 1342, 1373, 1404,
-  1435, 1466, 1497, 1528, 1559, 1589, 1618, 1646, 1673, 1699, 1724, 1748, 847,
-  878,  909,  940,  971,  1002, 1033, 1064, 1095, 1126, 1157, 1188, 1219, 1250,
-  1281, 1312, 1343, 1374, 1405, 1436, 1467, 1498, 1529, 1560, 1590, 1619, 1647,
-  1674, 1700, 1725, 1749, 1772, 879,  910,  941,  972,  1003, 1034, 1065, 1096,
-  1127, 1158, 1189, 1220, 1251, 1282, 1313, 1344, 1375, 1406, 1437, 1468, 1499,
-  1530, 1561, 1591, 1620, 1648, 1675, 1701, 1726, 1750, 1773, 1795, 911,  942,
-  973,  1004, 1035, 1066, 1097, 1128, 1159, 1190, 1221, 1252, 1283, 1314, 1345,
-  1376, 1407, 1438, 1469, 1500, 1531, 1562, 1592, 1621, 1649, 1676, 1702, 1727,
-  1751, 1774, 1796, 1817, 943,  974,  1005, 1036, 1067, 1098, 1129, 1160, 1191,
-  1222, 1253, 1284, 1315, 1346, 1377, 1408, 1439, 1470, 1501, 1532, 1563, 1593,
-  1622, 1650, 1677, 1703, 1728, 1752, 1775, 1797, 1818, 1838, 975,  1006, 1037,
-  1068, 1099, 1130, 1161, 1192, 1223, 1254, 1285, 1316, 1347, 1378, 1409, 1440,
-  1471, 1502, 1533, 1564, 1594, 1623, 1651, 1678, 1704, 1729, 1753, 1776, 1798,
-  1819, 1839, 1858, 1007, 1038, 1069, 1100, 1131, 1162, 1193, 1224, 1255, 1286,
-  1317, 1348, 1379, 1410, 1441, 1472, 1503, 1534, 1565, 1595, 1624, 1652, 1679,
-  1705, 1730, 1754, 1777, 1799, 1820, 1840, 1859, 1877, 1039, 1070, 1101, 1132,
-  1163, 1194, 1225, 1256, 1287, 1318, 1349, 1380, 1411, 1442, 1473, 1504, 1535,
-  1566, 1596, 1625, 1653, 1680, 1706, 1731, 1755, 1778, 1800, 1821, 1841, 1860,
-  1878, 1895, 1071, 1102, 1133, 1164, 1195, 1226, 1257, 1288, 1319, 1350, 1381,
-  1412, 1443, 1474, 1505, 1536, 1567, 1597, 1626, 1654, 1681, 1707, 1732, 1756,
-  1779, 1801, 1822, 1842, 1861, 1879, 1896, 1912, 1103, 1134, 1165, 1196, 1227,
-  1258, 1289, 1320, 1351, 1382, 1413, 1444, 1475, 1506, 1537, 1568, 1598, 1627,
-  1655, 1682, 1708, 1733, 1757, 1780, 1802, 1823, 1843, 1862, 1880, 1897, 1913,
-  1928, 1135, 1166, 1197, 1228, 1259, 1290, 1321, 1352, 1383, 1414, 1445, 1476,
-  1507, 1538, 1569, 1599, 1628, 1656, 1683, 1709, 1734, 1758, 1781, 1803, 1824,
-  1844, 1863, 1881, 1898, 1914, 1929, 1943, 1167, 1198, 1229, 1260, 1291, 1322,
-  1353, 1384, 1415, 1446, 1477, 1508, 1539, 1570, 1600, 1629, 1657, 1684, 1710,
-  1735, 1759, 1782, 1804, 1825, 1845, 1864, 1882, 1899, 1915, 1930, 1944, 1957,
-  1199, 1230, 1261, 1292, 1323, 1354, 1385, 1416, 1447, 1478, 1509, 1540, 1571,
-  1601, 1630, 1658, 1685, 1711, 1736, 1760, 1783, 1805, 1826, 1846, 1865, 1883,
-  1900, 1916, 1931, 1945, 1958, 1970, 1231, 1262, 1293, 1324, 1355, 1386, 1417,
-  1448, 1479, 1510, 1541, 1572, 1602, 1631, 1659, 1686, 1712, 1737, 1761, 1784,
-  1806, 1827, 1847, 1866, 1884, 1901, 1917, 1932, 1946, 1959, 1971, 1982, 1263,
-  1294, 1325, 1356, 1387, 1418, 1449, 1480, 1511, 1542, 1573, 1603, 1632, 1660,
-  1687, 1713, 1738, 1762, 1785, 1807, 1828, 1848, 1867, 1885, 1902, 1918, 1933,
-  1947, 1960, 1972, 1983, 1993, 1295, 1326, 1357, 1388, 1419, 1450, 1481, 1512,
-  1543, 1574, 1604, 1633, 1661, 1688, 1714, 1739, 1763, 1786, 1808, 1829, 1849,
-  1868, 1886, 1903, 1919, 1934, 1948, 1961, 1973, 1984, 1994, 2003, 1327, 1358,
-  1389, 1420, 1451, 1482, 1513, 1544, 1575, 1605, 1634, 1662, 1689, 1715, 1740,
-  1764, 1787, 1809, 1830, 1850, 1869, 1887, 1904, 1920, 1935, 1949, 1962, 1974,
-  1985, 1995, 2004, 2012, 1359, 1390, 1421, 1452, 1483, 1514, 1545, 1576, 1606,
-  1635, 1663, 1690, 1716, 1741, 1765, 1788, 1810, 1831, 1851, 1870, 1888, 1905,
-  1921, 1936, 1950, 1963, 1975, 1986, 1996, 2005, 2013, 2020, 1391, 1422, 1453,
-  1484, 1515, 1546, 1577, 1607, 1636, 1664, 1691, 1717, 1742, 1766, 1789, 1811,
-  1832, 1852, 1871, 1889, 1906, 1922, 1937, 1951, 1964, 1976, 1987, 1997, 2006,
-  2014, 2021, 2027, 1423, 1454, 1485, 1516, 1547, 1578, 1608, 1637, 1665, 1692,
-  1718, 1743, 1767, 1790, 1812, 1833, 1853, 1872, 1890, 1907, 1923, 1938, 1952,
-  1965, 1977, 1988, 1998, 2007, 2015, 2022, 2028, 2033, 1455, 1486, 1517, 1548,
-  1579, 1609, 1638, 1666, 1693, 1719, 1744, 1768, 1791, 1813, 1834, 1854, 1873,
-  1891, 1908, 1924, 1939, 1953, 1966, 1978, 1989, 1999, 2008, 2016, 2023, 2029,
-  2034, 2038, 1487, 1518, 1549, 1580, 1610, 1639, 1667, 1694, 1720, 1745, 1769,
-  1792, 1814, 1835, 1855, 1874, 1892, 1909, 1925, 1940, 1954, 1967, 1979, 1990,
-  2000, 2009, 2017, 2024, 2030, 2035, 2039, 2042, 1519, 1550, 1581, 1611, 1640,
-  1668, 1695, 1721, 1746, 1770, 1793, 1815, 1836, 1856, 1875, 1893, 1910, 1926,
-  1941, 1955, 1968, 1980, 1991, 2001, 2010, 2018, 2025, 2031, 2036, 2040, 2043,
-  2045, 1551, 1582, 1612, 1641, 1669, 1696, 1722, 1747, 1771, 1794, 1816, 1837,
-  1857, 1876, 1894, 1911, 1927, 1942, 1956, 1969, 1981, 1992, 2002, 2011, 2019,
-  2026, 2032, 2037, 2041, 2044, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_64x32[2048]) = {
-  0,    1,    3,    6,    10,   15,   21,   28,   36,   45,   55,   66,   78,
-  91,   105,  120,  136,  153,  171,  190,  210,  231,  253,  276,  300,  325,
-  351,  378,  406,  435,  465,  496,  528,  560,  592,  624,  656,  688,  720,
-  752,  784,  816,  848,  880,  912,  944,  976,  1008, 1040, 1072, 1104, 1136,
-  1168, 1200, 1232, 1264, 1296, 1328, 1360, 1392, 1424, 1456, 1488, 1520, 2,
-  4,    7,    11,   16,   22,   29,   37,   46,   56,   67,   79,   92,   106,
-  121,  137,  154,  172,  191,  211,  232,  254,  277,  301,  326,  352,  379,
-  407,  436,  466,  497,  529,  561,  593,  625,  657,  689,  721,  753,  785,
-  817,  849,  881,  913,  945,  977,  1009, 1041, 1073, 1105, 1137, 1169, 1201,
-  1233, 1265, 1297, 1329, 1361, 1393, 1425, 1457, 1489, 1521, 1552, 5,    8,
-  12,   17,   23,   30,   38,   47,   57,   68,   80,   93,   107,  122,  138,
-  155,  173,  192,  212,  233,  255,  278,  302,  327,  353,  380,  408,  437,
-  467,  498,  530,  562,  594,  626,  658,  690,  722,  754,  786,  818,  850,
-  882,  914,  946,  978,  1010, 1042, 1074, 1106, 1138, 1170, 1202, 1234, 1266,
-  1298, 1330, 1362, 1394, 1426, 1458, 1490, 1522, 1553, 1583, 9,    13,   18,
-  24,   31,   39,   48,   58,   69,   81,   94,   108,  123,  139,  156,  174,
-  193,  213,  234,  256,  279,  303,  328,  354,  381,  409,  438,  468,  499,
-  531,  563,  595,  627,  659,  691,  723,  755,  787,  819,  851,  883,  915,
-  947,  979,  1011, 1043, 1075, 1107, 1139, 1171, 1203, 1235, 1267, 1299, 1331,
-  1363, 1395, 1427, 1459, 1491, 1523, 1554, 1584, 1613, 14,   19,   25,   32,
-  40,   49,   59,   70,   82,   95,   109,  124,  140,  157,  175,  194,  214,
-  235,  257,  280,  304,  329,  355,  382,  410,  439,  469,  500,  532,  564,
-  596,  628,  660,  692,  724,  756,  788,  820,  852,  884,  916,  948,  980,
-  1012, 1044, 1076, 1108, 1140, 1172, 1204, 1236, 1268, 1300, 1332, 1364, 1396,
-  1428, 1460, 1492, 1524, 1555, 1585, 1614, 1642, 20,   26,   33,   41,   50,
-  60,   71,   83,   96,   110,  125,  141,  158,  176,  195,  215,  236,  258,
-  281,  305,  330,  356,  383,  411,  440,  470,  501,  533,  565,  597,  629,
-  661,  693,  725,  757,  789,  821,  853,  885,  917,  949,  981,  1013, 1045,
-  1077, 1109, 1141, 1173, 1205, 1237, 1269, 1301, 1333, 1365, 1397, 1429, 1461,
-  1493, 1525, 1556, 1586, 1615, 1643, 1670, 27,   34,   42,   51,   61,   72,
-  84,   97,   111,  126,  142,  159,  177,  196,  216,  237,  259,  282,  306,
-  331,  357,  384,  412,  441,  471,  502,  534,  566,  598,  630,  662,  694,
-  726,  758,  790,  822,  854,  886,  918,  950,  982,  1014, 1046, 1078, 1110,
-  1142, 1174, 1206, 1238, 1270, 1302, 1334, 1366, 1398, 1430, 1462, 1494, 1526,
-  1557, 1587, 1616, 1644, 1671, 1697, 35,   43,   52,   62,   73,   85,   98,
-  112,  127,  143,  160,  178,  197,  217,  238,  260,  283,  307,  332,  358,
-  385,  413,  442,  472,  503,  535,  567,  599,  631,  663,  695,  727,  759,
-  791,  823,  855,  887,  919,  951,  983,  1015, 1047, 1079, 1111, 1143, 1175,
-  1207, 1239, 1271, 1303, 1335, 1367, 1399, 1431, 1463, 1495, 1527, 1558, 1588,
-  1617, 1645, 1672, 1698, 1723, 44,   53,   63,   74,   86,   99,   113,  128,
-  144,  161,  179,  198,  218,  239,  261,  284,  308,  333,  359,  386,  414,
-  443,  473,  504,  536,  568,  600,  632,  664,  696,  728,  760,  792,  824,
-  856,  888,  920,  952,  984,  1016, 1048, 1080, 1112, 1144, 1176, 1208, 1240,
-  1272, 1304, 1336, 1368, 1400, 1432, 1464, 1496, 1528, 1559, 1589, 1618, 1646,
-  1673, 1699, 1724, 1748, 54,   64,   75,   87,   100,  114,  129,  145,  162,
-  180,  199,  219,  240,  262,  285,  309,  334,  360,  387,  415,  444,  474,
-  505,  537,  569,  601,  633,  665,  697,  729,  761,  793,  825,  857,  889,
-  921,  953,  985,  1017, 1049, 1081, 1113, 1145, 1177, 1209, 1241, 1273, 1305,
-  1337, 1369, 1401, 1433, 1465, 1497, 1529, 1560, 1590, 1619, 1647, 1674, 1700,
-  1725, 1749, 1772, 65,   76,   88,   101,  115,  130,  146,  163,  181,  200,
-  220,  241,  263,  286,  310,  335,  361,  388,  416,  445,  475,  506,  538,
-  570,  602,  634,  666,  698,  730,  762,  794,  826,  858,  890,  922,  954,
-  986,  1018, 1050, 1082, 1114, 1146, 1178, 1210, 1242, 1274, 1306, 1338, 1370,
-  1402, 1434, 1466, 1498, 1530, 1561, 1591, 1620, 1648, 1675, 1701, 1726, 1750,
-  1773, 1795, 77,   89,   102,  116,  131,  147,  164,  182,  201,  221,  242,
-  264,  287,  311,  336,  362,  389,  417,  446,  476,  507,  539,  571,  603,
-  635,  667,  699,  731,  763,  795,  827,  859,  891,  923,  955,  987,  1019,
-  1051, 1083, 1115, 1147, 1179, 1211, 1243, 1275, 1307, 1339, 1371, 1403, 1435,
-  1467, 1499, 1531, 1562, 1592, 1621, 1649, 1676, 1702, 1727, 1751, 1774, 1796,
-  1817, 90,   103,  117,  132,  148,  165,  183,  202,  222,  243,  265,  288,
-  312,  337,  363,  390,  418,  447,  477,  508,  540,  572,  604,  636,  668,
-  700,  732,  764,  796,  828,  860,  892,  924,  956,  988,  1020, 1052, 1084,
-  1116, 1148, 1180, 1212, 1244, 1276, 1308, 1340, 1372, 1404, 1436, 1468, 1500,
-  1532, 1563, 1593, 1622, 1650, 1677, 1703, 1728, 1752, 1775, 1797, 1818, 1838,
-  104,  118,  133,  149,  166,  184,  203,  223,  244,  266,  289,  313,  338,
-  364,  391,  419,  448,  478,  509,  541,  573,  605,  637,  669,  701,  733,
-  765,  797,  829,  861,  893,  925,  957,  989,  1021, 1053, 1085, 1117, 1149,
-  1181, 1213, 1245, 1277, 1309, 1341, 1373, 1405, 1437, 1469, 1501, 1533, 1564,
-  1594, 1623, 1651, 1678, 1704, 1729, 1753, 1776, 1798, 1819, 1839, 1858, 119,
-  134,  150,  167,  185,  204,  224,  245,  267,  290,  314,  339,  365,  392,
-  420,  449,  479,  510,  542,  574,  606,  638,  670,  702,  734,  766,  798,
-  830,  862,  894,  926,  958,  990,  1022, 1054, 1086, 1118, 1150, 1182, 1214,
-  1246, 1278, 1310, 1342, 1374, 1406, 1438, 1470, 1502, 1534, 1565, 1595, 1624,
-  1652, 1679, 1705, 1730, 1754, 1777, 1799, 1820, 1840, 1859, 1877, 135,  151,
-  168,  186,  205,  225,  246,  268,  291,  315,  340,  366,  393,  421,  450,
-  480,  511,  543,  575,  607,  639,  671,  703,  735,  767,  799,  831,  863,
-  895,  927,  959,  991,  1023, 1055, 1087, 1119, 1151, 1183, 1215, 1247, 1279,
-  1311, 1343, 1375, 1407, 1439, 1471, 1503, 1535, 1566, 1596, 1625, 1653, 1680,
-  1706, 1731, 1755, 1778, 1800, 1821, 1841, 1860, 1878, 1895, 152,  169,  187,
-  206,  226,  247,  269,  292,  316,  341,  367,  394,  422,  451,  481,  512,
-  544,  576,  608,  640,  672,  704,  736,  768,  800,  832,  864,  896,  928,
-  960,  992,  1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344,
-  1376, 1408, 1440, 1472, 1504, 1536, 1567, 1597, 1626, 1654, 1681, 1707, 1732,
-  1756, 1779, 1801, 1822, 1842, 1861, 1879, 1896, 1912, 170,  188,  207,  227,
-  248,  270,  293,  317,  342,  368,  395,  423,  452,  482,  513,  545,  577,
-  609,  641,  673,  705,  737,  769,  801,  833,  865,  897,  929,  961,  993,
-  1025, 1057, 1089, 1121, 1153, 1185, 1217, 1249, 1281, 1313, 1345, 1377, 1409,
-  1441, 1473, 1505, 1537, 1568, 1598, 1627, 1655, 1682, 1708, 1733, 1757, 1780,
-  1802, 1823, 1843, 1862, 1880, 1897, 1913, 1928, 189,  208,  228,  249,  271,
-  294,  318,  343,  369,  396,  424,  453,  483,  514,  546,  578,  610,  642,
-  674,  706,  738,  770,  802,  834,  866,  898,  930,  962,  994,  1026, 1058,
-  1090, 1122, 1154, 1186, 1218, 1250, 1282, 1314, 1346, 1378, 1410, 1442, 1474,
-  1506, 1538, 1569, 1599, 1628, 1656, 1683, 1709, 1734, 1758, 1781, 1803, 1824,
-  1844, 1863, 1881, 1898, 1914, 1929, 1943, 209,  229,  250,  272,  295,  319,
-  344,  370,  397,  425,  454,  484,  515,  547,  579,  611,  643,  675,  707,
-  739,  771,  803,  835,  867,  899,  931,  963,  995,  1027, 1059, 1091, 1123,
-  1155, 1187, 1219, 1251, 1283, 1315, 1347, 1379, 1411, 1443, 1475, 1507, 1539,
-  1570, 1600, 1629, 1657, 1684, 1710, 1735, 1759, 1782, 1804, 1825, 1845, 1864,
-  1882, 1899, 1915, 1930, 1944, 1957, 230,  251,  273,  296,  320,  345,  371,
-  398,  426,  455,  485,  516,  548,  580,  612,  644,  676,  708,  740,  772,
-  804,  836,  868,  900,  932,  964,  996,  1028, 1060, 1092, 1124, 1156, 1188,
-  1220, 1252, 1284, 1316, 1348, 1380, 1412, 1444, 1476, 1508, 1540, 1571, 1601,
-  1630, 1658, 1685, 1711, 1736, 1760, 1783, 1805, 1826, 1846, 1865, 1883, 1900,
-  1916, 1931, 1945, 1958, 1970, 252,  274,  297,  321,  346,  372,  399,  427,
-  456,  486,  517,  549,  581,  613,  645,  677,  709,  741,  773,  805,  837,
-  869,  901,  933,  965,  997,  1029, 1061, 1093, 1125, 1157, 1189, 1221, 1253,
-  1285, 1317, 1349, 1381, 1413, 1445, 1477, 1509, 1541, 1572, 1602, 1631, 1659,
-  1686, 1712, 1737, 1761, 1784, 1806, 1827, 1847, 1866, 1884, 1901, 1917, 1932,
-  1946, 1959, 1971, 1982, 275,  298,  322,  347,  373,  400,  428,  457,  487,
-  518,  550,  582,  614,  646,  678,  710,  742,  774,  806,  838,  870,  902,
-  934,  966,  998,  1030, 1062, 1094, 1126, 1158, 1190, 1222, 1254, 1286, 1318,
-  1350, 1382, 1414, 1446, 1478, 1510, 1542, 1573, 1603, 1632, 1660, 1687, 1713,
-  1738, 1762, 1785, 1807, 1828, 1848, 1867, 1885, 1902, 1918, 1933, 1947, 1960,
-  1972, 1983, 1993, 299,  323,  348,  374,  401,  429,  458,  488,  519,  551,
-  583,  615,  647,  679,  711,  743,  775,  807,  839,  871,  903,  935,  967,
-  999,  1031, 1063, 1095, 1127, 1159, 1191, 1223, 1255, 1287, 1319, 1351, 1383,
-  1415, 1447, 1479, 1511, 1543, 1574, 1604, 1633, 1661, 1688, 1714, 1739, 1763,
-  1786, 1808, 1829, 1849, 1868, 1886, 1903, 1919, 1934, 1948, 1961, 1973, 1984,
-  1994, 2003, 324,  349,  375,  402,  430,  459,  489,  520,  552,  584,  616,
-  648,  680,  712,  744,  776,  808,  840,  872,  904,  936,  968,  1000, 1032,
-  1064, 1096, 1128, 1160, 1192, 1224, 1256, 1288, 1320, 1352, 1384, 1416, 1448,
-  1480, 1512, 1544, 1575, 1605, 1634, 1662, 1689, 1715, 1740, 1764, 1787, 1809,
-  1830, 1850, 1869, 1887, 1904, 1920, 1935, 1949, 1962, 1974, 1985, 1995, 2004,
-  2012, 350,  376,  403,  431,  460,  490,  521,  553,  585,  617,  649,  681,
-  713,  745,  777,  809,  841,  873,  905,  937,  969,  1001, 1033, 1065, 1097,
-  1129, 1161, 1193, 1225, 1257, 1289, 1321, 1353, 1385, 1417, 1449, 1481, 1513,
-  1545, 1576, 1606, 1635, 1663, 1690, 1716, 1741, 1765, 1788, 1810, 1831, 1851,
-  1870, 1888, 1905, 1921, 1936, 1950, 1963, 1975, 1986, 1996, 2005, 2013, 2020,
-  377,  404,  432,  461,  491,  522,  554,  586,  618,  650,  682,  714,  746,
-  778,  810,  842,  874,  906,  938,  970,  1002, 1034, 1066, 1098, 1130, 1162,
-  1194, 1226, 1258, 1290, 1322, 1354, 1386, 1418, 1450, 1482, 1514, 1546, 1577,
-  1607, 1636, 1664, 1691, 1717, 1742, 1766, 1789, 1811, 1832, 1852, 1871, 1889,
-  1906, 1922, 1937, 1951, 1964, 1976, 1987, 1997, 2006, 2014, 2021, 2027, 405,
-  433,  462,  492,  523,  555,  587,  619,  651,  683,  715,  747,  779,  811,
-  843,  875,  907,  939,  971,  1003, 1035, 1067, 1099, 1131, 1163, 1195, 1227,
-  1259, 1291, 1323, 1355, 1387, 1419, 1451, 1483, 1515, 1547, 1578, 1608, 1637,
-  1665, 1692, 1718, 1743, 1767, 1790, 1812, 1833, 1853, 1872, 1890, 1907, 1923,
-  1938, 1952, 1965, 1977, 1988, 1998, 2007, 2015, 2022, 2028, 2033, 434,  463,
-  493,  524,  556,  588,  620,  652,  684,  716,  748,  780,  812,  844,  876,
-  908,  940,  972,  1004, 1036, 1068, 1100, 1132, 1164, 1196, 1228, 1260, 1292,
-  1324, 1356, 1388, 1420, 1452, 1484, 1516, 1548, 1579, 1609, 1638, 1666, 1693,
-  1719, 1744, 1768, 1791, 1813, 1834, 1854, 1873, 1891, 1908, 1924, 1939, 1953,
-  1966, 1978, 1989, 1999, 2008, 2016, 2023, 2029, 2034, 2038, 464,  494,  525,
-  557,  589,  621,  653,  685,  717,  749,  781,  813,  845,  877,  909,  941,
-  973,  1005, 1037, 1069, 1101, 1133, 1165, 1197, 1229, 1261, 1293, 1325, 1357,
-  1389, 1421, 1453, 1485, 1517, 1549, 1580, 1610, 1639, 1667, 1694, 1720, 1745,
-  1769, 1792, 1814, 1835, 1855, 1874, 1892, 1909, 1925, 1940, 1954, 1967, 1979,
-  1990, 2000, 2009, 2017, 2024, 2030, 2035, 2039, 2042, 495,  526,  558,  590,
-  622,  654,  686,  718,  750,  782,  814,  846,  878,  910,  942,  974,  1006,
-  1038, 1070, 1102, 1134, 1166, 1198, 1230, 1262, 1294, 1326, 1358, 1390, 1422,
-  1454, 1486, 1518, 1550, 1581, 1611, 1640, 1668, 1695, 1721, 1746, 1770, 1793,
-  1815, 1836, 1856, 1875, 1893, 1910, 1926, 1941, 1955, 1968, 1980, 1991, 2001,
-  2010, 2018, 2025, 2031, 2036, 2040, 2043, 2045, 527,  559,  591,  623,  655,
-  687,  719,  751,  783,  815,  847,  879,  911,  943,  975,  1007, 1039, 1071,
-  1103, 1135, 1167, 1199, 1231, 1263, 1295, 1327, 1359, 1391, 1423, 1455, 1487,
-  1519, 1551, 1582, 1612, 1641, 1669, 1696, 1722, 1747, 1771, 1794, 1816, 1837,
-  1857, 1876, 1894, 1911, 1927, 1942, 1956, 1969, 1981, 1992, 2002, 2011, 2019,
-  2026, 2032, 2037, 2041, 2044, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_64x64[4096]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  214,  239,  269,  300,  331,  363,  400,  435,  471,  510,
-  553,  598,  640,  683,  732,  780,  833,  884,  937,  995,  1048, 1107, 1165,
-  1230, 1293, 1353, 1422, 1489, 1562, 1632, 1701, 1776, 1850, 1929, 2006, 2091,
-  2173, 2252, 2339, 2421, 2516, 2603, 2694, 2786, 2879, 2978, 3076, 3175, 2,
-  3,    6,    11,   17,   26,   35,   45,   58,   73,   90,   106,  123,  146,
-  168,  193,  216,  243,  271,  302,  335,  365,  402,  437,  473,  516,  557,
-  600,  642,  687,  736,  782,  835,  886,  941,  999,  1050, 1111, 1167, 1234,
-  1297, 1357, 1424, 1491, 1564, 1636, 1703, 1778, 1852, 1931, 2012, 2095, 2177,
-  2256, 2341, 2425, 2518, 2605, 2698, 2788, 2883, 2982, 3078, 3177, 5,    7,
-  8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,  170,
-  195,  218,  249,  277,  304,  337,  369,  406,  441,  478,  520,  559,  604,
-  646,  689,  740,  788,  841,  890,  945,  1001, 1052, 1115, 1173, 1236, 1301,
-  1362, 1428, 1497, 1568, 1638, 1707, 1786, 1858, 1935, 2016, 2097, 2181, 2260,
-  2343, 2431, 2520, 2613, 2702, 2790, 2889, 2984, 3082, 3181, 10,   12,   14,
-  19,   23,   31,   41,   52,   65,   81,   96,   113,  133,  152,  175,  201,
-  224,  253,  279,  310,  341,  375,  410,  445,  484,  524,  563,  606,  648,
-  697,  746,  793,  843,  896,  949,  1005, 1060, 1119, 1181, 1242, 1303, 1366,
-  1436, 1503, 1572, 1640, 1713, 1790, 1865, 1943, 2018, 2103, 2183, 2266, 2347,
-  2437, 2526, 2617, 2708, 2800, 2893, 2992, 3086, 3189, 16,   18,   21,   24,
-  30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,  203,  230,
-  255,  286,  316,  347,  380,  414,  451,  490,  530,  571,  612,  656,  705,
-  750,  799,  849,  898,  959,  1009, 1066, 1127, 1184, 1246, 1307, 1376, 1440,
-  1509, 1578, 1644, 1723, 1794, 1871, 1947, 2024, 2109, 2185, 2270, 2361, 2443,
-  2536, 2619, 2710, 2806, 2899, 2998, 3090, 3193, 25,   27,   29,   32,   40,
-  46,   54,   67,   79,   94,   109,  127,  143,  164,  185,  210,  236,  263,
-  292,  320,  353,  388,  422,  459,  496,  533,  579,  618,  665,  711,  754,
-  809,  857,  910,  961,  1015, 1074, 1131, 1194, 1254, 1315, 1384, 1448, 1517,
-  1584, 1655, 1731, 1802, 1875, 1959, 2034, 2115, 2197, 2280, 2367, 2452, 2538,
-  2625, 2722, 2816, 2907, 3004, 3100, 3203, 34,   36,   38,   42,   49,   55,
-  64,   76,   87,   102,  117,  135,  154,  176,  197,  222,  247,  272,  298,
-  329,  361,  392,  427,  465,  504,  545,  585,  626,  671,  717,  766,  813,
-  862,  916,  971,  1028, 1084, 1139, 1200, 1264, 1325, 1390, 1452, 1523, 1594,
-  1667, 1737, 1806, 1887, 1963, 2046, 2123, 2202, 2290, 2371, 2462, 2548, 2641,
-  2732, 2822, 2917, 3010, 3111, 3211, 44,   47,   51,   53,   60,   68,   77,
-  85,   98,   114,  131,  147,  162,  183,  208,  232,  256,  283,  314,  343,
-  373,  408,  442,  475,  511,  551,  592,  638,  681,  726,  772,  821,  874,
-  926,  979,  1034, 1088, 1153, 1214, 1271, 1335, 1396, 1469, 1533, 1600, 1673,
-  1745, 1824, 1897, 1973, 2054, 2131, 2216, 2300, 2383, 2468, 2558, 2649, 2740,
-  2829, 2923, 3022, 3123, 3221, 57,   61,   63,   66,   70,   80,   88,   99,
-  112,  124,  140,  159,  179,  199,  219,  240,  267,  294,  322,  354,  386,
-  418,  455,  492,  528,  567,  608,  649,  695,  742,  786,  836,  882,  933,
-  989,  1046, 1101, 1161, 1216, 1279, 1343, 1410, 1479, 1543, 1614, 1687, 1758,
-  1832, 1905, 1980, 2066, 2141, 2226, 2306, 2395, 2484, 2566, 2659, 2750, 2845,
-  2939, 3032, 3133, 3225, 72,   74,   78,   82,   84,   95,   103,  115,  125,
-  139,  156,  173,  190,  211,  234,  259,  281,  311,  339,  366,  394,  433,
-  466,  500,  543,  581,  622,  667,  707,  752,  803,  853,  899,  955,  1007,
-  1064, 1117, 1175, 1237, 1299, 1354, 1420, 1485, 1556, 1624, 1697, 1770, 1842,
-  1919, 1998, 2074, 2155, 2234, 2319, 2409, 2492, 2581, 2671, 2760, 2859, 2949,
-  3046, 3145, 3245, 89,   91,   93,   97,   101,  110,  118,  132,  141,  157,
-  171,  186,  206,  228,  251,  273,  296,  324,  351,  384,  415,  447,  482,
-  521,  554,  593,  636,  677,  722,  770,  815,  866,  914,  967,  1022, 1078,
-  1135, 1195, 1252, 1313, 1378, 1444, 1507, 1576, 1642, 1714, 1788, 1860, 1933,
-  2013, 2085, 2169, 2250, 2337, 2417, 2502, 2597, 2683, 2778, 2869, 2960, 3060,
-  3157, 3256, 105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,
-  205,  225,  244,  265,  290,  317,  344,  370,  398,  431,  463,  498,  534,
-  573,  616,  654,  698,  743,  783,  831,  880,  928,  983,  1036, 1092, 1149,
-  1208, 1266, 1333, 1394, 1457, 1524, 1590, 1665, 1733, 1804, 1879, 1953, 2030,
-  2111, 2189, 2271, 2357, 2441, 2534, 2615, 2704, 2791, 2887, 2979, 3072, 3167,
-  3270, 122,  126,  130,  134,  138,  144,  155,  163,  180,  191,  207,  226,
-  238,  261,  287,  308,  332,  359,  390,  419,  449,  485,  518,  549,  587,
-  630,  672,  715,  760,  805,  855,  900,  953,  1003, 1053, 1108, 1163, 1220,
-  1287, 1345, 1408, 1473, 1541, 1608, 1677, 1749, 1826, 1898, 1971, 2048, 2127,
-  2208, 2294, 2373, 2458, 2542, 2631, 2726, 2818, 2908, 3002, 3094, 3199, 3286,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  229,  245,  262,
-  284,  305,  327,  355,  382,  411,  438,  469,  501,  539,  577,  613,  652,
-  690,  730,  776,  822,  872,  922,  973,  1024, 1079, 1132, 1188, 1250, 1305,
-  1367, 1432, 1492, 1560, 1626, 1693, 1766, 1838, 1911, 1992, 2068, 2149, 2228,
-  2307, 2393, 2478, 2564, 2655, 2742, 2833, 2927, 3020, 3119, 3219, 3298, 167,
-  169,  172,  178,  182,  188,  198,  209,  220,  235,  252,  266,  288,  306,
-  326,  349,  378,  403,  428,  461,  494,  526,  560,  594,  632,  675,  713,
-  755,  801,  845,  892,  942,  990,  1042, 1096, 1155, 1212, 1267, 1329, 1391,
-  1450, 1519, 1582, 1650, 1724, 1792, 1862, 1936, 2007, 2083, 2167, 2246, 2329,
-  2413, 2496, 2585, 2675, 2761, 2855, 2947, 3040, 3135, 3233, 3320, 192,  194,
-  196,  202,  204,  213,  223,  233,  241,  260,  274,  291,  309,  328,  350,
-  376,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,  778,
-  819,  868,  917,  965,  1013, 1072, 1123, 1176, 1231, 1289, 1351, 1414, 1474,
-  1539, 1604, 1674, 1741, 1816, 1891, 1961, 2040, 2116, 2191, 2276, 2353, 2438,
-  2524, 2606, 2689, 2784, 2871, 2968, 3062, 3161, 3257, 3334, 215,  217,  221,
-  227,  231,  237,  248,  257,  268,  282,  297,  318,  333,  356,  379,  396,
-  424,  452,  479,  508,  541,  574,  609,  643,  679,  719,  764,  806,  850,
-  894,  938,  987,  1038, 1089, 1145, 1204, 1258, 1316, 1379, 1438, 1501, 1565,
-  1628, 1694, 1764, 1836, 1907, 1981, 2060, 2137, 2220, 2298, 2377, 2464, 2549,
-  2635, 2724, 2812, 2903, 2999, 3088, 3185, 3278, 3350, 242,  246,  250,  254,
-  258,  264,  275,  285,  295,  312,  325,  345,  360,  383,  404,  426,  453,
-  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,  875,  923,
-  968,  1016, 1068, 1120, 1168, 1224, 1280, 1341, 1402, 1465, 1531, 1591, 1661,
-  1729, 1795, 1867, 1937, 2004, 2079, 2159, 2242, 2320, 2405, 2488, 2573, 2661,
-  2744, 2839, 2933, 3023, 3117, 3215, 3296, 3373, 270,  276,  278,  280,  289,
-  293,  299,  315,  323,  340,  352,  371,  391,  412,  429,  458,  480,  507,
-  532,  564,  590,  627,  663,  703,  733,  773,  816,  859,  906,  950,  993,
-  1043, 1094, 1147, 1201, 1256, 1311, 1372, 1429, 1486, 1550, 1618, 1685, 1751,
-  1827, 1895, 1965, 2042, 2119, 2192, 2268, 2348, 2429, 2512, 2599, 2684, 2772,
-  2863, 2951, 3048, 3143, 3239, 3324, 3393, 301,  303,  307,  313,  319,  321,
-  330,  346,  357,  367,  385,  399,  420,  439,  462,  489,  509,  536,  565,
-  589,  624,  661,  691,  727,  768,  810,  846,  887,  929,  977,  1029, 1076,
-  1128, 1177, 1226, 1283, 1339, 1397, 1461, 1521, 1585, 1648, 1715, 1779, 1848,
-  1923, 1996, 2069, 2142, 2224, 2302, 2381, 2465, 2544, 2627, 2720, 2807, 2895,
-  2985, 3073, 3163, 3264, 3338, 3413, 334,  336,  338,  342,  348,  358,  362,
-  374,  387,  397,  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,
-  657,  684,  723,  762,  797,  837,  878,  920,  963,  1010, 1054, 1105, 1157,
-  1206, 1262, 1317, 1374, 1433, 1483, 1545, 1615, 1681, 1743, 1812, 1885, 1954,
-  2025, 2101, 2174, 2248, 2330, 2411, 2490, 2579, 2663, 2745, 2835, 2924, 3018,
-  3115, 3205, 3290, 3363, 3431, 364,  368,  372,  377,  381,  389,  393,  409,
-  421,  434,  448,  464,  486,  502,  527,  548,  575,  602,  628,  662,  685,
-  721,  756,  794,  827,  869,  912,  956,  996,  1040, 1086, 1137, 1189, 1243,
-  1291, 1349, 1404, 1466, 1525, 1588, 1645, 1711, 1774, 1843, 1909, 1988, 2058,
-  2132, 2209, 2288, 2368, 2445, 2527, 2607, 2687, 2780, 2865, 2953, 3049, 3139,
-  3237, 3318, 3387, 3451, 401,  405,  407,  413,  417,  423,  430,  443,  456,
-  467,  483,  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,
-  792,  825,  863,  908,  946,  985,  1032, 1080, 1125, 1169, 1217, 1275, 1330,
-  1386, 1441, 1498, 1554, 1619, 1683, 1746, 1810, 1883, 1949, 2019, 2086, 2165,
-  2238, 2314, 2399, 2479, 2562, 2645, 2733, 2820, 2904, 2996, 3083, 3168, 3268,
-  3339, 3407, 3474, 436,  440,  444,  446,  454,  460,  468,  477,  493,  503,
-  522,  537,  550,  578,  595,  620,  644,  670,  704,  728,  763,  795,  826,
-  861,  901,  935,  980,  1025, 1069, 1112, 1159, 1209, 1260, 1309, 1363, 1418,
-  1475, 1534, 1598, 1656, 1721, 1780, 1846, 1912, 1982, 2056, 2129, 2199, 2278,
-  2358, 2432, 2508, 2593, 2677, 2762, 2851, 2941, 3030, 3124, 3216, 3294, 3365,
-  3433, 3488, 472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,
-  576,  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  864,  902,
-  932,  975,  1020, 1061, 1102, 1150, 1198, 1247, 1294, 1346, 1400, 1455, 1513,
-  1573, 1629, 1689, 1755, 1820, 1888, 1955, 2022, 2092, 2163, 2235, 2312, 2389,
-  2472, 2554, 2632, 2716, 2804, 2884, 2974, 3063, 3153, 3250, 3326, 3395, 3454,
-  3512, 515,  517,  523,  525,  531,  538,  546,  552,  570,  582,  596,  617,
-  631,  653,  676,  700,  720,  749,  774,  811,  838,  870,  909,  936,  976,
-  1017, 1058, 1099, 1143, 1192, 1238, 1284, 1336, 1388, 1445, 1493, 1546, 1610,
-  1671, 1734, 1796, 1856, 1925, 1994, 2062, 2133, 2206, 2281, 2354, 2426, 2503,
-  2587, 2669, 2754, 2843, 2928, 3016, 3105, 3201, 3284, 3351, 3421, 3480, 3534,
-  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
-  693,  714,  738,  765,  790,  817,  847,  879,  913,  947,  981,  1021, 1059,
-  1097, 1140, 1185, 1227, 1277, 1327, 1380, 1425, 1481, 1537, 1595, 1651, 1708,
-  1771, 1834, 1901, 1966, 2035, 2107, 2170, 2244, 2315, 2396, 2474, 2552, 2628,
-  2711, 2792, 2875, 2966, 3056, 3146, 3234, 3314, 3383, 3445, 3504, 3559, 599,
-  603,  605,  607,  615,  621,  629,  639,  650,  668,  678,  701,  716,  731,
-  758,  779,  807,  830,  860,  888,  921,  957,  986,  1026, 1062, 1100, 1141,
-  1183, 1221, 1272, 1323, 1368, 1416, 1471, 1526, 1580, 1633, 1691, 1752, 1817,
-  1876, 1944, 2002, 2072, 2143, 2218, 2291, 2363, 2435, 2509, 2589, 2672, 2752,
-  2840, 2921, 3008, 3095, 3190, 3274, 3344, 3409, 3470, 3526, 3577, 641,  645,
-  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,  802,
-  820,  851,  876,  907,  930,  964,  997,  1033, 1070, 1103, 1144, 1186, 1222,
-  1270, 1318, 1360, 1411, 1463, 1515, 1569, 1622, 1678, 1739, 1800, 1853, 1917,
-  1983, 2052, 2121, 2186, 2253, 2331, 2406, 2482, 2559, 2639, 2717, 2798, 2877,
-  2961, 3052, 3137, 3226, 3306, 3379, 3437, 3492, 3553, 3601, 686,  688,  694,
-  702,  706,  712,  718,  729,  745,  753,  771,  784,  808,  823,  848,  871,
-  895,  924,  951,  978,  1011, 1041, 1081, 1113, 1151, 1193, 1228, 1273, 1319,
-  1358, 1406, 1458, 1510, 1557, 1612, 1669, 1727, 1781, 1839, 1903, 1969, 2031,
-  2098, 2160, 2232, 2304, 2375, 2453, 2528, 2601, 2679, 2758, 2846, 2929, 3011,
-  3098, 3186, 3271, 3340, 3401, 3466, 3522, 3571, 3620, 735,  739,  741,  747,
-  751,  759,  767,  775,  787,  804,  818,  832,  856,  873,  893,  918,  939,
-  969,  994,  1030, 1055, 1087, 1126, 1160, 1199, 1239, 1278, 1324, 1361, 1407,
-  1453, 1505, 1551, 1605, 1663, 1716, 1768, 1830, 1893, 1951, 2008, 2075, 2139,
-  2214, 2284, 2349, 2418, 2494, 2571, 2653, 2734, 2810, 2890, 2972, 3058, 3147,
-  3231, 3310, 3375, 3435, 3490, 3545, 3595, 3642, 781,  785,  791,  796,  800,
-  812,  814,  824,  839,  854,  867,  881,  903,  925,  943,  966,  988,  1018,
-  1044, 1077, 1106, 1138, 1170, 1210, 1248, 1285, 1328, 1369, 1412, 1459, 1506,
-  1549, 1601, 1657, 1704, 1762, 1821, 1880, 1938, 1999, 2063, 2125, 2193, 2257,
-  2327, 2401, 2475, 2545, 2620, 2691, 2776, 2860, 2942, 3024, 3109, 3197, 3276,
-  3345, 3403, 3468, 3520, 3569, 3616, 3664, 834,  840,  842,  844,  852,  858,
-  865,  877,  883,  904,  915,  931,  954,  974,  991,  1014, 1039, 1071, 1095,
-  1129, 1158, 1190, 1218, 1261, 1295, 1337, 1381, 1417, 1464, 1511, 1552, 1602,
-  1654, 1699, 1759, 1813, 1872, 1927, 1990, 2049, 2113, 2178, 2239, 2308, 2378,
-  2450, 2521, 2594, 2667, 2746, 2824, 2909, 2990, 3070, 3154, 3243, 3316, 3381,
-  3441, 3493, 3547, 3597, 3640, 3682, 885,  889,  891,  897,  905,  911,  919,
-  927,  934,  958,  970,  984,  1004, 1027, 1045, 1073, 1090, 1121, 1148, 1178,
-  1207, 1244, 1276, 1310, 1347, 1389, 1426, 1472, 1516, 1558, 1606, 1658, 1700,
-  1757, 1807, 1868, 1920, 1978, 2043, 2104, 2157, 2229, 2296, 2364, 2422, 2498,
-  2574, 2650, 2727, 2801, 2872, 2954, 3038, 3129, 3212, 3288, 3352, 3419, 3475,
-  3524, 3573, 3621, 3668, 3707, 940,  944,  948,  952,  960,  962,  972,  982,
-  992,  1008, 1023, 1037, 1056, 1082, 1098, 1124, 1146, 1171, 1202, 1229, 1263,
-  1292, 1331, 1364, 1401, 1446, 1482, 1527, 1570, 1613, 1664, 1705, 1760, 1808,
-  1863, 1915, 1976, 2036, 2087, 2153, 2221, 2286, 2344, 2414, 2486, 2556, 2623,
-  2699, 2773, 2853, 2937, 3012, 3091, 3169, 3260, 3330, 3391, 3447, 3505, 3555,
-  3603, 3646, 3684, 3727, 998,  1000, 1002, 1006, 1012, 1019, 1031, 1035, 1047,
-  1065, 1083, 1093, 1109, 1133, 1156, 1179, 1205, 1225, 1257, 1286, 1320, 1350,
-  1387, 1419, 1456, 1494, 1538, 1581, 1623, 1670, 1717, 1763, 1814, 1869, 1916,
-  1974, 2028, 2081, 2150, 2212, 2272, 2335, 2403, 2469, 2539, 2608, 2680, 2755,
-  2827, 2915, 2986, 3068, 3151, 3229, 3300, 3366, 3427, 3484, 3532, 3581, 3630,
-  3672, 3709, 3745, 1049, 1051, 1057, 1063, 1067, 1075, 1085, 1091, 1104, 1118,
-  1136, 1152, 1164, 1191, 1213, 1232, 1259, 1281, 1312, 1340, 1375, 1405, 1442,
-  1476, 1514, 1547, 1596, 1634, 1679, 1728, 1769, 1822, 1873, 1921, 1977, 2029,
-  2078, 2144, 2203, 2264, 2325, 2390, 2459, 2529, 2591, 2665, 2738, 2813, 2880,
-  2957, 3041, 3127, 3206, 3282, 3348, 3399, 3460, 3513, 3565, 3609, 3650, 3695,
-  3733, 3768, 1110, 1114, 1116, 1122, 1130, 1134, 1142, 1154, 1162, 1180, 1196,
-  1211, 1223, 1251, 1268, 1290, 1321, 1342, 1373, 1398, 1434, 1467, 1499, 1535,
-  1574, 1611, 1652, 1692, 1740, 1782, 1831, 1881, 1928, 1979, 2037, 2082, 2145,
-  2200, 2261, 2321, 2387, 2454, 2513, 2583, 2656, 2730, 2793, 2867, 2945, 3025,
-  3101, 3178, 3262, 3328, 3388, 3443, 3494, 3543, 3591, 3636, 3678, 3715, 3754,
-  3790, 1166, 1172, 1174, 1182, 1187, 1197, 1203, 1215, 1219, 1240, 1253, 1269,
-  1288, 1306, 1332, 1352, 1382, 1403, 1430, 1462, 1484, 1528, 1555, 1599, 1630,
-  1672, 1709, 1753, 1801, 1840, 1894, 1939, 1991, 2044, 2088, 2151, 2204, 2262,
-  2318, 2384, 2448, 2504, 2577, 2646, 2712, 2782, 2856, 2934, 3006, 3079, 3158,
-  3240, 3307, 3371, 3425, 3481, 3530, 3575, 3618, 3660, 3701, 3741, 3774, 3807,
-  1233, 1235, 1241, 1245, 1249, 1255, 1265, 1274, 1282, 1300, 1314, 1334, 1348,
-  1370, 1392, 1415, 1439, 1468, 1487, 1522, 1548, 1589, 1620, 1659, 1690, 1735,
-  1772, 1818, 1854, 1904, 1952, 2000, 2050, 2105, 2154, 2213, 2265, 2322, 2385,
-  2446, 2500, 2569, 2642, 2705, 2770, 2849, 2919, 2993, 3064, 3140, 3223, 3292,
-  3353, 3414, 3464, 3516, 3561, 3607, 3648, 3687, 3725, 3762, 3796, 3827, 1296,
-  1298, 1302, 1304, 1308, 1322, 1326, 1338, 1344, 1355, 1383, 1395, 1409, 1435,
-  1451, 1477, 1502, 1532, 1553, 1586, 1616, 1646, 1684, 1722, 1756, 1797, 1835,
-  1877, 1918, 1970, 2009, 2064, 2114, 2158, 2222, 2273, 2326, 2388, 2449, 2501,
-  2567, 2636, 2695, 2768, 2836, 2910, 2976, 3053, 3131, 3209, 3279, 3336, 3397,
-  3449, 3500, 3549, 3593, 3634, 3676, 3713, 3747, 3784, 3817, 3845, 1356, 1359,
-  1365, 1371, 1377, 1385, 1393, 1399, 1413, 1421, 1447, 1460, 1478, 1495, 1520,
-  1540, 1566, 1592, 1621, 1649, 1682, 1712, 1747, 1783, 1823, 1857, 1902, 1945,
-  1984, 2032, 2076, 2126, 2179, 2230, 2287, 2336, 2391, 2455, 2505, 2570, 2637,
-  2692, 2763, 2830, 2901, 2969, 3044, 3120, 3194, 3265, 3331, 3385, 3439, 3486,
-  3536, 3582, 3626, 3665, 3703, 3739, 3772, 3802, 3835, 3864, 1423, 1427, 1431,
-  1437, 1443, 1449, 1454, 1470, 1480, 1488, 1508, 1529, 1542, 1561, 1583, 1607,
-  1631, 1662, 1686, 1718, 1744, 1775, 1811, 1847, 1889, 1926, 1967, 2003, 2053,
-  2099, 2140, 2194, 2240, 2297, 2345, 2404, 2460, 2514, 2578, 2643, 2696, 2764,
-  2826, 2897, 2962, 3036, 3112, 3182, 3254, 3321, 3376, 3429, 3478, 3527, 3567,
-  3611, 3652, 3693, 3731, 3764, 3794, 3825, 3853, 3882, 1490, 1496, 1500, 1504,
-  1512, 1518, 1530, 1536, 1544, 1559, 1577, 1593, 1609, 1627, 1653, 1675, 1695,
-  1730, 1754, 1784, 1815, 1844, 1884, 1913, 1956, 1995, 2038, 2073, 2122, 2161,
-  2215, 2258, 2309, 2365, 2415, 2470, 2530, 2584, 2647, 2706, 2769, 2831, 2898,
-  2959, 3033, 3106, 3170, 3252, 3312, 3367, 3423, 3471, 3518, 3563, 3605, 3644,
-  3680, 3717, 3755, 3788, 3819, 3847, 3874, 3898, 1563, 1567, 1571, 1575, 1579,
-  1587, 1597, 1603, 1617, 1625, 1643, 1666, 1680, 1696, 1725, 1742, 1765, 1798,
-  1828, 1849, 1886, 1910, 1950, 1985, 2023, 2065, 2108, 2146, 2187, 2233, 2285,
-  2328, 2379, 2423, 2487, 2540, 2592, 2657, 2713, 2771, 2837, 2902, 2963, 3034,
-  3104, 3164, 3248, 3304, 3361, 3417, 3462, 3510, 3557, 3598, 3638, 3674, 3711,
-  3743, 3776, 3811, 3839, 3868, 3892, 3917, 1635, 1637, 1639, 1641, 1647, 1660,
-  1668, 1676, 1688, 1698, 1719, 1736, 1750, 1767, 1793, 1819, 1837, 1870, 1896,
-  1924, 1957, 1989, 2020, 2057, 2093, 2134, 2171, 2219, 2254, 2305, 2350, 2402,
-  2451, 2499, 2557, 2609, 2666, 2731, 2783, 2850, 2911, 2970, 3037, 3107, 3165,
-  3246, 3301, 3359, 3410, 3458, 3508, 3551, 3589, 3632, 3670, 3705, 3737, 3770,
-  3800, 3829, 3858, 3886, 3911, 3933, 1702, 1706, 1710, 1720, 1726, 1732, 1738,
-  1748, 1761, 1773, 1789, 1805, 1829, 1841, 1864, 1892, 1908, 1940, 1968, 1997,
-  2026, 2059, 2089, 2130, 2164, 2207, 2245, 2292, 2332, 2376, 2419, 2476, 2522,
-  2575, 2624, 2681, 2739, 2794, 2857, 2920, 2977, 3045, 3113, 3171, 3249, 3302,
-  3358, 3404, 3455, 3502, 3541, 3587, 3628, 3661, 3699, 3735, 3766, 3797, 3823,
-  3851, 3876, 3903, 3927, 3950, 1777, 1785, 1787, 1791, 1799, 1803, 1809, 1825,
-  1833, 1845, 1861, 1882, 1899, 1914, 1941, 1962, 1986, 2005, 2045, 2070, 2102,
-  2135, 2166, 2201, 2236, 2282, 2316, 2366, 2407, 2456, 2495, 2546, 2595, 2651,
-  2700, 2756, 2814, 2868, 2935, 2994, 3054, 3121, 3183, 3253, 3305, 3360, 3405,
-  3453, 3498, 3539, 3585, 3622, 3658, 3697, 3728, 3760, 3792, 3821, 3849, 3872,
-  3896, 3919, 3942, 3964, 1851, 1855, 1859, 1866, 1874, 1878, 1890, 1900, 1906,
-  1922, 1934, 1958, 1972, 1993, 2010, 2041, 2061, 2080, 2120, 2147, 2175, 2210,
-  2241, 2279, 2313, 2355, 2397, 2436, 2483, 2531, 2572, 2621, 2668, 2728, 2774,
-  2828, 2881, 2946, 3007, 3065, 3132, 3195, 3255, 3313, 3362, 3411, 3456, 3499,
-  3538, 3579, 3614, 3656, 3691, 3723, 3758, 3786, 3815, 3843, 3870, 3894, 3915,
-  3937, 3956, 3975, 1930, 1932, 1942, 1946, 1948, 1960, 1964, 1975, 1987, 2001,
-  2014, 2033, 2051, 2071, 2084, 2117, 2138, 2162, 2195, 2225, 2249, 2289, 2317,
-  2359, 2392, 2427, 2477, 2510, 2560, 2602, 2654, 2693, 2747, 2802, 2854, 2916,
-  2958, 3026, 3080, 3141, 3210, 3266, 3322, 3368, 3418, 3459, 3503, 3540, 3580,
-  3613, 3654, 3688, 3721, 3752, 3782, 3813, 3841, 3865, 3890, 3913, 3935, 3954,
-  3972, 3989, 2011, 2015, 2017, 2021, 2027, 2039, 2047, 2055, 2067, 2077, 2090,
-  2112, 2128, 2152, 2168, 2196, 2223, 2243, 2269, 2303, 2333, 2369, 2400, 2433,
-  2473, 2506, 2553, 2590, 2640, 2682, 2735, 2777, 2825, 2873, 2938, 2987, 3042,
-  3102, 3159, 3224, 3280, 3332, 3377, 3424, 3463, 3509, 3542, 3586, 3615, 3655,
-  3685, 3719, 3750, 3780, 3809, 3836, 3862, 3888, 3909, 3931, 3952, 3970, 3987,
-  4003, 2094, 2096, 2100, 2106, 2110, 2118, 2124, 2136, 2148, 2156, 2172, 2190,
-  2211, 2231, 2247, 2277, 2299, 2323, 2351, 2382, 2412, 2447, 2480, 2511, 2555,
-  2588, 2629, 2673, 2718, 2759, 2811, 2861, 2912, 2955, 3013, 3069, 3128, 3179,
-  3241, 3293, 3337, 3386, 3430, 3472, 3511, 3552, 3588, 3623, 3657, 3689, 3720,
-  3749, 3778, 3805, 3833, 3860, 3884, 3907, 3929, 3948, 3968, 3985, 4001, 4016,
-  2176, 2180, 2182, 2184, 2188, 2198, 2205, 2217, 2227, 2237, 2251, 2274, 2295,
-  2310, 2334, 2356, 2380, 2408, 2430, 2466, 2491, 2532, 2563, 2596, 2633, 2670,
-  2714, 2753, 2799, 2847, 2891, 2943, 2991, 3039, 3092, 3152, 3207, 3263, 3308,
-  3354, 3398, 3440, 3479, 3519, 3558, 3590, 3629, 3659, 3692, 3722, 3751, 3779,
-  3804, 3831, 3856, 3880, 3905, 3925, 3946, 3966, 3983, 3999, 4014, 4028, 2255,
-  2259, 2263, 2267, 2275, 2283, 2293, 2301, 2311, 2324, 2338, 2360, 2374, 2394,
-  2416, 2439, 2467, 2489, 2515, 2547, 2580, 2610, 2648, 2678, 2719, 2757, 2795,
-  2841, 2878, 2930, 2973, 3027, 3071, 3130, 3172, 3230, 3283, 3329, 3372, 3415,
-  3450, 3487, 3528, 3564, 3599, 3633, 3662, 3698, 3724, 3753, 3781, 3806, 3832,
-  3855, 3878, 3901, 3923, 3944, 3962, 3981, 3997, 4012, 4026, 4039, 2340, 2342,
-  2346, 2352, 2362, 2370, 2372, 2386, 2398, 2410, 2420, 2442, 2461, 2481, 2497,
-  2525, 2550, 2576, 2600, 2630, 2664, 2688, 2736, 2765, 2805, 2844, 2876, 2922,
-  2964, 3014, 3059, 3110, 3155, 3213, 3261, 3303, 3349, 3389, 3426, 3465, 3501,
-  3537, 3568, 3606, 3639, 3671, 3700, 3729, 3759, 3783, 3810, 3834, 3857, 3879,
-  3900, 3921, 3940, 3960, 3979, 3995, 4010, 4024, 4037, 4049, 2424, 2428, 2434,
-  2440, 2444, 2457, 2463, 2471, 2485, 2493, 2507, 2535, 2543, 2565, 2586, 2611,
-  2638, 2662, 2685, 2721, 2748, 2781, 2821, 2852, 2885, 2931, 2967, 3009, 3055,
-  3099, 3148, 3198, 3244, 3289, 3333, 3369, 3400, 3444, 3482, 3517, 3550, 3583,
-  3612, 3645, 3675, 3706, 3736, 3761, 3787, 3814, 3837, 3861, 3881, 3902, 3922,
-  3939, 3958, 3977, 3993, 4008, 4022, 4035, 4047, 4058, 2517, 2519, 2523, 2533,
-  2537, 2541, 2551, 2561, 2568, 2582, 2598, 2616, 2634, 2658, 2676, 2690, 2725,
-  2749, 2775, 2808, 2838, 2866, 2905, 2944, 2975, 3017, 3057, 3096, 3138, 3187,
-  3232, 3277, 3317, 3355, 3392, 3428, 3461, 3495, 3531, 3562, 3594, 3627, 3653,
-  3681, 3712, 3738, 3767, 3793, 3816, 3842, 3863, 3885, 3906, 3924, 3941, 3959,
-  3974, 3991, 4006, 4020, 4033, 4045, 4056, 4066, 2604, 2612, 2614, 2618, 2622,
-  2626, 2644, 2652, 2660, 2674, 2686, 2707, 2729, 2743, 2766, 2785, 2815, 2842,
-  2864, 2896, 2925, 2956, 2997, 3031, 3066, 3108, 3149, 3191, 3227, 3272, 3311,
-  3346, 3382, 3420, 3448, 3485, 3514, 3544, 3576, 3608, 3635, 3666, 3694, 3718,
-  3744, 3771, 3798, 3822, 3844, 3866, 3889, 3908, 3926, 3945, 3961, 3978, 3992,
-  4005, 4018, 4031, 4043, 4054, 4064, 4073, 2697, 2701, 2703, 2709, 2715, 2723,
-  2737, 2741, 2751, 2767, 2779, 2796, 2819, 2834, 2858, 2874, 2906, 2936, 2952,
-  2988, 3019, 3050, 3084, 3125, 3156, 3202, 3235, 3275, 3309, 3341, 3378, 3406,
-  3442, 3476, 3506, 3533, 3566, 3592, 3619, 3649, 3677, 3704, 3732, 3756, 3777,
-  3801, 3824, 3850, 3871, 3891, 3910, 3930, 3947, 3963, 3980, 3994, 4007, 4019,
-  4030, 4041, 4052, 4062, 4071, 4079, 2787, 2789, 2797, 2803, 2809, 2817, 2823,
-  2832, 2848, 2862, 2870, 2888, 2913, 2932, 2948, 2971, 3000, 3028, 3051, 3074,
-  3116, 3142, 3173, 3217, 3251, 3285, 3315, 3347, 3380, 3402, 3436, 3469, 3496,
-  3525, 3556, 3584, 3610, 3637, 3663, 3690, 3714, 3740, 3765, 3789, 3812, 3830,
-  3852, 3873, 3895, 3914, 3932, 3949, 3967, 3982, 3996, 4009, 4021, 4032, 4042,
-  4051, 4060, 4069, 4077, 4084, 2882, 2886, 2892, 2894, 2900, 2914, 2918, 2926,
-  2940, 2950, 2965, 2980, 3003, 3021, 3043, 3067, 3089, 3118, 3144, 3166, 3208,
-  3238, 3269, 3295, 3327, 3356, 3384, 3412, 3438, 3467, 3491, 3521, 3548, 3574,
-  3604, 3631, 3651, 3679, 3702, 3726, 3748, 3773, 3795, 3820, 3840, 3859, 3877,
-  3897, 3916, 3936, 3953, 3969, 3984, 3998, 4011, 4023, 4034, 4044, 4053, 4061,
-  4068, 4075, 4082, 4088, 2981, 2983, 2989, 2995, 3001, 3005, 3015, 3029, 3035,
-  3047, 3061, 3075, 3097, 3122, 3136, 3162, 3188, 3218, 3242, 3267, 3291, 3319,
-  3342, 3370, 3396, 3422, 3446, 3473, 3497, 3523, 3546, 3570, 3600, 3624, 3647,
-  3673, 3696, 3716, 3742, 3763, 3785, 3803, 3826, 3848, 3869, 3887, 3904, 3920,
-  3938, 3955, 3971, 3986, 4000, 4013, 4025, 4036, 4046, 4055, 4063, 4070, 4076,
-  4081, 4086, 4091, 3077, 3081, 3085, 3087, 3093, 3103, 3114, 3126, 3134, 3150,
-  3160, 3174, 3200, 3220, 3236, 3258, 3281, 3297, 3325, 3343, 3364, 3390, 3408,
-  3434, 3457, 3483, 3507, 3529, 3554, 3572, 3596, 3617, 3641, 3669, 3686, 3710,
-  3734, 3757, 3775, 3799, 3818, 3838, 3854, 3875, 3893, 3912, 3928, 3943, 3957,
-  3973, 3988, 4002, 4015, 4027, 4038, 4048, 4057, 4065, 4072, 4078, 4083, 4087,
-  4090, 4093, 3176, 3180, 3184, 3192, 3196, 3204, 3214, 3222, 3228, 3247, 3259,
-  3273, 3287, 3299, 3323, 3335, 3357, 3374, 3394, 3416, 3432, 3452, 3477, 3489,
-  3515, 3535, 3560, 3578, 3602, 3625, 3643, 3667, 3683, 3708, 3730, 3746, 3769,
-  3791, 3808, 3828, 3846, 3867, 3883, 3899, 3918, 3934, 3951, 3965, 3976, 3990,
-  4004, 4017, 4029, 4040, 4050, 4059, 4067, 4074, 4080, 4085, 4089, 4092, 4094,
-  4095,
-};
-#endif  // CONFIG_TX64X64
+  0,    1,    5,    6,    14,   15,   27,   28,   44,   45,   65,   66,   90,
+  91,   119,  120,  152,  153,  189,  190,  230,  231,  275,  276,  324,  325,
+  377,  378,  434,  435,  495,  496,  2,    4,    7,    13,   16,   26,   29,
+  43,   46,   64,   67,   89,   92,   118,  121,  151,  154,  188,  191,  229,
+  232,  274,  277,  323,  326,  376,  379,  433,  436,  494,  497,  558,  3,
+  8,    12,   17,   25,   30,   42,   47,   63,   68,   88,   93,   117,  122,
+  150,  155,  187,  192,  228,  233,  273,  278,  322,  327,  375,  380,  432,
+  437,  493,  498,  557,  559,  9,    11,   18,   24,   31,   41,   48,   62,
+  69,   87,   94,   116,  123,  149,  156,  186,  193,  227,  234,  272,  279,
+  321,  328,  374,  381,  431,  438,  492,  499,  556,  560,  617,  10,   19,
+  23,   32,   40,   49,   61,   70,   86,   95,   115,  124,  148,  157,  185,
+  194,  226,  235,  271,  280,  320,  329,  373,  382,  430,  439,  491,  500,
+  555,  561,  616,  618,  20,   22,   33,   39,   50,   60,   71,   85,   96,
+  114,  125,  147,  158,  184,  195,  225,  236,  270,  281,  319,  330,  372,
+  383,  429,  440,  490,  501,  554,  562,  615,  619,  672,  21,   34,   38,
+  51,   59,   72,   84,   97,   113,  126,  146,  159,  183,  196,  224,  237,
+  269,  282,  318,  331,  371,  384,  428,  441,  489,  502,  553,  563,  614,
+  620,  671,  673,  35,   37,   52,   58,   73,   83,   98,   112,  127,  145,
+  160,  182,  197,  223,  238,  268,  283,  317,  332,  370,  385,  427,  442,
+  488,  503,  552,  564,  613,  621,  670,  674,  723,  36,   53,   57,   74,
+  82,   99,   111,  128,  144,  161,  181,  198,  222,  239,  267,  284,  316,
+  333,  369,  386,  426,  443,  487,  504,  551,  565,  612,  622,  669,  675,
+  722,  724,  54,   56,   75,   81,   100,  110,  129,  143,  162,  180,  199,
+  221,  240,  266,  285,  315,  334,  368,  387,  425,  444,  486,  505,  550,
+  566,  611,  623,  668,  676,  721,  725,  770,  55,   76,   80,   101,  109,
+  130,  142,  163,  179,  200,  220,  241,  265,  286,  314,  335,  367,  388,
+  424,  445,  485,  506,  549,  567,  610,  624,  667,  677,  720,  726,  769,
+  771,  77,   79,   102,  108,  131,  141,  164,  178,  201,  219,  242,  264,
+  287,  313,  336,  366,  389,  423,  446,  484,  507,  548,  568,  609,  625,
+  666,  678,  719,  727,  768,  772,  813,  78,   103,  107,  132,  140,  165,
+  177,  202,  218,  243,  263,  288,  312,  337,  365,  390,  422,  447,  483,
+  508,  547,  569,  608,  626,  665,  679,  718,  728,  767,  773,  812,  814,
+  104,  106,  133,  139,  166,  176,  203,  217,  244,  262,  289,  311,  338,
+  364,  391,  421,  448,  482,  509,  546,  570,  607,  627,  664,  680,  717,
+  729,  766,  774,  811,  815,  852,  105,  134,  138,  167,  175,  204,  216,
+  245,  261,  290,  310,  339,  363,  392,  420,  449,  481,  510,  545,  571,
+  606,  628,  663,  681,  716,  730,  765,  775,  810,  816,  851,  853,  135,
+  137,  168,  174,  205,  215,  246,  260,  291,  309,  340,  362,  393,  419,
+  450,  480,  511,  544,  572,  605,  629,  662,  682,  715,  731,  764,  776,
+  809,  817,  850,  854,  887,  136,  169,  173,  206,  214,  247,  259,  292,
+  308,  341,  361,  394,  418,  451,  479,  512,  543,  573,  604,  630,  661,
+  683,  714,  732,  763,  777,  808,  818,  849,  855,  886,  888,  170,  172,
+  207,  213,  248,  258,  293,  307,  342,  360,  395,  417,  452,  478,  513,
+  542,  574,  603,  631,  660,  684,  713,  733,  762,  778,  807,  819,  848,
+  856,  885,  889,  918,  171,  208,  212,  249,  257,  294,  306,  343,  359,
+  396,  416,  453,  477,  514,  541,  575,  602,  632,  659,  685,  712,  734,
+  761,  779,  806,  820,  847,  857,  884,  890,  917,  919,  209,  211,  250,
+  256,  295,  305,  344,  358,  397,  415,  454,  476,  515,  540,  576,  601,
+  633,  658,  686,  711,  735,  760,  780,  805,  821,  846,  858,  883,  891,
+  916,  920,  945,  210,  251,  255,  296,  304,  345,  357,  398,  414,  455,
+  475,  516,  539,  577,  600,  634,  657,  687,  710,  736,  759,  781,  804,
+  822,  845,  859,  882,  892,  915,  921,  944,  946,  252,  254,  297,  303,
+  346,  356,  399,  413,  456,  474,  517,  538,  578,  599,  635,  656,  688,
+  709,  737,  758,  782,  803,  823,  844,  860,  881,  893,  914,  922,  943,
+  947,  968,  253,  298,  302,  347,  355,  400,  412,  457,  473,  518,  537,
+  579,  598,  636,  655,  689,  708,  738,  757,  783,  802,  824,  843,  861,
+  880,  894,  913,  923,  942,  948,  967,  969,  299,  301,  348,  354,  401,
+  411,  458,  472,  519,  536,  580,  597,  637,  654,  690,  707,  739,  756,
+  784,  801,  825,  842,  862,  879,  895,  912,  924,  941,  949,  966,  970,
+  987,  300,  349,  353,  402,  410,  459,  471,  520,  535,  581,  596,  638,
+  653,  691,  706,  740,  755,  785,  800,  826,  841,  863,  878,  896,  911,
+  925,  940,  950,  965,  971,  986,  988,  350,  352,  403,  409,  460,  470,
+  521,  534,  582,  595,  639,  652,  692,  705,  741,  754,  786,  799,  827,
+  840,  864,  877,  897,  910,  926,  939,  951,  964,  972,  985,  989,  1002,
+  351,  404,  408,  461,  469,  522,  533,  583,  594,  640,  651,  693,  704,
+  742,  753,  787,  798,  828,  839,  865,  876,  898,  909,  927,  938,  952,
+  963,  973,  984,  990,  1001, 1003, 405,  407,  462,  468,  523,  532,  584,
+  593,  641,  650,  694,  703,  743,  752,  788,  797,  829,  838,  866,  875,
+  899,  908,  928,  937,  953,  962,  974,  983,  991,  1000, 1004, 1013, 406,
+  463,  467,  524,  531,  585,  592,  642,  649,  695,  702,  744,  751,  789,
+  796,  830,  837,  867,  874,  900,  907,  929,  936,  954,  961,  975,  982,
+  992,  999,  1005, 1012, 1014, 464,  466,  525,  530,  586,  591,  643,  648,
+  696,  701,  745,  750,  790,  795,  831,  836,  868,  873,  901,  906,  930,
+  935,  955,  960,  976,  981,  993,  998,  1006, 1011, 1015, 1020, 465,  526,
+  529,  587,  590,  644,  647,  697,  700,  746,  749,  791,  794,  832,  835,
+  869,  872,  902,  905,  931,  934,  956,  959,  977,  980,  994,  997,  1007,
+  1010, 1016, 1019, 1021, 527,  528,  588,  589,  645,  646,  698,  699,  747,
+  748,  792,  793,  833,  834,  870,  871,  903,  904,  932,  933,  957,  958,
+  978,  979,  995,  996,  1008, 1009, 1017, 1018, 1022, 1023
+};
 
 const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-#endif
   { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
-#if CONFIG_TX64X64
-  { default_scan_64x64, av1_default_iscan_64x64, default_scan_64x64_neighbors },
-#endif  // CONFIG_TX64X64
+  // Half of the coefficients of tx64 at higher frequencies are set to
+  // zeros. So tx32's scan order is used.
+  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const SCAN_ORDER av1_intra_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
-#if CONFIG_CHROMA_2X2
-  {
-      // TX_2X2
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#endif
+const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
   {
       // TX_4X4
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_8X8
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_16X16
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_32X32
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-#if CONFIG_EXT_TX
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#if CONFIG_TX64X64
-  {
-      // TX_64X64
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#endif  // CONFIG_TX64X64
-  {
-      // TX_4X8
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_8X4
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_8X16
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_16X8
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_16X32
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_32X16
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#if CONFIG_TX64X64
-  {
-      // TX_32X64
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_64X32
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#endif  // CONFIG_EXT_TX
-  }
-#endif  // CONFIG_TX64X64
-};
-
-const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
-#if CONFIG_CHROMA_2X2
-  {
-      // TX_2X2
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#endif
-  {
-      // TX_4X4
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X8
@@ -7453,20 +3236,18 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X16
@@ -7478,7 +3259,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
@@ -7489,96 +3269,93 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X32
       { default_scan_32x32, av1_default_iscan_32x32,
         default_scan_32x32_neighbors },
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-#if CONFIG_EXT_TX
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-#endif  // CONFIG_EXT_TX
   },
-#if CONFIG_TX64X64
   {
       // TX_64X64
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#endif  // CONFIG_EXT_TX
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
   },
-#endif  // CONFIG_TX64X64
   {
       // TX_4X8
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X4
@@ -7586,20 +3363,18 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X16
@@ -7611,7 +3386,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
@@ -7622,14 +3396,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X8
@@ -7641,7 +3415,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
@@ -7652,14 +3425,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X32
@@ -7671,7 +3444,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
@@ -7682,14 +3454,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X16
@@ -7701,7 +3473,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
@@ -7712,91 +3483,77 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
-#if CONFIG_TX64X64
   {
       // TX_32X64
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#endif  // CONFIG_EXT_TX
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
   },
   {
       // TX_64X32
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#endif  // CONFIG_EXT_TX
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
   },
-#endif  // CONFIG_TX64X64
   {
       // TX_4X16
       { default_scan_4x16, av1_default_iscan_4x16,
@@ -7807,7 +3564,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_4x16_neighbors },
       { default_scan_4x16, av1_default_iscan_4x16,
         default_scan_4x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_4x16, av1_default_iscan_4x16,
         default_scan_4x16_neighbors },
       { default_scan_4x16, av1_default_iscan_4x16,
@@ -7818,14 +3574,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_4x16_neighbors },
       { default_scan_4x16, av1_default_iscan_4x16,
         default_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
       { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
       { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
       { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
       { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
       { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
       { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X4
@@ -7837,7 +3593,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x4_neighbors },
       { default_scan_16x4, av1_default_iscan_16x4,
         default_scan_16x4_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x4, av1_default_iscan_16x4,
         default_scan_16x4_neighbors },
       { default_scan_16x4, av1_default_iscan_16x4,
@@ -7848,14 +3603,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x4_neighbors },
       { default_scan_16x4, av1_default_iscan_16x4,
         default_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
       { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
       { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
       { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
       { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
       { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
       { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X32
@@ -7867,7 +3622,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x32_neighbors },
       { default_scan_8x32, av1_default_iscan_8x32,
         default_scan_8x32_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x32, av1_default_iscan_8x32,
         default_scan_8x32_neighbors },
       { default_scan_8x32, av1_default_iscan_8x32,
@@ -7878,14 +3632,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x32_neighbors },
       { default_scan_8x32, av1_default_iscan_8x32,
         default_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
       { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
       { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
       { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
       { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
       { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
       { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X8
@@ -7897,7 +3651,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x8_neighbors },
       { default_scan_32x8, av1_default_iscan_32x8,
         default_scan_32x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_32x8, av1_default_iscan_32x8,
         default_scan_32x8_neighbors },
       { default_scan_32x8, av1_default_iscan_32x8,
@@ -7908,679 +3661,75 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x8_neighbors },
       { default_scan_32x8, av1_default_iscan_32x8,
         default_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
       { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
       { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
       { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
       { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
       { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
       { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
-#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+  },
+  {
+      // TX_64X16
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
   },
 };
-
-#if CONFIG_ADAPT_SCAN
-// TX_32X32 will has 1024 coefficients whose indexes can be represented in 10
-// bits
-#define COEFF_IDX_BITS (10 + CONFIG_TX64X64)
-#define COEFF_IDX_SIZE (1 << COEFF_IDX_BITS)
-#define COEFF_IDX_MASK (COEFF_IDX_SIZE - 1)
-
-static uint32_t *get_non_zero_prob(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                                   TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->non_zero_prob_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->non_zero_prob_4X4[tx_type];
-    case TX_8X8: return fc->non_zero_prob_8X8[tx_type];
-    case TX_16X16: return fc->non_zero_prob_16X16[tx_type];
-    case TX_32X32: return fc->non_zero_prob_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->non_zero_prob_4X8[tx_type];
-    case TX_8X4: return fc->non_zero_prob_8X4[tx_type];
-    case TX_8X16: return fc->non_zero_prob_8X16[tx_type];
-    case TX_16X8: return fc->non_zero_prob_16X8[tx_type];
-    case TX_16X32: return fc->non_zero_prob_16X32[tx_type];
-    case TX_32X16: return fc->non_zero_prob_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static int16_t *get_adapt_scan(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                               TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->scan_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->scan_4X4[tx_type];
-    case TX_8X8: return fc->scan_8X8[tx_type];
-    case TX_16X16: return fc->scan_16X16[tx_type];
-    case TX_32X32: return fc->scan_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->scan_4X8[tx_type];
-    case TX_8X4: return fc->scan_8X4[tx_type];
-    case TX_8X16: return fc->scan_8X16[tx_type];
-    case TX_16X8: return fc->scan_16X8[tx_type];
-    case TX_16X32: return fc->scan_16X32[tx_type];
-    case TX_32X16: return fc->scan_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static int16_t *get_adapt_iscan(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                                TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->iscan_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->iscan_4X4[tx_type];
-    case TX_8X8: return fc->iscan_8X8[tx_type];
-    case TX_16X16: return fc->iscan_16X16[tx_type];
-    case TX_32X32: return fc->iscan_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->iscan_4X8[tx_type];
-    case TX_8X4: return fc->iscan_8X4[tx_type];
-    case TX_8X16: return fc->iscan_8X16[tx_type];
-    case TX_16X8: return fc->iscan_16X8[tx_type];
-    case TX_16X32: return fc->iscan_16X32[tx_type];
-    case TX_32X16: return fc->iscan_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static int16_t *get_adapt_nb(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                             TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->nb_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->nb_4X4[tx_type];
-    case TX_8X8: return fc->nb_8X8[tx_type];
-    case TX_16X16: return fc->nb_16X16[tx_type];
-    case TX_32X32: return fc->nb_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->nb_4X8[tx_type];
-    case TX_8X4: return fc->nb_8X4[tx_type];
-    case TX_8X16: return fc->nb_8X16[tx_type];
-    case TX_16X8: return fc->nb_16X8[tx_type];
-    case TX_16X32: return fc->nb_16X32[tx_type];
-    case TX_32X16: return fc->nb_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static uint32_t *get_non_zero_counts(FRAME_COUNTS *counts, TX_SIZE tx_size,
-                                     TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return counts->non_zero_count_2x2[tx_type];
-#endif
-    case TX_4X4: return counts->non_zero_count_4X4[tx_type];
-    case TX_8X8: return counts->non_zero_count_8X8[tx_type];
-    case TX_16X16: return counts->non_zero_count_16X16[tx_type];
-    case TX_32X32: return counts->non_zero_count_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return counts->non_zero_count_4x8[tx_type];
-    case TX_8X4: return counts->non_zero_count_8x4[tx_type];
-    case TX_8X16: return counts->non_zero_count_8x16[tx_type];
-    case TX_16X8: return counts->non_zero_count_16x8[tx_type];
-    case TX_16X32: return counts->non_zero_count_16x32[tx_type];
-    case TX_32X16: return counts->non_zero_count_32x16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static INLINE int clamp_64(int64_t value, int low, int high) {
-  return value < low ? low : (value > high ? high : (int)value);
-}
-
-#if USE_2X2_PROB
-static int do_down_sample(TX_SIZE tx_size) {
-  const int tx_w = tx_size_wide[tx_size];
-  const int tx_h = tx_size_high[tx_size];
-  if (tx_w > 8 || tx_h > 8) {
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-void av1_down_sample_scan_count(uint32_t *non_zero_count_ds,
-                                const uint32_t *non_zero_count,
-                                TX_SIZE tx_size) {
-  const int tx_w = tx_size_wide[tx_size];
-  const int tx_h = tx_size_high[tx_size];
-  if (tx_w > 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count_ds[ci_ds] = non_zero_count[ci];
-      }
-    }
-  } else if (tx_w > 8 && tx_h <= 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count_ds[ci_ds] = non_zero_count[ci];
-      }
-    }
-  } else if (tx_w <= 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds;
-        const int ci = r * tx_w + c;
-        non_zero_count_ds[ci_ds] = non_zero_count[ci];
-      }
-    }
-  } else {
-    assert(0);
-  }
-}
-
-void av1_up_sample_scan_count(uint32_t *non_zero_count,
-                              const uint32_t *non_zero_count_ds,
-                              TX_SIZE tx_size, unsigned int block_num) {
-  const int tx_w = tx_size_wide[tx_size];
-  const int tx_h = tx_size_high[tx_size];
-  if (tx_w > 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count[ci] = non_zero_count_ds[ci_ds];
-        if (c_ds + 1 < tx_w_ds) {
-          uint32_t count =
-              non_zero_count_ds[ci_ds] + non_zero_count_ds[ci_ds + 1];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + 1] = count;
-        } else {
-          non_zero_count[ci + 1] = non_zero_count_ds[ci_ds];
-        }
-      }
-    }
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c = 0; c < tx_w; ++c) {
-        const int r = r_ds << 1;
-        const int ci = r * tx_w + c;
-        if (r + 2 < tx_h) {
-          uint32_t count = non_zero_count[ci] + non_zero_count[ci + 2 * tx_w];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + tx_w] = count;
-        } else {
-          non_zero_count[ci + tx_w] = non_zero_count[ci];
-        }
-      }
-    }
-  } else if (tx_w > 8 && tx_h <= 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count[ci] = non_zero_count_ds[ci_ds];
-        if (c_ds + 1 < tx_w_ds) {
-          uint32_t count =
-              non_zero_count_ds[ci_ds] + non_zero_count_ds[ci_ds + 1];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + 1] = count;
-        } else {
-          non_zero_count[ci + 1] = non_zero_count_ds[ci_ds];
-        }
-      }
-    }
-  } else if (tx_w <= 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds;
-        const int ci = r * tx_w + c;
-        non_zero_count[ci] = non_zero_count_ds[ci_ds];
-        if (r_ds + 1 < tx_h_ds) {
-          uint32_t count =
-              non_zero_count_ds[ci_ds] + non_zero_count_ds[ci_ds + tx_w_ds];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + tx_w] = count;
-        } else {
-          non_zero_count[ci + tx_w] = non_zero_count_ds[ci_ds];
-        }
-      }
-    }
-  } else {
-    assert(0);
-  }
-}
-#endif
-
-static void update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
-                             int rate) {
-  FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  uint32_t *prev_non_zero_prob = get_non_zero_prob(pre_fc, tx_size, tx_type);
-  uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
-  uint32_t *non_zero_count = get_non_zero_counts(&cm->counts, tx_size, tx_type);
-  const int tx2d_size = tx_size_2d[tx_size];
-  unsigned int block_num = cm->counts.txb_count[tx_size][tx_type];
-#if USE_2X2_PROB
-#if CONFIG_TX64X64
-  DECLARE_ALIGNED(16, uint32_t, non_zero_count_ds[1024]);
-  assert((tx2d_size >> 2) <= 1024);
-#else   // CONFIG_TX64X64
-  DECLARE_ALIGNED(16, uint32_t, non_zero_count_ds[256]);
-  assert((tx2d_size >> 2) <= 256);
-#endif  // CONFIG_TX64X64
-  if (do_down_sample(tx_size)) {
-    av1_down_sample_scan_count(non_zero_count_ds, non_zero_count, tx_size);
-    av1_up_sample_scan_count(non_zero_count, non_zero_count_ds, tx_size,
-                             block_num);
-  }
-#endif
-  int i;
-  const int inv_precision = 30;
-  int32_t inv_block_num = block_num == 0 ? 0 : (1 << inv_precision) / block_num;
-  for (i = 0; i < tx2d_size; i++) {
-    int64_t curr_prob =
-        block_num == 0 ? 0 : ((non_zero_count[i] * inv_block_num) >>
-                              (inv_precision - ADAPT_SCAN_PROB_PRECISION));
-    int64_t prev_prob = prev_non_zero_prob[i];
-    int64_t pred_prob =
-        (curr_prob * rate +
-         prev_prob * ((1 << ADAPT_SCAN_PROB_PRECISION) - rate)) >>
-        ADAPT_SCAN_PROB_PRECISION;
-    // TODO(angiebird): reduce the bit usage of probabilities and remove
-    // clamp_64()
-    non_zero_prob[i] =
-        clamp_64(pred_prob, 0, (1 << ADAPT_SCAN_PROB_PRECISION) - 1);
-  }
-}
-
-static void update_scan_count(int16_t *scan, int max_scan,
-                              const tran_low_t *dqcoeffs,
-                              uint32_t *non_zero_count) {
-  int i;
-  for (i = 0; i < max_scan; ++i) {
-    int coeff_idx = scan[i];
-    non_zero_count[coeff_idx] += (dqcoeffs[coeff_idx] != 0);
-  }
-}
-
-void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
-                                  TX_SIZE tx_size, TX_TYPE tx_type,
-                                  const tran_low_t *dqcoeffs, int max_scan) {
-  if (cm->use_adapt_scan && do_adapt_scan(tx_size, tx_type)) {
-    int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
-    uint32_t *non_zero_count = get_non_zero_counts(counts, tx_size, tx_type);
-    update_scan_count(scan, max_scan, dqcoeffs, non_zero_count);
-    ++counts->txb_count[tx_size][tx_type];
-  }
-}
-
-static int cmp_prob(const void *a, const void *b) {
-  return *(const uint32_t *)b > *(const uint32_t *)a ? 1 : -1;
-}
-
-void av1_augment_prob(TX_SIZE tx_size, TX_TYPE tx_type, uint32_t *prob) {
-  // TODO(angiebird): check if we need is_inter here
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  const int tx1d_wide = tx_size_wide[tx_size];
-  const int tx1d_high = tx_size_high[tx_size];
-  int r, c;
-  for (r = 0; r < tx1d_high; r++) {
-    for (c = 0; c < tx1d_wide; c++) {
-      const int idx = r * tx1d_wide + c;
-      const uint32_t mask_16 = ((1 << 16) - 1);
-      const uint32_t tie_breaker = ~((uint32_t)sc->iscan[idx]);
-      // prob[idx]: 16 bits  dummy: 6 bits  scan_idx: 10 bits
-      prob[idx] = (prob[idx] << 16) | (mask_16 & tie_breaker);
-    }
-  }
-}
-
-void av1_update_neighbors(TX_SIZE tx_size, const int16_t *scan,
-                          const int16_t *iscan, int16_t *neighbors) {
-  const int tx1d_wide = tx_size_wide[tx_size];
-  const int tx1d_high = tx_size_high[tx_size];
-  const int tx2d_size = tx_size_2d[tx_size];
-  int scan_idx;
-  for (scan_idx = 0; scan_idx < tx2d_size; ++scan_idx) {
-    const int coeff_idx = scan[scan_idx];
-    const int r = coeff_idx / tx1d_wide;
-    const int c = coeff_idx % tx1d_wide;
-    const int nb_offset_r[5] = { -1, 0, -1, -1, 1 };
-    const int nb_offset_c[5] = { 0, -1, -1, 1, -1 };
-    const int nb_num = 5;
-    int nb_count = 0;
-    int nb_idx;
-
-    for (nb_idx = 0; nb_idx < nb_num; ++nb_idx) {
-      if (nb_count < 2) {
-        int nb_r = r + nb_offset_r[nb_idx];
-        int nb_c = c + nb_offset_c[nb_idx];
-        int nb_coeff_idx = nb_r * tx1d_wide + nb_c;
-        int valid_pos =
-            nb_r >= 0 && nb_r < tx1d_high && nb_c >= 0 && nb_c < tx1d_wide;
-        if (valid_pos && iscan[nb_coeff_idx] < scan_idx) {
-          neighbors[scan_idx * MAX_NEIGHBORS + nb_count] = nb_coeff_idx;
-          ++nb_count;
-        }
-      } else {
-        break;
-      }
-    }
-
-    if (nb_count == 1) {
-      neighbors[scan_idx * MAX_NEIGHBORS + 1] =
-          neighbors[scan_idx * MAX_NEIGHBORS + 0];
-    } else if (nb_count == 0) {
-      neighbors[scan_idx * MAX_NEIGHBORS + 0] = scan[0];
-      neighbors[scan_idx * MAX_NEIGHBORS + 1] = scan[0];
-    }
-  }
-  neighbors[tx2d_size * MAX_NEIGHBORS + 0] = scan[0];
-  neighbors[tx2d_size * MAX_NEIGHBORS + 1] = scan[0];
-}
-
-#if USE_LIMIT_SCAN_DISTANCE
-typedef struct SCAN_NB_QUEUE {
-  int nb_ci_queue[COEFF_IDX_SIZE + 1];
-  int pr_si_queue[COEFF_IDX_SIZE + 1];
-  int size;
-  int start;
-  int end;
-} SCAN_NB_QUEUE;
-
-static void assign_scan_idx(int16_t coeff_idx, int16_t *scan_idx, int tx_width,
-                            int tx_height, int16_t *scan, int16_t *iscan,
-                            int16_t *visit, SCAN_NB_QUEUE *queue) {
-  if (visit[coeff_idx] != 2) {
-    assert(*scan_idx < tx_width * tx_height);
-    scan[*scan_idx] = coeff_idx;
-    iscan[coeff_idx] = *scan_idx;
-    visit[coeff_idx] = 2;
-    int row = coeff_idx / tx_width;
-    int col = coeff_idx % tx_width;
-    int right_ci = coeff_idx + 1;
-    if (col + 1 < tx_width && visit[right_ci] == 0) {
-      visit[right_ci] = 1;
-      queue->pr_si_queue[queue->end] = *scan_idx;
-      queue->nb_ci_queue[queue->end] = right_ci;
-      queue->end = (queue->end + 1) % queue->size;
-    }
-    int down_ci = coeff_idx + tx_width;
-    if (row + 1 < tx_height && visit[down_ci] == 0) {
-      visit[down_ci] = 1;
-      queue->pr_si_queue[queue->end] = *scan_idx;
-      queue->nb_ci_queue[queue->end] = down_ci;
-      queue->end = (queue->end + 1) % queue->size;
-    }
-    ++(*scan_idx);
-  }
-}
-static void limit_nb_scan_distance(TX_SIZE tx_size, int16_t *scan,
-                                   int16_t *iscan) {
-  const int tx2d_size = tx_size_2d[tx_size];
-  int16_t visit[COEFF_IDX_SIZE] = { 0 };
-  int16_t org_scan[COEFF_IDX_SIZE];
-  memcpy(org_scan, scan, tx2d_size * sizeof(*scan));
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
-  const int limit = 2 * AOMMAX(tx_width, tx_height);
-  SCAN_NB_QUEUE queue;
-  queue.size = tx2d_size;
-  queue.start = 0;
-  queue.end = 0;
-  int16_t new_si = 0;
-  for (int16_t si = 0; si < tx2d_size; ++si) {
-    while (queue.start != queue.end &&
-           queue.pr_si_queue[queue.start] + limit <= new_si) {
-      int nb_ci = queue.nb_ci_queue[queue.start];
-      assign_scan_idx(nb_ci, &new_si, tx_width, tx_height, scan, iscan, visit,
-                      &queue);
-      queue.start = (queue.start + 1) % queue.size;
-    }
-
-    int16_t ci = org_scan[si];
-    assign_scan_idx(ci, &new_si, tx_width, tx_height, scan, iscan, visit,
-                    &queue);
-  }
-  assert(new_si == tx2d_size);
-}
-#endif  // USE_LIMIT_SCAN_DISTANCE
-
-#if USE_TOPOLOGICAL_SORT
-void av1_update_sort_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           const uint32_t *non_zero_prob, int16_t *sort_order) {
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  uint32_t temp[COEFF_IDX_SIZE];
-  const int tx2d_size = tx_size_2d[tx_size];
-  int sort_idx;
-  assert(tx2d_size <= COEFF_IDX_SIZE);
-  memcpy(temp, non_zero_prob, tx2d_size * sizeof(*non_zero_prob));
-  av1_augment_prob(tx_size, tx_type, temp);
-  qsort(temp, tx2d_size, sizeof(*temp), cmp_prob);
-  for (sort_idx = 0; sort_idx < tx2d_size; ++sort_idx) {
-    const int default_scan_idx =
-        (temp[sort_idx] & COEFF_IDX_MASK) ^ COEFF_IDX_MASK;
-    const int coeff_idx = sc->scan[default_scan_idx];
-    sort_order[sort_idx] = coeff_idx;
-  }
-}
-
-// topological sort
-static void dfs_scan(int tx1d_size, int *scan_idx, int coeff_idx, int16_t *scan,
-                     int16_t *iscan) {
-  const int r = coeff_idx / tx1d_size;
-  const int c = coeff_idx % tx1d_size;
-
-  if (iscan[coeff_idx] != -1) return;
-
-  if (r > 0) dfs_scan(tx1d_size, scan_idx, coeff_idx - tx1d_size, scan, iscan);
-
-  if (c > 0) dfs_scan(tx1d_size, scan_idx, coeff_idx - 1, scan, iscan);
-
-  scan[*scan_idx] = coeff_idx;
-  iscan[coeff_idx] = *scan_idx;
-  ++(*scan_idx);
-}
-
-void av1_update_scan_order(TX_SIZE tx_size, int16_t *sort_order, int16_t *scan,
-                           int16_t *iscan) {
-  int coeff_idx;
-  int scan_idx;
-  int sort_idx;
-  const int tx1d_size = tx_size_wide[tx_size];
-  const int tx2d_size = tx_size_2d[tx_size];
-
-  for (coeff_idx = 0; coeff_idx < tx2d_size; ++coeff_idx) {
-    iscan[coeff_idx] = -1;
-  }
-
-  scan_idx = 0;
-  for (sort_idx = 0; sort_idx < tx2d_size; ++sort_idx) {
-    coeff_idx = sort_order[sort_idx];
-    dfs_scan(tx1d_size, &scan_idx, coeff_idx, scan, iscan);
-  }
-}
-#else
-
-static void filter_prob(TX_SIZE tx_size, uint32_t *prob) {
-  const int tx1d_wide = tx_size_wide[tx_size];
-  const int tx1d_high = tx_size_high[tx_size];
-  for (int r = tx1d_high - 1; r >= 0; --r) {
-    for (int c = tx1d_wide - 1; c >= 0; --c) {
-      int idx = r * tx1d_wide + c;
-      uint32_t v = prob[idx];
-      if (r > 0 && prob[idx - tx1d_wide] < v) prob[idx - tx1d_wide] = v;
-      if (c > 0 && prob[idx - 1] < v) prob[idx - 1] = v;
-    }
-  }
-}
-
-void av1_update_scan_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           uint32_t *non_zero_prob, int16_t *scan,
-                           int16_t *iscan) {
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  uint32_t temp[COEFF_IDX_SIZE];
-  const int tx2d_size = tx_size_2d[tx_size];
-  int scan_idx;
-  assert(tx2d_size <= COEFF_IDX_SIZE);
-  memcpy(temp, non_zero_prob, tx2d_size * sizeof(*non_zero_prob));
-  filter_prob(tx_size, temp);
-  av1_augment_prob(tx_size, tx_type, temp);
-  qsort(temp, tx2d_size, sizeof(*temp), cmp_prob);
-  for (scan_idx = 0; scan_idx < tx2d_size; ++scan_idx) {
-    const int default_scan_idx =
-        (temp[scan_idx] & COEFF_IDX_MASK) ^ COEFF_IDX_MASK;
-    const int coeff_idx = sc->scan[default_scan_idx];
-    scan[scan_idx] = coeff_idx;
-    iscan[coeff_idx] = scan_idx;
-  }
-}
-#endif
-
-static void update_scan_order_facade(AV1_COMMON *cm, TX_SIZE tx_size,
-                                     TX_TYPE tx_type, int use_curr_frame) {
-#if USE_TOPOLOGICAL_SORT
-  int16_t sort_order[COEFF_IDX_SIZE];
-#endif
-  uint32_t *non_zero_prob;
-  if (use_curr_frame)
-    non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
-  else
-    non_zero_prob = get_non_zero_prob(cm->pre_fc, tx_size, tx_type);
-  int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
-  int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
-  int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
-  assert(tx_size_2d[tx_size] <= COEFF_IDX_SIZE);
-#if USE_TOPOLOGICAL_SORT
-  av1_update_sort_order(tx_size, tx_type, non_zero_prob, sort_order);
-  av1_update_scan_order(tx_size, sort_order, scan, iscan);
-#else
-  av1_update_scan_order(tx_size, tx_type, non_zero_prob, scan, iscan);
-#endif
-#if USE_LIMIT_SCAN_DISTANCE
-  limit_nb_scan_distance(tx_size, scan, iscan);
-#endif  // USE_LIMIT_SCAN_DISTANCE
-  av1_update_neighbors(tx_size, scan, iscan, nb);
-}
-
-static void update_eob_threshold(AV1_COMMON *cm, TX_SIZE tx_size,
-                                 TX_TYPE tx_type) {
-  int i, row, col, row_limit, col_limit, cal_idx = 0;
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
-
-  row_limit = tx_width >> 1;
-  col_limit = tx_height >> 1;
-
-  if (tx_width >= 8 && tx_height >= 8) {
-    SCAN_ORDER *sc = &cm->fc->sc[tx_size][tx_type];
-    int16_t *threshold = &cm->fc->eob_threshold[tx_size][tx_type][0];
-    const int tx2d_size = tx_size_2d[tx_size];
-
-    while (cal_idx < EOB_THRESHOLD_NUM) {
-      for (i = 0; i < tx2d_size; ++i) {
-        row = sc->scan[i] / tx_height;
-        col = sc->scan[i] % tx_width;
-        if (row >= row_limit || col >= col_limit) break;
-      }
-      row_limit >>= 1;
-      col_limit >>= 1;
-      threshold[cal_idx] = i;
-      cal_idx++;
-    }
-  }
-}
-
-void av1_init_scan_order(AV1_COMMON *cm) {
-  TX_SIZE tx_size;
-  TX_TYPE tx_type;
-  for (tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    if (tx_size > TX_32X16) continue;
-#else
-    if (tx_size >= TX_SIZES) continue;
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-      if (do_adapt_scan(tx_size, tx_type)) {
-        uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
-        const int tx2d_size = tx_size_2d[tx_size];
-        int i;
-        SCAN_ORDER *sc = &cm->fc->sc[tx_size][tx_type];
-        for (i = 0; i < tx2d_size; ++i) {
-          non_zero_prob[i] = (1 << ADAPT_SCAN_PROB_PRECISION) /
-                             2;  // init non_zero_prob to 0.5
-        }
-        update_scan_order_facade(cm, tx_size, tx_type, 1);
-        sc->scan = get_adapt_scan(cm->fc, tx_size, tx_type);
-        sc->iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
-        sc->neighbors = get_adapt_nb(cm->fc, tx_size, tx_type);
-        update_eob_threshold(cm, tx_size, tx_type);
-      }
-    }
-  }
-}
-
-void av1_adapt_scan_order(AV1_COMMON *cm) {
-  if (cm->use_adapt_scan) {
-    TX_SIZE tx_size;
-#if CACHE_SCAN_PROB
-    int use_curr_frame = 0;
-#else   // CACHE_SCAN_PROB
-    int use_curr_frame = 1;
-#endif  // CACHE_SCAN_PROB
-
-    for (tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      if (tx_size > TX_32X16) continue;
-#else
-      if (tx_size >= TX_SIZES) continue;
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      TX_TYPE tx_type;
-      for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-        if (do_adapt_scan(tx_size, tx_type)) {
-          update_scan_prob(cm, tx_size, tx_type, ADAPT_SCAN_UPDATE_RATE);
-          update_scan_order_facade(cm, tx_size, tx_type, use_curr_frame);
-          update_eob_threshold(cm, tx_size, tx_type);
-        }
-      }
-    }
-  }
-}
-
-void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd) {
-  xd->eob_threshold_md = (const EobThresholdMD *)cm->fc->eob_threshold;
-}
-#endif  // CONFIG_ADAPT_SCAN
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index 82d2e917f..c5cebc135 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -25,51 +25,18 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
-extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES_ALL][TX_TYPES];
-extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
-
-#if CONFIG_ADAPT_SCAN
-#define USE_2X2_PROB 1
-#define USE_TOPOLOGICAL_SORT 0
-#define USE_LIMIT_SCAN_DISTANCE 0
-void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
-                                  TX_SIZE tx_size, TX_TYPE tx_type,
-                                  const tran_low_t *dqcoeffs, int max_scan);
-
-// embed r + c and coeff_idx info with nonzero probabilities. When sorting the
-// nonzero probabilities, if there is a tie, the coefficient with smaller r + c
-// will be scanned first
-void av1_augment_prob(TX_SIZE tx_size, TX_TYPE tx_type, uint32_t *prob);
+typedef enum SCAN_MODE {
+  SCAN_MODE_ZIG_ZAG,
+  SCAN_MODE_COL_DIAG,
+  SCAN_MODE_ROW_DIAG,
+  SCAN_MODE_COL_1D,
+  SCAN_MODE_ROW_1D,
+  SCAN_MODES
+} SCAN_MODE;
 
-#if USE_TOPOLOGICAL_SORT
-// apply quick sort on nonzero probabilities to obtain a sort order
-void av1_update_sort_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           const uint32_t *non_zero_prob, int16_t *sort_order);
-
-// apply topological sort on the nonzero probabilities sorting order to
-// guarantee each to-be-scanned coefficient's upper and left coefficient will be
-// scanned before the to-be-scanned coefficient.
-void av1_update_scan_order(TX_SIZE tx_size, int16_t *sort_order, int16_t *scan,
-                           int16_t *iscan);
-#else   // USE_TOPOLOGICAL_SORT
-void av1_update_scan_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           uint32_t *non_zero_prob, int16_t *scan,
-                           int16_t *iscan);
-#endif  // USE_TOPOLOGICAL_SORT
+extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
+extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
-// For each coeff_idx in scan[], update its above and left neighbors in
-// neighbors[] accordingly.
-void av1_update_neighbors(TX_SIZE tx_size, const int16_t *scan,
-                          const int16_t *iscan, int16_t *neighbors);
-void av1_init_scan_order(AV1_COMMON *cm);
-void av1_adapt_scan_order(AV1_COMMON *cm);
-#if USE_2X2_PROB
-void av1_down_sample_scan_count(uint32_t *non_zero_count_ds,
-                                const uint32_t *non_zero_count,
-                                TX_SIZE tx_size);
-#endif  // USE_2X2_PROB
-#endif  // CONFIG_ADAPT_SCAN
 void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 static INLINE int get_coef_context(const int16_t *neighbors,
@@ -80,52 +47,12 @@ static INLINE int get_coef_context(const int16_t *neighbors,
 }
 
 static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
-                                                 TX_TYPE tx_type,
-                                                 int is_inter) {
-#if CONFIG_EXT_TX || CONFIG_VAR_TX
-  return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
-                  : &av1_intra_scan_orders[tx_size][tx_type];
-#else
-  (void)is_inter;
-  return &av1_intra_scan_orders[tx_size][tx_type];
-#endif  // CONFIG_EXT_TX
+                                                 TX_TYPE tx_type) {
+  return &av1_scan_orders[tx_size][tx_type];
 }
 
-static INLINE int do_adapt_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)tx_size;
-#if CONFIG_EXT_TX
-  if (tx_size_2d[tx_size] >= 1024 && tx_type != DCT_DCT) return 0;
-  return tx_type < IDTX;
-#else
-  (void)tx_type;
-  return 1;
-#endif
-}
-
-static INLINE const SCAN_ORDER *get_scan(const AV1_COMMON *cm, TX_SIZE tx_size,
-                                         TX_TYPE tx_type,
-                                         const MB_MODE_INFO *mbmi) {
-#if CONFIG_MRC_TX
-  // use the DCT_DCT scan order for MRC_DCT for now
-  if (tx_type == MRC_DCT) tx_type = DCT_DCT;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  if (mbmi->use_lgt) tx_type = DCT_DCT;
-#endif
-  const int is_inter = is_inter_block(mbmi);
-#if CONFIG_ADAPT_SCAN
-  (void)mbmi;
-  (void)is_inter;
-#if CONFIG_EXT_TX
-  if (!do_adapt_scan(tx_size, tx_type))
-    return get_default_scan(tx_size, tx_type, is_inter);
-  else
-#endif  // CONFIG_EXT_TX
-    return &cm->fc->sc[tx_size][tx_type];
-#else   // CONFIG_ADAPT_SCAN
-  (void)cm;
-  return get_default_scan(tx_size, tx_type, is_inter);
-#endif  // CONFIG_ADAPT_SCAN
+static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+  return get_default_scan(tx_size, tx_type);
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c
index 4603026bd..cd189ad76 100644
--- a/third_party/aom/av1/common/seg_common.c
+++ b/third_party/aom/av1/common/seg_common.c
@@ -16,18 +16,11 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/quant_common.h"
 
-#if CONFIG_LOOPFILTER_LEVEL
-static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 0, 0 };
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 };
 
 static const int seg_feature_data_max[SEG_LVL_MAX] = {
-  MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 0
+  MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0
 };
-#else
-static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
-
-static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, MAX_LOOP_FILTER, 3,
-                                                       0 };
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
@@ -39,6 +32,19 @@ void av1_clearall_segfeatures(struct segmentation *seg) {
   av1_zero(seg->feature_mask);
 }
 
+void calculate_segdata(struct segmentation *seg) {
+  seg->segid_preskip = 0;
+  seg->last_active_segid = 0;
+  for (int i = 0; i < MAX_SEGMENTS; i++) {
+    for (int j = 0; j < SEG_LVL_MAX; j++) {
+      if (seg->feature_mask[i] & (1 << j)) {
+        seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
+        seg->last_active_segid = i;
+      }
+    }
+  }
+}
+
 void av1_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id) {
   seg->feature_mask[segment_id] |= 1 << feature_id;
@@ -52,6 +58,17 @@ int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
   return seg_feature_data_signed[feature_id];
 }
 
+// The 'seg_data' given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+
 void av1_set_segdata(struct segmentation *seg, int segment_id,
                      SEG_LVL_FEATURES feature_id, int seg_data) {
   if (seg_data < 0) {
@@ -64,8 +81,4 @@ void av1_set_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-const aom_tree_index av1_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
-  2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7
-};
-
 // TBD? Functions to read and write segment data with range / validity checking
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
index 6d16aedb6..c851d65fd 100644
--- a/third_party/aom/av1/common/seg_common.h
+++ b/third_party/aom/av1/common/seg_common.h
@@ -18,15 +18,12 @@
 extern "C" {
 #endif
 
-#define SEGMENT_DELTADATA 0
-#define SEGMENT_ABSDATA 1
-
 #define MAX_SEGMENTS 8
 #define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
 
-#define PREDICTION_PROBS 3
+#define SEG_TEMPORAL_PRED_CTXS 3
+#define SPATIAL_PREDICTION_PROBS 3
 
-#if CONFIG_LOOPFILTER_LEVEL
 typedef enum {
   SEG_LVL_ALT_Q,       // Use alternate Quantizer ....
   SEG_LVL_ALT_LF_Y_V,  // Use alternate loop filter value on y plane vertical
@@ -35,47 +32,31 @@ typedef enum {
   SEG_LVL_ALT_LF_V,    // Use alternate loop filter value on v plane
   SEG_LVL_REF_FRAME,   // Optional Segment reference frame
   SEG_LVL_SKIP,        // Optional Segment (0,0) + skip mode
-#if CONFIG_SEGMENT_ZEROMV
-  SEG_LVL_ZEROMV,
-  SEG_LVL_MAX
-#else
+  SEG_LVL_GLOBALMV,
   SEG_LVL_MAX
-#endif
-} SEG_LVL_FEATURES;
-#else  // CONFIG_LOOPFILTER_LEVEL
-// Segment level features.
-typedef enum {
-  SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
-  SEG_LVL_ALT_LF = 1,     // Use alternate loop filter value...
-  SEG_LVL_REF_FRAME = 2,  // Optional Segment reference frame
-  SEG_LVL_SKIP = 3,  // Optional Segment (0,0) + skip mode
-#if CONFIG_SEGMENT_ZEROMV
-  SEG_LVL_ZEROMV = 4,
-  SEG_LVL_MAX = 5
-#else
-  SEG_LVL_MAX = 4
-#endif
 } SEG_LVL_FEATURES;
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
 struct segmentation {
   uint8_t enabled;
   uint8_t update_map;
   uint8_t update_data;
-  uint8_t abs_delta;
   uint8_t temporal_update;
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
   unsigned int feature_mask[MAX_SEGMENTS];
+  int last_active_segid;  // The highest numbered segment id that has some
+                          // enabled feature.
+  uint8_t segid_preskip;  // Whether the segment id will be read before the
+                          // skip syntax element.
+                          // 1: the segment id will be read first.
+                          // 0: the skip syntax element will be read first.
 };
 
 struct segmentation_probs {
-  aom_prob tree_probs[SEG_TREE_PROBS];
   aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
-  aom_prob pred_probs[PREDICTION_PROBS];
-#if CONFIG_NEW_MULTISYMBOL
-  aom_cdf_prob pred_cdf[PREDICTION_PROBS][CDF_SIZE(2)];
-#endif
+  aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
+  aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
+                                   [CDF_SIZE(MAX_SEGMENTS)];
 };
 
 static INLINE int segfeature_active(const struct segmentation *seg,
@@ -84,11 +65,26 @@ static INLINE int segfeature_active(const struct segmentation *seg,
   return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
 }
 
+static INLINE void segfeatures_copy(struct segmentation *dst,
+                                    const struct segmentation *src) {
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    dst->feature_mask[i] = src->feature_mask[i];
+    for (j = 0; j < SEG_LVL_MAX; j++) {
+      dst->feature_data[i][j] = src->feature_data[i][j];
+    }
+  }
+  dst->segid_preskip = src->segid_preskip;
+  dst->last_active_segid = src->last_active_segid;
+}
+
 void av1_clearall_segfeatures(struct segmentation *seg);
 
 void av1_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id);
 
+void calculate_segdata(struct segmentation *seg);
+
 int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
 
 int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
@@ -101,8 +97,6 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
   return seg->feature_data[segment_id][feature_id];
 }
 
-extern const aom_tree_index av1_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index 4c9fa6962..3fa998a91 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -9,40 +9,158 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/common/av1_loopfilter.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/reconinter.h"
 
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+static INLINE int get_lr_sync_range(int width) {
+#if 0
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+  return 1;
+#endif
+}
+
+// Allocate memory for lf row synchronization
+static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                              int width, int num_workers) {
+  lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
-  const int kMaxTryLocks = 4000;
-  int locked = 0;
-  int i;
+  {
+    int i, j;
+
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
+      if (lf_sync->mutex_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
+        }
+      }
 
-  for (i = 0; i < kMaxTryLocks; ++i) {
-    if (!pthread_mutex_trylock(mutex)) {
-      locked = 1;
-      break;
+      CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
+                      aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
+      if (lf_sync->cond_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_cond_init(&lf_sync->cond_[j][i], NULL);
+        }
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
+                    aom_malloc(sizeof(*(lf_sync->job_mutex))));
+    if (lf_sync->job_mutex) {
+      pthread_mutex_init(lf_sync->job_mutex, NULL);
     }
   }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
+  lf_sync->num_workers = num_workers;
 
-  if (!locked) pthread_mutex_lock(mutex);
+  for (int j = 0; j < MAX_MB_PLANE; j++) {
+    CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
+                    aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
+  }
+  CHECK_MEM_ERROR(
+      cm, lf_sync->job_queue,
+      aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
 }
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+    int j;
+#if CONFIG_MULTITHREAD
+    int i;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lf_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
+        }
+        aom_free(lf_sync->mutex_[j]);
+      }
+      if (lf_sync->cond_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_cond_destroy(&lf_sync->cond_[j][i]);
+        }
+        aom_free(lf_sync->cond_[j]);
+      }
+    }
+    if (lf_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lf_sync->job_mutex);
+      aom_free(lf_sync->job_mutex);
+    }
 #endif  // CONFIG_MULTITHREAD
+    aom_free(lf_sync->lfdata);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lf_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lf_sync->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lf_sync);
+  }
+}
+
+static void loop_filter_data_reset(LFWorkerData *lf_data,
+                                   YV12_BUFFER_CONFIG *frame_buffer,
+                                   struct AV1Common *cm, MACROBLOCKD *xd) {
+  struct macroblockd_plane *pd = xd->plane;
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->xd = xd;
+  for (int i = 0; i < MAX_MB_PLANE; i++) {
+    memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+  }
+}
 
-static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c) {
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+                             int plane) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
 
   if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
-    mutex_lock(mutex);
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
 
-    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
   }
@@ -50,11 +168,12 @@ static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c) {
   (void)lf_sync;
   (void)r;
   (void)c;
+  (void)plane;
 #endif  // CONFIG_MULTITHREAD
 }
 
 static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
-                              const int sb_cols) {
+                              const int sb_cols, int plane) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
   int cur;
@@ -69,321 +188,156 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
   }
 
   if (sig) {
-    mutex_lock(&lf_sync->mutex_[r]);
+    pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
 
-    lf_sync->cur_sb_col[r] = cur;
+    lf_sync->cur_sb_col[plane][r] = cur;
 
-    pthread_cond_signal(&lf_sync->cond_[r]);
-    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+    pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
   }
 #else
   (void)lf_sync;
   (void)r;
   (void)c;
   (void)sb_cols;
+  (void)plane;
 #endif  // CONFIG_MULTITHREAD
 }
 
-#if !CONFIG_EXT_PARTITION_TYPES
-static INLINE enum lf_path get_loop_filter_path(
-    int y_only, struct macroblockd_plane *planes) {
-  if (y_only)
-    return LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    return LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    return LF_PATH_444;
-  else
-    return LF_PATH_SLOW;
-}
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
+                            int stop, int plane_start, int plane_end) {
+  int mi_row, plane, dir;
+  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+  lf_sync->jobs_enqueued = 0;
+  lf_sync->jobs_dequeued = 0;
 
-static INLINE void loop_filter_block_plane_ver(
-    AV1_COMMON *cm, struct macroblockd_plane *planes, int plane,
-    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
-    LOOP_FILTER_MASK *lfm) {
-  if (plane == 0) {
-    av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
-  } else {
-    switch (path) {
-      case LF_PATH_420:
-        av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_444:
-        av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_SLOW:
-        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row,
-                                          mi_col, plane);
+  for (dir = 0; dir < 2; dir++) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
         break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        lf_job_queue->mi_row = mi_row;
+        lf_job_queue->plane = plane;
+        lf_job_queue->dir = dir;
+        lf_job_queue++;
+        lf_sync->jobs_enqueued++;
+      }
     }
   }
 }
 
-static INLINE void loop_filter_block_plane_hor(
-    AV1_COMMON *cm, struct macroblockd_plane *planes, int plane,
-    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
-    LOOP_FILTER_MASK *lfm) {
-  if (plane == 0) {
-    av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
-  } else {
-    switch (path) {
-      case LF_PATH_420:
-        av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_444:
-        av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_SLOW:
-        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row,
-                                          mi_col, plane);
-        break;
-    }
+AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+  AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lf_sync->job_mutex);
+
+  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+    lf_sync->jobs_dequeued++;
   }
-}
-#endif
-// Row-based multi-threaded loopfilter hook
-#if CONFIG_PARALLEL_DEBLOCKING
-static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
-                                      LFWorkerData *const lf_data) {
-  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
-  int mi_row, mi_col;
-#if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-#endif
-  for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
-    MODE_INFO **const mi =
-        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
-
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
-         mi_col += lf_data->cm->mib_size) {
-      LOOP_FILTER_MASK lfm;
-      int plane;
-
-      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
-                           lf_data->frame_buffer, mi_row, mi_col);
-      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
-                     lf_data->cm->mi_stride, &lfm);
-
-#if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-#else
 
-      for (plane = 0; plane < num_planes; ++plane)
-        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+  pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+  (void)lf_sync;
 #endif
-    }
-  }
-  return 1;
+
+  return cur_job_info;
 }
 
-static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
-                                      LFWorkerData *const lf_data) {
-  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+// Implement row loopfiltering for each thread.
+static INLINE void thread_loop_filter_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
   const int sb_cols =
-      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
-  int mi_row, mi_col;
-#if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-#endif
-
-  for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
-    MODE_INFO **const mi =
-        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
-
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
-         mi_col += lf_data->cm->mib_size) {
-      const int r = mi_row >> lf_data->cm->mib_size_log2;
-      const int c = mi_col >> lf_data->cm->mib_size_log2;
-      LOOP_FILTER_MASK lfm;
-      int plane;
-
-      // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
-      // the outer loop to column-based and remove the synchronizations here.
-      sync_read(lf_sync, r, c);
-
-      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
-                           lf_data->frame_buffer, mi_row, mi_col);
-      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
-                     lf_data->cm->mi_stride, &lfm);
-#if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-#else
-      for (plane = 0; plane < num_planes; ++plane)
-        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
-#endif
-      sync_write(lf_sync, r, c, sb_cols);
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MAX_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+        }
+      }
+    } else {
+      break;
     }
   }
-  return 1;
 }
-#else  //  CONFIG_PARALLEL_DEBLOCKING
+
+// Row-based multi-threaded loopfilter hook
 static int loop_filter_row_worker(AV1LfSync *const lf_sync,
                                   LFWorkerData *const lf_data) {
-  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
-  const int sb_cols =
-      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
-  int mi_row, mi_col;
-#if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-#endif  // !CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_EXT_PARTITION
-  printf(
-      "STOPPING: This code has not been modified to work with the "
-      "extended coding unit size experiment");
-  exit(EXIT_FAILURE);
-#endif  // CONFIG_EXT_PARTITION
-
-  for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
-    MODE_INFO **const mi =
-        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
-
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
-         mi_col += lf_data->cm->mib_size) {
-      const int r = mi_row >> lf_data->cm->mib_size_log2;
-      const int c = mi_col >> lf_data->cm->mib_size_log2;
-#if !CONFIG_EXT_PARTITION_TYPES
-      LOOP_FILTER_MASK lfm;
-#endif
-      int plane;
-
-      sync_read(lf_sync, r, c);
-
-      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
-                           lf_data->frame_buffer, mi_row, mi_col);
-#if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane) {
-        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-      }
-#else
-      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
-                     lf_data->cm->mi_stride, &lfm);
-
-      for (plane = 0; plane < num_planes; ++plane) {
-        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
-        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
-      }
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      sync_write(lf_sync, r, c, sb_cols);
-    }
-  }
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->xd, lf_sync);
   return 1;
 }
-#endif  //  CONFIG_PARALLEL_DEBLOCKING
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                                struct macroblockd_plane *planes, int start,
-                                int stop, int y_only, AVxWorker *workers,
-                                int nworkers, AV1LfSync *lf_sync) {
-#if CONFIG_EXT_PARTITION
-  printf(
-      "STOPPING: This code has not been modified to work with the "
-      "extended coding unit size experiment");
-  exit(EXIT_FAILURE);
-#endif  // CONFIG_EXT_PARTITION
-
+                                MACROBLOCKD *xd, int start, int stop,
+                                int plane_start, int plane_end,
+                                AVxWorker *workers, int nworkers,
+                                AV1LfSync *lf_sync) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   // Number of superblock rows and cols
-  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
-  // Decoder may allocate more threads than number of tiles based on user's
-  // input.
-  const int tile_cols = cm->tile_cols;
-  const int num_workers = AOMMIN(nworkers, tile_cols);
+  const int sb_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  const int num_workers = nworkers;
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     av1_loop_filter_dealloc(lf_sync);
-    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+    loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
-// Set up loopfilter thread data.
-// The decoder is capping num_workers because it has been observed that using
-// more threads on the loopfilter than there are cores will hurt performance
-// on Android. This is because the system will only schedule the tile decode
-// workers on cores equal to the number of tile columns. Then if the decoder
-// tries to use more threads for the loopfilter, it will hurt performance
-// because of contention. If the multithreading code changes in the future
-// then the number of workers used by the loopfilter should be revisited.
-
-#if CONFIG_PARALLEL_DEBLOCKING
   // Initialize cur_sb_col to -1 for all SB rows.
-  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-
-  // Filter all the vertical edges in the whole frame
-  for (i = 0; i < num_workers; ++i) {
-    AVxWorker *const worker = &workers[i];
-    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
-
-    worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
-    worker->data1 = lf_sync;
-    worker->data2 = lf_data;
-
-    // Loopfilter data
-    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * cm->mib_size;
-    lf_data->stop = stop;
-    lf_data->y_only = y_only;
-
-    // Start loopfiltering
-    if (i == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
-    }
-  }
-
-  // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&workers[i]);
-  }
-
-  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-  // Filter all the horizontal edges in the whole frame
-  for (i = 0; i < num_workers; ++i) {
-    AVxWorker *const worker = &workers[i];
-    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
-
-    worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
-    worker->data1 = lf_sync;
-    worker->data2 = lf_data;
-
-    // Loopfilter data
-    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * cm->mib_size;
-    lf_data->stop = stop;
-    lf_data->y_only = y_only;
-
-    // Start loopfiltering
-    if (i == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
-    }
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    memset(lf_sync->cur_sb_col[i], -1,
+           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
   }
 
-  // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&workers[i]);
-  }
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  // Initialize cur_sb_col to -1 for all SB rows.
-  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+  enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
 
+  // Set up loopfilter thread data.
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
@@ -393,10 +347,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     worker->data2 = lf_data;
 
     // Loopfilter data
-    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * cm->mib_size;
-    lf_data->stop = stop;
-    lf_data->y_only = y_only;
+    loop_filter_data_reset(lf_data, frame, cm, xd);
 
     // Start loopfiltering
     if (i == num_workers - 1) {
@@ -410,21 +361,14 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   for (i = 0; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 }
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                              struct macroblockd_plane *planes,
-                              int frame_filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                              int frame_filter_level_r,
-#endif
-                              int y_only, int partial_frame, AVxWorker *workers,
+                              MACROBLOCKD *xd, int plane_start, int plane_end,
+                              int partial_frame, AVxWorker *workers,
                               int num_workers, AV1LfSync *lf_sync) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
-  if (!frame_filter_level) return;
-
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_rows;
   if (partial_frame && cm->mi_rows > 8) {
@@ -433,103 +377,406 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
-#if CONFIG_LOOPFILTER_LEVEL
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level_r,
-                             y_only);
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+  loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                      plane_end, workers, num_workers, lf_sync);
+}
+
+static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
 #else
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, y_only,
-                      workers, num_workers, lf_sync);
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
 }
 
-// Set up nsync by width.
-static INLINE int get_sync_range(int width) {
-  // nsync numbers are picked by testing. For example, for 4k
-  // video, using 4 gives best performance.
-  if (width < 640)
-    return 1;
-  else if (width <= 1280)
-    return 2;
-  else if (width <= 4096)
-    return 4;
-  else
-    return 8;
+static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+                                 const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
+
+    loop_res_sync->cur_sb_col[plane][r] = cur;
+
+    pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
+  }
+#else
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
 }
 
-// Allocate memory for lf row synchronization
-void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
-                           int width, int num_workers) {
-  lf_sync->rows = rows;
+// Allocate memory for loop restoration row synchronization
+static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                   int num_workers, int num_rows_lr,
+                                   int num_planes, int width) {
+  lr_sync->rows = num_rows_lr;
+  lr_sync->num_planes = num_planes;
 #if CONFIG_MULTITHREAD
   {
-    int i;
+    int i, j;
+
+    for (j = 0; j < num_planes; j++) {
+      CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
+      if (lr_sync->mutex_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
+        }
+      }
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                    aom_malloc(sizeof(*lf_sync->mutex_) * rows));
-    if (lf_sync->mutex_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
+                      aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
+      if (lr_sync->cond_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_cond_init(&lr_sync->cond_[j][i], NULL);
+        }
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                    aom_malloc(sizeof(*lf_sync->cond_) * rows));
-    if (lf_sync->cond_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&lf_sync->cond_[i], NULL);
-      }
+    CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
+                    aom_malloc(sizeof(*(lr_sync->job_mutex))));
+    if (lr_sync->job_mutex) {
+      pthread_mutex_init(lr_sync->job_mutex, NULL);
     }
   }
 #endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
+                  aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata))));
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
-                  aom_malloc(num_workers * sizeof(*lf_sync->lfdata)));
-  lf_sync->num_workers = num_workers;
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    if (worker_idx < num_workers - 1) {
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
+                      (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
+                      aom_malloc(sizeof(RestorationLineBuffers)));
+
+    } else {
+      lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
+      lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
+    }
+  }
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
-                  aom_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+  lr_sync->num_workers = num_workers;
 
+  for (int j = 0; j < num_planes; j++) {
+    CHECK_MEM_ERROR(
+        cm, lr_sync->cur_sb_col[j],
+        aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
+  }
+  CHECK_MEM_ERROR(
+      cm, lr_sync->job_queue,
+      aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
   // Set up nsync.
-  lf_sync->sync_range = get_sync_range(width);
+  lr_sync->sync_range = get_lr_sync_range(width);
 }
 
-// Deallocate lf synchronization related mutex and data
-void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
-  if (lf_sync != NULL) {
+// Deallocate loop restoration synchronization related mutex and data
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers) {
+  if (lr_sync != NULL) {
+    int j;
 #if CONFIG_MULTITHREAD
     int i;
-
-    if (lf_sync->mutex_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lr_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
+        }
+        aom_free(lr_sync->mutex_[j]);
       }
-      aom_free(lf_sync->mutex_);
-    }
-    if (lf_sync->cond_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_cond_destroy(&lf_sync->cond_[i]);
+      if (lr_sync->cond_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_cond_destroy(&lr_sync->cond_[j][i]);
+        }
+        aom_free(lr_sync->cond_[j]);
       }
-      aom_free(lf_sync->cond_);
+    }
+    if (lr_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lr_sync->job_mutex);
+      aom_free(lr_sync->job_mutex);
     }
 #endif  // CONFIG_MULTITHREAD
-    aom_free(lf_sync->lfdata);
-    aom_free(lf_sync->cur_sb_col);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lr_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lr_sync->job_queue);
+
+    if (lr_sync->lrworkerdata) {
+      for (int worker_idx = 0; worker_idx < num_workers - 1; worker_idx++) {
+        LRWorkerData *const workerdata_data =
+            lr_sync->lrworkerdata + worker_idx;
+
+        aom_free(workerdata_data->rst_tmpbuf);
+        aom_free(workerdata_data->rlbs);
+      }
+      aom_free(lr_sync->lrworkerdata);
+    }
+
     // clear the structure as the source of this call may be a resize in which
     // case this call will be followed by an _alloc() which may fail.
-    av1_zero(*lf_sync);
+    av1_zero(*lr_sync);
+  }
+}
+
+static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
+                            AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+  AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
+  lr_sync->jobs_enqueued = 0;
+  lr_sync->jobs_dequeued = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    num_even_lr_jobs =
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+  }
+  lr_job_counter[0] = 0;
+  lr_job_counter[1] = num_even_lr_jobs;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    const int is_uv = plane > 0;
+    const int ss_y = is_uv && cm->subsampling_y;
+
+    AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+
+    const int tile_h = tile_rect.bottom - tile_rect.top;
+    const int ext_size = unit_size * 3 / 2;
+
+    int y0 = 0, i = 0;
+    while (y0 < tile_h) {
+      int remaining_h = tile_h - y0;
+      int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+      RestorationTileLimits limits;
+      limits.v_start = tile_rect.top + y0;
+      limits.v_end = tile_rect.top + y0 + h;
+      assert(limits.v_end <= tile_rect.bottom);
+      // Offset the tile upwards to align with the restoration processing stripe
+      const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+      limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
+      if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+
+      assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+      if ((i & 1) == 0) {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            limits.v_start + RESTORATION_BORDER;
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            limits.v_end - RESTORATION_BORDER;
+        if (i == 0) {
+          assert(limits.v_start == tile_rect.top);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+        }
+        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+          assert(limits.v_end == tile_rect.bottom);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        }
+      } else {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+      }
+      lr_job_counter[i & 1]++;
+      lr_sync->jobs_enqueued++;
+
+      y0 += h;
+      ++i;
+    }
+  }
+}
+
+AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+  AV1LrMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lr_sync->job_mutex);
+
+  if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+    cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
+    lr_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lr_sync->job_mutex);
+#else
+  (void)lr_sync;
+#endif
+
+  return cur_job_info;
+}
+
+// Implement row loop restoration for each thread.
+static int loop_restoration_row_worker(AV1LrSync *const lr_sync,
+                                       LRWorkerData *lrworkerdata) {
+  AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+  int lr_unit_row;
+  int plane;
+  const int tile_row = LR_TILE_ROW;
+  const int tile_col = LR_TILE_COL;
+  const int tile_cols = LR_TILE_COLS;
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
+
+  while (1) {
+    AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
+    if (cur_job_info != NULL) {
+      RestorationTileLimits limits;
+      sync_read_fn_t on_sync_read;
+      sync_write_fn_t on_sync_write;
+      limits.v_start = cur_job_info->v_start;
+      limits.v_end = cur_job_info->v_end;
+      lr_unit_row = cur_job_info->lr_unit_row;
+      plane = cur_job_info->plane;
+      const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+
+      // sync_mode == 1 implies only sync read is required in LR Multi-threading
+      // sync_mode == 0 implies only sync write is required.
+      on_sync_read =
+          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+                                                   : av1_lr_sync_write_dummy;
+
+      av1_foreach_rest_unit_in_row(
+          &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
+          ctxt[plane].rsi->restoration_unit_size, unit_idx0,
+          ctxt[plane].rsi->horz_units_per_tile,
+          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+          on_sync_write, lr_sync);
+
+      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
+                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+                       cur_job_info->v_copy_end);
+    } else {
+      break;
+    }
+  }
+  return 1;
+}
+
+static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
+                                           AVxWorker *workers, int nworkers,
+                                           AV1LrSync *lr_sync, AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_rows_lr = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+    const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    num_rows_lr =
+        AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+  }
+
+  const int num_workers = nworkers;
+  int i;
+  assert(MAX_MB_PLANE == 3);
+
+  if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
+      num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+    av1_loop_restoration_dealloc(lr_sync, num_workers);
+    loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
+                           cm->width);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (i = 0; i < num_planes; i++) {
+    memset(lr_sync->cur_sb_col[i], -1,
+           sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
+  }
+
+  enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
+
+  // Set up looprestoration thread data.
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+    worker->hook = (AVxWorkerHook)loop_restoration_row_worker;
+    worker->data1 = lr_sync;
+    worker->data2 = &lr_sync->lrworkerdata[i];
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
   }
 }
 
-// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
-// members, so we treat it as an array, and sum over the whole length.
-void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
-                                 FRAME_COUNTS *counts) {
-  unsigned int *const acc = (unsigned int *)acc_counts;
-  const unsigned int *const cnt = (unsigned int *)counts;
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          AV1_COMMON *cm, int optimized_lr,
+                                          AVxWorker *workers, int num_workers,
+                                          AV1LrSync *lr_sync, void *lr_ctxt) {
+  assert(!cm->all_lossless);
+
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
 
-  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
-  unsigned int i;
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
 
-  for (i = 0; i < n_counts; i++) acc[i] += cnt[i];
+  foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
+                                 cm);
 }
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
index 7eddc662c..4b0d5d2b8 100644
--- a/third_party/aom/av1/common/thread_common.h
+++ b/third_party/aom/av1/common/thread_common.h
@@ -11,7 +11,9 @@
 
 #ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
 #define AV1_COMMON_LOOPFILTER_THREAD_H_
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "av1/common/av1_loopfilter.h"
 #include "aom_util/aom_thread.h"
 
@@ -20,16 +22,21 @@ extern "C" {
 #endif
 
 struct AV1Common;
-struct FRAME_COUNTS;
+
+typedef struct AV1LfMTInfo {
+  int mi_row;
+  int plane;
+  int dir;
+} AV1LfMTInfo;
 
 // Loopfilter row synchronization
 typedef struct AV1LfSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
 #endif
   // Allocate memory to store the loop-filtered superblock index in each row.
-  int *cur_sb_col;
+  int *cur_sb_col[MAX_MB_PLANE];
   // The optimal sync_range for different resolution and platform should be
   // determined by testing. Currently, it is chosen to be a power-of-2 number.
   int sync_range;
@@ -38,27 +45,72 @@ typedef struct AV1LfSyncData {
   // Row-based parallel loopfilter data
   LFWorkerData *lfdata;
   int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  AV1LfMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
 } AV1LfSync;
 
-// Allocate memory for loopfilter row synchronization.
-void av1_loop_filter_alloc(AV1LfSync *lf_sync, struct AV1Common *cm, int rows,
-                           int width, int num_workers);
+typedef struct AV1LrMTInfo {
+  int v_start;
+  int v_end;
+  int lr_unit_row;
+  int plane;
+  int sync_mode;
+  int v_copy_start;
+  int v_copy_end;
+} AV1LrMTInfo;
+
+typedef struct LoopRestorationWorkerData {
+  int32_t *rst_tmpbuf;
+  void *rlbs;
+  void *lr_ctxt;
+} LRWorkerData;
+
+// Looprestoration row synchronization
+typedef struct AV1LrSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+  // Allocate memory to store the loop-restoration block index in each row.
+  int *cur_sb_col[MAX_MB_PLANE];
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+  int num_planes;
+
+  int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  // Row-based parallel loopfilter data
+  LRWorkerData *lrworkerdata;
+
+  AV1LrMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+} AV1LrSync;
 
 // Deallocate loopfilter synchronization related mutex and data.
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
 
-// Multi-threaded loopfilter that uses the tile threads.
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                              struct macroblockd_plane *planes,
-                              int frame_filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                              int frame_filter_level_r,
-#endif
-                              int y_only, int partial_frame, AVxWorker *workers,
-                              int num_workers, AV1LfSync *lf_sync);
-
-void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
-                                 struct FRAME_COUNTS *counts);
+                              struct macroblockd *mbd, int plane_start,
+                              int plane_end, int partial_frame,
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync);
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          struct AV1Common *cm,
+                                          int optimized_lr, AVxWorker *workers,
+                                          int num_workers, AV1LrSync *lr_sync,
+                                          void *lr_ctxt);
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 507a01265..9a43ab29a 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -11,32 +11,14 @@
 
 #include "av1/common/tile_common.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-#if CONFIG_DEPENDENT_HORZTILES
-void av1_tile_set_tg_boundary(TileInfo *tile, const AV1_COMMON *const cm,
-                              int row, int col) {
-  const int tg_start_row = cm->tile_group_start_row[row][col];
-  const int tg_start_col = cm->tile_group_start_col[row][col];
-  tile->tg_horz_boundary = ((row == tg_start_row && col >= tg_start_col) ||
-                            (row == tg_start_row + 1 && col < tg_start_col));
-#if CONFIG_MAX_TILE
-  if (cm->tile_row_independent[row]) {
-    tile->tg_horz_boundary = 1;  // this tile row is independent
-  }
-#endif
-}
-#endif
 void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
   av1_tile_set_row(tile, cm, row);
   av1_tile_set_col(tile, cm, col);
-#if CONFIG_DEPENDENT_HORZTILES
-  av1_tile_set_tg_boundary(tile, cm, row, col);
-#endif
 }
 
-#if CONFIG_MAX_TILE
-
 // Find smallest k>=0 such that (blk_size << k) >= target
 static int tile_log2(int blk_size, int target) {
   int k;
@@ -46,25 +28,27 @@ static int tile_log2(int blk_size, int target) {
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
-  int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
 
-  cm->min_log2_tile_cols = tile_log2(MAX_TILE_WIDTH_SB, sb_cols);
+  int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
+  cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+  int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+  cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
   cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
   cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
-  cm->min_log2_tiles = tile_log2(MAX_TILE_AREA_SB, sb_cols * sb_rows);
+  cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
   cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
-  // TODO(dominic.symes@arm.com):
-  // Add in levelMinLog2Tiles as a lower limit when levels are defined
 }
 
 void av1_calculate_tile_cols(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
-  int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
   int i;
 
   if (cm->uniform_tile_spacing_flag) {
@@ -80,24 +64,27 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) {
     cm->tile_col_start_sb[i] = sb_cols;
     cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
     cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
+
+    cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
+    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
   } else {
     int max_tile_area_sb = (sb_rows * sb_cols);
-    int max_tile_width_sb = 0;
+    int widest_tile_sb = 1;
     cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
     for (i = 0; i < cm->tile_cols; i++) {
       int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
-      max_tile_width_sb = AOMMAX(max_tile_width_sb, size_sb);
+      widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
     }
     if (cm->min_log2_tiles) {
       max_tile_area_sb >>= (cm->min_log2_tiles + 1);
     }
-    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / max_tile_width_sb, 1);
+    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
   }
 }
 
 void av1_calculate_tile_rows(AV1_COMMON *const cm) {
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
   int start_sb, size_sb, i;
 
   if (cm->uniform_tile_spacing_flag) {
@@ -110,106 +97,34 @@ void av1_calculate_tile_rows(AV1_COMMON *const cm) {
     }
     cm->tile_rows = i;
     cm->tile_row_start_sb[i] = sb_rows;
+
+    cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
+    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
   } else {
     cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
   }
-
-#if CONFIG_DEPENDENT_HORZTILES
-  // Record which tile rows must be indpendent for parallelism
-  for (i = 0, start_sb = 0; i < cm->tile_rows; i++) {
-    cm->tile_row_independent[i] = 0;
-    if (cm->tile_row_start_sb[i + 1] - start_sb > cm->max_tile_height_sb) {
-      cm->tile_row_independent[i] = 1;
-      start_sb = cm->tile_row_start_sb[i];
-    }
-  }
-#endif
 }
 
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
   assert(row < cm->tile_rows);
-  int mi_row_start = cm->tile_row_start_sb[row] << MAX_MIB_SIZE_LOG2;
-  int mi_row_end = cm->tile_row_start_sb[row + 1] << MAX_MIB_SIZE_LOG2;
+  int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
+  int mi_row_end = cm->tile_row_start_sb[row + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
   tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+  assert(tile->mi_row_end > tile->mi_row_start);
 }
 
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(col < cm->tile_cols);
-  int mi_col_start = cm->tile_col_start_sb[col] << MAX_MIB_SIZE_LOG2;
-  int mi_col_end = cm->tile_col_start_sb[col + 1] << MAX_MIB_SIZE_LOG2;
+  int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
+  int mi_col_end = cm->tile_col_start_sb[col + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
   tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
-}
-
-#else
-
-void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
-  tile->mi_row_start = row * cm->tile_height;
-  tile->mi_row_end = AOMMIN(tile->mi_row_start + cm->tile_height, cm->mi_rows);
-}
-
-void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
-  tile->mi_col_start = col * cm->tile_width;
-  tile->mi_col_end = AOMMIN(tile->mi_col_start + cm->tile_width, cm->mi_cols);
-}
-
-#if CONFIG_EXT_PARTITION
-#define MIN_TILE_WIDTH_MAX_SB 2
-#define MAX_TILE_WIDTH_MAX_SB 32
-#else
-#define MIN_TILE_WIDTH_MAX_SB 4
-#define MAX_TILE_WIDTH_MAX_SB 64
-#endif  // CONFIG_EXT_PARTITION
-
-static int get_min_log2_tile_cols(int max_sb_cols) {
-  int min_log2 = 0;
-  while ((MAX_TILE_WIDTH_MAX_SB << min_log2) < max_sb_cols) ++min_log2;
-  return min_log2;
-}
-
-static int get_max_log2_tile_cols(int max_sb_cols) {
-  int max_log2 = 1;
-  while ((max_sb_cols >> max_log2) >= MIN_TILE_WIDTH_MAX_SB) ++max_log2;
-  return max_log2 - 1;
-}
-
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
-                         int *max_log2_tile_cols) {
-  const int max_sb_cols =
-      ALIGN_POWER_OF_TWO(mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
-  *min_log2_tile_cols = get_min_log2_tile_cols(max_sb_cols);
-  *max_log2_tile_cols = get_max_log2_tile_cols(max_sb_cols);
-  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
-}
-#endif  // CONFIG_MAX_TILE
-
-void av1_setup_frame_boundary_info(const AV1_COMMON *const cm) {
-  MODE_INFO *mi = cm->mi;
-  int col;
-  for (col = 0; col < cm->mi_cols; ++col) {
-    mi->mbmi.boundary_info |= FRAME_ABOVE_BOUNDARY | TILE_ABOVE_BOUNDARY;
-    mi += 1;
-  }
-
-  mi = cm->mi;
-  int row;
-  for (row = 0; row < cm->mi_rows; ++row) {
-    mi->mbmi.boundary_info |= FRAME_LEFT_BOUNDARY | TILE_LEFT_BOUNDARY;
-    mi += cm->mi_stride;
-  }
-
-  mi = cm->mi + (cm->mi_rows - 1) * cm->mi_stride;
-  for (col = 0; col < cm->mi_cols; ++col) {
-    mi->mbmi.boundary_info |= FRAME_BOTTOM_BOUNDARY | TILE_BOTTOM_BOUNDARY;
-    mi += 1;
-  }
-
-  mi = cm->mi + cm->mi_cols - 1;
-  for (row = 0; row < cm->mi_rows; ++row) {
-    mi->mbmi.boundary_info |= FRAME_RIGHT_BOUNDARY | TILE_RIGHT_BOUNDARY;
-    mi += cm->mi_stride;
-  }
+  assert(tile->mi_col_end > tile->mi_col_start);
 }
 
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
@@ -236,56 +151,41 @@ int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
   return mi_tile_size;
 }
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-void av1_setup_across_tile_boundary_info(const AV1_COMMON *const cm,
-                                         const TileInfo *const tile_info) {
-  if (cm->tile_cols * cm->tile_rows > 1) {
-    const int mi_row = tile_info->mi_row_start;
-    const int mi_col = tile_info->mi_col_start;
-    MODE_INFO *const mi_start = cm->mi + mi_row * cm->mi_stride + mi_col;
-    assert(mi_start < cm->mip + cm->mi_alloc_size);
-    MODE_INFO *mi = 0;
-    const int row_diff = tile_info->mi_row_end - tile_info->mi_row_start;
-    const int col_diff = tile_info->mi_col_end - tile_info->mi_col_start;
-    int row, col;
-
-#if CONFIG_DEPENDENT_HORZTILES
-    if (!cm->dependent_horz_tiles || tile_info->tg_horz_boundary)
-#endif  // CONFIG_DEPENDENT_HORZTILES
-    {
-      mi = mi_start;
-      for (col = 0; col < col_diff; ++col) {
-        mi->mbmi.boundary_info |= TILE_ABOVE_BOUNDARY;
-        mi += 1;
-      }
-    }
-
-    mi = mi_start;
-    for (row = 0; row < row_diff; ++row) {
-      mi->mbmi.boundary_info |= TILE_LEFT_BOUNDARY;
-      mi += cm->mi_stride;
-    }
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+                               int is_uv) {
+  AV1PixelRect r;
+
+  // Calculate position in the Y plane
+  r.left = tile_info->mi_col_start * MI_SIZE;
+  r.right = tile_info->mi_col_end * MI_SIZE;
+  r.top = tile_info->mi_row_start * MI_SIZE;
+  r.bottom = tile_info->mi_row_end * MI_SIZE;
+
+  // If upscaling is enabled, the tile limits need scaling to match the
+  // upscaled frame where the restoration units live. To do this, scale up the
+  // top-left and bottom-right of the tile.
+  if (av1_superres_scaled(cm)) {
+    av1_calculate_unscaled_superres_size(&r.left, &r.top,
+                                         cm->superres_scale_denominator);
+    av1_calculate_unscaled_superres_size(&r.right, &r.bottom,
+                                         cm->superres_scale_denominator);
+  }
 
-    mi = mi_start + (row_diff - 1) * cm->mi_stride;
+  const int frame_w = cm->superres_upscaled_width;
+  const int frame_h = cm->superres_upscaled_height;
 
-    // explicit bounds checking
-    assert(mi + col_diff <= cm->mip + cm->mi_alloc_size);
+  // Make sure we don't fall off the bottom-right of the frame.
+  r.right = AOMMIN(r.right, frame_w);
+  r.bottom = AOMMIN(r.bottom, frame_h);
 
-    for (col = 0; col < col_diff; ++col) {
-      mi->mbmi.boundary_info |= TILE_BOTTOM_BOUNDARY;
-      mi += 1;
-    }
+  // Convert to coordinates in the appropriate plane
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_y = is_uv && cm->subsampling_y;
 
-    mi = mi_start + col_diff - 1;
-    for (row = 0; row < row_diff; ++row) {
-      mi->mbmi.boundary_info |= TILE_RIGHT_BOUNDARY;
-      mi += cm->mi_stride;
-    }
-  }
-}
+  r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
+  r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
+  r.top = ROUND_POWER_OF_TWO(r.top, ss_y);
+  r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y);
 
-int av1_disable_loopfilter_on_tile_boundary(const struct AV1Common *cm) {
-  return (!cm->loop_filter_across_tiles_enabled &&
-          (cm->tile_cols * cm->tile_rows > 1));
+  return r;
 }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
index be21e1482..be037fb17 100644
--- a/third_party/aom/av1/common/tile_common.h
+++ b/third_party/aom/av1/common/tile_common.h
@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 struct AV1Common;
 
@@ -26,6 +26,8 @@ typedef struct TileInfo {
   int mi_row_start, mi_row_end;
   int mi_col_start, mi_col_end;
   int tg_horz_boundary;
+  int tile_row;
+  int tile_col;
 } TileInfo;
 
 // initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
@@ -35,39 +37,30 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
 
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-#if CONFIG_DEPENDENT_HORZTILES
-void av1_tile_set_tg_boundary(TileInfo *tile, const struct AV1Common *const cm,
-                              int row, int col);
-#endif
 void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
                          int *max_log2_tile_cols);
 
-void av1_setup_frame_boundary_info(const struct AV1Common *const cm);
-
 // Calculate the correct tile size (width or height) for (1 << log2_tile_num)
 // tiles horizontally or vertically in the frame.
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-void av1_setup_across_tile_boundary_info(const struct AV1Common *const cm,
-                                         const TileInfo *const tile_info);
-int av1_disable_loopfilter_on_tile_boundary(const struct AV1Common *cm);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+typedef struct {
+  int left, top, right, bottom;
+} AV1PixelRect;
 
-#if CONFIG_MAX_TILE
+// Return the pixel extents of the given tile
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+                               const struct AV1Common *cm, int is_uv);
 
 // Define tile maximum width and area
 // There is no maximum height since height is limited by area and width limits
 // The minimum tile width or height is fixed at one superblock
-#define MAX_TILE_WIDTH (4096)  // Max Tile width in pixels
-#define MAX_TILE_WIDTH_SB (MAX_TILE_WIDTH >> MAX_SB_SIZE_LOG2)
+#define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
 #define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
-#define MAX_TILE_AREA_SB (MAX_TILE_AREA >> (2 * MAX_SB_SIZE_LOG2))
 
 void av1_get_tile_limits(struct AV1Common *const cm);
 void av1_calculate_tile_cols(struct AV1Common *const cm);
 void av1_calculate_tile_rows(struct AV1Common *const cm);
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
new file mode 100644
index 000000000..5ff538ae1
--- /dev/null
+++ b/third_party/aom/av1/common/timing.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/timing.h"
+
+/* Tables for AV1 max bitrates for different levels of main and high tier.
+ * The tables are in Kbps instead of Mbps in the specification.
+ * Note that depending on the profile, a multiplier is needed.
+ */
+
+/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t main_kbps[1 << LEVEL_BITS] = {
+  1500, 3000,  0,     0,     6000,  10000, 0,      0,      12000,  20000,    0,
+  0,    30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0,        0,
+  0,    0,     0,     0,     0,     0,     0,      0,      0,      (1 << 26)
+};
+
+/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t high_kbps[1 << LEVEL_BITS] = {
+  0,      0,      0,      0,      0,      0,      0,      0,
+  30000,  50000,  0,      0,      100000, 160000, 240000, 240000,
+  240000, 480000, 800000, 800000, 0,      0,      0,      0,
+  0,      0,      0,      0,      0,      0,      0,      (1 << 26)
+};
+
+/* BitrateProfileFactor */
+static int bitrate_profile_factor[1 << PROFILE_BITS] = {
+  1, 2, 3, 0, 0, 0, 0, 0
+};
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                          int seq_tier) {
+  int64_t bitrate;
+
+  if (seq_tier) {
+    bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  } else {
+    bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  }
+
+  return bitrate * 1000;
+}
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+  decoder_model->encoder_decoder_buffer_delay_length = 16;
+  decoder_model->buffer_removal_delay_length = 10;
+  decoder_model->frame_presentation_delay_length = 10;
+}
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 1;
+  op_params->decoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->encoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->low_delay_mode_flag = 0;
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
+
+void set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 0;
+  op_params->decoder_buffer_delay =
+      70000;  // Resource availability mode default
+  op_params->encoder_buffer_delay =
+      20000;                           // Resource availability mode default
+  op_params->low_delay_mode_flag = 0;  // Resource availability mode default
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
new file mode 100644
index 000000000..d31f4b7fc
--- /dev/null
+++ b/third_party/aom/av1/common/timing.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TIMING_H_
+#define AOM_TIMING_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+#define MAX_NUM_OP_POINTS 32
+
+typedef struct aom_timing {
+  uint32_t num_units_in_display_tick;
+  uint32_t time_scale;
+  int equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+} aom_timing_info_t;
+
+typedef struct aom_dec_model_info {
+  uint32_t num_units_in_decoding_tick;
+  int encoder_decoder_buffer_delay_length;
+  int buffer_removal_delay_length;
+  int frame_presentation_delay_length;
+} aom_dec_model_info_t;
+
+typedef struct aom_dec_model_op_parameters {
+  int decoder_model_param_present_flag;
+  int64_t bitrate;
+  int64_t buffer_size;
+  int decoder_buffer_delay;
+  int encoder_buffer_delay;
+  int low_delay_mode_flag;
+  int display_model_param_present_flag;
+  int initial_display_delay;
+} aom_dec_model_op_parameters_t;
+
+typedef struct aom_op_timing_info_t {
+  int64_t buffer_removal_delay;
+} aom_op_timing_info_t;
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
+
+void set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params);
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                          int seq_tier);
+
+#endif  // AOM_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
index c4f0f94c0..9a6b454ac 100644
--- a/third_party/aom/av1/common/token_cdfs.h
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -9,5245 +9,3542 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/entropy.h"
 
-/* clang-format off */
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q0[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(21029), AOM_ICDF(21848), AOM_ICDF(26326), AOM_ICDF(29423),
-    AOM_ICDF(30610), AOM_ICDF(32768), },
-    {AOM_ICDF(10066), AOM_ICDF(12716), AOM_ICDF(18523), AOM_ICDF(23277),
-    AOM_ICDF(24780), AOM_ICDF(32768), },
-    {AOM_ICDF(1655), AOM_ICDF(4793), AOM_ICDF(6429), AOM_ICDF(11430),
-    AOM_ICDF(12206), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10364), AOM_ICDF(14773), AOM_ICDF(25084), AOM_ICDF(25599),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10060), AOM_ICDF(14834), AOM_ICDF(24695), AOM_ICDF(25188),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8279), AOM_ICDF(11106), AOM_ICDF(21159), AOM_ICDF(21671),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5914), AOM_ICDF(6961), AOM_ICDF(15824), AOM_ICDF(16314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3542), AOM_ICDF(3935), AOM_ICDF(10073), AOM_ICDF(10456),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1492), AOM_ICDF(1808), AOM_ICDF(4428), AOM_ICDF(4747),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(15783), AOM_ICDF(19657), AOM_ICDF(28753), AOM_ICDF(29248),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12047), AOM_ICDF(15766), AOM_ICDF(26989), AOM_ICDF(27464),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8412), AOM_ICDF(9971), AOM_ICDF(21538), AOM_ICDF(22026),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5438), AOM_ICDF(6039), AOM_ICDF(15108), AOM_ICDF(15570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3247), AOM_ICDF(3593), AOM_ICDF(9495), AOM_ICDF(9859),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1428), AOM_ICDF(1742), AOM_ICDF(4322), AOM_ICDF(4638),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(18469), AOM_ICDF(21675), AOM_ICDF(30172), AOM_ICDF(30563),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12582), AOM_ICDF(16559), AOM_ICDF(27995), AOM_ICDF(28423),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8183), AOM_ICDF(9915), AOM_ICDF(21836), AOM_ICDF(22336),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5255), AOM_ICDF(5845), AOM_ICDF(15137), AOM_ICDF(15593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3140), AOM_ICDF(3478), AOM_ICDF(9376), AOM_ICDF(9739),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1549), AOM_ICDF(1864), AOM_ICDF(4660), AOM_ICDF(4984),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18319), AOM_ICDF(23757), AOM_ICDF(30989), AOM_ICDF(31399),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12864), AOM_ICDF(18051), AOM_ICDF(28729), AOM_ICDF(29218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8090), AOM_ICDF(10047), AOM_ICDF(22011), AOM_ICDF(22680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5061), AOM_ICDF(5688), AOM_ICDF(14783), AOM_ICDF(15379),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3425), AOM_ICDF(3784), AOM_ICDF(9565), AOM_ICDF(9998),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1564), AOM_ICDF(1884), AOM_ICDF(4703), AOM_ICDF(5054),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(8329), AOM_ICDF(23625), AOM_ICDF(30376), AOM_ICDF(31182),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7265), AOM_ICDF(19981), AOM_ICDF(27965), AOM_ICDF(29333),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5797), AOM_ICDF(12014), AOM_ICDF(21143), AOM_ICDF(23728),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4525), AOM_ICDF(7029), AOM_ICDF(14661), AOM_ICDF(17493),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3200), AOM_ICDF(4082), AOM_ICDF(9679), AOM_ICDF(11816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1930), AOM_ICDF(2344), AOM_ICDF(5504), AOM_ICDF(6684),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(12366), AOM_ICDF(20513), AOM_ICDF(22133), AOM_ICDF(29810),
-    AOM_ICDF(30422), AOM_ICDF(32768), },
-    {AOM_ICDF(7182), AOM_ICDF(16662), AOM_ICDF(18633), AOM_ICDF(27443),
-    AOM_ICDF(28120), AOM_ICDF(32768), },
-    {AOM_ICDF(1791), AOM_ICDF(10613), AOM_ICDF(11616), AOM_ICDF(21520),
-    AOM_ICDF(22191), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(18943), AOM_ICDF(19755), AOM_ICDF(30340), AOM_ICDF(30674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15702), AOM_ICDF(17160), AOM_ICDF(28778), AOM_ICDF(29115),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9337), AOM_ICDF(10054), AOM_ICDF(22492), AOM_ICDF(22845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6550), AOM_ICDF(7019), AOM_ICDF(17068), AOM_ICDF(17420),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4169), AOM_ICDF(4566), AOM_ICDF(11849), AOM_ICDF(12185),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2495), AOM_ICDF(2839), AOM_ICDF(6895), AOM_ICDF(7221),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20241), AOM_ICDF(21593), AOM_ICDF(31083), AOM_ICDF(31425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15276), AOM_ICDF(16626), AOM_ICDF(28787), AOM_ICDF(29136),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7656), AOM_ICDF(8102), AOM_ICDF(20347), AOM_ICDF(20700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4527), AOM_ICDF(4880), AOM_ICDF(13482), AOM_ICDF(13819),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2538), AOM_ICDF(2860), AOM_ICDF(7975), AOM_ICDF(8298),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1394), AOM_ICDF(1707), AOM_ICDF(3770), AOM_ICDF(4086),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19968), AOM_ICDF(21872), AOM_ICDF(30812), AOM_ICDF(31172),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15081), AOM_ICDF(16805), AOM_ICDF(28957), AOM_ICDF(29326),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8196), AOM_ICDF(8748), AOM_ICDF(21434), AOM_ICDF(21825),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5297), AOM_ICDF(5675), AOM_ICDF(15007), AOM_ICDF(15385),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3102), AOM_ICDF(3429), AOM_ICDF(9255), AOM_ICDF(9607),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1502), AOM_ICDF(1815), AOM_ICDF(4662), AOM_ICDF(4983),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19362), AOM_ICDF(22537), AOM_ICDF(31260), AOM_ICDF(31624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14450), AOM_ICDF(17789), AOM_ICDF(29362), AOM_ICDF(29788),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7957), AOM_ICDF(8982), AOM_ICDF(21542), AOM_ICDF(22120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4819), AOM_ICDF(5280), AOM_ICDF(14199), AOM_ICDF(14724),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2831), AOM_ICDF(3180), AOM_ICDF(8511), AOM_ICDF(8950),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1385), AOM_ICDF(1700), AOM_ICDF(4300), AOM_ICDF(4633),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(14134), AOM_ICDF(22252), AOM_ICDF(31119), AOM_ICDF(31577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11383), AOM_ICDF(19847), AOM_ICDF(29451), AOM_ICDF(30205),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7338), AOM_ICDF(11314), AOM_ICDF(22338), AOM_ICDF(24028),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5071), AOM_ICDF(6634), AOM_ICDF(15379), AOM_ICDF(17178),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2969), AOM_ICDF(3703), AOM_ICDF(9896), AOM_ICDF(11246),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1809), AOM_ICDF(2173), AOM_ICDF(5573), AOM_ICDF(6229),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(25213), AOM_ICDF(26007), AOM_ICDF(29751), AOM_ICDF(31199),
-    AOM_ICDF(31688), AOM_ICDF(32768), },
-    {AOM_ICDF(13781), AOM_ICDF(16489), AOM_ICDF(23298), AOM_ICDF(27505),
-    AOM_ICDF(28405), AOM_ICDF(32768), },
-    {AOM_ICDF(4621), AOM_ICDF(9194), AOM_ICDF(12764), AOM_ICDF(19842),
-    AOM_ICDF(20708), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12686), AOM_ICDF(19031), AOM_ICDF(28910), AOM_ICDF(29358),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12732), AOM_ICDF(18729), AOM_ICDF(28346), AOM_ICDF(28824),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9753), AOM_ICDF(12954), AOM_ICDF(24344), AOM_ICDF(24920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6853), AOM_ICDF(7851), AOM_ICDF(18601), AOM_ICDF(19110),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3697), AOM_ICDF(4071), AOM_ICDF(11373), AOM_ICDF(11743),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1738), AOM_ICDF(2057), AOM_ICDF(5307), AOM_ICDF(5627),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18299), AOM_ICDF(24455), AOM_ICDF(30930), AOM_ICDF(31398),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14316), AOM_ICDF(19083), AOM_ICDF(29266), AOM_ICDF(29766),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9584), AOM_ICDF(11344), AOM_ICDF(23898), AOM_ICDF(24407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6076), AOM_ICDF(6645), AOM_ICDF(16805), AOM_ICDF(17237),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3535), AOM_ICDF(3885), AOM_ICDF(10393), AOM_ICDF(10746),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1909), AOM_ICDF(2222), AOM_ICDF(5010), AOM_ICDF(5328),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21106), AOM_ICDF(25258), AOM_ICDF(31172), AOM_ICDF(31576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14944), AOM_ICDF(20229), AOM_ICDF(29903), AOM_ICDF(30361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10454), AOM_ICDF(13063), AOM_ICDF(25548), AOM_ICDF(26138),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7667), AOM_ICDF(8529), AOM_ICDF(20025), AOM_ICDF(20588),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4813), AOM_ICDF(5176), AOM_ICDF(13672), AOM_ICDF(14085),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2450), AOM_ICDF(2763), AOM_ICDF(7515), AOM_ICDF(7873),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18297), AOM_ICDF(25980), AOM_ICDF(31547), AOM_ICDF(31946),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13370), AOM_ICDF(21048), AOM_ICDF(30193), AOM_ICDF(30703),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9326), AOM_ICDF(13020), AOM_ICDF(25206), AOM_ICDF(26074),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6117), AOM_ICDF(7480), AOM_ICDF(18243), AOM_ICDF(19130),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6408), AOM_ICDF(6819), AOM_ICDF(13596), AOM_ICDF(14098),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2179), AOM_ICDF(2485), AOM_ICDF(7393), AOM_ICDF(7768),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(11255), AOM_ICDF(26931), AOM_ICDF(31505), AOM_ICDF(32033),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9120), AOM_ICDF(23148), AOM_ICDF(30070), AOM_ICDF(31091),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7927), AOM_ICDF(15909), AOM_ICDF(25162), AOM_ICDF(27329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6416), AOM_ICDF(10706), AOM_ICDF(19959), AOM_ICDF(22732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4232), AOM_ICDF(5545), AOM_ICDF(13107), AOM_ICDF(15118),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2626), AOM_ICDF(2941), AOM_ICDF(8665), AOM_ICDF(9872),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(27618), AOM_ICDF(28976), AOM_ICDF(30940), AOM_ICDF(31993),
-    AOM_ICDF(32336), AOM_ICDF(32768), },
-    {AOM_ICDF(16119), AOM_ICDF(21691), AOM_ICDF(26334), AOM_ICDF(30705),
-    AOM_ICDF(31244), AOM_ICDF(32768), },
-    {AOM_ICDF(5114), AOM_ICDF(14755), AOM_ICDF(17865), AOM_ICDF(27048),
-    AOM_ICDF(27895), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(19468), AOM_ICDF(23767), AOM_ICDF(31339), AOM_ICDF(31674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16878), AOM_ICDF(20966), AOM_ICDF(30654), AOM_ICDF(31007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12213), AOM_ICDF(14415), AOM_ICDF(26909), AOM_ICDF(27338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9404), AOM_ICDF(10670), AOM_ICDF(22239), AOM_ICDF(22719),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6856), AOM_ICDF(7784), AOM_ICDF(17127), AOM_ICDF(17609),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5034), AOM_ICDF(5529), AOM_ICDF(13229), AOM_ICDF(13634),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21214), AOM_ICDF(25570), AOM_ICDF(31656), AOM_ICDF(31994),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17012), AOM_ICDF(20535), AOM_ICDF(30403), AOM_ICDF(30787),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10855), AOM_ICDF(12147), AOM_ICDF(25451), AOM_ICDF(25874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7055), AOM_ICDF(7837), AOM_ICDF(19116), AOM_ICDF(19553),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4141), AOM_ICDF(4531), AOM_ICDF(11911), AOM_ICDF(12296),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1706), AOM_ICDF(2041), AOM_ICDF(5622), AOM_ICDF(5957),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22092), AOM_ICDF(26330), AOM_ICDF(31642), AOM_ICDF(32015),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16433), AOM_ICDF(20889), AOM_ICDF(30263), AOM_ICDF(30704),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11015), AOM_ICDF(13045), AOM_ICDF(26253), AOM_ICDF(26743),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9188), AOM_ICDF(9924), AOM_ICDF(21991), AOM_ICDF(22551),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5259), AOM_ICDF(5634), AOM_ICDF(14131), AOM_ICDF(14627),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1916), AOM_ICDF(2218), AOM_ICDF(6453), AOM_ICDF(6780),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(20331), AOM_ICDF(26854), AOM_ICDF(31896), AOM_ICDF(32255),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15738), AOM_ICDF(22741), AOM_ICDF(31108), AOM_ICDF(31557),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11693), AOM_ICDF(15508), AOM_ICDF(27435), AOM_ICDF(28103),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8066), AOM_ICDF(9281), AOM_ICDF(20855), AOM_ICDF(21631),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4427), AOM_ICDF(4860), AOM_ICDF(12951), AOM_ICDF(13543),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1928), AOM_ICDF(2372), AOM_ICDF(5634), AOM_ICDF(6672),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(15605), AOM_ICDF(27749), AOM_ICDF(31907), AOM_ICDF(32303),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11920), AOM_ICDF(24653), AOM_ICDF(31013), AOM_ICDF(31675),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8007), AOM_ICDF(14898), AOM_ICDF(25377), AOM_ICDF(27353),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6010), AOM_ICDF(8920), AOM_ICDF(18956), AOM_ICDF(21554),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4573), AOM_ICDF(5611), AOM_ICDF(13522), AOM_ICDF(15795),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4274), AOM_ICDF(6411), AOM_ICDF(11398), AOM_ICDF(14247),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(22195), AOM_ICDF(22830), AOM_ICDF(25684), AOM_ICDF(28569),
-    AOM_ICDF(30557), AOM_ICDF(32768), },
-    {AOM_ICDF(9973), AOM_ICDF(12001), AOM_ICDF(15354), AOM_ICDF(20353),
-    AOM_ICDF(23020), AOM_ICDF(32768), },
-    {AOM_ICDF(1514), AOM_ICDF(3998), AOM_ICDF(4873), AOM_ICDF(9182),
-    AOM_ICDF(9967), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(11609), AOM_ICDF(14013), AOM_ICDF(24609), AOM_ICDF(25092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10302), AOM_ICDF(15208), AOM_ICDF(24145), AOM_ICDF(24658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7991), AOM_ICDF(10895), AOM_ICDF(20438), AOM_ICDF(21146),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5831), AOM_ICDF(7006), AOM_ICDF(15716), AOM_ICDF(16394),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3536), AOM_ICDF(3969), AOM_ICDF(10117), AOM_ICDF(10569),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1369), AOM_ICDF(1686), AOM_ICDF(4062), AOM_ICDF(4385),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17334), AOM_ICDF(19416), AOM_ICDF(28420), AOM_ICDF(28798),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13512), AOM_ICDF(15917), AOM_ICDF(26736), AOM_ICDF(27122),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9322), AOM_ICDF(10491), AOM_ICDF(21892), AOM_ICDF(22281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6187), AOM_ICDF(6682), AOM_ICDF(15992), AOM_ICDF(16351),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3733), AOM_ICDF(4073), AOM_ICDF(10406), AOM_ICDF(10735),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1606), AOM_ICDF(1920), AOM_ICDF(4715), AOM_ICDF(5028),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(20589), AOM_ICDF(22106), AOM_ICDF(30065), AOM_ICDF(30422),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14731), AOM_ICDF(16342), AOM_ICDF(27701), AOM_ICDF(28059),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8554), AOM_ICDF(9080), AOM_ICDF(20831), AOM_ICDF(21182),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5011), AOM_ICDF(5354), AOM_ICDF(13968), AOM_ICDF(14296),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2867), AOM_ICDF(3184), AOM_ICDF(8524), AOM_ICDF(8840),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1174), AOM_ICDF(1486), AOM_ICDF(3643), AOM_ICDF(3955),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(23439), AOM_ICDF(24729), AOM_ICDF(31199), AOM_ICDF(31537),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15716), AOM_ICDF(17015), AOM_ICDF(28650), AOM_ICDF(28989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8381), AOM_ICDF(8812), AOM_ICDF(21032), AOM_ICDF(21369),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4868), AOM_ICDF(5197), AOM_ICDF(13740), AOM_ICDF(14065),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2744), AOM_ICDF(3058), AOM_ICDF(8333), AOM_ICDF(8648),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1185), AOM_ICDF(1497), AOM_ICDF(3656), AOM_ICDF(3968),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(23980), AOM_ICDF(26041), AOM_ICDF(31566), AOM_ICDF(31904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16060), AOM_ICDF(18243), AOM_ICDF(29508), AOM_ICDF(29868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8844), AOM_ICDF(9414), AOM_ICDF(22119), AOM_ICDF(22496),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5265), AOM_ICDF(5612), AOM_ICDF(14599), AOM_ICDF(14944),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3058), AOM_ICDF(3375), AOM_ICDF(9028), AOM_ICDF(9351),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1414), AOM_ICDF(1726), AOM_ICDF(4249), AOM_ICDF(4563),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(9994), AOM_ICDF(19506), AOM_ICDF(21744), AOM_ICDF(29408),
-    AOM_ICDF(30809), AOM_ICDF(32768), },
-    {AOM_ICDF(3771), AOM_ICDF(14862), AOM_ICDF(16756), AOM_ICDF(26385),
-    AOM_ICDF(27927), AOM_ICDF(32768), },
-    {AOM_ICDF(964), AOM_ICDF(10643), AOM_ICDF(11416), AOM_ICDF(21060),
-    AOM_ICDF(22316), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23263), AOM_ICDF(23761), AOM_ICDF(31250), AOM_ICDF(31580),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19631), AOM_ICDF(21067), AOM_ICDF(30262), AOM_ICDF(30596),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12419), AOM_ICDF(13646), AOM_ICDF(25959), AOM_ICDF(26329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9274), AOM_ICDF(10229), AOM_ICDF(21588), AOM_ICDF(21981),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6778), AOM_ICDF(7496), AOM_ICDF(17069), AOM_ICDF(17469),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4655), AOM_ICDF(5089), AOM_ICDF(12206), AOM_ICDF(12574),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24055), AOM_ICDF(24771), AOM_ICDF(31529), AOM_ICDF(31851),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18300), AOM_ICDF(19177), AOM_ICDF(29983), AOM_ICDF(30310),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9684), AOM_ICDF(10239), AOM_ICDF(23130), AOM_ICDF(23465),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6112), AOM_ICDF(6511), AOM_ICDF(16539), AOM_ICDF(16874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3508), AOM_ICDF(3841), AOM_ICDF(10475), AOM_ICDF(10798),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1647), AOM_ICDF(1963), AOM_ICDF(5379), AOM_ICDF(5693),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24875), AOM_ICDF(25551), AOM_ICDF(31757), AOM_ICDF(32078),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18585), AOM_ICDF(19328), AOM_ICDF(30217), AOM_ICDF(30542),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8948), AOM_ICDF(9350), AOM_ICDF(22251), AOM_ICDF(22577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5148), AOM_ICDF(5481), AOM_ICDF(14806), AOM_ICDF(15127),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2852), AOM_ICDF(3169), AOM_ICDF(8930), AOM_ICDF(9249),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1298), AOM_ICDF(1609), AOM_ICDF(4289), AOM_ICDF(4600),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25149), AOM_ICDF(25840), AOM_ICDF(31833), AOM_ICDF(32153),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19051), AOM_ICDF(19689), AOM_ICDF(30461), AOM_ICDF(30785),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8956), AOM_ICDF(9308), AOM_ICDF(22406), AOM_ICDF(22729),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5001), AOM_ICDF(5325), AOM_ICDF(14586), AOM_ICDF(14906),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2875), AOM_ICDF(3189), AOM_ICDF(8639), AOM_ICDF(8955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1311), AOM_ICDF(1623), AOM_ICDF(4261), AOM_ICDF(4572),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(25212), AOM_ICDF(26544), AOM_ICDF(31879), AOM_ICDF(32209),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18967), AOM_ICDF(20523), AOM_ICDF(30778), AOM_ICDF(31126),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9672), AOM_ICDF(10140), AOM_ICDF(23740), AOM_ICDF(24117),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5732), AOM_ICDF(6079), AOM_ICDF(16067), AOM_ICDF(16423),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3370), AOM_ICDF(3687), AOM_ICDF(10101), AOM_ICDF(10429),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1696), AOM_ICDF(2007), AOM_ICDF(5320), AOM_ICDF(5648),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(26296), AOM_ICDF(26903), AOM_ICDF(30027), AOM_ICDF(31098),
-    AOM_ICDF(31851), AOM_ICDF(32768), },
-    {AOM_ICDF(13982), AOM_ICDF(16223), AOM_ICDF(22840), AOM_ICDF(26540),
-    AOM_ICDF(28301), AOM_ICDF(32768), },
-    {AOM_ICDF(5643), AOM_ICDF(9834), AOM_ICDF(13670), AOM_ICDF(20220),
-    AOM_ICDF(21734), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14291), AOM_ICDF(20303), AOM_ICDF(29319), AOM_ICDF(29879),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13407), AOM_ICDF(20905), AOM_ICDF(29052), AOM_ICDF(29644),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10860), AOM_ICDF(15525), AOM_ICDF(25872), AOM_ICDF(26766),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7801), AOM_ICDF(9554), AOM_ICDF(20530), AOM_ICDF(21309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4523), AOM_ICDF(4994), AOM_ICDF(12583), AOM_ICDF(13069),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1784), AOM_ICDF(2110), AOM_ICDF(5198), AOM_ICDF(5511),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20153), AOM_ICDF(24114), AOM_ICDF(30802), AOM_ICDF(31195),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16079), AOM_ICDF(19936), AOM_ICDF(29580), AOM_ICDF(29992),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10977), AOM_ICDF(12993), AOM_ICDF(25245), AOM_ICDF(25687),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7386), AOM_ICDF(8212), AOM_ICDF(19223), AOM_ICDF(19683),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4797), AOM_ICDF(5164), AOM_ICDF(12928), AOM_ICDF(13288),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2188), AOM_ICDF(2498), AOM_ICDF(6396), AOM_ICDF(6706),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24221), AOM_ICDF(26746), AOM_ICDF(31634), AOM_ICDF(31980),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17492), AOM_ICDF(20348), AOM_ICDF(30067), AOM_ICDF(30432),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10522), AOM_ICDF(11531), AOM_ICDF(24642), AOM_ICDF(25031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6567), AOM_ICDF(7006), AOM_ICDF(17688), AOM_ICDF(18036),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4123), AOM_ICDF(4447), AOM_ICDF(11775), AOM_ICDF(12095),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1770), AOM_ICDF(2065), AOM_ICDF(6491), AOM_ICDF(6786),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25862), AOM_ICDF(27744), AOM_ICDF(31611), AOM_ICDF(31969),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17752), AOM_ICDF(20079), AOM_ICDF(30169), AOM_ICDF(30530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10588), AOM_ICDF(11308), AOM_ICDF(24834), AOM_ICDF(25180),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7459), AOM_ICDF(7820), AOM_ICDF(17949), AOM_ICDF(18281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3984), AOM_ICDF(4294), AOM_ICDF(11863), AOM_ICDF(12173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2689), AOM_ICDF(2969), AOM_ICDF(11371), AOM_ICDF(11651),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27703), AOM_ICDF(29662), AOM_ICDF(31910), AOM_ICDF(32262),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17904), AOM_ICDF(21878), AOM_ICDF(30510), AOM_ICDF(30969),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10660), AOM_ICDF(12299), AOM_ICDF(24907), AOM_ICDF(25524),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6972), AOM_ICDF(7545), AOM_ICDF(18660), AOM_ICDF(19251),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5359), AOM_ICDF(5768), AOM_ICDF(14022), AOM_ICDF(14397),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5030), AOM_ICDF(5487), AOM_ICDF(10364), AOM_ICDF(10973),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(27980), AOM_ICDF(28880), AOM_ICDF(31045), AOM_ICDF(31931),
-    AOM_ICDF(32370), AOM_ICDF(32768), },
-    {AOM_ICDF(15958), AOM_ICDF(19891), AOM_ICDF(25963), AOM_ICDF(29601),
-    AOM_ICDF(30931), AOM_ICDF(32768), },
-    {AOM_ICDF(3897), AOM_ICDF(12331), AOM_ICDF(15935), AOM_ICDF(24489),
-    AOM_ICDF(26773), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(21443), AOM_ICDF(24237), AOM_ICDF(31473), AOM_ICDF(31808),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18617), AOM_ICDF(22378), AOM_ICDF(30958), AOM_ICDF(31301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14626), AOM_ICDF(17725), AOM_ICDF(28852), AOM_ICDF(29246),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12155), AOM_ICDF(14598), AOM_ICDF(26000), AOM_ICDF(26506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10111), AOM_ICDF(12149), AOM_ICDF(23415), AOM_ICDF(24002),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11352), AOM_ICDF(12864), AOM_ICDF(22589), AOM_ICDF(23010),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22855), AOM_ICDF(25401), AOM_ICDF(31675), AOM_ICDF(31999),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19086), AOM_ICDF(21008), AOM_ICDF(30886), AOM_ICDF(31214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13477), AOM_ICDF(14473), AOM_ICDF(28104), AOM_ICDF(28450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9553), AOM_ICDF(10401), AOM_ICDF(23815), AOM_ICDF(24225),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5795), AOM_ICDF(6172), AOM_ICDF(18068), AOM_ICDF(18445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4297), AOM_ICDF(5909), AOM_ICDF(10206), AOM_ICDF(11818),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24424), AOM_ICDF(26344), AOM_ICDF(31912), AOM_ICDF(32231),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20229), AOM_ICDF(21775), AOM_ICDF(31283), AOM_ICDF(31610),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14224), AOM_ICDF(14882), AOM_ICDF(28673), AOM_ICDF(29012),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10881), AOM_ICDF(11494), AOM_ICDF(23829), AOM_ICDF(24238),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6367), AOM_ICDF(6988), AOM_ICDF(15685), AOM_ICDF(16306),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7447), AOM_ICDF(11916), AOM_ICDF(17873), AOM_ICDF(22342),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25536), AOM_ICDF(27216), AOM_ICDF(31570), AOM_ICDF(31916),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19600), AOM_ICDF(21062), AOM_ICDF(30095), AOM_ICDF(30444),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11491), AOM_ICDF(12044), AOM_ICDF(26170), AOM_ICDF(26497),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9629), AOM_ICDF(9963), AOM_ICDF(23790), AOM_ICDF(24112),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8073), AOM_ICDF(8359), AOM_ICDF(22212), AOM_ICDF(22498),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27425), AOM_ICDF(29611), AOM_ICDF(32005), AOM_ICDF(32347),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20590), AOM_ICDF(24265), AOM_ICDF(31252), AOM_ICDF(31658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14072), AOM_ICDF(15705), AOM_ICDF(28945), AOM_ICDF(29389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11295), AOM_ICDF(11926), AOM_ICDF(26485), AOM_ICDF(26872),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10627), AOM_ICDF(11292), AOM_ICDF(22141), AOM_ICDF(22805),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(9850), AOM_ICDF(11321), AOM_ICDF(13211), AOM_ICDF(18246),
-    AOM_ICDF(21613), AOM_ICDF(32768), },
-    {AOM_ICDF(4128), AOM_ICDF(6155), AOM_ICDF(7367), AOM_ICDF(11928),
-    AOM_ICDF(14060), AOM_ICDF(32768), },
-    {AOM_ICDF(932), AOM_ICDF(2794), AOM_ICDF(3234), AOM_ICDF(6647),
-    AOM_ICDF(7340), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(9101), AOM_ICDF(10823), AOM_ICDF(21291), AOM_ICDF(22109),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8086), AOM_ICDF(13032), AOM_ICDF(21855), AOM_ICDF(22748),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6563), AOM_ICDF(10137), AOM_ICDF(18484), AOM_ICDF(20069),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4987), AOM_ICDF(6567), AOM_ICDF(14425), AOM_ICDF(15700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3399), AOM_ICDF(3947), AOM_ICDF(9950), AOM_ICDF(10738),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1474), AOM_ICDF(1793), AOM_ICDF(4347), AOM_ICDF(4690),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17035), AOM_ICDF(18650), AOM_ICDF(27401), AOM_ICDF(27793),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13213), AOM_ICDF(16039), AOM_ICDF(26044), AOM_ICDF(26448),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9916), AOM_ICDF(11812), AOM_ICDF(22497), AOM_ICDF(22945),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7227), AOM_ICDF(8059), AOM_ICDF(17399), AOM_ICDF(17817),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5144), AOM_ICDF(5572), AOM_ICDF(12546), AOM_ICDF(12892),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2364), AOM_ICDF(2678), AOM_ICDF(6057), AOM_ICDF(6372),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19805), AOM_ICDF(21667), AOM_ICDF(29302), AOM_ICDF(29680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14232), AOM_ICDF(16314), AOM_ICDF(27120), AOM_ICDF(27515),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8796), AOM_ICDF(9578), AOM_ICDF(21112), AOM_ICDF(21479),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5203), AOM_ICDF(5552), AOM_ICDF(14231), AOM_ICDF(14563),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2943), AOM_ICDF(3257), AOM_ICDF(8676), AOM_ICDF(8994),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1363), AOM_ICDF(1675), AOM_ICDF(4064), AOM_ICDF(4376),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24214), AOM_ICDF(25083), AOM_ICDF(30916), AOM_ICDF(31249),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15904), AOM_ICDF(17001), AOM_ICDF(28199), AOM_ICDF(28532),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8324), AOM_ICDF(8717), AOM_ICDF(20480), AOM_ICDF(20808),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4752), AOM_ICDF(5070), AOM_ICDF(13245), AOM_ICDF(13565),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2729), AOM_ICDF(3042), AOM_ICDF(8218), AOM_ICDF(8530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1385), AOM_ICDF(1697), AOM_ICDF(4196), AOM_ICDF(4508),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(26956), AOM_ICDF(27719), AOM_ICDF(31679), AOM_ICDF(32005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16913), AOM_ICDF(17759), AOM_ICDF(29092), AOM_ICDF(29422),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8166), AOM_ICDF(8510), AOM_ICDF(20577), AOM_ICDF(20901),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4804), AOM_ICDF(5119), AOM_ICDF(13537), AOM_ICDF(13853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2951), AOM_ICDF(3263), AOM_ICDF(8766), AOM_ICDF(9079),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1498), AOM_ICDF(1810), AOM_ICDF(4515), AOM_ICDF(4827),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(7335), AOM_ICDF(13463), AOM_ICDF(14286), AOM_ICDF(24588),
-    AOM_ICDF(29117), AOM_ICDF(32768), },
-    {AOM_ICDF(3212), AOM_ICDF(9869), AOM_ICDF(10336), AOM_ICDF(20172),
-    AOM_ICDF(25029), AOM_ICDF(32768), },
-    {AOM_ICDF(917), AOM_ICDF(6904), AOM_ICDF(7251), AOM_ICDF(15225),
-    AOM_ICDF(18595), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23988), AOM_ICDF(24467), AOM_ICDF(31033), AOM_ICDF(31407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20390), AOM_ICDF(23805), AOM_ICDF(30556), AOM_ICDF(30920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13566), AOM_ICDF(16666), AOM_ICDF(27478), AOM_ICDF(27995),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10353), AOM_ICDF(12637), AOM_ICDF(23789), AOM_ICDF(24437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7956), AOM_ICDF(9364), AOM_ICDF(19994), AOM_ICDF(20621),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6036), AOM_ICDF(6495), AOM_ICDF(15543), AOM_ICDF(16033),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(25643), AOM_ICDF(26692), AOM_ICDF(31634), AOM_ICDF(31957),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18721), AOM_ICDF(20381), AOM_ICDF(30130), AOM_ICDF(30466),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10914), AOM_ICDF(12337), AOM_ICDF(24817), AOM_ICDF(25177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7843), AOM_ICDF(8667), AOM_ICDF(19826), AOM_ICDF(20212),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5080), AOM_ICDF(5484), AOM_ICDF(14225), AOM_ICDF(14587),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2880), AOM_ICDF(3192), AOM_ICDF(7916), AOM_ICDF(8236),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26447), AOM_ICDF(27233), AOM_ICDF(31779), AOM_ICDF(32097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19042), AOM_ICDF(20153), AOM_ICDF(30217), AOM_ICDF(30540),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9858), AOM_ICDF(10440), AOM_ICDF(23424), AOM_ICDF(23753),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6276), AOM_ICDF(6657), AOM_ICDF(17158), AOM_ICDF(17489),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3725), AOM_ICDF(4039), AOM_ICDF(10981), AOM_ICDF(11303),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2041), AOM_ICDF(2345), AOM_ICDF(6069), AOM_ICDF(6373),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27189), AOM_ICDF(27737), AOM_ICDF(31897), AOM_ICDF(32213),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19763), AOM_ICDF(20443), AOM_ICDF(30288), AOM_ICDF(30607),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9033), AOM_ICDF(9393), AOM_ICDF(22097), AOM_ICDF(22415),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5417), AOM_ICDF(5747), AOM_ICDF(15230), AOM_ICDF(15545),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3397), AOM_ICDF(3709), AOM_ICDF(10342), AOM_ICDF(10655),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2805), AOM_ICDF(3108), AOM_ICDF(6119), AOM_ICDF(6422),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27874), AOM_ICDF(28490), AOM_ICDF(31981), AOM_ICDF(32301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20112), AOM_ICDF(20724), AOM_ICDF(30607), AOM_ICDF(30935),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9188), AOM_ICDF(9549), AOM_ICDF(22544), AOM_ICDF(22875),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5590), AOM_ICDF(5918), AOM_ICDF(15550), AOM_ICDF(15878),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3567), AOM_ICDF(4015), AOM_ICDF(10658), AOM_ICDF(10988),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1950), AOM_ICDF(2388), AOM_ICDF(6246), AOM_ICDF(6681),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(25724), AOM_ICDF(26337), AOM_ICDF(28579), AOM_ICDF(29957),
-    AOM_ICDF(30927), AOM_ICDF(32768), },
-    {AOM_ICDF(9657), AOM_ICDF(12074), AOM_ICDF(16790), AOM_ICDF(21738),
-    AOM_ICDF(23899), AOM_ICDF(32768), },
-    {AOM_ICDF(4158), AOM_ICDF(7646), AOM_ICDF(10690), AOM_ICDF(16969),
-    AOM_ICDF(18800), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14330), AOM_ICDF(19826), AOM_ICDF(28364), AOM_ICDF(29154),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13503), AOM_ICDF(21352), AOM_ICDF(28714), AOM_ICDF(29534),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11754), AOM_ICDF(16853), AOM_ICDF(25931), AOM_ICDF(27325),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8311), AOM_ICDF(10581), AOM_ICDF(21258), AOM_ICDF(22633),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5297), AOM_ICDF(5819), AOM_ICDF(14162), AOM_ICDF(14892),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2887), AOM_ICDF(3208), AOM_ICDF(7455), AOM_ICDF(7768),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22005), AOM_ICDF(24480), AOM_ICDF(30925), AOM_ICDF(31309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17332), AOM_ICDF(20557), AOM_ICDF(29696), AOM_ICDF(30096),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11930), AOM_ICDF(14337), AOM_ICDF(25931), AOM_ICDF(26358),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8888), AOM_ICDF(10020), AOM_ICDF(20964), AOM_ICDF(21352),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5694), AOM_ICDF(6135), AOM_ICDF(14997), AOM_ICDF(15376),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2521), AOM_ICDF(2842), AOM_ICDF(7765), AOM_ICDF(8069),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23993), AOM_ICDF(25546), AOM_ICDF(31427), AOM_ICDF(31762),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18134), AOM_ICDF(20327), AOM_ICDF(29992), AOM_ICDF(30386),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10997), AOM_ICDF(12057), AOM_ICDF(24719), AOM_ICDF(25141),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5719), AOM_ICDF(6153), AOM_ICDF(16654), AOM_ICDF(17032),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3637), AOM_ICDF(3953), AOM_ICDF(11392), AOM_ICDF(11696),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1837), AOM_ICDF(2127), AOM_ICDF(5703), AOM_ICDF(5993),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26095), AOM_ICDF(26989), AOM_ICDF(31766), AOM_ICDF(32091),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19524), AOM_ICDF(20820), AOM_ICDF(30413), AOM_ICDF(30738),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9962), AOM_ICDF(10551), AOM_ICDF(22667), AOM_ICDF(23010),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5773), AOM_ICDF(6093), AOM_ICDF(15402), AOM_ICDF(15748),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3546), AOM_ICDF(3850), AOM_ICDF(9983), AOM_ICDF(10287),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2387), AOM_ICDF(2668), AOM_ICDF(5711), AOM_ICDF(5992),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29071), AOM_ICDF(29675), AOM_ICDF(31761), AOM_ICDF(32087),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18709), AOM_ICDF(19761), AOM_ICDF(29374), AOM_ICDF(29730),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9336), AOM_ICDF(10048), AOM_ICDF(22625), AOM_ICDF(22988),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6446), AOM_ICDF(6793), AOM_ICDF(16834), AOM_ICDF(17172),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4227), AOM_ICDF(4539), AOM_ICDF(11587), AOM_ICDF(11909),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2624), AOM_ICDF(2929), AOM_ICDF(7139), AOM_ICDF(7444),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(25114), AOM_ICDF(25872), AOM_ICDF(29577), AOM_ICDF(31173),
-    AOM_ICDF(32008), AOM_ICDF(32768), },
-    {AOM_ICDF(11286), AOM_ICDF(14376), AOM_ICDF(22156), AOM_ICDF(26266),
-    AOM_ICDF(29278), AOM_ICDF(32768), },
-    {AOM_ICDF(2680), AOM_ICDF(11055), AOM_ICDF(14683), AOM_ICDF(23068),
-    AOM_ICDF(26651), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(22838), AOM_ICDF(24926), AOM_ICDF(31689), AOM_ICDF(32019),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19245), AOM_ICDF(24299), AOM_ICDF(31481), AOM_ICDF(31852),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15429), AOM_ICDF(21159), AOM_ICDF(30176), AOM_ICDF(30732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12373), AOM_ICDF(17092), AOM_ICDF(26912), AOM_ICDF(27758),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10899), AOM_ICDF(13395), AOM_ICDF(23604), AOM_ICDF(24329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12767), AOM_ICDF(13096), AOM_ICDF(21644), AOM_ICDF(22083),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24527), AOM_ICDF(26101), AOM_ICDF(31912), AOM_ICDF(32226),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20576), AOM_ICDF(22265), AOM_ICDF(31439), AOM_ICDF(31762),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13792), AOM_ICDF(15369), AOM_ICDF(28531), AOM_ICDF(28942),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9392), AOM_ICDF(11153), AOM_ICDF(23790), AOM_ICDF(24274),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5587), AOM_ICDF(6191), AOM_ICDF(19027), AOM_ICDF(19480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(17246), AOM_ICDF(22420),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24737), AOM_ICDF(25605), AOM_ICDF(31953), AOM_ICDF(32268),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20933), AOM_ICDF(21817), AOM_ICDF(31546), AOM_ICDF(31861),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13887), AOM_ICDF(14656), AOM_ICDF(28490), AOM_ICDF(28817),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10018), AOM_ICDF(11047), AOM_ICDF(23593), AOM_ICDF(23967),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3855), AOM_ICDF(6746), AOM_ICDF(15420), AOM_ICDF(18312),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25861), AOM_ICDF(26475), AOM_ICDF(32028), AOM_ICDF(32343),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22221), AOM_ICDF(22755), AOM_ICDF(31735), AOM_ICDF(32050),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15517), AOM_ICDF(15928), AOM_ICDF(29558), AOM_ICDF(29870),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7719), AOM_ICDF(8507), AOM_ICDF(20165), AOM_ICDF(20638),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(19275), AOM_ICDF(25058),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28675), AOM_ICDF(29326), AOM_ICDF(31767), AOM_ICDF(32092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21491), AOM_ICDF(22422), AOM_ICDF(29827), AOM_ICDF(30197),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10080), AOM_ICDF(11350), AOM_ICDF(23883), AOM_ICDF(24321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8383), AOM_ICDF(8793), AOM_ICDF(21382), AOM_ICDF(21739),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6835), AOM_ICDF(7137), AOM_ICDF(20646), AOM_ICDF(20947),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(15501), AOM_ICDF(16574), AOM_ICDF(17941), AOM_ICDF(20080),
-    AOM_ICDF(21984), AOM_ICDF(32768), },
-    {AOM_ICDF(1676), AOM_ICDF(3221), AOM_ICDF(3952), AOM_ICDF(6916),
-    AOM_ICDF(7628), AOM_ICDF(32768), },
-    {AOM_ICDF(468), AOM_ICDF(1825), AOM_ICDF(2211), AOM_ICDF(4504),
-    AOM_ICDF(4877), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(5597), AOM_ICDF(9461), AOM_ICDF(16777), AOM_ICDF(17896),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5231), AOM_ICDF(9185), AOM_ICDF(16569), AOM_ICDF(17688),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4128), AOM_ICDF(6983), AOM_ICDF(13860), AOM_ICDF(15674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2908), AOM_ICDF(4209), AOM_ICDF(9762), AOM_ICDF(11321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2269), AOM_ICDF(2797), AOM_ICDF(7063), AOM_ICDF(7999),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1270), AOM_ICDF(1588), AOM_ICDF(3710), AOM_ICDF(4051),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(14862), AOM_ICDF(16903), AOM_ICDF(25712), AOM_ICDF(26189),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12778), AOM_ICDF(15420), AOM_ICDF(25395), AOM_ICDF(25836),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10402), AOM_ICDF(12279), AOM_ICDF(22858), AOM_ICDF(23302),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8026), AOM_ICDF(8897), AOM_ICDF(18866), AOM_ICDF(19290),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6610), AOM_ICDF(7121), AOM_ICDF(15967), AOM_ICDF(16322),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3980), AOM_ICDF(4296), AOM_ICDF(10443), AOM_ICDF(10757),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19177), AOM_ICDF(21516), AOM_ICDF(28474), AOM_ICDF(28892),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14901), AOM_ICDF(17006), AOM_ICDF(27100), AOM_ICDF(27500),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10655), AOM_ICDF(11487), AOM_ICDF(23288), AOM_ICDF(23664),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6980), AOM_ICDF(7408), AOM_ICDF(17955), AOM_ICDF(18288),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3891), AOM_ICDF(4206), AOM_ICDF(11255), AOM_ICDF(11570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1532), AOM_ICDF(1844), AOM_ICDF(4593), AOM_ICDF(4905),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24338), AOM_ICDF(25864), AOM_ICDF(30962), AOM_ICDF(31346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16430), AOM_ICDF(18166), AOM_ICDF(28700), AOM_ICDF(29068),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9726), AOM_ICDF(10244), AOM_ICDF(22575), AOM_ICDF(22934),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5539), AOM_ICDF(5868), AOM_ICDF(15030), AOM_ICDF(15363),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3305), AOM_ICDF(3620), AOM_ICDF(9405), AOM_ICDF(9720),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1482), AOM_ICDF(1794), AOM_ICDF(4429), AOM_ICDF(4741),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29843), AOM_ICDF(30312), AOM_ICDF(31922), AOM_ICDF(32242),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17390), AOM_ICDF(18061), AOM_ICDF(28932), AOM_ICDF(29258),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7968), AOM_ICDF(8308), AOM_ICDF(20128), AOM_ICDF(20447),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4523), AOM_ICDF(4838), AOM_ICDF(12959), AOM_ICDF(13274),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2765), AOM_ICDF(3077), AOM_ICDF(8284), AOM_ICDF(8596),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1422), AOM_ICDF(1733), AOM_ICDF(4244), AOM_ICDF(4556),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(19066), AOM_ICDF(20217), AOM_ICDF(21504), AOM_ICDF(24559),
-    AOM_ICDF(26831), AOM_ICDF(32768), },
-    {AOM_ICDF(5708), AOM_ICDF(7393), AOM_ICDF(8108), AOM_ICDF(11986),
-    AOM_ICDF(17424), AOM_ICDF(32768), },
-    {AOM_ICDF(1144), AOM_ICDF(2709), AOM_ICDF(3111), AOM_ICDF(6009),
-    AOM_ICDF(10882), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17586), AOM_ICDF(17895), AOM_ICDF(27561), AOM_ICDF(28179),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16442), AOM_ICDF(19113), AOM_ICDF(27944), AOM_ICDF(28764),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12438), AOM_ICDF(17724), AOM_ICDF(26435), AOM_ICDF(27714),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9439), AOM_ICDF(12708), AOM_ICDF(22594), AOM_ICDF(24060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7762), AOM_ICDF(9639), AOM_ICDF(19669), AOM_ICDF(20614),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5324), AOM_ICDF(5894), AOM_ICDF(14504), AOM_ICDF(15100),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23414), AOM_ICDF(25239), AOM_ICDF(31300), AOM_ICDF(31670),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18089), AOM_ICDF(22136), AOM_ICDF(30318), AOM_ICDF(30720),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12081), AOM_ICDF(15216), AOM_ICDF(27074), AOM_ICDF(27531),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9327), AOM_ICDF(10783), AOM_ICDF(22927), AOM_ICDF(23384),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6381), AOM_ICDF(6914), AOM_ICDF(17070), AOM_ICDF(17506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3854), AOM_ICDF(4164), AOM_ICDF(10355), AOM_ICDF(10665),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24366), AOM_ICDF(25993), AOM_ICDF(31678), AOM_ICDF(32001),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18041), AOM_ICDF(21047), AOM_ICDF(30693), AOM_ICDF(31031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11271), AOM_ICDF(12970), AOM_ICDF(26794), AOM_ICDF(27180),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8173), AOM_ICDF(8758), AOM_ICDF(21941), AOM_ICDF(22340),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5248), AOM_ICDF(5568), AOM_ICDF(15646), AOM_ICDF(15994),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2689), AOM_ICDF(3193), AOM_ICDF(6722), AOM_ICDF(7226),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27565), AOM_ICDF(28694), AOM_ICDF(31993), AOM_ICDF(32314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20591), AOM_ICDF(22532), AOM_ICDF(31143), AOM_ICDF(31473),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11268), AOM_ICDF(12113), AOM_ICDF(25966), AOM_ICDF(26331),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7268), AOM_ICDF(7674), AOM_ICDF(19409), AOM_ICDF(19747),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4404), AOM_ICDF(4686), AOM_ICDF(13213), AOM_ICDF(13495),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2637), AOM_ICDF(3766), AOM_ICDF(7533), AOM_ICDF(8663),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29847), AOM_ICDF(30306), AOM_ICDF(32081), AOM_ICDF(32397),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22752), AOM_ICDF(23329), AOM_ICDF(31334), AOM_ICDF(31658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10305), AOM_ICDF(10672), AOM_ICDF(24328), AOM_ICDF(24657),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5712), AOM_ICDF(6031), AOM_ICDF(16694), AOM_ICDF(17018),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3979), AOM_ICDF(4278), AOM_ICDF(10985), AOM_ICDF(11284),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2465), AOM_ICDF(2900), AOM_ICDF(6815), AOM_ICDF(7250),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(21609), AOM_ICDF(22111), AOM_ICDF(24624), AOM_ICDF(26045),
-    AOM_ICDF(27916), AOM_ICDF(32768), },
-    {AOM_ICDF(5498), AOM_ICDF(7300), AOM_ICDF(12100), AOM_ICDF(15851),
-    AOM_ICDF(18370), AOM_ICDF(32768), },
-    {AOM_ICDF(1268), AOM_ICDF(3284), AOM_ICDF(6295), AOM_ICDF(10711),
-    AOM_ICDF(12999), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(9621), AOM_ICDF(16733), AOM_ICDF(26354), AOM_ICDF(27609),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9619), AOM_ICDF(18339), AOM_ICDF(27578), AOM_ICDF(28547),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9575), AOM_ICDF(18177), AOM_ICDF(24044), AOM_ICDF(25625),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5999), AOM_ICDF(11578), AOM_ICDF(20125), AOM_ICDF(22544),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4842), AOM_ICDF(6220), AOM_ICDF(12898), AOM_ICDF(14944),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(948), AOM_ICDF(1247), AOM_ICDF(3292), AOM_ICDF(3791),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21002), AOM_ICDF(25135), AOM_ICDF(31208), AOM_ICDF(31629),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18106), AOM_ICDF(22116), AOM_ICDF(29422), AOM_ICDF(30013),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14580), AOM_ICDF(15855), AOM_ICDF(26171), AOM_ICDF(26535),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9965), AOM_ICDF(10971), AOM_ICDF(23043), AOM_ICDF(23378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7123), AOM_ICDF(7395), AOM_ICDF(16893), AOM_ICDF(17232),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3187), AOM_ICDF(3432), AOM_ICDF(7600), AOM_ICDF(7845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26393), AOM_ICDF(27823), AOM_ICDF(31691), AOM_ICDF(32028),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18449), AOM_ICDF(20915), AOM_ICDF(30092), AOM_ICDF(30531),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11710), AOM_ICDF(12263), AOM_ICDF(26838), AOM_ICDF(27139),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7737), AOM_ICDF(8192), AOM_ICDF(21299), AOM_ICDF(21572),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3572), AOM_ICDF(4038), AOM_ICDF(13822), AOM_ICDF(14287),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1689), AOM_ICDF(2703), AOM_ICDF(3716), AOM_ICDF(4729),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28371), AOM_ICDF(29507), AOM_ICDF(31986), AOM_ICDF(32314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19411), AOM_ICDF(21758), AOM_ICDF(30225), AOM_ICDF(30579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11995), AOM_ICDF(12434), AOM_ICDF(26661), AOM_ICDF(27026),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9175), AOM_ICDF(9721), AOM_ICDF(22173), AOM_ICDF(22501),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9078), AOM_ICDF(9742), AOM_ICDF(13063), AOM_ICDF(13727),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3192), AOM_ICDF(3830), AOM_ICDF(6809), AOM_ICDF(7447),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31351), AOM_ICDF(31682), AOM_ICDF(32124), AOM_ICDF(32438),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20883), AOM_ICDF(22618), AOM_ICDF(30828), AOM_ICDF(31173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11388), AOM_ICDF(12381), AOM_ICDF(24266), AOM_ICDF(24700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6987), AOM_ICDF(7380), AOM_ICDF(18401), AOM_ICDF(18795),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2016), AOM_ICDF(2773), AOM_ICDF(7814), AOM_ICDF(8570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2849), AOM_ICDF(4986), AOM_ICDF(8548), AOM_ICDF(10685),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(19461), AOM_ICDF(21728), AOM_ICDF(26601), AOM_ICDF(29082),
-    AOM_ICDF(30105), AOM_ICDF(32768), },
-    {AOM_ICDF(2845), AOM_ICDF(10798), AOM_ICDF(14316), AOM_ICDF(23118),
-    AOM_ICDF(24609), AOM_ICDF(32768), },
-    {AOM_ICDF(705), AOM_ICDF(10138), AOM_ICDF(12123), AOM_ICDF(21473),
-    AOM_ICDF(23327), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(24780), AOM_ICDF(25836), AOM_ICDF(31623), AOM_ICDF(31938),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22703), AOM_ICDF(24390), AOM_ICDF(31353), AOM_ICDF(31797),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18218), AOM_ICDF(20834), AOM_ICDF(29429), AOM_ICDF(30327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12517), AOM_ICDF(15626), AOM_ICDF(26000), AOM_ICDF(27281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9988), AOM_ICDF(12791), AOM_ICDF(24073), AOM_ICDF(25295),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8529), AOM_ICDF(9202), AOM_ICDF(18853), AOM_ICDF(19751),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26497), AOM_ICDF(27282), AOM_ICDF(32016), AOM_ICDF(32333),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22767), AOM_ICDF(24548), AOM_ICDF(31680), AOM_ICDF(32007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10455), AOM_ICDF(13458), AOM_ICDF(26448), AOM_ICDF(26995),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3684), AOM_ICDF(4847), AOM_ICDF(20940), AOM_ICDF(21522),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9063), AOM_ICDF(11155), AOM_ICDF(17430), AOM_ICDF(19521),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(11469), AOM_ICDF(16384), AOM_ICDF(21299),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26212), AOM_ICDF(26755), AOM_ICDF(32090), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22239), AOM_ICDF(23123), AOM_ICDF(31406), AOM_ICDF(31725),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7220), AOM_ICDF(7609), AOM_ICDF(22715), AOM_ICDF(22993),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5554), AOM_ICDF(6387), AOM_ICDF(11941), AOM_ICDF(12774),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4915), AOM_ICDF(9830), AOM_ICDF(19661), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28796), AOM_ICDF(29237), AOM_ICDF(32134), AOM_ICDF(32446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(25912), AOM_ICDF(26456), AOM_ICDF(32010), AOM_ICDF(32321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14399), AOM_ICDF(14668), AOM_ICDF(26039), AOM_ICDF(26309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2341), AOM_ICDF(4096), AOM_ICDF(11703), AOM_ICDF(13458),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30253), AOM_ICDF(30635), AOM_ICDF(32016), AOM_ICDF(32330),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(23066), AOM_ICDF(23485), AOM_ICDF(30571), AOM_ICDF(30897),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11664), AOM_ICDF(12092), AOM_ICDF(22146), AOM_ICDF(22496),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5932), AOM_ICDF(6387), AOM_ICDF(17131), AOM_ICDF(17470),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5501), AOM_ICDF(5846), AOM_ICDF(15763), AOM_ICDF(16097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4946), AOM_ICDF(6801), AOM_ICDF(14838), AOM_ICDF(16693),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
+static const aom_cdf_prob
+    av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS]
+                            [CDF_SIZE(2)] = {
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                            };
+
+static const aom_cdf_prob
+    av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS]
+                             [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) },
+                                                   { AOM_CDF2(5892) },
+                                                   { AOM_CDF2(12112) },
+                                                   { AOM_CDF2(21935) },
+                                                   { AOM_CDF2(20289) },
+                                                   { AOM_CDF2(27473) },
+                                                   { AOM_CDF2(32487) },
+                                                   { AOM_CDF2(7654) },
+                                                   { AOM_CDF2(19473) },
+                                                   { AOM_CDF2(29984) },
+                                                   { AOM_CDF2(9961) },
+                                                   { AOM_CDF2(30242) },
+                                                   { AOM_CDF2(32117) } },
+                                                 { { AOM_CDF2(31548) },
+                                                   { AOM_CDF2(1549) },
+                                                   { AOM_CDF2(10130) },
+                                                   { AOM_CDF2(16656) },
+                                                   { AOM_CDF2(18591) },
+                                                   { AOM_CDF2(26308) },
+                                                   { AOM_CDF2(32537) },
+                                                   { AOM_CDF2(5403) },
+                                                   { AOM_CDF2(18096) },
+                                                   { AOM_CDF2(30003) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(29957) },
+                                                   { AOM_CDF2(5391) },
+                                                   { AOM_CDF2(18039) },
+                                                   { AOM_CDF2(23566) },
+                                                   { AOM_CDF2(22431) },
+                                                   { AOM_CDF2(25822) },
+                                                   { AOM_CDF2(32197) },
+                                                   { AOM_CDF2(3778) },
+                                                   { AOM_CDF2(15336) },
+                                                   { AOM_CDF2(28981) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(17920) },
+                                                   { AOM_CDF2(1818) },
+                                                   { AOM_CDF2(7282) },
+                                                   { AOM_CDF2(25273) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(31554) },
+                                                   { AOM_CDF2(32624) },
+                                                   { AOM_CDF2(1366) },
+                                                   { AOM_CDF2(15628) },
+                                                   { AOM_CDF2(30462) },
+                                                   { AOM_CDF2(146) },
+                                                   { AOM_CDF2(5132) },
+                                                   { AOM_CDF2(31657) } },
+                                                 { { AOM_CDF2(6308) },
+                                                   { AOM_CDF2(117) },
+                                                   { AOM_CDF2(1638) },
+                                                   { AOM_CDF2(2161) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(30247) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(30371) },
+                                                   { AOM_CDF2(7570) },
+                                                   { AOM_CDF2(13155) },
+                                                   { AOM_CDF2(20751) },
+                                                   { AOM_CDF2(20969) },
+                                                   { AOM_CDF2(27067) },
+                                                   { AOM_CDF2(32013) },
+                                                   { AOM_CDF2(5495) },
+                                                   { AOM_CDF2(17942) },
+                                                   { AOM_CDF2(28280) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31782) },
+                                                   { AOM_CDF2(1836) },
+                                                   { AOM_CDF2(10689) },
+                                                   { AOM_CDF2(17604) },
+                                                   { AOM_CDF2(21622) },
+                                                   { AOM_CDF2(27518) },
+                                                   { AOM_CDF2(32399) },
+                                                   { AOM_CDF2(4419) },
+                                                   { AOM_CDF2(16294) },
+                                                   { AOM_CDF2(28345) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31901) },
+                                                   { AOM_CDF2(10311) },
+                                                   { AOM_CDF2(18047) },
+                                                   { AOM_CDF2(24806) },
+                                                   { AOM_CDF2(23288) },
+                                                   { AOM_CDF2(27914) },
+                                                   { AOM_CDF2(32296) },
+                                                   { AOM_CDF2(4215) },
+                                                   { AOM_CDF2(15756) },
+                                                   { AOM_CDF2(28341) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(26726) },
+                                                   { AOM_CDF2(1045) },
+                                                   { AOM_CDF2(11703) },
+                                                   { AOM_CDF2(20590) },
+                                                   { AOM_CDF2(18554) },
+                                                   { AOM_CDF2(25970) },
+                                                   { AOM_CDF2(31938) },
+                                                   { AOM_CDF2(5583) },
+                                                   { AOM_CDF2(21313) },
+                                                   { AOM_CDF2(29390) },
+                                                   { AOM_CDF2(641) },
+                                                   { AOM_CDF2(22265) },
+                                                   { AOM_CDF2(31452) } },
+                                                 { { AOM_CDF2(26584) },
+                                                   { AOM_CDF2(188) },
+                                                   { AOM_CDF2(8847) },
+                                                   { AOM_CDF2(24519) },
+                                                   { AOM_CDF2(22938) },
+                                                   { AOM_CDF2(30583) },
+                                                   { AOM_CDF2(32608) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(29614) },
+                                                   { AOM_CDF2(9068) },
+                                                   { AOM_CDF2(12924) },
+                                                   { AOM_CDF2(19538) },
+                                                   { AOM_CDF2(17737) },
+                                                   { AOM_CDF2(24619) },
+                                                   { AOM_CDF2(30642) },
+                                                   { AOM_CDF2(4119) },
+                                                   { AOM_CDF2(16026) },
+                                                   { AOM_CDF2(25657) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31957) },
+                                                   { AOM_CDF2(3230) },
+                                                   { AOM_CDF2(11153) },
+                                                   { AOM_CDF2(18123) },
+                                                   { AOM_CDF2(20143) },
+                                                   { AOM_CDF2(26536) },
+                                                   { AOM_CDF2(31986) },
+                                                   { AOM_CDF2(3050) },
+                                                   { AOM_CDF2(14603) },
+                                                   { AOM_CDF2(25155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32363) },
+                                                   { AOM_CDF2(10692) },
+                                                   { AOM_CDF2(19090) },
+                                                   { AOM_CDF2(24357) },
+                                                   { AOM_CDF2(24442) },
+                                                   { AOM_CDF2(28312) },
+                                                   { AOM_CDF2(32169) },
+                                                   { AOM_CDF2(3648) },
+                                                   { AOM_CDF2(15690) },
+                                                   { AOM_CDF2(26815) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(30669) },
+                                                   { AOM_CDF2(3832) },
+                                                   { AOM_CDF2(11663) },
+                                                   { AOM_CDF2(18889) },
+                                                   { AOM_CDF2(19782) },
+                                                   { AOM_CDF2(23313) },
+                                                   { AOM_CDF2(31330) },
+                                                   { AOM_CDF2(5124) },
+                                                   { AOM_CDF2(18719) },
+                                                   { AOM_CDF2(28468) },
+                                                   { AOM_CDF2(3082) },
+                                                   { AOM_CDF2(20982) },
+                                                   { AOM_CDF2(29443) } },
+                                                 { { AOM_CDF2(28573) },
+                                                   { AOM_CDF2(3183) },
+                                                   { AOM_CDF2(17802) },
+                                                   { AOM_CDF2(25977) },
+                                                   { AOM_CDF2(26677) },
+                                                   { AOM_CDF2(27832) },
+                                                   { AOM_CDF2(32387) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(26887) },
+                                                   { AOM_CDF2(6729) },
+                                                   { AOM_CDF2(10361) },
+                                                   { AOM_CDF2(17442) },
+                                                   { AOM_CDF2(15045) },
+                                                   { AOM_CDF2(22478) },
+                                                   { AOM_CDF2(29072) },
+                                                   { AOM_CDF2(2713) },
+                                                   { AOM_CDF2(11861) },
+                                                   { AOM_CDF2(20773) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31903) },
+                                                   { AOM_CDF2(2044) },
+                                                   { AOM_CDF2(7528) },
+                                                   { AOM_CDF2(14618) },
+                                                   { AOM_CDF2(16182) },
+                                                   { AOM_CDF2(24168) },
+                                                   { AOM_CDF2(31037) },
+                                                   { AOM_CDF2(2786) },
+                                                   { AOM_CDF2(11194) },
+                                                   { AOM_CDF2(20155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32510) },
+                                                   { AOM_CDF2(8430) },
+                                                   { AOM_CDF2(17318) },
+                                                   { AOM_CDF2(24154) },
+                                                   { AOM_CDF2(23674) },
+                                                   { AOM_CDF2(28789) },
+                                                   { AOM_CDF2(32139) },
+                                                   { AOM_CDF2(3440) },
+                                                   { AOM_CDF2(13117) },
+                                                   { AOM_CDF2(22702) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31671) },
+                                                   { AOM_CDF2(2056) },
+                                                   { AOM_CDF2(11746) },
+                                                   { AOM_CDF2(16852) },
+                                                   { AOM_CDF2(18635) },
+                                                   { AOM_CDF2(24715) },
+                                                   { AOM_CDF2(31484) },
+                                                   { AOM_CDF2(4656) },
+                                                   { AOM_CDF2(16074) },
+                                                   { AOM_CDF2(24704) },
+                                                   { AOM_CDF2(1806) },
+                                                   { AOM_CDF2(14645) },
+                                                   { AOM_CDF2(25336) } },
+                                                 { { AOM_CDF2(31539) },
+                                                   { AOM_CDF2(8433) },
+                                                   { AOM_CDF2(20576) },
+                                                   { AOM_CDF2(27904) },
+                                                   { AOM_CDF2(27852) },
+                                                   { AOM_CDF2(30026) },
+                                                   { AOM_CDF2(32441) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = {
+                                { { {
+                                        { AOM_CDF2(16961) },
+                                        { AOM_CDF2(17223) },
+                                        { AOM_CDF2(7621) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(19069) },
+                                        { AOM_CDF2(22525) },
+                                        { AOM_CDF2(13377) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20401) },
+                                        { AOM_CDF2(17025) },
+                                        { AOM_CDF2(12845) },
+                                        { AOM_CDF2(12873) },
+                                        { AOM_CDF2(14094) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20681) },
+                                        { AOM_CDF2(20701) },
+                                        { AOM_CDF2(15250) },
+                                        { AOM_CDF2(15017) },
+                                        { AOM_CDF2(14928) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(17194) },
+                                        { AOM_CDF2(16170) },
+                                        { AOM_CDF2(17695) },
+                                        { AOM_CDF2(13826) },
+                                        { AOM_CDF2(15810) },
+                                        { AOM_CDF2(12036) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(23959) },
+                                        { AOM_CDF2(20799) },
+                                        { AOM_CDF2(19021) },
+                                        { AOM_CDF2(16203) },
+                                        { AOM_CDF2(17886) },
+                                        { AOM_CDF2(14144) },
+                                        { AOM_CDF2(12010) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(27399) },
+                                        { AOM_CDF2(16327) },
+                                        { AOM_CDF2(18071) },
+                                        { AOM_CDF2(19584) },
+                                        { AOM_CDF2(20721) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(19560) },
+                                        { AOM_CDF2(10150) },
+                                        { AOM_CDF2(8805) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24932) },
+                                        { AOM_CDF2(20833) },
+                                        { AOM_CDF2(12027) },
+                                        { AOM_CDF2(16670) },
+                                        { AOM_CDF2(19914) },
+                                        { AOM_CDF2(15106) },
+                                        { AOM_CDF2(17662) },
+                                        { AOM_CDF2(13783) },
+                                        { AOM_CDF2(28756) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23406) },
+                                        { AOM_CDF2(21845) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(17096) },
+                                        { AOM_CDF2(12561) },
+                                        { AOM_CDF2(17320) },
+                                        { AOM_CDF2(22395) },
+                                        { AOM_CDF2(21370) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(17471) },
+                                        { AOM_CDF2(20223) },
+                                        { AOM_CDF2(11357) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20335) },
+                                        { AOM_CDF2(21667) },
+                                        { AOM_CDF2(14818) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20430) },
+                                        { AOM_CDF2(20662) },
+                                        { AOM_CDF2(15367) },
+                                        { AOM_CDF2(16970) },
+                                        { AOM_CDF2(14657) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22117) },
+                                        { AOM_CDF2(22028) },
+                                        { AOM_CDF2(18650) },
+                                        { AOM_CDF2(16042) },
+                                        { AOM_CDF2(15885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(22409) },
+                                        { AOM_CDF2(21012) },
+                                        { AOM_CDF2(15650) },
+                                        { AOM_CDF2(17395) },
+                                        { AOM_CDF2(15469) },
+                                        { AOM_CDF2(20205) },
+                                        { AOM_CDF2(19511) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24220) },
+                                        { AOM_CDF2(22480) },
+                                        { AOM_CDF2(17737) },
+                                        { AOM_CDF2(18916) },
+                                        { AOM_CDF2(19268) },
+                                        { AOM_CDF2(18412) },
+                                        { AOM_CDF2(18844) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(25991) },
+                                        { AOM_CDF2(20314) },
+                                        { AOM_CDF2(17731) },
+                                        { AOM_CDF2(19678) },
+                                        { AOM_CDF2(18649) },
+                                        { AOM_CDF2(17307) },
+                                        { AOM_CDF2(21798) },
+                                        { AOM_CDF2(17549) },
+                                        { AOM_CDF2(15630) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26585) },
+                                        { AOM_CDF2(21469) },
+                                        { AOM_CDF2(20432) },
+                                        { AOM_CDF2(17735) },
+                                        { AOM_CDF2(19280) },
+                                        { AOM_CDF2(15235) },
+                                        { AOM_CDF2(20297) },
+                                        { AOM_CDF2(22471) },
+                                        { AOM_CDF2(28997) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26605) },
+                                        { AOM_CDF2(11304) },
+                                        { AOM_CDF2(16726) },
+                                        { AOM_CDF2(16560) },
+                                        { AOM_CDF2(20866) },
+                                        { AOM_CDF2(23524) },
+                                        { AOM_CDF2(19878) },
+                                        { AOM_CDF2(13469) },
+                                        { AOM_CDF2(23084) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(18983) },
+                                        { AOM_CDF2(20512) },
+                                        { AOM_CDF2(14885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20090) },
+                                        { AOM_CDF2(19444) },
+                                        { AOM_CDF2(17286) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19139) },
+                                        { AOM_CDF2(21487) },
+                                        { AOM_CDF2(18959) },
+                                        { AOM_CDF2(20910) },
+                                        { AOM_CDF2(19089) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20536) },
+                                        { AOM_CDF2(20664) },
+                                        { AOM_CDF2(20625) },
+                                        { AOM_CDF2(19123) },
+                                        { AOM_CDF2(14862) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19833) },
+                                        { AOM_CDF2(21502) },
+                                        { AOM_CDF2(17485) },
+                                        { AOM_CDF2(20267) },
+                                        { AOM_CDF2(18353) },
+                                        { AOM_CDF2(23329) },
+                                        { AOM_CDF2(21478) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22041) },
+                                        { AOM_CDF2(23434) },
+                                        { AOM_CDF2(20001) },
+                                        { AOM_CDF2(20554) },
+                                        { AOM_CDF2(20951) },
+                                        { AOM_CDF2(20145) },
+                                        { AOM_CDF2(15562) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23312) },
+                                        { AOM_CDF2(21607) },
+                                        { AOM_CDF2(16526) },
+                                        { AOM_CDF2(18957) },
+                                        { AOM_CDF2(18034) },
+                                        { AOM_CDF2(18934) },
+                                        { AOM_CDF2(24247) },
+                                        { AOM_CDF2(16921) },
+                                        { AOM_CDF2(17080) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26579) },
+                                        { AOM_CDF2(24910) },
+                                        { AOM_CDF2(18637) },
+                                        { AOM_CDF2(19800) },
+                                        { AOM_CDF2(20388) },
+                                        { AOM_CDF2(9887) },
+                                        { AOM_CDF2(15642) },
+                                        { AOM_CDF2(30198) },
+                                        { AOM_CDF2(24721) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26998) },
+                                        { AOM_CDF2(16737) },
+                                        { AOM_CDF2(17838) },
+                                        { AOM_CDF2(18922) },
+                                        { AOM_CDF2(19515) },
+                                        { AOM_CDF2(18636) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(15776) },
+                                        { AOM_CDF2(22658) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(20177) },
+                                        { AOM_CDF2(20789) },
+                                        { AOM_CDF2(20262) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(21416) },
+                                        { AOM_CDF2(20855) },
+                                        { AOM_CDF2(23410) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20238) },
+                                        { AOM_CDF2(21057) },
+                                        { AOM_CDF2(19159) },
+                                        { AOM_CDF2(22337) },
+                                        { AOM_CDF2(20159) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20125) },
+                                        { AOM_CDF2(20559) },
+                                        { AOM_CDF2(21707) },
+                                        { AOM_CDF2(22296) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19941) },
+                                        { AOM_CDF2(20527) },
+                                        { AOM_CDF2(21470) },
+                                        { AOM_CDF2(22487) },
+                                        { AOM_CDF2(19558) },
+                                        { AOM_CDF2(22354) },
+                                        { AOM_CDF2(20331) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22752) },
+                                        { AOM_CDF2(25006) },
+                                        { AOM_CDF2(22075) },
+                                        { AOM_CDF2(21576) },
+                                        { AOM_CDF2(17740) },
+                                        { AOM_CDF2(21690) },
+                                        { AOM_CDF2(19211) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(21442) },
+                                        { AOM_CDF2(22358) },
+                                        { AOM_CDF2(18503) },
+                                        { AOM_CDF2(20291) },
+                                        { AOM_CDF2(19945) },
+                                        { AOM_CDF2(21294) },
+                                        { AOM_CDF2(21178) },
+                                        { AOM_CDF2(19400) },
+                                        { AOM_CDF2(10556) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24648) },
+                                        { AOM_CDF2(24949) },
+                                        { AOM_CDF2(20708) },
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(20501) },
+                                        { AOM_CDF2(9558) },
+                                        { AOM_CDF2(9423) },
+                                        { AOM_CDF2(30365) },
+                                        { AOM_CDF2(19253) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26064) },
+                                        { AOM_CDF2(22098) },
+                                        { AOM_CDF2(19613) },
+                                        { AOM_CDF2(20525) },
+                                        { AOM_CDF2(17595) },
+                                        { AOM_CDF2(16618) },
+                                        { AOM_CDF2(20497) },
+                                        { AOM_CDF2(18989) },
+                                        { AOM_CDF2(15513) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } }
+                              };
+
+static const aom_cdf_prob
+    av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) },
+                    { AOM_CDF5(370, 671, 1883, 4471) } },
+                  { { AOM_CDF5(3247, 4950, 9688, 14563) },
+                    { AOM_CDF5(1904, 3354, 7763, 14647) } } },
+                { { { AOM_CDF5(2125, 2551, 5165, 8946) },
+                    { AOM_CDF5(513, 765, 1859, 6339) } },
+                  { { AOM_CDF5(7637, 9498, 14259, 19108) },
+                    { AOM_CDF5(2497, 4096, 8866, 16993) } } },
+                { { { AOM_CDF5(4016, 4897, 8881, 14968) },
+                    { AOM_CDF5(716, 1105, 2646, 10056) } },
+                  { { AOM_CDF5(11139, 13270, 18241, 23566) },
+                    { AOM_CDF5(3192, 5032, 10297, 19755) } } },
+                { { { AOM_CDF5(6708, 8958, 14746, 22133) },
+                    { AOM_CDF5(1222, 2074, 4783, 15410) } },
+                  { { AOM_CDF5(19575, 21766, 26044, 29709) },
+                    { AOM_CDF5(7297, 10767, 19273, 28194) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) },
+                    { AOM_CDF6(210, 405, 1315, 3326, 7537) } },
+                  { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) },
+                    { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } },
+                { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) },
+                    { AOM_CDF6(313, 441, 1099, 2917, 8562) } },
+                  { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) },
+                    { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } },
+                { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) },
+                    { AOM_CDF6(574, 821, 1836, 5089, 13128) } },
+                  { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) },
+                    { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } },
+                { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) },
+                    { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } },
+                  { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) },
+                    { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) },
+                    { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } },
+                  { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) },
+                    { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } },
+                { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) },
+                    { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } },
+                  { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) },
+                    { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } },
+                { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) },
+                    { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } },
+                  { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) },
+                    { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } },
+                { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) },
+                    { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } },
+                  { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) },
+                    { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        8)] = {
+      { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) },
+          { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } },
+        { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+          { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } },
+      { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) },
+          { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } },
+        { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+          { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } },
+      { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+          { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } },
+        { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+          { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } },
+      { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+          { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } },
+        { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+          { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        9)] = {
+      { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) },
+          { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } },
+        { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) },
+          { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } },
+      { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) },
+          { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } },
+        { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) },
+          { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } },
+      { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) },
+          { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } },
+        { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) },
+          { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842,
+                     32708) } } },
+      { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) },
+          { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } },
+        { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) },
+          { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403,
+                     32695) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788,
+                                 23412, 26061) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919,
+                                 26129, 29140) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590,
+                                 24584, 28749) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478,
+                                 28396, 31811) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267,
+                                 28410, 31078) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812,
+                                 27300, 29219, 32114) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456,
+                                 31142, 32060) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716,
+                                 30073, 30820, 31956) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } } };
 
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q1[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(21480), AOM_ICDF(22344), AOM_ICDF(27339), AOM_ICDF(29181),
-    AOM_ICDF(29765), AOM_ICDF(32768), },
-    {AOM_ICDF(9705), AOM_ICDF(12374), AOM_ICDF(20269), AOM_ICDF(24109),
-    AOM_ICDF(25071), AOM_ICDF(32768), },
-    {AOM_ICDF(2883), AOM_ICDF(6716), AOM_ICDF(10461), AOM_ICDF(16169),
-    AOM_ICDF(17355), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(8632), AOM_ICDF(15472), AOM_ICDF(26027), AOM_ICDF(26596),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8543), AOM_ICDF(14383), AOM_ICDF(25665), AOM_ICDF(26207),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8561), AOM_ICDF(12583), AOM_ICDF(22962), AOM_ICDF(23529),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6538), AOM_ICDF(8023), AOM_ICDF(18106), AOM_ICDF(18672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4363), AOM_ICDF(4797), AOM_ICDF(12512), AOM_ICDF(12937),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2471), AOM_ICDF(2791), AOM_ICDF(7274), AOM_ICDF(7605),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(14783), AOM_ICDF(18891), AOM_ICDF(29122), AOM_ICDF(29700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11829), AOM_ICDF(16696), AOM_ICDF(28114), AOM_ICDF(28591),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8965), AOM_ICDF(11076), AOM_ICDF(23514), AOM_ICDF(24031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6257), AOM_ICDF(7011), AOM_ICDF(17779), AOM_ICDF(18315),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4329), AOM_ICDF(4704), AOM_ICDF(12448), AOM_ICDF(12839),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2542), AOM_ICDF(2860), AOM_ICDF(7886), AOM_ICDF(8207),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19181), AOM_ICDF(22038), AOM_ICDF(30697), AOM_ICDF(31106),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12174), AOM_ICDF(17208), AOM_ICDF(28897), AOM_ICDF(29328),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8420), AOM_ICDF(10706), AOM_ICDF(23788), AOM_ICDF(24321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6153), AOM_ICDF(6850), AOM_ICDF(17983), AOM_ICDF(18530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4168), AOM_ICDF(4524), AOM_ICDF(12547), AOM_ICDF(12983),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3136), AOM_ICDF(3480), AOM_ICDF(9221), AOM_ICDF(9659),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18701), AOM_ICDF(23907), AOM_ICDF(31282), AOM_ICDF(31695),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12655), AOM_ICDF(19258), AOM_ICDF(29824), AOM_ICDF(30279),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8699), AOM_ICDF(11467), AOM_ICDF(24763), AOM_ICDF(25450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6268), AOM_ICDF(7027), AOM_ICDF(18397), AOM_ICDF(19102),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5613), AOM_ICDF(6020), AOM_ICDF(14084), AOM_ICDF(14637),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2443), AOM_ICDF(2919), AOM_ICDF(8222), AOM_ICDF(8639),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6156), AOM_ICDF(23586), AOM_ICDF(30739), AOM_ICDF(31476),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6056), AOM_ICDF(21852), AOM_ICDF(29323), AOM_ICDF(30442),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6113), AOM_ICDF(14408), AOM_ICDF(24331), AOM_ICDF(26899),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5825), AOM_ICDF(9328), AOM_ICDF(18946), AOM_ICDF(22143),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5023), AOM_ICDF(6340), AOM_ICDF(14812), AOM_ICDF(17429),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5140), AOM_ICDF(6104), AOM_ICDF(11565), AOM_ICDF(14135),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(12606), AOM_ICDF(20577), AOM_ICDF(21354), AOM_ICDF(29249),
-    AOM_ICDF(29714), AOM_ICDF(32768), },
-    {AOM_ICDF(8630), AOM_ICDF(17728), AOM_ICDF(19353), AOM_ICDF(27722),
-    AOM_ICDF(28219), AOM_ICDF(32768), },
-    {AOM_ICDF(3040), AOM_ICDF(12616), AOM_ICDF(14286), AOM_ICDF(23918),
-    AOM_ICDF(24539), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(20824), AOM_ICDF(21610), AOM_ICDF(31110), AOM_ICDF(31445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15597), AOM_ICDF(17692), AOM_ICDF(29670), AOM_ICDF(30015),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8954), AOM_ICDF(10007), AOM_ICDF(23515), AOM_ICDF(23902),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6693), AOM_ICDF(7282), AOM_ICDF(18144), AOM_ICDF(18537),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4048), AOM_ICDF(4451), AOM_ICDF(12255), AOM_ICDF(12626),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2619), AOM_ICDF(2960), AOM_ICDF(7084), AOM_ICDF(7429),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21628), AOM_ICDF(22786), AOM_ICDF(31520), AOM_ICDF(31865),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15854), AOM_ICDF(17925), AOM_ICDF(29872), AOM_ICDF(30228),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8120), AOM_ICDF(8815), AOM_ICDF(22575), AOM_ICDF(22964),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5006), AOM_ICDF(5427), AOM_ICDF(15724), AOM_ICDF(16101),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2967), AOM_ICDF(3311), AOM_ICDF(9553), AOM_ICDF(9913),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2878), AOM_ICDF(3188), AOM_ICDF(5418), AOM_ICDF(5825),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21594), AOM_ICDF(23721), AOM_ICDF(31496), AOM_ICDF(31872),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15704), AOM_ICDF(18452), AOM_ICDF(30207), AOM_ICDF(30585),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8637), AOM_ICDF(9546), AOM_ICDF(23803), AOM_ICDF(24254),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5991), AOM_ICDF(6479), AOM_ICDF(17619), AOM_ICDF(18099),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3856), AOM_ICDF(4220), AOM_ICDF(11623), AOM_ICDF(12111),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3501), AOM_ICDF(3825), AOM_ICDF(6760), AOM_ICDF(7246),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19929), AOM_ICDF(23849), AOM_ICDF(31581), AOM_ICDF(31956),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14239), AOM_ICDF(19461), AOM_ICDF(30323), AOM_ICDF(30761),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8094), AOM_ICDF(9844), AOM_ICDF(23595), AOM_ICDF(24338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5204), AOM_ICDF(5848), AOM_ICDF(16396), AOM_ICDF(17121),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3568), AOM_ICDF(3961), AOM_ICDF(10658), AOM_ICDF(11301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1594), AOM_ICDF(1913), AOM_ICDF(5552), AOM_ICDF(6040),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(13512), AOM_ICDF(24112), AOM_ICDF(31648), AOM_ICDF(32057),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10595), AOM_ICDF(22378), AOM_ICDF(30592), AOM_ICDF(31236),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7571), AOM_ICDF(13305), AOM_ICDF(24936), AOM_ICDF(26656),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6163), AOM_ICDF(8207), AOM_ICDF(18688), AOM_ICDF(20500),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3185), AOM_ICDF(4449), AOM_ICDF(13298), AOM_ICDF(14707),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1890), AOM_ICDF(2731), AOM_ICDF(7562), AOM_ICDF(8192),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(26689), AOM_ICDF(27259), AOM_ICDF(30590), AOM_ICDF(31538),
-    AOM_ICDF(31930), AOM_ICDF(32768), },
-    {AOM_ICDF(17843), AOM_ICDF(19709), AOM_ICDF(27299), AOM_ICDF(29813),
-    AOM_ICDF(30435), AOM_ICDF(32768), },
-    {AOM_ICDF(9138), AOM_ICDF(13232), AOM_ICDF(20487), AOM_ICDF(25798),
-    AOM_ICDF(26794), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13264), AOM_ICDF(22970), AOM_ICDF(30914), AOM_ICDF(31354),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11647), AOM_ICDF(20651), AOM_ICDF(30191), AOM_ICDF(30692),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10449), AOM_ICDF(15871), AOM_ICDF(27240), AOM_ICDF(27909),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7759), AOM_ICDF(9400), AOM_ICDF(22161), AOM_ICDF(22812),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4095), AOM_ICDF(4544), AOM_ICDF(13856), AOM_ICDF(14309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3199), AOM_ICDF(3509), AOM_ICDF(8639), AOM_ICDF(8964),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18180), AOM_ICDF(25717), AOM_ICDF(31446), AOM_ICDF(31899),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14593), AOM_ICDF(22211), AOM_ICDF(30845), AOM_ICDF(31282),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10443), AOM_ICDF(13816), AOM_ICDF(27239), AOM_ICDF(27789),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6760), AOM_ICDF(7698), AOM_ICDF(19648), AOM_ICDF(20234),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3896), AOM_ICDF(4253), AOM_ICDF(12678), AOM_ICDF(13056),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(6722), AOM_ICDF(13443), AOM_ICDF(14704),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22145), AOM_ICDF(27566), AOM_ICDF(31813), AOM_ICDF(32212),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15241), AOM_ICDF(23215), AOM_ICDF(31215), AOM_ICDF(31658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11148), AOM_ICDF(15527), AOM_ICDF(28336), AOM_ICDF(28891),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8864), AOM_ICDF(10402), AOM_ICDF(24069), AOM_ICDF(24811),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6919), AOM_ICDF(7527), AOM_ICDF(19607), AOM_ICDF(20260),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(18971), AOM_ICDF(25869),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18795), AOM_ICDF(27901), AOM_ICDF(31907), AOM_ICDF(32272),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13177), AOM_ICDF(24166), AOM_ICDF(31395), AOM_ICDF(31820),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9217), AOM_ICDF(15410), AOM_ICDF(28101), AOM_ICDF(28868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6328), AOM_ICDF(8749), AOM_ICDF(21695), AOM_ICDF(22954),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15672), AOM_ICDF(17809), AOM_ICDF(22795), AOM_ICDF(24932),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(9431), AOM_ICDF(28094), AOM_ICDF(31965), AOM_ICDF(32338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8107), AOM_ICDF(26038), AOM_ICDF(31393), AOM_ICDF(32024),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9347), AOM_ICDF(19880), AOM_ICDF(28342), AOM_ICDF(29759),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7092), AOM_ICDF(13694), AOM_ICDF(25432), AOM_ICDF(28366),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7802), AOM_ICDF(12483), AOM_ICDF(21845), AOM_ICDF(26526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29212), AOM_ICDF(29998), AOM_ICDF(31256), AOM_ICDF(32035),
-    AOM_ICDF(32360), AOM_ICDF(32768), },
-    {AOM_ICDF(19150), AOM_ICDF(23189), AOM_ICDF(28117), AOM_ICDF(31168),
-    AOM_ICDF(31611), AOM_ICDF(32768), },
-    {AOM_ICDF(9324), AOM_ICDF(18178), AOM_ICDF(23556), AOM_ICDF(29422),
-    AOM_ICDF(30204), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(20406), AOM_ICDF(26462), AOM_ICDF(31971), AOM_ICDF(32298),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15834), AOM_ICDF(22647), AOM_ICDF(31547), AOM_ICDF(31902),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11047), AOM_ICDF(15431), AOM_ICDF(27825), AOM_ICDF(28393),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8665), AOM_ICDF(11083), AOM_ICDF(22493), AOM_ICDF(23423),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6191), AOM_ICDF(7733), AOM_ICDF(16624), AOM_ICDF(17708),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3210), AOM_ICDF(3875), AOM_ICDF(10937), AOM_ICDF(11867),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21520), AOM_ICDF(27152), AOM_ICDF(31994), AOM_ICDF(32324),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17519), AOM_ICDF(23609), AOM_ICDF(31670), AOM_ICDF(32022),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10647), AOM_ICDF(14610), AOM_ICDF(28389), AOM_ICDF(28873),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7660), AOM_ICDF(10704), AOM_ICDF(22849), AOM_ICDF(23680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5535), AOM_ICDF(6454), AOM_ICDF(17275), AOM_ICDF(17753),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(6144), AOM_ICDF(13653), AOM_ICDF(15701),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22487), AOM_ICDF(27996), AOM_ICDF(32020), AOM_ICDF(32381),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17371), AOM_ICDF(24453), AOM_ICDF(31777), AOM_ICDF(32152),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11366), AOM_ICDF(16072), AOM_ICDF(29193), AOM_ICDF(29761),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12545), AOM_ICDF(13869), AOM_ICDF(24642), AOM_ICDF(25603),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4119), AOM_ICDF(5056), AOM_ICDF(16103), AOM_ICDF(17601),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19350), AOM_ICDF(28517), AOM_ICDF(32050), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14752), AOM_ICDF(25831), AOM_ICDF(31897), AOM_ICDF(32261),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11157), AOM_ICDF(20816), AOM_ICDF(29821), AOM_ICDF(30635),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8157), AOM_ICDF(9691), AOM_ICDF(22868), AOM_ICDF(23705),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10650), AOM_ICDF(17203), AOM_ICDF(19661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(15557), AOM_ICDF(29043), AOM_ICDF(32047), AOM_ICDF(32424),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10253), AOM_ICDF(27948), AOM_ICDF(31922), AOM_ICDF(32329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7797), AOM_ICDF(18860), AOM_ICDF(28870), AOM_ICDF(30661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5617), AOM_ICDF(11235), AOM_ICDF(27151), AOM_ICDF(29959),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(20585), AOM_ICDF(21554), AOM_ICDF(27179), AOM_ICDF(28995),
-    AOM_ICDF(30170), AOM_ICDF(32768), },
-    {AOM_ICDF(6316), AOM_ICDF(8987), AOM_ICDF(15571), AOM_ICDF(19766),
-    AOM_ICDF(21417), AOM_ICDF(32768), },
-    {AOM_ICDF(1426), AOM_ICDF(4693), AOM_ICDF(6721), AOM_ICDF(11940),
-    AOM_ICDF(12874), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10177), AOM_ICDF(14297), AOM_ICDF(24926), AOM_ICDF(25396),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8812), AOM_ICDF(13381), AOM_ICDF(24128), AOM_ICDF(24649),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8090), AOM_ICDF(11314), AOM_ICDF(21329), AOM_ICDF(21906),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6324), AOM_ICDF(7511), AOM_ICDF(17212), AOM_ICDF(17717),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4272), AOM_ICDF(4718), AOM_ICDF(12016), AOM_ICDF(12415),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2129), AOM_ICDF(2445), AOM_ICDF(6433), AOM_ICDF(6755),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(15709), AOM_ICDF(18339), AOM_ICDF(28174), AOM_ICDF(28566),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12592), AOM_ICDF(15866), AOM_ICDF(27071), AOM_ICDF(27475),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9361), AOM_ICDF(10768), AOM_ICDF(22752), AOM_ICDF(23166),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6525), AOM_ICDF(7048), AOM_ICDF(17478), AOM_ICDF(17863),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4314), AOM_ICDF(4656), AOM_ICDF(12242), AOM_ICDF(12579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2419), AOM_ICDF(2735), AOM_ICDF(7387), AOM_ICDF(7707),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(20453), AOM_ICDF(22253), AOM_ICDF(29963), AOM_ICDF(30329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14090), AOM_ICDF(16483), AOM_ICDF(27992), AOM_ICDF(28355),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8737), AOM_ICDF(9396), AOM_ICDF(22134), AOM_ICDF(22499),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5543), AOM_ICDF(5904), AOM_ICDF(15783), AOM_ICDF(16122),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3358), AOM_ICDF(3677), AOM_ICDF(10362), AOM_ICDF(10680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1875), AOM_ICDF(2187), AOM_ICDF(5982), AOM_ICDF(6294),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(23693), AOM_ICDF(25306), AOM_ICDF(31174), AOM_ICDF(31516),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14804), AOM_ICDF(16843), AOM_ICDF(28713), AOM_ICDF(29058),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8442), AOM_ICDF(8976), AOM_ICDF(22003), AOM_ICDF(22353),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5397), AOM_ICDF(5741), AOM_ICDF(15529), AOM_ICDF(15867),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3322), AOM_ICDF(3639), AOM_ICDF(10248), AOM_ICDF(10570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1852), AOM_ICDF(2161), AOM_ICDF(5980), AOM_ICDF(6290),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(24219), AOM_ICDF(26214), AOM_ICDF(31501), AOM_ICDF(31844),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15202), AOM_ICDF(17709), AOM_ICDF(29450), AOM_ICDF(29807),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9044), AOM_ICDF(9603), AOM_ICDF(23134), AOM_ICDF(23506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5849), AOM_ICDF(6187), AOM_ICDF(16695), AOM_ICDF(17032),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3734), AOM_ICDF(4050), AOM_ICDF(11408), AOM_ICDF(11727),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1898), AOM_ICDF(2201), AOM_ICDF(6126), AOM_ICDF(6430),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(10195), AOM_ICDF(21186), AOM_ICDF(23530), AOM_ICDF(29551),
-    AOM_ICDF(30281), AOM_ICDF(32768), },
-    {AOM_ICDF(3950), AOM_ICDF(15607), AOM_ICDF(18726), AOM_ICDF(26764),
-    AOM_ICDF(27758), AOM_ICDF(32768), },
-    {AOM_ICDF(942), AOM_ICDF(11209), AOM_ICDF(12954), AOM_ICDF(22126),
-    AOM_ICDF(23296), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(24110), AOM_ICDF(24717), AOM_ICDF(31199), AOM_ICDF(31532),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16869), AOM_ICDF(18762), AOM_ICDF(29600), AOM_ICDF(29951),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10702), AOM_ICDF(12122), AOM_ICDF(25122), AOM_ICDF(25503),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8221), AOM_ICDF(9053), AOM_ICDF(20816), AOM_ICDF(21206),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5635), AOM_ICDF(6244), AOM_ICDF(15801), AOM_ICDF(16186),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3776), AOM_ICDF(4210), AOM_ICDF(10380), AOM_ICDF(10766),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24719), AOM_ICDF(25439), AOM_ICDF(31522), AOM_ICDF(31849),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16693), AOM_ICDF(18162), AOM_ICDF(29698), AOM_ICDF(30036),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9340), AOM_ICDF(10024), AOM_ICDF(23513), AOM_ICDF(23867),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6269), AOM_ICDF(6709), AOM_ICDF(17711), AOM_ICDF(18060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3841), AOM_ICDF(4185), AOM_ICDF(11892), AOM_ICDF(12230),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1944), AOM_ICDF(2259), AOM_ICDF(6437), AOM_ICDF(6776),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25795), AOM_ICDF(26524), AOM_ICDF(31784), AOM_ICDF(32108),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17514), AOM_ICDF(18812), AOM_ICDF(30221), AOM_ICDF(30557),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9099), AOM_ICDF(9576), AOM_ICDF(23502), AOM_ICDF(23843),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5738), AOM_ICDF(6097), AOM_ICDF(16847), AOM_ICDF(17182),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3411), AOM_ICDF(3730), AOM_ICDF(10729), AOM_ICDF(11057),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1282), AOM_ICDF(1591), AOM_ICDF(4705), AOM_ICDF(5013),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26360), AOM_ICDF(27205), AOM_ICDF(31918), AOM_ICDF(32240),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18465), AOM_ICDF(19729), AOM_ICDF(30758), AOM_ICDF(31089),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9488), AOM_ICDF(9915), AOM_ICDF(24339), AOM_ICDF(24678),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5812), AOM_ICDF(6156), AOM_ICDF(17325), AOM_ICDF(17661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3739), AOM_ICDF(4065), AOM_ICDF(10932), AOM_ICDF(11265),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1391), AOM_ICDF(1700), AOM_ICDF(4764), AOM_ICDF(5073),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27036), AOM_ICDF(28212), AOM_ICDF(31970), AOM_ICDF(32305),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18634), AOM_ICDF(21073), AOM_ICDF(31116), AOM_ICDF(31477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9822), AOM_ICDF(10441), AOM_ICDF(24990), AOM_ICDF(25437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6130), AOM_ICDF(6530), AOM_ICDF(17790), AOM_ICDF(18269),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3725), AOM_ICDF(4044), AOM_ICDF(11127), AOM_ICDF(11602),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1298), AOM_ICDF(1573), AOM_ICDF(4642), AOM_ICDF(5075),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23042), AOM_ICDF(23702), AOM_ICDF(30487), AOM_ICDF(31370),
-    AOM_ICDF(31898), AOM_ICDF(32768), },
-    {AOM_ICDF(15512), AOM_ICDF(17357), AOM_ICDF(27018), AOM_ICDF(29404),
-    AOM_ICDF(30377), AOM_ICDF(32768), },
-    {AOM_ICDF(8935), AOM_ICDF(12713), AOM_ICDF(20545), AOM_ICDF(25580),
-    AOM_ICDF(26931), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(15021), AOM_ICDF(24086), AOM_ICDF(30796), AOM_ICDF(31272),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13040), AOM_ICDF(21866), AOM_ICDF(30054), AOM_ICDF(30686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10915), AOM_ICDF(16852), AOM_ICDF(27467), AOM_ICDF(28235),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8096), AOM_ICDF(10403), AOM_ICDF(22531), AOM_ICDF(23355),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4485), AOM_ICDF(5020), AOM_ICDF(13360), AOM_ICDF(13816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1728), AOM_ICDF(2067), AOM_ICDF(5998), AOM_ICDF(6337),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20845), AOM_ICDF(25929), AOM_ICDF(31278), AOM_ICDF(31670),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15553), AOM_ICDF(21602), AOM_ICDF(30338), AOM_ICDF(30745),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10953), AOM_ICDF(13829), AOM_ICDF(26398), AOM_ICDF(26854),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7900), AOM_ICDF(8858), AOM_ICDF(20869), AOM_ICDF(21378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5225), AOM_ICDF(5579), AOM_ICDF(13764), AOM_ICDF(14087),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1881), AOM_ICDF(2352), AOM_ICDF(6742), AOM_ICDF(7212),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25402), AOM_ICDF(28169), AOM_ICDF(31825), AOM_ICDF(32169),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17086), AOM_ICDF(21375), AOM_ICDF(30582), AOM_ICDF(30951),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11057), AOM_ICDF(12358), AOM_ICDF(25930), AOM_ICDF(26346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6989), AOM_ICDF(7448), AOM_ICDF(18814), AOM_ICDF(19143),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4476), AOM_ICDF(4752), AOM_ICDF(16025), AOM_ICDF(16301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2185), AOM_ICDF(4369), AOM_ICDF(12379), AOM_ICDF(14564),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26444), AOM_ICDF(28656), AOM_ICDF(31864), AOM_ICDF(32231),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17642), AOM_ICDF(20848), AOM_ICDF(30615), AOM_ICDF(30967),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10973), AOM_ICDF(11732), AOM_ICDF(25256), AOM_ICDF(25612),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8325), AOM_ICDF(8726), AOM_ICDF(19826), AOM_ICDF(20146),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5294), AOM_ICDF(5568), AOM_ICDF(14056), AOM_ICDF(14330),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(18204), AOM_ICDF(23666),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27760), AOM_ICDF(29748), AOM_ICDF(31934), AOM_ICDF(32299),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17133), AOM_ICDF(21599), AOM_ICDF(30800), AOM_ICDF(31243),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12224), AOM_ICDF(13907), AOM_ICDF(26992), AOM_ICDF(27546),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9221), AOM_ICDF(9617), AOM_ICDF(21845), AOM_ICDF(22162),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5401), AOM_ICDF(6482), AOM_ICDF(18004), AOM_ICDF(19085),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29286), AOM_ICDF(29932), AOM_ICDF(31576), AOM_ICDF(32075),
-    AOM_ICDF(32408), AOM_ICDF(32768), },
-    {AOM_ICDF(17969), AOM_ICDF(21693), AOM_ICDF(28937), AOM_ICDF(30945),
-    AOM_ICDF(31682), AOM_ICDF(32768), },
-    {AOM_ICDF(6607), AOM_ICDF(16160), AOM_ICDF(23280), AOM_ICDF(27595),
-    AOM_ICDF(30027), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(24724), AOM_ICDF(28333), AOM_ICDF(32022), AOM_ICDF(32346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18803), AOM_ICDF(24728), AOM_ICDF(31661), AOM_ICDF(32022),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14179), AOM_ICDF(20757), AOM_ICDF(30098), AOM_ICDF(30633),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12564), AOM_ICDF(17179), AOM_ICDF(27133), AOM_ICDF(28080),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10543), AOM_ICDF(13479), AOM_ICDF(23725), AOM_ICDF(25031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11377), AOM_ICDF(12741), AOM_ICDF(21923), AOM_ICDF(22888),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26071), AOM_ICDF(28609), AOM_ICDF(32053), AOM_ICDF(32374),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20389), AOM_ICDF(24820), AOM_ICDF(31690), AOM_ICDF(32027),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12977), AOM_ICDF(16892), AOM_ICDF(29053), AOM_ICDF(29445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8745), AOM_ICDF(12303), AOM_ICDF(24164), AOM_ICDF(25209),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4042), AOM_ICDF(5052), AOM_ICDF(18333), AOM_ICDF(18910),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(9557), AOM_ICDF(13653), AOM_ICDF(17749),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(27936), AOM_ICDF(29582), AOM_ICDF(32107), AOM_ICDF(32422),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22472), AOM_ICDF(25761), AOM_ICDF(31858), AOM_ICDF(32177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14107), AOM_ICDF(16587), AOM_ICDF(29250), AOM_ICDF(29692),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10726), AOM_ICDF(11739), AOM_ICDF(23985), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5825), AOM_ICDF(8010), AOM_ICDF(18204), AOM_ICDF(20389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27066), AOM_ICDF(29025), AOM_ICDF(31972), AOM_ICDF(32338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20639), AOM_ICDF(23330), AOM_ICDF(31616), AOM_ICDF(31985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13468), AOM_ICDF(15091), AOM_ICDF(29902), AOM_ICDF(30243),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14473), AOM_ICDF(15019), AOM_ICDF(24030), AOM_ICDF(24439),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7864), AOM_ICDF(11796), AOM_ICDF(19661), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28741), AOM_ICDF(30503), AOM_ICDF(32039), AOM_ICDF(32388),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19712), AOM_ICDF(25328), AOM_ICDF(31621), AOM_ICDF(32049),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13461), AOM_ICDF(17167), AOM_ICDF(29712), AOM_ICDF(30308),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10285), AOM_ICDF(11242), AOM_ICDF(27267), AOM_ICDF(28224),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(17246), AOM_ICDF(22420),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(4353), AOM_ICDF(7056), AOM_ICDF(15884), AOM_ICDF(20594),
-    AOM_ICDF(24026), AOM_ICDF(32768), },
-    {AOM_ICDF(2397), AOM_ICDF(5417), AOM_ICDF(9610), AOM_ICDF(14451),
-    AOM_ICDF(16689), AOM_ICDF(32768), },
-    {AOM_ICDF(841), AOM_ICDF(3543), AOM_ICDF(4598), AOM_ICDF(9149),
-    AOM_ICDF(9950), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(8763), AOM_ICDF(11845), AOM_ICDF(22684), AOM_ICDF(23211),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8074), AOM_ICDF(12129), AOM_ICDF(22232), AOM_ICDF(22924),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7453), AOM_ICDF(10017), AOM_ICDF(19822), AOM_ICDF(20662),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5825), AOM_ICDF(6998), AOM_ICDF(16346), AOM_ICDF(16952),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4059), AOM_ICDF(4481), AOM_ICDF(11444), AOM_ICDF(11852),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1973), AOM_ICDF(2289), AOM_ICDF(5827), AOM_ICDF(6149),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(15272), AOM_ICDF(17017), AOM_ICDF(26959), AOM_ICDF(27346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12476), AOM_ICDF(14916), AOM_ICDF(26163), AOM_ICDF(26575),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9485), AOM_ICDF(10720), AOM_ICDF(22557), AOM_ICDF(22973),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6821), AOM_ICDF(7342), AOM_ICDF(17484), AOM_ICDF(17858),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4370), AOM_ICDF(4714), AOM_ICDF(12030), AOM_ICDF(12366),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2375), AOM_ICDF(2688), AOM_ICDF(6850), AOM_ICDF(7162),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19929), AOM_ICDF(21244), AOM_ICDF(29489), AOM_ICDF(29829),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14005), AOM_ICDF(16066), AOM_ICDF(27595), AOM_ICDF(27947),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8918), AOM_ICDF(9550), AOM_ICDF(22126), AOM_ICDF(22488),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5741), AOM_ICDF(6095), AOM_ICDF(16004), AOM_ICDF(16340),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3558), AOM_ICDF(3873), AOM_ICDF(10340), AOM_ICDF(10657),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1822), AOM_ICDF(2134), AOM_ICDF(5530), AOM_ICDF(5843),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(23568), AOM_ICDF(24663), AOM_ICDF(30915), AOM_ICDF(31245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15139), AOM_ICDF(16577), AOM_ICDF(28661), AOM_ICDF(28997),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8850), AOM_ICDF(9259), AOM_ICDF(22366), AOM_ICDF(22700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5454), AOM_ICDF(5781), AOM_ICDF(15617), AOM_ICDF(15937),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3315), AOM_ICDF(3629), AOM_ICDF(10044), AOM_ICDF(10359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1736), AOM_ICDF(2047), AOM_ICDF(5698), AOM_ICDF(6009),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27011), AOM_ICDF(27875), AOM_ICDF(31721), AOM_ICDF(32046),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16855), AOM_ICDF(18018), AOM_ICDF(29676), AOM_ICDF(30005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8916), AOM_ICDF(9282), AOM_ICDF(22431), AOM_ICDF(22760),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5391), AOM_ICDF(5710), AOM_ICDF(15343), AOM_ICDF(15662),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3316), AOM_ICDF(3629), AOM_ICDF(10223), AOM_ICDF(10537),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1891), AOM_ICDF(2202), AOM_ICDF(6076), AOM_ICDF(6387),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(5744), AOM_ICDF(15508), AOM_ICDF(23294), AOM_ICDF(28653),
-    AOM_ICDF(30781), AOM_ICDF(32768), },
-    {AOM_ICDF(2130), AOM_ICDF(11786), AOM_ICDF(17337), AOM_ICDF(24444),
-    AOM_ICDF(27499), AOM_ICDF(32768), },
-    {AOM_ICDF(615), AOM_ICDF(8230), AOM_ICDF(10191), AOM_ICDF(18291),
-    AOM_ICDF(21029), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(25149), AOM_ICDF(25880), AOM_ICDF(31110), AOM_ICDF(31453),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17454), AOM_ICDF(20460), AOM_ICDF(29560), AOM_ICDF(29929),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11724), AOM_ICDF(14294), AOM_ICDF(25947), AOM_ICDF(26377),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9198), AOM_ICDF(10981), AOM_ICDF(22357), AOM_ICDF(22857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7164), AOM_ICDF(8069), AOM_ICDF(18345), AOM_ICDF(18857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5833), AOM_ICDF(6316), AOM_ICDF(14661), AOM_ICDF(15073),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26117), AOM_ICDF(26928), AOM_ICDF(31526), AOM_ICDF(31850),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16540), AOM_ICDF(18394), AOM_ICDF(29402), AOM_ICDF(29740),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9908), AOM_ICDF(10886), AOM_ICDF(23865), AOM_ICDF(24223),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6805), AOM_ICDF(7383), AOM_ICDF(18402), AOM_ICDF(18777),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4259), AOM_ICDF(4638), AOM_ICDF(12791), AOM_ICDF(13136),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2274), AOM_ICDF(2584), AOM_ICDF(7391), AOM_ICDF(7713),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(27129), AOM_ICDF(27797), AOM_ICDF(31745), AOM_ICDF(32063),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17255), AOM_ICDF(18663), AOM_ICDF(29815), AOM_ICDF(30145),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9538), AOM_ICDF(10091), AOM_ICDF(23590), AOM_ICDF(23931),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6366), AOM_ICDF(6732), AOM_ICDF(17467), AOM_ICDF(17800),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3701), AOM_ICDF(4018), AOM_ICDF(11326), AOM_ICDF(11652),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1976), AOM_ICDF(2284), AOM_ICDF(6325), AOM_ICDF(6633),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27944), AOM_ICDF(28479), AOM_ICDF(31894), AOM_ICDF(32211),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18032), AOM_ICDF(18997), AOM_ICDF(30130), AOM_ICDF(30452),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9467), AOM_ICDF(9842), AOM_ICDF(23729), AOM_ICDF(24051),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5900), AOM_ICDF(6226), AOM_ICDF(16797), AOM_ICDF(17116),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3282), AOM_ICDF(3595), AOM_ICDF(10418), AOM_ICDF(10730),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2289), AOM_ICDF(2601), AOM_ICDF(6048), AOM_ICDF(6360),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29278), AOM_ICDF(29837), AOM_ICDF(32038), AOM_ICDF(32360),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19805), AOM_ICDF(20846), AOM_ICDF(31007), AOM_ICDF(31343),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9976), AOM_ICDF(10433), AOM_ICDF(24483), AOM_ICDF(24848),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5971), AOM_ICDF(6354), AOM_ICDF(17184), AOM_ICDF(17539),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3497), AOM_ICDF(4693), AOM_ICDF(11940), AOM_ICDF(12291),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1776), AOM_ICDF(2357), AOM_ICDF(6260), AOM_ICDF(6918),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23166), AOM_ICDF(23821), AOM_ICDF(30269), AOM_ICDF(31075),
-    AOM_ICDF(31847), AOM_ICDF(32768), },
-    {AOM_ICDF(14510), AOM_ICDF(16494), AOM_ICDF(25635), AOM_ICDF(28335),
-    AOM_ICDF(29759), AOM_ICDF(32768), },
-    {AOM_ICDF(7730), AOM_ICDF(12354), AOM_ICDF(18089), AOM_ICDF(24005),
-    AOM_ICDF(25442), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17908), AOM_ICDF(24824), AOM_ICDF(30533), AOM_ICDF(31042),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13950), AOM_ICDF(22899), AOM_ICDF(29969), AOM_ICDF(30646),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11728), AOM_ICDF(17834), AOM_ICDF(27214), AOM_ICDF(28218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9581), AOM_ICDF(12074), AOM_ICDF(23689), AOM_ICDF(24616),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6193), AOM_ICDF(6855), AOM_ICDF(16430), AOM_ICDF(16955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3393), AOM_ICDF(3712), AOM_ICDF(8802), AOM_ICDF(9226),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23368), AOM_ICDF(26826), AOM_ICDF(31183), AOM_ICDF(31579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16523), AOM_ICDF(21603), AOM_ICDF(30044), AOM_ICDF(30503),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11171), AOM_ICDF(14152), AOM_ICDF(27009), AOM_ICDF(27644),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8523), AOM_ICDF(9348), AOM_ICDF(21021), AOM_ICDF(21595),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4780), AOM_ICDF(5196), AOM_ICDF(13440), AOM_ICDF(13786),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4328), AOM_ICDF(5255), AOM_ICDF(10820), AOM_ICDF(11747),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(27020), AOM_ICDF(28644), AOM_ICDF(31643), AOM_ICDF(31990),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18016), AOM_ICDF(21678), AOM_ICDF(30346), AOM_ICDF(30712),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10497), AOM_ICDF(11555), AOM_ICDF(24827), AOM_ICDF(25156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6370), AOM_ICDF(6703), AOM_ICDF(18612), AOM_ICDF(18903),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5355), AOM_ICDF(5738), AOM_ICDF(14790), AOM_ICDF(15173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3486), AOM_ICDF(5578), AOM_ICDF(11155), AOM_ICDF(13247),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28933), AOM_ICDF(29746), AOM_ICDF(31882), AOM_ICDF(32203),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18171), AOM_ICDF(20286), AOM_ICDF(29713), AOM_ICDF(30052),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9732), AOM_ICDF(10163), AOM_ICDF(23952), AOM_ICDF(24275),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6084), AOM_ICDF(6480), AOM_ICDF(17459), AOM_ICDF(17771),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3250), AOM_ICDF(3656), AOM_ICDF(10291), AOM_ICDF(10697),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(8192), AOM_ICDF(15214), AOM_ICDF(18725),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29940), AOM_ICDF(30510), AOM_ICDF(31933), AOM_ICDF(32260),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17688), AOM_ICDF(19258), AOM_ICDF(29757), AOM_ICDF(30125),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9668), AOM_ICDF(10798), AOM_ICDF(24231), AOM_ICDF(24605),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7580), AOM_ICDF(7942), AOM_ICDF(19364), AOM_ICDF(19692),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6043), AOM_ICDF(6446), AOM_ICDF(15578), AOM_ICDF(15981),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(28553), AOM_ICDF(29151), AOM_ICDF(31521), AOM_ICDF(32038),
-    AOM_ICDF(32413), AOM_ICDF(32768), },
-    {AOM_ICDF(15138), AOM_ICDF(19554), AOM_ICDF(27559), AOM_ICDF(29750),
-    AOM_ICDF(31321), AOM_ICDF(32768), },
-    {AOM_ICDF(3406), AOM_ICDF(18680), AOM_ICDF(23310), AOM_ICDF(27259),
-    AOM_ICDF(30430), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(29000), AOM_ICDF(30219), AOM_ICDF(32098), AOM_ICDF(32414),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21324), AOM_ICDF(25278), AOM_ICDF(31789), AOM_ICDF(32126),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14011), AOM_ICDF(21190), AOM_ICDF(30288), AOM_ICDF(30900),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12762), AOM_ICDF(18476), AOM_ICDF(27140), AOM_ICDF(28461),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11498), AOM_ICDF(14867), AOM_ICDF(24806), AOM_ICDF(25613),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15872), AOM_ICDF(16512), AOM_ICDF(24192), AOM_ICDF(25088),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(29308), AOM_ICDF(30286), AOM_ICDF(32095), AOM_ICDF(32410),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21819), AOM_ICDF(24215), AOM_ICDF(31771), AOM_ICDF(32103),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14853), AOM_ICDF(18028), AOM_ICDF(29729), AOM_ICDF(30160),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10598), AOM_ICDF(13400), AOM_ICDF(26555), AOM_ICDF(27043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10426), AOM_ICDF(12660), AOM_ICDF(21597), AOM_ICDF(23831),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(29866), AOM_ICDF(30588), AOM_ICDF(32131), AOM_ICDF(32445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(23473), AOM_ICDF(25323), AOM_ICDF(31960), AOM_ICDF(32280),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17529), AOM_ICDF(19173), AOM_ICDF(30278), AOM_ICDF(30577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9830), AOM_ICDF(11469), AOM_ICDF(23484), AOM_ICDF(25122),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(30405), AOM_ICDF(31032), AOM_ICDF(32139), AOM_ICDF(32451),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(25453), AOM_ICDF(27199), AOM_ICDF(32040), AOM_ICDF(32361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15663), AOM_ICDF(16432), AOM_ICDF(30654), AOM_ICDF(31038),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6780), AOM_ICDF(10169), AOM_ICDF(18079), AOM_ICDF(21469),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29785), AOM_ICDF(30368), AOM_ICDF(31904), AOM_ICDF(32245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18173), AOM_ICDF(21111), AOM_ICDF(30105), AOM_ICDF(30575),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8476), AOM_ICDF(13666), AOM_ICDF(28420), AOM_ICDF(28896),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11427), AOM_ICDF(12066), AOM_ICDF(26197), AOM_ICDF(26691),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6827), AOM_ICDF(10923), AOM_ICDF(21845), AOM_ICDF(25941),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(7848), AOM_ICDF(9841), AOM_ICDF(13623), AOM_ICDF(19351),
-    AOM_ICDF(23196), AOM_ICDF(32768), },
-    {AOM_ICDF(3229), AOM_ICDF(5641), AOM_ICDF(7103), AOM_ICDF(13195),
-    AOM_ICDF(15046), AOM_ICDF(32768), },
-    {AOM_ICDF(810), AOM_ICDF(3129), AOM_ICDF(3687), AOM_ICDF(8373),
-    AOM_ICDF(8971), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(8165), AOM_ICDF(12626), AOM_ICDF(22213), AOM_ICDF(23403),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7602), AOM_ICDF(15378), AOM_ICDF(23248), AOM_ICDF(24331),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5607), AOM_ICDF(10197), AOM_ICDF(18657), AOM_ICDF(20616),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4498), AOM_ICDF(6539), AOM_ICDF(14461), AOM_ICDF(16104),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3387), AOM_ICDF(4098), AOM_ICDF(10245), AOM_ICDF(11322),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1793), AOM_ICDF(2111), AOM_ICDF(5262), AOM_ICDF(5646),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(16815), AOM_ICDF(19141), AOM_ICDF(27640), AOM_ICDF(28110),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13156), AOM_ICDF(15592), AOM_ICDF(26089), AOM_ICDF(26592),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9841), AOM_ICDF(11588), AOM_ICDF(22858), AOM_ICDF(23403),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7765), AOM_ICDF(8871), AOM_ICDF(19127), AOM_ICDF(19526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5550), AOM_ICDF(6013), AOM_ICDF(14338), AOM_ICDF(14677),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2658), AOM_ICDF(2969), AOM_ICDF(7230), AOM_ICDF(7541),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22765), AOM_ICDF(24278), AOM_ICDF(30194), AOM_ICDF(30535),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15310), AOM_ICDF(17292), AOM_ICDF(27870), AOM_ICDF(28248),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10047), AOM_ICDF(10839), AOM_ICDF(23345), AOM_ICDF(23710),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6594), AOM_ICDF(6959), AOM_ICDF(17456), AOM_ICDF(17796),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3784), AOM_ICDF(4109), AOM_ICDF(10984), AOM_ICDF(11297),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1569), AOM_ICDF(1875), AOM_ICDF(4586), AOM_ICDF(4892),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25747), AOM_ICDF(26817), AOM_ICDF(31236), AOM_ICDF(31577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16018), AOM_ICDF(17720), AOM_ICDF(28833), AOM_ICDF(29219),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9348), AOM_ICDF(10015), AOM_ICDF(22943), AOM_ICDF(23323),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5841), AOM_ICDF(6167), AOM_ICDF(15774), AOM_ICDF(16107),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3385), AOM_ICDF(3703), AOM_ICDF(9664), AOM_ICDF(9975),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1460), AOM_ICDF(1768), AOM_ICDF(4704), AOM_ICDF(5011),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29634), AOM_ICDF(30134), AOM_ICDF(31898), AOM_ICDF(32218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16976), AOM_ICDF(17856), AOM_ICDF(29258), AOM_ICDF(29584),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8521), AOM_ICDF(8858), AOM_ICDF(21252), AOM_ICDF(21574),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4894), AOM_ICDF(5208), AOM_ICDF(13957), AOM_ICDF(14271),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3140), AOM_ICDF(3452), AOM_ICDF(9099), AOM_ICDF(9411),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1770), AOM_ICDF(2080), AOM_ICDF(5241), AOM_ICDF(5551),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(22253), AOM_ICDF(23279), AOM_ICDF(24319), AOM_ICDF(27691),
-    AOM_ICDF(30884), AOM_ICDF(32768), },
-    {AOM_ICDF(6281), AOM_ICDF(8348), AOM_ICDF(9473), AOM_ICDF(15740),
-    AOM_ICDF(24879), AOM_ICDF(32768), },
-    {AOM_ICDF(1265), AOM_ICDF(3893), AOM_ICDF(4482), AOM_ICDF(9694),
-    AOM_ICDF(18376), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17243), AOM_ICDF(18993), AOM_ICDF(28515), AOM_ICDF(29242),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15645), AOM_ICDF(23632), AOM_ICDF(29905), AOM_ICDF(30416),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11203), AOM_ICDF(18441), AOM_ICDF(27037), AOM_ICDF(27930),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9306), AOM_ICDF(13788), AOM_ICDF(23647), AOM_ICDF(24669),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8076), AOM_ICDF(10237), AOM_ICDF(20500), AOM_ICDF(21437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7214), AOM_ICDF(8133), AOM_ICDF(17608), AOM_ICDF(18202),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23555), AOM_ICDF(26147), AOM_ICDF(31229), AOM_ICDF(31581),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16046), AOM_ICDF(20455), AOM_ICDF(29711), AOM_ICDF(30107),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10810), AOM_ICDF(14014), AOM_ICDF(25967), AOM_ICDF(26499),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8267), AOM_ICDF(9930), AOM_ICDF(21704), AOM_ICDF(22244),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5637), AOM_ICDF(6282), AOM_ICDF(15954), AOM_ICDF(16508),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4090), AOM_ICDF(4363), AOM_ICDF(11771), AOM_ICDF(12044),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26146), AOM_ICDF(27425), AOM_ICDF(31658), AOM_ICDF(31983),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17486), AOM_ICDF(20295), AOM_ICDF(30279), AOM_ICDF(30621),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10812), AOM_ICDF(12230), AOM_ICDF(26095), AOM_ICDF(26460),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7510), AOM_ICDF(8042), AOM_ICDF(21058), AOM_ICDF(21425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4566), AOM_ICDF(4916), AOM_ICDF(13594), AOM_ICDF(13891),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1956), AOM_ICDF(2445), AOM_ICDF(5380), AOM_ICDF(5869),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28423), AOM_ICDF(29253), AOM_ICDF(31959), AOM_ICDF(32277),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18711), AOM_ICDF(20638), AOM_ICDF(30445), AOM_ICDF(30777),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10301), AOM_ICDF(10903), AOM_ICDF(24702), AOM_ICDF(25060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6531), AOM_ICDF(6885), AOM_ICDF(18215), AOM_ICDF(18535),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3965), AOM_ICDF(4265), AOM_ICDF(11701), AOM_ICDF(12023),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3255), AOM_ICDF(3906), AOM_ICDF(8897), AOM_ICDF(9548),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29905), AOM_ICDF(30382), AOM_ICDF(32053), AOM_ICDF(32369),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19724), AOM_ICDF(20376), AOM_ICDF(30778), AOM_ICDF(31101),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10430), AOM_ICDF(10786), AOM_ICDF(24620), AOM_ICDF(24943),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6151), AOM_ICDF(6475), AOM_ICDF(17188), AOM_ICDF(17504),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3728), AOM_ICDF(4034), AOM_ICDF(11352), AOM_ICDF(11658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1456), AOM_ICDF(1748), AOM_ICDF(5024), AOM_ICDF(5316),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(24883), AOM_ICDF(25616), AOM_ICDF(27995), AOM_ICDF(29251),
-    AOM_ICDF(31055), AOM_ICDF(32768), },
-    {AOM_ICDF(9802), AOM_ICDF(11841), AOM_ICDF(18691), AOM_ICDF(22179),
-    AOM_ICDF(26383), AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(7928), AOM_ICDF(14072), AOM_ICDF(21042),
-    AOM_ICDF(23453), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10363), AOM_ICDF(20924), AOM_ICDF(29116), AOM_ICDF(29906),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10682), AOM_ICDF(22326), AOM_ICDF(29093), AOM_ICDF(29642),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10304), AOM_ICDF(21073), AOM_ICDF(26843), AOM_ICDF(28904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6138), AOM_ICDF(13221), AOM_ICDF(22475), AOM_ICDF(25119),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3788), AOM_ICDF(4356), AOM_ICDF(10607), AOM_ICDF(12690),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1950), AOM_ICDF(4291), AOM_ICDF(10923), AOM_ICDF(12873),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21958), AOM_ICDF(27093), AOM_ICDF(30741), AOM_ICDF(31349),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18725), AOM_ICDF(23406), AOM_ICDF(30541), AOM_ICDF(31268),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15634), AOM_ICDF(17134), AOM_ICDF(26450), AOM_ICDF(27092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10012), AOM_ICDF(11287), AOM_ICDF(24758), AOM_ICDF(25304),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(7802), AOM_ICDF(19895), AOM_ICDF(21065),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26587), AOM_ICDF(27934), AOM_ICDF(31817), AOM_ICDF(32094),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20234), AOM_ICDF(22651), AOM_ICDF(30576), AOM_ICDF(30857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13405), AOM_ICDF(14708), AOM_ICDF(26624), AOM_ICDF(27183),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9132), AOM_ICDF(11281), AOM_ICDF(19876), AOM_ICDF(21487),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(15522), AOM_ICDF(20696),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(19275), AOM_ICDF(25058),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28277), AOM_ICDF(29312), AOM_ICDF(32101), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18946), AOM_ICDF(23037), AOM_ICDF(31186), AOM_ICDF(31565),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14043), AOM_ICDF(14980), AOM_ICDF(29491), AOM_ICDF(30193),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9638), AOM_ICDF(12529), AOM_ICDF(21203), AOM_ICDF(24094),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(11469), AOM_ICDF(18022), AOM_ICDF(22938),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31039), AOM_ICDF(31404), AOM_ICDF(32048), AOM_ICDF(32372),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20567), AOM_ICDF(21869), AOM_ICDF(28724), AOM_ICDF(29256),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10000), AOM_ICDF(11250), AOM_ICDF(22768), AOM_ICDF(23393),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6291), AOM_ICDF(7078), AOM_ICDF(20447), AOM_ICDF(21234),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3072), AOM_ICDF(6144), AOM_ICDF(18432), AOM_ICDF(21504),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(23448), AOM_ICDF(25882), AOM_ICDF(29692), AOM_ICDF(31272),
-    AOM_ICDF(32065), AOM_ICDF(32768), },
-    {AOM_ICDF(4276), AOM_ICDF(17832), AOM_ICDF(22156), AOM_ICDF(28463),
-    AOM_ICDF(30374), AOM_ICDF(32768), },
-    {AOM_ICDF(842), AOM_ICDF(20937), AOM_ICDF(22447), AOM_ICDF(28559),
-    AOM_ICDF(30333), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(30469), AOM_ICDF(30991), AOM_ICDF(32114), AOM_ICDF(32435),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(27295), AOM_ICDF(29153), AOM_ICDF(31917), AOM_ICDF(32269),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16309), AOM_ICDF(22060), AOM_ICDF(29937), AOM_ICDF(30686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11440), AOM_ICDF(16853), AOM_ICDF(26633), AOM_ICDF(27427),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13069), AOM_ICDF(15405), AOM_ICDF(27401), AOM_ICDF(28033),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9084), AOM_ICDF(10058), AOM_ICDF(23197), AOM_ICDF(23684),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(30728), AOM_ICDF(31202), AOM_ICDF(32138), AOM_ICDF(32450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(23421), AOM_ICDF(26186), AOM_ICDF(31939), AOM_ICDF(32278),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12249), AOM_ICDF(15027), AOM_ICDF(28348), AOM_ICDF(28854),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5667), AOM_ICDF(6899), AOM_ICDF(22174), AOM_ICDF(23652),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10650), AOM_ICDF(17203), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(30721), AOM_ICDF(31093), AOM_ICDF(32141), AOM_ICDF(32453),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(24052), AOM_ICDF(25175), AOM_ICDF(31923), AOM_ICDF(32231),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8145), AOM_ICDF(9281), AOM_ICDF(27654), AOM_ICDF(28412),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7373), AOM_ICDF(9830), AOM_ICDF(21299), AOM_ICDF(23757),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(31284), AOM_ICDF(31621), AOM_ICDF(32143), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(27783), AOM_ICDF(28563), AOM_ICDF(32045), AOM_ICDF(32361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10149), AOM_ICDF(12179), AOM_ICDF(28128), AOM_ICDF(28998),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5650), AOM_ICDF(9039), AOM_ICDF(19209), AOM_ICDF(22599),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31038), AOM_ICDF(31383), AOM_ICDF(32035), AOM_ICDF(32357),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20689), AOM_ICDF(22001), AOM_ICDF(28880), AOM_ICDF(29479),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7827), AOM_ICDF(10613), AOM_ICDF(24141), AOM_ICDF(24735),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8021), AOM_ICDF(8585), AOM_ICDF(22014), AOM_ICDF(22383),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6047), AOM_ICDF(6350), AOM_ICDF(19918), AOM_ICDF(20220),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
+static const aom_cdf_prob
+    av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047,
+                                 22571, 25830) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354,
+                                 27255, 28546, 31784) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851,
+                                 21856, 25692, 28034) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527,
+                                 28027, 28377, 30876) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155,
+                                 26682, 29229, 31045) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601,
+                                 25483, 25843, 32056) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434,
+                                 29326, 31082, 32050) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913,
+                                 29486, 29724, 29807, 32570) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } } };
 
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q2[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23035), AOM_ICDF(23799), AOM_ICDF(27745), AOM_ICDF(29607),
-    AOM_ICDF(30130), AOM_ICDF(32768), },
-    {AOM_ICDF(12409), AOM_ICDF(14763), AOM_ICDF(22883), AOM_ICDF(26775),
-    AOM_ICDF(27649), AOM_ICDF(32768), },
-    {AOM_ICDF(5237), AOM_ICDF(9433), AOM_ICDF(15597), AOM_ICDF(21779),
-    AOM_ICDF(23224), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10424), AOM_ICDF(17678), AOM_ICDF(28850), AOM_ICDF(29349),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10376), AOM_ICDF(16902), AOM_ICDF(28779), AOM_ICDF(29265),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10166), AOM_ICDF(14387), AOM_ICDF(26253), AOM_ICDF(26807),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8474), AOM_ICDF(9927), AOM_ICDF(22092), AOM_ICDF(22697),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6415), AOM_ICDF(6911), AOM_ICDF(17155), AOM_ICDF(17579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4611), AOM_ICDF(4928), AOM_ICDF(12174), AOM_ICDF(12497),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(16984), AOM_ICDF(21802), AOM_ICDF(30901), AOM_ICDF(31373),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14003), AOM_ICDF(19369), AOM_ICDF(30193), AOM_ICDF(30615),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10729), AOM_ICDF(13233), AOM_ICDF(26938), AOM_ICDF(27455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8604), AOM_ICDF(9526), AOM_ICDF(22436), AOM_ICDF(22989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6828), AOM_ICDF(7236), AOM_ICDF(18056), AOM_ICDF(18456),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4302), AOM_ICDF(4555), AOM_ICDF(12209), AOM_ICDF(12462),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(20261), AOM_ICDF(24381), AOM_ICDF(31612), AOM_ICDF(31989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13775), AOM_ICDF(20449), AOM_ICDF(30685), AOM_ICDF(31111),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10459), AOM_ICDF(13768), AOM_ICDF(27504), AOM_ICDF(28114),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7994), AOM_ICDF(8989), AOM_ICDF(22906), AOM_ICDF(23636),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5928), AOM_ICDF(6460), AOM_ICDF(16884), AOM_ICDF(17720),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4520), AOM_ICDF(7910), AOM_ICDF(12429), AOM_ICDF(16949),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(17822), AOM_ICDF(26021), AOM_ICDF(31751), AOM_ICDF(32150),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13484), AOM_ICDF(23372), AOM_ICDF(31305), AOM_ICDF(31747),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11009), AOM_ICDF(15469), AOM_ICDF(28452), AOM_ICDF(29132),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8358), AOM_ICDF(9357), AOM_ICDF(22412), AOM_ICDF(23385),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9392), AOM_ICDF(10018), AOM_ICDF(18158), AOM_ICDF(19202),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(5236), AOM_ICDF(26529), AOM_ICDF(31709), AOM_ICDF(32201),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5710), AOM_ICDF(25925), AOM_ICDF(31254), AOM_ICDF(31967),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7645), AOM_ICDF(19427), AOM_ICDF(28170), AOM_ICDF(29920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7427), AOM_ICDF(13350), AOM_ICDF(23253), AOM_ICDF(25438),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(6687), AOM_ICDF(15381), AOM_ICDF(18725),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(11176), AOM_ICDF(18297), AOM_ICDF(19062), AOM_ICDF(28984),
-    AOM_ICDF(29496), AOM_ICDF(32768), },
-    {AOM_ICDF(9778), AOM_ICDF(17798), AOM_ICDF(19934), AOM_ICDF(28434),
-    AOM_ICDF(28921), AOM_ICDF(32768), },
-    {AOM_ICDF(4806), AOM_ICDF(14260), AOM_ICDF(17259), AOM_ICDF(26368),
-    AOM_ICDF(26942), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(21802), AOM_ICDF(22916), AOM_ICDF(31657), AOM_ICDF(31989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16874), AOM_ICDF(20345), AOM_ICDF(31048), AOM_ICDF(31389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10717), AOM_ICDF(12576), AOM_ICDF(26899), AOM_ICDF(27294),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8468), AOM_ICDF(9404), AOM_ICDF(21928), AOM_ICDF(22358),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5992), AOM_ICDF(6521), AOM_ICDF(16309), AOM_ICDF(16729),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5134), AOM_ICDF(5452), AOM_ICDF(11491), AOM_ICDF(11865),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22003), AOM_ICDF(24147), AOM_ICDF(31841), AOM_ICDF(32177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17179), AOM_ICDF(20593), AOM_ICDF(31041), AOM_ICDF(31394),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9282), AOM_ICDF(10544), AOM_ICDF(25698), AOM_ICDF(26133),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6301), AOM_ICDF(7013), AOM_ICDF(19066), AOM_ICDF(19557),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3845), AOM_ICDF(4316), AOM_ICDF(12209), AOM_ICDF(12812),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4819), AOM_ICDF(6746), AOM_ICDF(11565), AOM_ICDF(13011),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22820), AOM_ICDF(26023), AOM_ICDF(31888), AOM_ICDF(32236),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17130), AOM_ICDF(21510), AOM_ICDF(31268), AOM_ICDF(31632),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10062), AOM_ICDF(11898), AOM_ICDF(26787), AOM_ICDF(27281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7681), AOM_ICDF(8590), AOM_ICDF(21264), AOM_ICDF(22034),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4413), AOM_ICDF(5143), AOM_ICDF(13605), AOM_ICDF(14712),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(16384), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(20237), AOM_ICDF(25695), AOM_ICDF(31868), AOM_ICDF(32222),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15551), AOM_ICDF(22658), AOM_ICDF(31236), AOM_ICDF(31659),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9584), AOM_ICDF(12389), AOM_ICDF(26347), AOM_ICDF(27242),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6067), AOM_ICDF(7231), AOM_ICDF(19625), AOM_ICDF(20707),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3724), AOM_ICDF(4312), AOM_ICDF(11269), AOM_ICDF(12425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(6554), AOM_ICDF(9830), AOM_ICDF(12288),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(11726), AOM_ICDF(26639), AOM_ICDF(31977), AOM_ICDF(32340),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10754), AOM_ICDF(25823), AOM_ICDF(31568), AOM_ICDF(32060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8761), AOM_ICDF(16650), AOM_ICDF(27884), AOM_ICDF(29394),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7387), AOM_ICDF(9941), AOM_ICDF(21377), AOM_ICDF(23333),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2374), AOM_ICDF(3799), AOM_ICDF(16147), AOM_ICDF(19471),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(29271), AOM_ICDF(29645), AOM_ICDF(31447), AOM_ICDF(31951),
-    AOM_ICDF(32313), AOM_ICDF(32768), },
-    {AOM_ICDF(22174), AOM_ICDF(23288), AOM_ICDF(29633), AOM_ICDF(31096),
-    AOM_ICDF(31701), AOM_ICDF(32768), },
-    {AOM_ICDF(13601), AOM_ICDF(16603), AOM_ICDF(25296), AOM_ICDF(28966),
-    AOM_ICDF(30043), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13850), AOM_ICDF(26266), AOM_ICDF(31653), AOM_ICDF(32083),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11979), AOM_ICDF(24610), AOM_ICDF(31369), AOM_ICDF(31810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11325), AOM_ICDF(18989), AOM_ICDF(29109), AOM_ICDF(29770),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9338), AOM_ICDF(11892), AOM_ICDF(25324), AOM_ICDF(26115),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5725), AOM_ICDF(6243), AOM_ICDF(18483), AOM_ICDF(18919),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(9830), AOM_ICDF(16384), AOM_ICDF(19661),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18097), AOM_ICDF(27765), AOM_ICDF(31891), AOM_ICDF(32286),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14735), AOM_ICDF(24632), AOM_ICDF(31577), AOM_ICDF(31970),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11031), AOM_ICDF(15675), AOM_ICDF(29109), AOM_ICDF(29716),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8859), AOM_ICDF(9891), AOM_ICDF(23909), AOM_ICDF(24940),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7864), AOM_ICDF(11796), AOM_ICDF(20972), AOM_ICDF(24904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21057), AOM_ICDF(29116), AOM_ICDF(32033), AOM_ICDF(32367),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15287), AOM_ICDF(25704), AOM_ICDF(31791), AOM_ICDF(32151),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12927), AOM_ICDF(18993), AOM_ICDF(30815), AOM_ICDF(31329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13227), AOM_ICDF(16234), AOM_ICDF(27657), AOM_ICDF(28860),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6899), AOM_ICDF(12072), AOM_ICDF(18971), AOM_ICDF(25869),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(17688), AOM_ICDF(28768), AOM_ICDF(32140), AOM_ICDF(32435),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13473), AOM_ICDF(26360), AOM_ICDF(31944), AOM_ICDF(32307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12653), AOM_ICDF(18817), AOM_ICDF(28875), AOM_ICDF(30497),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(20025), AOM_ICDF(25486),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6820), AOM_ICDF(28765), AOM_ICDF(31878), AOM_ICDF(32323),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7737), AOM_ICDF(28672), AOM_ICDF(31972), AOM_ICDF(32313),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11796), AOM_ICDF(18350), AOM_ICDF(24904), AOM_ICDF(28836),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30079), AOM_ICDF(30525), AOM_ICDF(31559), AOM_ICDF(32085),
-    AOM_ICDF(32407), AOM_ICDF(32768), },
-    {AOM_ICDF(22148), AOM_ICDF(24035), AOM_ICDF(29557), AOM_ICDF(31423),
-    AOM_ICDF(31881), AOM_ICDF(32768), },
-    {AOM_ICDF(13266), AOM_ICDF(17717), AOM_ICDF(26069), AOM_ICDF(29825),
-    AOM_ICDF(30780), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(18219), AOM_ICDF(27530), AOM_ICDF(32048), AOM_ICDF(32373),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14664), AOM_ICDF(25532), AOM_ICDF(31886), AOM_ICDF(32244),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11683), AOM_ICDF(19554), AOM_ICDF(30330), AOM_ICDF(30870),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9410), AOM_ICDF(14238), AOM_ICDF(25794), AOM_ICDF(27268),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6629), AOM_ICDF(9580), AOM_ICDF(20186), AOM_ICDF(22187),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2891), AOM_ICDF(4337), AOM_ICDF(11083), AOM_ICDF(13493),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20016), AOM_ICDF(28471), AOM_ICDF(32074), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16915), AOM_ICDF(26047), AOM_ICDF(31965), AOM_ICDF(32300),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10725), AOM_ICDF(18206), AOM_ICDF(30056), AOM_ICDF(30606),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6883), AOM_ICDF(13990), AOM_ICDF(26334), AOM_ICDF(27531),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11529), AOM_ICDF(15170), AOM_ICDF(22452), AOM_ICDF(24879),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23488), AOM_ICDF(29744), AOM_ICDF(32117), AOM_ICDF(32442),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17520), AOM_ICDF(27259), AOM_ICDF(32056), AOM_ICDF(32389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13107), AOM_ICDF(20597), AOM_ICDF(31416), AOM_ICDF(32092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20165), AOM_ICDF(22686), AOM_ICDF(26887), AOM_ICDF(29407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(17711), AOM_ICDF(29963), AOM_ICDF(32137), AOM_ICDF(32452),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14078), AOM_ICDF(28336), AOM_ICDF(32026), AOM_ICDF(32391),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11129), AOM_ICDF(28749), AOM_ICDF(30295), AOM_ICDF(31222),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7447), AOM_ICDF(13405), AOM_ICDF(22342), AOM_ICDF(26810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(14413), AOM_ICDF(30309), AOM_ICDF(32090), AOM_ICDF(32471),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11814), AOM_ICDF(30354), AOM_ICDF(32251), AOM_ICDF(32509),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(12743), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(16945), AOM_ICDF(18241), AOM_ICDF(25718), AOM_ICDF(28152),
-    AOM_ICDF(29383), AOM_ICDF(32768), },
-    {AOM_ICDF(7095), AOM_ICDF(10051), AOM_ICDF(18830), AOM_ICDF(23174),
-    AOM_ICDF(24906), AOM_ICDF(32768), },
-    {AOM_ICDF(2585), AOM_ICDF(6677), AOM_ICDF(10951), AOM_ICDF(17411),
-    AOM_ICDF(18916), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12894), AOM_ICDF(17897), AOM_ICDF(28218), AOM_ICDF(28651),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11333), AOM_ICDF(16802), AOM_ICDF(27676), AOM_ICDF(28153),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10166), AOM_ICDF(13829), AOM_ICDF(25072), AOM_ICDF(25646),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8356), AOM_ICDF(9772), AOM_ICDF(21358), AOM_ICDF(21912),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5988), AOM_ICDF(6506), AOM_ICDF(16203), AOM_ICDF(16647),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3684), AOM_ICDF(4012), AOM_ICDF(10039), AOM_ICDF(10367),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18192), AOM_ICDF(21044), AOM_ICDF(30229), AOM_ICDF(30597),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14976), AOM_ICDF(18218), AOM_ICDF(29191), AOM_ICDF(29564),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10914), AOM_ICDF(12508), AOM_ICDF(25451), AOM_ICDF(25857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7970), AOM_ICDF(8605), AOM_ICDF(20619), AOM_ICDF(21011),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5555), AOM_ICDF(5926), AOM_ICDF(15730), AOM_ICDF(16091),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3522), AOM_ICDF(3847), AOM_ICDF(10567), AOM_ICDF(10892),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21896), AOM_ICDF(23866), AOM_ICDF(31136), AOM_ICDF(31486),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15913), AOM_ICDF(18331), AOM_ICDF(29670), AOM_ICDF(30019),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10158), AOM_ICDF(10878), AOM_ICDF(24664), AOM_ICDF(25024),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6692), AOM_ICDF(7070), AOM_ICDF(18934), AOM_ICDF(19267),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4603), AOM_ICDF(4914), AOM_ICDF(13724), AOM_ICDF(14041),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2378), AOM_ICDF(3171), AOM_ICDF(7663), AOM_ICDF(8456),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24113), AOM_ICDF(25740), AOM_ICDF(31668), AOM_ICDF(32000),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16618), AOM_ICDF(18583), AOM_ICDF(30173), AOM_ICDF(30511),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10122), AOM_ICDF(10666), AOM_ICDF(24877), AOM_ICDF(25222),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6721), AOM_ICDF(7062), AOM_ICDF(19250), AOM_ICDF(19588),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4641), AOM_ICDF(4957), AOM_ICDF(13698), AOM_ICDF(14021),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3324), AOM_ICDF(4749), AOM_ICDF(9498), AOM_ICDF(10923),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(24933), AOM_ICDF(27294), AOM_ICDF(31876), AOM_ICDF(32207),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17505), AOM_ICDF(20214), AOM_ICDF(30842), AOM_ICDF(31189),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10756), AOM_ICDF(11345), AOM_ICDF(25989), AOM_ICDF(26362),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7374), AOM_ICDF(7763), AOM_ICDF(19820), AOM_ICDF(20160),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5003), AOM_ICDF(5328), AOM_ICDF(15420), AOM_ICDF(15723),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4915), AOM_ICDF(9830), AOM_ICDF(18022), AOM_ICDF(22938),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(7874), AOM_ICDF(17174), AOM_ICDF(19119), AOM_ICDF(28514),
-    AOM_ICDF(29361), AOM_ICDF(32768), },
-    {AOM_ICDF(3407), AOM_ICDF(13628), AOM_ICDF(16836), AOM_ICDF(26723),
-    AOM_ICDF(27681), AOM_ICDF(32768), },
-    {AOM_ICDF(1062), AOM_ICDF(11514), AOM_ICDF(14002), AOM_ICDF(24081),
-    AOM_ICDF(25232), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23614), AOM_ICDF(24717), AOM_ICDF(31593), AOM_ICDF(31927),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18177), AOM_ICDF(21581), AOM_ICDF(30890), AOM_ICDF(31234),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12535), AOM_ICDF(14549), AOM_ICDF(27749), AOM_ICDF(28134),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9687), AOM_ICDF(10712), AOM_ICDF(23848), AOM_ICDF(24271),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6461), AOM_ICDF(7119), AOM_ICDF(17940), AOM_ICDF(18368),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3863), AOM_ICDF(4245), AOM_ICDF(10904), AOM_ICDF(11278),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24334), AOM_ICDF(25912), AOM_ICDF(31795), AOM_ICDF(32120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17964), AOM_ICDF(20229), AOM_ICDF(30726), AOM_ICDF(31064),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10463), AOM_ICDF(11527), AOM_ICDF(25898), AOM_ICDF(26256),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7431), AOM_ICDF(8071), AOM_ICDF(20542), AOM_ICDF(20928),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4561), AOM_ICDF(4995), AOM_ICDF(13977), AOM_ICDF(14347),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2427), AOM_ICDF(2687), AOM_ICDF(8149), AOM_ICDF(8409),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25888), AOM_ICDF(27308), AOM_ICDF(31957), AOM_ICDF(32279),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18868), AOM_ICDF(20992), AOM_ICDF(31092), AOM_ICDF(31424),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10480), AOM_ICDF(11191), AOM_ICDF(25801), AOM_ICDF(26149),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6878), AOM_ICDF(7326), AOM_ICDF(19397), AOM_ICDF(19762),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4235), AOM_ICDF(4601), AOM_ICDF(13182), AOM_ICDF(13587),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3584), AOM_ICDF(5120), AOM_ICDF(11264), AOM_ICDF(13312),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26802), AOM_ICDF(28181), AOM_ICDF(32031), AOM_ICDF(32349),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19661), AOM_ICDF(21746), AOM_ICDF(31360), AOM_ICDF(31688),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10680), AOM_ICDF(11361), AOM_ICDF(26261), AOM_ICDF(26610),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6811), AOM_ICDF(7274), AOM_ICDF(19689), AOM_ICDF(20075),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4881), AOM_ICDF(5230), AOM_ICDF(11882), AOM_ICDF(12324),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(6144), AOM_ICDF(9557), AOM_ICDF(11605),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27511), AOM_ICDF(29045), AOM_ICDF(32051), AOM_ICDF(32376),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19712), AOM_ICDF(22596), AOM_ICDF(31464), AOM_ICDF(31813),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11035), AOM_ICDF(11852), AOM_ICDF(26626), AOM_ICDF(27082),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7190), AOM_ICDF(7674), AOM_ICDF(20245), AOM_ICDF(20794),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5114), AOM_ICDF(5407), AOM_ICDF(12895), AOM_ICDF(13443),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(15522), AOM_ICDF(20696),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(26201), AOM_ICDF(26641), AOM_ICDF(31158), AOM_ICDF(31755),
-    AOM_ICDF(32200), AOM_ICDF(32768), },
-    {AOM_ICDF(19651), AOM_ICDF(20883), AOM_ICDF(28935), AOM_ICDF(30581),
-    AOM_ICDF(31426), AOM_ICDF(32768), },
-    {AOM_ICDF(12456), AOM_ICDF(15868), AOM_ICDF(23727), AOM_ICDF(27839),
-    AOM_ICDF(29216), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(16708), AOM_ICDF(25600), AOM_ICDF(31550), AOM_ICDF(31927),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14533), AOM_ICDF(24134), AOM_ICDF(31151), AOM_ICDF(31670),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12771), AOM_ICDF(19041), AOM_ICDF(29256), AOM_ICDF(29926),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9497), AOM_ICDF(12011), AOM_ICDF(24856), AOM_ICDF(25648),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6059), AOM_ICDF(6512), AOM_ICDF(17765), AOM_ICDF(18218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4498), AOM_ICDF(6425), AOM_ICDF(13493), AOM_ICDF(15420),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21314), AOM_ICDF(26763), AOM_ICDF(31645), AOM_ICDF(32043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16898), AOM_ICDF(23241), AOM_ICDF(31276), AOM_ICDF(31667),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12339), AOM_ICDF(16091), AOM_ICDF(28493), AOM_ICDF(28851),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8583), AOM_ICDF(10033), AOM_ICDF(23721), AOM_ICDF(24359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6801), AOM_ICDF(7728), AOM_ICDF(18857), AOM_ICDF(19784),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25155), AOM_ICDF(28551), AOM_ICDF(31936), AOM_ICDF(32273),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18054), AOM_ICDF(22818), AOM_ICDF(31343), AOM_ICDF(31736),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12381), AOM_ICDF(14088), AOM_ICDF(27865), AOM_ICDF(28300),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7853), AOM_ICDF(8666), AOM_ICDF(21665), AOM_ICDF(22477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(10923), AOM_ICDF(15604), AOM_ICDF(20285),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26649), AOM_ICDF(29334), AOM_ICDF(32001), AOM_ICDF(32345),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18410), AOM_ICDF(22788), AOM_ICDF(31465), AOM_ICDF(31842),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12504), AOM_ICDF(13480), AOM_ICDF(28600), AOM_ICDF(28955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9175), AOM_ICDF(10486), AOM_ICDF(21845), AOM_ICDF(23156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(13493), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27622), AOM_ICDF(30399), AOM_ICDF(32070), AOM_ICDF(32399),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18214), AOM_ICDF(24797), AOM_ICDF(31688), AOM_ICDF(32070),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14564), AOM_ICDF(16894), AOM_ICDF(28981), AOM_ICDF(29564),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7802), AOM_ICDF(12483), AOM_ICDF(17164), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30040), AOM_ICDF(30464), AOM_ICDF(31682), AOM_ICDF(32091),
-    AOM_ICDF(32421), AOM_ICDF(32768), },
-    {AOM_ICDF(20770), AOM_ICDF(22635), AOM_ICDF(29889), AOM_ICDF(31156),
-    AOM_ICDF(31909), AOM_ICDF(32768), },
-    {AOM_ICDF(9112), AOM_ICDF(13841), AOM_ICDF(23864), AOM_ICDF(27288),
-    AOM_ICDF(30322), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23477), AOM_ICDF(28240), AOM_ICDF(32035), AOM_ICDF(32360),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18183), AOM_ICDF(26268), AOM_ICDF(31861), AOM_ICDF(32205),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14392), AOM_ICDF(23052), AOM_ICDF(30811), AOM_ICDF(31315),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12579), AOM_ICDF(20081), AOM_ICDF(28411), AOM_ICDF(29467),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9882), AOM_ICDF(14796), AOM_ICDF(25492), AOM_ICDF(27040),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11141), AOM_ICDF(13107), AOM_ICDF(21627), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24700), AOM_ICDF(28735), AOM_ICDF(32055), AOM_ICDF(32379),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19703), AOM_ICDF(25203), AOM_ICDF(31809), AOM_ICDF(32142),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12756), AOM_ICDF(18882), AOM_ICDF(30716), AOM_ICDF(31103),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9508), AOM_ICDF(13922), AOM_ICDF(25977), AOM_ICDF(26826),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5243), AOM_ICDF(9175), AOM_ICDF(19661), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26792), AOM_ICDF(29367), AOM_ICDF(32090), AOM_ICDF(32407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21899), AOM_ICDF(25640), AOM_ICDF(31870), AOM_ICDF(32192),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14205), AOM_ICDF(16907), AOM_ICDF(30415), AOM_ICDF(30764),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10570), AOM_ICDF(13741), AOM_ICDF(23255), AOM_ICDF(26426),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27743), AOM_ICDF(29950), AOM_ICDF(32116), AOM_ICDF(32430),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21595), AOM_ICDF(24944), AOM_ICDF(31927), AOM_ICDF(32259),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15227), AOM_ICDF(16673), AOM_ICDF(30744), AOM_ICDF(31130),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13797), AOM_ICDF(16384), AOM_ICDF(25007), AOM_ICDF(27594),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28888), AOM_ICDF(30883), AOM_ICDF(32127), AOM_ICDF(32447),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20978), AOM_ICDF(26121), AOM_ICDF(32090), AOM_ICDF(32406),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16644), AOM_ICDF(18725), AOM_ICDF(30427), AOM_ICDF(31468),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(11469), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(2791), AOM_ICDF(5929), AOM_ICDF(15783), AOM_ICDF(21305),
-    AOM_ICDF(24756), AOM_ICDF(32768), },
-    {AOM_ICDF(2492), AOM_ICDF(5974), AOM_ICDF(11999), AOM_ICDF(17892),
-    AOM_ICDF(20328), AOM_ICDF(32768), },
-    {AOM_ICDF(1232), AOM_ICDF(4784), AOM_ICDF(7266), AOM_ICDF(13409),
-    AOM_ICDF(14638), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10984), AOM_ICDF(15590), AOM_ICDF(26386), AOM_ICDF(26860),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10300), AOM_ICDF(15555), AOM_ICDF(26075), AOM_ICDF(26661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9016), AOM_ICDF(12368), AOM_ICDF(23292), AOM_ICDF(24037),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7432), AOM_ICDF(9010), AOM_ICDF(19640), AOM_ICDF(20245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5340), AOM_ICDF(5830), AOM_ICDF(14605), AOM_ICDF(15017),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3041), AOM_ICDF(3357), AOM_ICDF(8664), AOM_ICDF(8983),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17487), AOM_ICDF(19944), AOM_ICDF(29422), AOM_ICDF(29785),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14365), AOM_ICDF(17572), AOM_ICDF(28369), AOM_ICDF(28763),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10944), AOM_ICDF(12562), AOM_ICDF(24945), AOM_ICDF(25372),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8061), AOM_ICDF(8670), AOM_ICDF(20179), AOM_ICDF(20570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5386), AOM_ICDF(5759), AOM_ICDF(14881), AOM_ICDF(15238),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3124), AOM_ICDF(3450), AOM_ICDF(9578), AOM_ICDF(9895),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21610), AOM_ICDF(23212), AOM_ICDF(30674), AOM_ICDF(31007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15516), AOM_ICDF(17922), AOM_ICDF(29225), AOM_ICDF(29573),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10431), AOM_ICDF(11308), AOM_ICDF(24594), AOM_ICDF(24955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6949), AOM_ICDF(7331), AOM_ICDF(18758), AOM_ICDF(19089),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4564), AOM_ICDF(4898), AOM_ICDF(12730), AOM_ICDF(13048),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2435), AOM_ICDF(2739), AOM_ICDF(7406), AOM_ICDF(7710),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24469), AOM_ICDF(25838), AOM_ICDF(31499), AOM_ICDF(31824),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17238), AOM_ICDF(18899), AOM_ICDF(30066), AOM_ICDF(30395),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10423), AOM_ICDF(10890), AOM_ICDF(24655), AOM_ICDF(24992),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6612), AOM_ICDF(6939), AOM_ICDF(18149), AOM_ICDF(18467),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4122), AOM_ICDF(4431), AOM_ICDF(12556), AOM_ICDF(12874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1910), AOM_ICDF(2211), AOM_ICDF(7840), AOM_ICDF(8142),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27205), AOM_ICDF(28145), AOM_ICDF(31900), AOM_ICDF(32218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18503), AOM_ICDF(19729), AOM_ICDF(30590), AOM_ICDF(30916),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10343), AOM_ICDF(10734), AOM_ICDF(24636), AOM_ICDF(24963),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6629), AOM_ICDF(6955), AOM_ICDF(18492), AOM_ICDF(18810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4131), AOM_ICDF(4437), AOM_ICDF(13086), AOM_ICDF(13392),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4005), AOM_ICDF(5097), AOM_ICDF(9102), AOM_ICDF(10194),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(1286), AOM_ICDF(10273), AOM_ICDF(21021), AOM_ICDF(28617),
-    AOM_ICDF(29729), AOM_ICDF(32768), },
-    {AOM_ICDF(941), AOM_ICDF(10009), AOM_ICDF(17718), AOM_ICDF(25847),
-    AOM_ICDF(27712), AOM_ICDF(32768), },
-    {AOM_ICDF(508), AOM_ICDF(9488), AOM_ICDF(12907), AOM_ICDF(21634),
-    AOM_ICDF(23969), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23900), AOM_ICDF(25135), AOM_ICDF(31528), AOM_ICDF(31861),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18613), AOM_ICDF(22015), AOM_ICDF(30774), AOM_ICDF(31124),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13064), AOM_ICDF(16135), AOM_ICDF(28060), AOM_ICDF(28484),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10563), AOM_ICDF(12428), AOM_ICDF(24847), AOM_ICDF(25281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7960), AOM_ICDF(9069), AOM_ICDF(20548), AOM_ICDF(21017),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6944), AOM_ICDF(7491), AOM_ICDF(16595), AOM_ICDF(17007),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24972), AOM_ICDF(26434), AOM_ICDF(31771), AOM_ICDF(32097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18362), AOM_ICDF(20757), AOM_ICDF(30733), AOM_ICDF(31070),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11226), AOM_ICDF(12487), AOM_ICDF(26292), AOM_ICDF(26651),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7823), AOM_ICDF(8448), AOM_ICDF(20940), AOM_ICDF(21314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4964), AOM_ICDF(5365), AOM_ICDF(14104), AOM_ICDF(14457),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2435), AOM_ICDF(2712), AOM_ICDF(8247), AOM_ICDF(8524),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26551), AOM_ICDF(27694), AOM_ICDF(31943), AOM_ICDF(32261),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19519), AOM_ICDF(21452), AOM_ICDF(31120), AOM_ICDF(31446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11272), AOM_ICDF(11965), AOM_ICDF(26389), AOM_ICDF(26736),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7109), AOM_ICDF(7485), AOM_ICDF(19585), AOM_ICDF(19920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4033), AOM_ICDF(4370), AOM_ICDF(12546), AOM_ICDF(12865),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1570), AOM_ICDF(2158), AOM_ICDF(7456), AOM_ICDF(8045),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27654), AOM_ICDF(28637), AOM_ICDF(32030), AOM_ICDF(32345),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20795), AOM_ICDF(22232), AOM_ICDF(31351), AOM_ICDF(31672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10841), AOM_ICDF(11329), AOM_ICDF(25676), AOM_ICDF(26002),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6589), AOM_ICDF(6943), AOM_ICDF(18084), AOM_ICDF(18412),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3970), AOM_ICDF(4279), AOM_ICDF(12009), AOM_ICDF(12318),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3449), AOM_ICDF(3967), AOM_ICDF(7761), AOM_ICDF(8278),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29545), AOM_ICDF(30314), AOM_ICDF(32084), AOM_ICDF(32404),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21229), AOM_ICDF(22783), AOM_ICDF(31470), AOM_ICDF(31800),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10409), AOM_ICDF(11031), AOM_ICDF(25267), AOM_ICDF(25669),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6456), AOM_ICDF(6909), AOM_ICDF(18270), AOM_ICDF(18674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4253), AOM_ICDF(5017), AOM_ICDF(13288), AOM_ICDF(13706),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1627), AOM_ICDF(2324), AOM_ICDF(8831), AOM_ICDF(9528),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(24627), AOM_ICDF(25102), AOM_ICDF(30943), AOM_ICDF(31607),
-    AOM_ICDF(32215), AOM_ICDF(32768), },
-    {AOM_ICDF(17408), AOM_ICDF(18757), AOM_ICDF(28256), AOM_ICDF(30111),
-    AOM_ICDF(31225), AOM_ICDF(32768), },
-    {AOM_ICDF(10984), AOM_ICDF(14293), AOM_ICDF(22894), AOM_ICDF(27503),
-    AOM_ICDF(28853), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(16390), AOM_ICDF(25826), AOM_ICDF(31293), AOM_ICDF(31726),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14074), AOM_ICDF(25147), AOM_ICDF(31045), AOM_ICDF(31638),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13598), AOM_ICDF(20524), AOM_ICDF(28818), AOM_ICDF(29894),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10035), AOM_ICDF(13322), AOM_ICDF(25086), AOM_ICDF(26332),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7156), AOM_ICDF(8035), AOM_ICDF(18456), AOM_ICDF(19334),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10923), AOM_ICDF(19115), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22787), AOM_ICDF(27489), AOM_ICDF(31676), AOM_ICDF(32026),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17518), AOM_ICDF(23800), AOM_ICDF(31204), AOM_ICDF(31578),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10686), AOM_ICDF(15226), AOM_ICDF(28087), AOM_ICDF(28560),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9612), AOM_ICDF(11942), AOM_ICDF(22574), AOM_ICDF(23010),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6437), AOM_ICDF(8192), AOM_ICDF(18139), AOM_ICDF(19895),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26773), AOM_ICDF(28429), AOM_ICDF(31782), AOM_ICDF(32120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18449), AOM_ICDF(22329), AOM_ICDF(30991), AOM_ICDF(31329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12861), AOM_ICDF(14182), AOM_ICDF(27130), AOM_ICDF(27395),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(6554), AOM_ICDF(22469), AOM_ICDF(23874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8623), AOM_ICDF(13797), AOM_ICDF(22420), AOM_ICDF(27594),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28378), AOM_ICDF(29466), AOM_ICDF(31934), AOM_ICDF(32245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19880), AOM_ICDF(21733), AOM_ICDF(31206), AOM_ICDF(31550),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12173), AOM_ICDF(13245), AOM_ICDF(27638), AOM_ICDF(27945),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6215), AOM_ICDF(7910), AOM_ICDF(19774), AOM_ICDF(21469),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30333), AOM_ICDF(31015), AOM_ICDF(32078), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19277), AOM_ICDF(21376), AOM_ICDF(31072), AOM_ICDF(31407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12978), AOM_ICDF(13724), AOM_ICDF(28144), AOM_ICDF(28442),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10031), AOM_ICDF(12037), AOM_ICDF(25412), AOM_ICDF(27418),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29777), AOM_ICDF(30229), AOM_ICDF(31726), AOM_ICDF(32104),
-    AOM_ICDF(32440), AOM_ICDF(32768), },
-    {AOM_ICDF(18551), AOM_ICDF(20755), AOM_ICDF(29778), AOM_ICDF(30685),
-    AOM_ICDF(31935), AOM_ICDF(32768), },
-    {AOM_ICDF(6236), AOM_ICDF(13170), AOM_ICDF(24037), AOM_ICDF(25823),
-    AOM_ICDF(30798), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(28890), AOM_ICDF(30863), AOM_ICDF(32128), AOM_ICDF(32440),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17311), AOM_ICDF(27082), AOM_ICDF(31871), AOM_ICDF(32209),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13447), AOM_ICDF(25217), AOM_ICDF(31158), AOM_ICDF(31793),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11906), AOM_ICDF(20177), AOM_ICDF(29976), AOM_ICDF(30713),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14883), AOM_ICDF(17134), AOM_ICDF(27140), AOM_ICDF(28266),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14959), AOM_ICDF(17096), AOM_ICDF(22795), AOM_ICDF(25645),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(29494), AOM_ICDF(30807), AOM_ICDF(32086), AOM_ICDF(32404),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19860), AOM_ICDF(25179), AOM_ICDF(31857), AOM_ICDF(32190),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13936), AOM_ICDF(19209), AOM_ICDF(30508), AOM_ICDF(31073),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7168), AOM_ICDF(10240), AOM_ICDF(24576), AOM_ICDF(27648),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(19275), AOM_ICDF(25058),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(30496), AOM_ICDF(31243), AOM_ICDF(32121), AOM_ICDF(32433),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21369), AOM_ICDF(24262), AOM_ICDF(31827), AOM_ICDF(32158),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18971), AOM_ICDF(21127), AOM_ICDF(29319), AOM_ICDF(30612),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(13493), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(30922), AOM_ICDF(31459), AOM_ICDF(32136), AOM_ICDF(32449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22640), AOM_ICDF(24782), AOM_ICDF(31768), AOM_ICDF(32076),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12955), AOM_ICDF(14860), AOM_ICDF(28958), AOM_ICDF(30101),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(12743), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30469), AOM_ICDF(31279), AOM_ICDF(32115), AOM_ICDF(32446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19748), AOM_ICDF(24367), AOM_ICDF(31900), AOM_ICDF(32257),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12684), AOM_ICDF(16120), AOM_ICDF(30125), AOM_ICDF(30918),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(8402), AOM_ICDF(9860), AOM_ICDF(23425), AOM_ICDF(26798),
-    AOM_ICDF(28753), AOM_ICDF(32768), },
-    {AOM_ICDF(4503), AOM_ICDF(7478), AOM_ICDF(14541), AOM_ICDF(19455),
-    AOM_ICDF(21058), AOM_ICDF(32768), },
-    {AOM_ICDF(1404), AOM_ICDF(4914), AOM_ICDF(7456), AOM_ICDF(13239),
-    AOM_ICDF(14005), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(11786), AOM_ICDF(17804), AOM_ICDF(26686), AOM_ICDF(27285),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10456), AOM_ICDF(16685), AOM_ICDF(26272), AOM_ICDF(27135),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8297), AOM_ICDF(12591), AOM_ICDF(23088), AOM_ICDF(24288),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6320), AOM_ICDF(8297), AOM_ICDF(18902), AOM_ICDF(20112),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4385), AOM_ICDF(4892), AOM_ICDF(12779), AOM_ICDF(13476),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2151), AOM_ICDF(2470), AOM_ICDF(6432), AOM_ICDF(6758),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17988), AOM_ICDF(21025), AOM_ICDF(29658), AOM_ICDF(30075),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14641), AOM_ICDF(18188), AOM_ICDF(28759), AOM_ICDF(29202),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10951), AOM_ICDF(12924), AOM_ICDF(25087), AOM_ICDF(25515),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(9165), AOM_ICDF(20302), AOM_ICDF(20696),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5213), AOM_ICDF(5567), AOM_ICDF(14740), AOM_ICDF(15114),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2785), AOM_ICDF(3096), AOM_ICDF(8153), AOM_ICDF(8465),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22839), AOM_ICDF(24625), AOM_ICDF(31013), AOM_ICDF(31343),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16111), AOM_ICDF(18689), AOM_ICDF(29552), AOM_ICDF(29896),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10736), AOM_ICDF(11502), AOM_ICDF(24493), AOM_ICDF(24827),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7153), AOM_ICDF(7570), AOM_ICDF(18744), AOM_ICDF(19067),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4285), AOM_ICDF(4591), AOM_ICDF(11651), AOM_ICDF(11957),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2064), AOM_ICDF(2322), AOM_ICDF(6321), AOM_ICDF(6579),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24955), AOM_ICDF(26499), AOM_ICDF(31625), AOM_ICDF(31948),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17242), AOM_ICDF(19354), AOM_ICDF(30096), AOM_ICDF(30432),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10470), AOM_ICDF(11049), AOM_ICDF(24405), AOM_ICDF(24742),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6717), AOM_ICDF(7038), AOM_ICDF(17553), AOM_ICDF(17870),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4030), AOM_ICDF(4342), AOM_ICDF(11280), AOM_ICDF(11592),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2060), AOM_ICDF(2355), AOM_ICDF(6966), AOM_ICDF(7260),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29697), AOM_ICDF(30286), AOM_ICDF(32009), AOM_ICDF(32325),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18629), AOM_ICDF(19720), AOM_ICDF(30251), AOM_ICDF(30574),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9459), AOM_ICDF(9826), AOM_ICDF(22948), AOM_ICDF(23264),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5742), AOM_ICDF(6057), AOM_ICDF(16269), AOM_ICDF(16580),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3696), AOM_ICDF(4006), AOM_ICDF(11276), AOM_ICDF(11586),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2359), AOM_ICDF(2614), AOM_ICDF(5801), AOM_ICDF(6056),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(14224), AOM_ICDF(15827), AOM_ICDF(27984), AOM_ICDF(30263),
-    AOM_ICDF(31458), AOM_ICDF(32768), },
-    {AOM_ICDF(4253), AOM_ICDF(7150), AOM_ICDF(20729), AOM_ICDF(24629),
-    AOM_ICDF(28621), AOM_ICDF(32768), },
-    {AOM_ICDF(1405), AOM_ICDF(5159), AOM_ICDF(12422), AOM_ICDF(17006),
-    AOM_ICDF(24088), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(20029), AOM_ICDF(23525), AOM_ICDF(30941), AOM_ICDF(31369),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15691), AOM_ICDF(22792), AOM_ICDF(30520), AOM_ICDF(30960),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12036), AOM_ICDF(18829), AOM_ICDF(28256), AOM_ICDF(29025),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10881), AOM_ICDF(14586), AOM_ICDF(25416), AOM_ICDF(26318),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11249), AOM_ICDF(13311), AOM_ICDF(23713), AOM_ICDF(24498),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9444), AOM_ICDF(10609), AOM_ICDF(20170), AOM_ICDF(21025),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23805), AOM_ICDF(26370), AOM_ICDF(31579), AOM_ICDF(31927),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16685), AOM_ICDF(21243), AOM_ICDF(30526), AOM_ICDF(30890),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11661), AOM_ICDF(14143), AOM_ICDF(26804), AOM_ICDF(27193),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8321), AOM_ICDF(9593), AOM_ICDF(21814), AOM_ICDF(22228),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6243), AOM_ICDF(6820), AOM_ICDF(16151), AOM_ICDF(16506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3612), AOM_ICDF(4386), AOM_ICDF(9547), AOM_ICDF(10321),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26022), AOM_ICDF(27534), AOM_ICDF(31845), AOM_ICDF(32167),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18692), AOM_ICDF(21351), AOM_ICDF(30871), AOM_ICDF(31203),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11493), AOM_ICDF(12410), AOM_ICDF(26280), AOM_ICDF(26619),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7099), AOM_ICDF(7581), AOM_ICDF(19315), AOM_ICDF(19619),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3329), AOM_ICDF(3623), AOM_ICDF(10868), AOM_ICDF(11162),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3104), AOM_ICDF(4139), AOM_ICDF(10003), AOM_ICDF(11038),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28126), AOM_ICDF(29216), AOM_ICDF(32027), AOM_ICDF(32345),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19828), AOM_ICDF(22063), AOM_ICDF(31140), AOM_ICDF(31465),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11206), AOM_ICDF(11832), AOM_ICDF(25718), AOM_ICDF(26041),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6496), AOM_ICDF(6825), AOM_ICDF(18069), AOM_ICDF(18408),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4600), AOM_ICDF(4904), AOM_ICDF(12431), AOM_ICDF(12735),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2016), AOM_ICDF(3529), AOM_ICDF(8066), AOM_ICDF(9578),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30246), AOM_ICDF(30814), AOM_ICDF(32096), AOM_ICDF(32411),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21165), AOM_ICDF(22238), AOM_ICDF(31122), AOM_ICDF(31445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10123), AOM_ICDF(10519), AOM_ICDF(24102), AOM_ICDF(24419),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5968), AOM_ICDF(6277), AOM_ICDF(17606), AOM_ICDF(17924),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4312), AOM_ICDF(4620), AOM_ICDF(12131), AOM_ICDF(12439),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4608), AOM_ICDF(6144), AOM_ICDF(9216), AOM_ICDF(10752),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(22808), AOM_ICDF(23508), AOM_ICDF(29956), AOM_ICDF(30649),
-    AOM_ICDF(31698), AOM_ICDF(32768), },
-    {AOM_ICDF(11001), AOM_ICDF(12792), AOM_ICDF(25018), AOM_ICDF(27680),
-    AOM_ICDF(29623), AOM_ICDF(32768), },
-    {AOM_ICDF(6919), AOM_ICDF(10026), AOM_ICDF(19635), AOM_ICDF(24728),
-    AOM_ICDF(26490), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12861), AOM_ICDF(25068), AOM_ICDF(30802), AOM_ICDF(31375),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11298), AOM_ICDF(21545), AOM_ICDF(29953), AOM_ICDF(30816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13053), AOM_ICDF(24270), AOM_ICDF(28485), AOM_ICDF(29845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(15059), AOM_ICDF(26383), AOM_ICDF(28431),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8856), AOM_ICDF(10332), AOM_ICDF(18008), AOM_ICDF(19779),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3855), AOM_ICDF(7710), AOM_ICDF(19275), AOM_ICDF(22167),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(19458), AOM_ICDF(25796), AOM_ICDF(31754), AOM_ICDF(32007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16458), AOM_ICDF(23827), AOM_ICDF(31294), AOM_ICDF(31638),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16274), AOM_ICDF(18913), AOM_ICDF(28150), AOM_ICDF(29029),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12429), AOM_ICDF(15254), AOM_ICDF(24858), AOM_ICDF(26553),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7399), AOM_ICDF(11627), AOM_ICDF(21141), AOM_ICDF(24312),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(17348), AOM_ICDF(23130),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25493), AOM_ICDF(28975), AOM_ICDF(31960), AOM_ICDF(32271),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16904), AOM_ICDF(21759), AOM_ICDF(31381), AOM_ICDF(31728),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9709), AOM_ICDF(11529), AOM_ICDF(24879), AOM_ICDF(26700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(13107), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(20025), AOM_ICDF(25486),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26127), AOM_ICDF(28926), AOM_ICDF(31725), AOM_ICDF(32274),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17673), AOM_ICDF(25036), AOM_ICDF(31940), AOM_ICDF(32216),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14824), AOM_ICDF(17164), AOM_ICDF(26526), AOM_ICDF(28867),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(16384), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30683), AOM_ICDF(31149), AOM_ICDF(32155), AOM_ICDF(32449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17896), AOM_ICDF(22055), AOM_ICDF(31508), AOM_ICDF(31886),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8548), AOM_ICDF(12822), AOM_ICDF(24220), AOM_ICDF(28494),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(27393), AOM_ICDF(28900), AOM_ICDF(31555), AOM_ICDF(31971),
-    AOM_ICDF(32368), AOM_ICDF(32768), },
-    {AOM_ICDF(8379), AOM_ICDF(19364), AOM_ICDF(27675), AOM_ICDF(28688),
-    AOM_ICDF(31114), AOM_ICDF(32768), },
-    {AOM_ICDF(1955), AOM_ICDF(19256), AOM_ICDF(24580), AOM_ICDF(25370),
-    AOM_ICDF(30257), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(31085), AOM_ICDF(31718), AOM_ICDF(32129), AOM_ICDF(32443),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14336), AOM_ICDF(26852), AOM_ICDF(31370), AOM_ICDF(31760),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11751), AOM_ICDF(23544), AOM_ICDF(28851), AOM_ICDF(29567),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14670), AOM_ICDF(21251), AOM_ICDF(28381), AOM_ICDF(29752),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14832), AOM_ICDF(19316), AOM_ICDF(27134), AOM_ICDF(28974),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13312), AOM_ICDF(15360), AOM_ICDF(25600), AOM_ICDF(27648),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(31302), AOM_ICDF(31746), AOM_ICDF(32144), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18343), AOM_ICDF(26723), AOM_ICDF(32018), AOM_ICDF(32434),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10570), AOM_ICDF(16913), AOM_ICDF(29068), AOM_ICDF(30125),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(13797), AOM_ICDF(24145), AOM_ICDF(26732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(31420), AOM_ICDF(31795), AOM_ICDF(32144), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21510), AOM_ICDF(28245), AOM_ICDF(32064), AOM_ICDF(32366),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6342), AOM_ICDF(11627), AOM_ICDF(25369), AOM_ICDF(28540),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(31470), AOM_ICDF(31806), AOM_ICDF(32143), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19571), AOM_ICDF(25722), AOM_ICDF(31538), AOM_ICDF(31985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(8738), AOM_ICDF(25122), AOM_ICDF(28399),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31292), AOM_ICDF(31637), AOM_ICDF(32104), AOM_ICDF(32431),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12774), AOM_ICDF(16652), AOM_ICDF(30002), AOM_ICDF(30986),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4652), AOM_ICDF(11442), AOM_ICDF(30231), AOM_ICDF(30593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7022), AOM_ICDF(10031), AOM_ICDF(28087), AOM_ICDF(29090),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
+static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+    [CDF_SIZE(BR_CDF_SIZE)] = {
+      { { { { AOM_CDF4(14298, 20718, 24174) },
+            { AOM_CDF4(12536, 19601, 23789) },
+            { AOM_CDF4(8712, 15051, 19503) },
+            { AOM_CDF4(6170, 11327, 15434) },
+            { AOM_CDF4(4742, 8926, 12538) },
+            { AOM_CDF4(3803, 7317, 10546) },
+            { AOM_CDF4(1696, 3317, 4871) },
+            { AOM_CDF4(14392, 19951, 22756) },
+            { AOM_CDF4(15978, 23218, 26818) },
+            { AOM_CDF4(12187, 19474, 23889) },
+            { AOM_CDF4(9176, 15640, 20259) },
+            { AOM_CDF4(7068, 12655, 17028) },
+            { AOM_CDF4(5656, 10442, 14472) },
+            { AOM_CDF4(2580, 4992, 7244) },
+            { AOM_CDF4(12136, 18049, 21426) },
+            { AOM_CDF4(13784, 20721, 24481) },
+            { AOM_CDF4(10836, 17621, 21900) },
+            { AOM_CDF4(8372, 14444, 18847) },
+            { AOM_CDF4(6523, 11779, 16000) },
+            { AOM_CDF4(5337, 9898, 13760) },
+            { AOM_CDF4(3034, 5860, 8462) } },
+          { { AOM_CDF4(15967, 22905, 26286) },
+            { AOM_CDF4(13534, 20654, 24579) },
+            { AOM_CDF4(9504, 16092, 20535) },
+            { AOM_CDF4(6975, 12568, 16903) },
+            { AOM_CDF4(5364, 10091, 14020) },
+            { AOM_CDF4(4357, 8370, 11857) },
+            { AOM_CDF4(2506, 4934, 7218) },
+            { AOM_CDF4(23032, 28815, 30936) },
+            { AOM_CDF4(19540, 26704, 29719) },
+            { AOM_CDF4(15158, 22969, 27097) },
+            { AOM_CDF4(11408, 18865, 23650) },
+            { AOM_CDF4(8885, 15448, 20250) },
+            { AOM_CDF4(7108, 12853, 17416) },
+            { AOM_CDF4(4231, 8041, 11480) },
+            { AOM_CDF4(19823, 26490, 29156) },
+            { AOM_CDF4(18890, 25929, 28932) },
+            { AOM_CDF4(15660, 23491, 27433) },
+            { AOM_CDF4(12147, 19776, 24488) },
+            { AOM_CDF4(9728, 16774, 21649) },
+            { AOM_CDF4(7919, 14277, 19066) },
+            { AOM_CDF4(5440, 10170, 14185) } } },
+        { { { AOM_CDF4(14406, 20862, 24414) },
+            { AOM_CDF4(11824, 18907, 23109) },
+            { AOM_CDF4(8257, 14393, 18803) },
+            { AOM_CDF4(5860, 10747, 14778) },
+            { AOM_CDF4(4475, 8486, 11984) },
+            { AOM_CDF4(3606, 6954, 10043) },
+            { AOM_CDF4(1736, 3410, 5048) },
+            { AOM_CDF4(14430, 20046, 22882) },
+            { AOM_CDF4(15593, 22899, 26709) },
+            { AOM_CDF4(12102, 19368, 23811) },
+            { AOM_CDF4(9059, 15584, 20262) },
+            { AOM_CDF4(6999, 12603, 17048) },
+            { AOM_CDF4(5684, 10497, 14553) },
+            { AOM_CDF4(2822, 5438, 7862) },
+            { AOM_CDF4(15785, 21585, 24359) },
+            { AOM_CDF4(18347, 25229, 28266) },
+            { AOM_CDF4(14974, 22487, 26389) },
+            { AOM_CDF4(11423, 18681, 23271) },
+            { AOM_CDF4(8863, 15350, 20008) },
+            { AOM_CDF4(7153, 12852, 17278) },
+            { AOM_CDF4(3707, 7036, 9982) } },
+          { { AOM_CDF4(15460, 21696, 25469) },
+            { AOM_CDF4(12170, 19249, 23191) },
+            { AOM_CDF4(8723, 15027, 19332) },
+            { AOM_CDF4(6428, 11704, 15874) },
+            { AOM_CDF4(4922, 9292, 13052) },
+            { AOM_CDF4(4139, 7695, 11010) },
+            { AOM_CDF4(2291, 4508, 6598) },
+            { AOM_CDF4(19856, 26920, 29828) },
+            { AOM_CDF4(17923, 25289, 28792) },
+            { AOM_CDF4(14278, 21968, 26297) },
+            { AOM_CDF4(10910, 18136, 22950) },
+            { AOM_CDF4(8423, 14815, 19627) },
+            { AOM_CDF4(6771, 12283, 16774) },
+            { AOM_CDF4(4074, 7750, 11081) },
+            { AOM_CDF4(19852, 26074, 28672) },
+            { AOM_CDF4(19371, 26110, 28989) },
+            { AOM_CDF4(16265, 23873, 27663) },
+            { AOM_CDF4(12758, 20378, 24952) },
+            { AOM_CDF4(10095, 17098, 21961) },
+            { AOM_CDF4(8250, 14628, 19451) },
+            { AOM_CDF4(5205, 9745, 13622) } } },
+        { { { AOM_CDF4(10563, 16233, 19763) },
+            { AOM_CDF4(9794, 16022, 19804) },
+            { AOM_CDF4(6750, 11945, 15759) },
+            { AOM_CDF4(4963, 9186, 12752) },
+            { AOM_CDF4(3845, 7435, 10627) },
+            { AOM_CDF4(3051, 6085, 8834) },
+            { AOM_CDF4(1311, 2596, 3830) },
+            { AOM_CDF4(11246, 16404, 19689) },
+            { AOM_CDF4(12315, 18911, 22731) },
+            { AOM_CDF4(10557, 17095, 21289) },
+            { AOM_CDF4(8136, 14006, 18249) },
+            { AOM_CDF4(6348, 11474, 15565) },
+            { AOM_CDF4(5196, 9655, 13400) },
+            { AOM_CDF4(2349, 4526, 6587) },
+            { AOM_CDF4(13337, 18730, 21569) },
+            { AOM_CDF4(19306, 26071, 28882) },
+            { AOM_CDF4(15952, 23540, 27254) },
+            { AOM_CDF4(12409, 19934, 24430) },
+            { AOM_CDF4(9760, 16706, 21389) },
+            { AOM_CDF4(8004, 14220, 18818) },
+            { AOM_CDF4(4138, 7794, 10961) } },
+          { { AOM_CDF4(10870, 16684, 20949) },
+            { AOM_CDF4(9664, 15230, 18680) },
+            { AOM_CDF4(6886, 12109, 15408) },
+            { AOM_CDF4(4825, 8900, 12305) },
+            { AOM_CDF4(3630, 7162, 10314) },
+            { AOM_CDF4(3036, 6429, 9387) },
+            { AOM_CDF4(1671, 3296, 4940) },
+            { AOM_CDF4(13819, 19159, 23026) },
+            { AOM_CDF4(11984, 19108, 23120) },
+            { AOM_CDF4(10690, 17210, 21663) },
+            { AOM_CDF4(7984, 14154, 18333) },
+            { AOM_CDF4(6868, 12294, 16124) },
+            { AOM_CDF4(5274, 8994, 12868) },
+            { AOM_CDF4(2988, 5771, 8424) },
+            { AOM_CDF4(19736, 26647, 29141) },
+            { AOM_CDF4(18933, 26070, 28984) },
+            { AOM_CDF4(15779, 23048, 27200) },
+            { AOM_CDF4(12638, 20061, 24532) },
+            { AOM_CDF4(10692, 17545, 22220) },
+            { AOM_CDF4(9217, 15251, 20054) },
+            { AOM_CDF4(5078, 9284, 12594) } } },
+        { { { AOM_CDF4(2331, 3662, 5244) },
+            { AOM_CDF4(2891, 4771, 6145) },
+            { AOM_CDF4(4598, 7623, 9729) },
+            { AOM_CDF4(3520, 6845, 9199) },
+            { AOM_CDF4(3417, 6119, 9324) },
+            { AOM_CDF4(2601, 5412, 7385) },
+            { AOM_CDF4(600, 1173, 1744) },
+            { AOM_CDF4(7672, 13286, 17469) },
+            { AOM_CDF4(4232, 7792, 10793) },
+            { AOM_CDF4(2915, 5317, 7397) },
+            { AOM_CDF4(2318, 4356, 6152) },
+            { AOM_CDF4(2127, 4000, 5554) },
+            { AOM_CDF4(1850, 3478, 5275) },
+            { AOM_CDF4(977, 1933, 2843) },
+            { AOM_CDF4(18280, 24387, 27989) },
+            { AOM_CDF4(15852, 22671, 26185) },
+            { AOM_CDF4(13845, 20951, 24789) },
+            { AOM_CDF4(11055, 17966, 22129) },
+            { AOM_CDF4(9138, 15422, 19801) },
+            { AOM_CDF4(7454, 13145, 17456) },
+            { AOM_CDF4(3370, 6393, 9013) } },
+          { { AOM_CDF4(5842, 9229, 10838) },
+            { AOM_CDF4(2313, 3491, 4276) },
+            { AOM_CDF4(2998, 6104, 7496) },
+            { AOM_CDF4(2420, 7447, 9868) },
+            { AOM_CDF4(3034, 8495, 10923) },
+            { AOM_CDF4(4076, 8937, 10975) },
+            { AOM_CDF4(1086, 2370, 3299) },
+            { AOM_CDF4(9714, 17254, 20444) },
+            { AOM_CDF4(8543, 13698, 17123) },
+            { AOM_CDF4(4918, 9007, 11910) },
+            { AOM_CDF4(4129, 7532, 10553) },
+            { AOM_CDF4(2364, 5533, 8058) },
+            { AOM_CDF4(1834, 3546, 5563) },
+            { AOM_CDF4(1473, 2908, 4133) },
+            { AOM_CDF4(15405, 21193, 25619) },
+            { AOM_CDF4(15691, 21952, 26561) },
+            { AOM_CDF4(12962, 19194, 24165) },
+            { AOM_CDF4(10272, 17855, 22129) },
+            { AOM_CDF4(8588, 15270, 20718) },
+            { AOM_CDF4(8682, 14669, 19500) },
+            { AOM_CDF4(4870, 9636, 13205) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(14995, 21341, 24749) },
+            { AOM_CDF4(13158, 20289, 24601) },
+            { AOM_CDF4(8941, 15326, 19876) },
+            { AOM_CDF4(6297, 11541, 15807) },
+            { AOM_CDF4(4817, 9029, 12776) },
+            { AOM_CDF4(3731, 7273, 10627) },
+            { AOM_CDF4(1847, 3617, 5354) },
+            { AOM_CDF4(14472, 19659, 22343) },
+            { AOM_CDF4(16806, 24162, 27533) },
+            { AOM_CDF4(12900, 20404, 24713) },
+            { AOM_CDF4(9411, 16112, 20797) },
+            { AOM_CDF4(7056, 12697, 17148) },
+            { AOM_CDF4(5544, 10339, 14460) },
+            { AOM_CDF4(2954, 5704, 8319) },
+            { AOM_CDF4(12464, 18071, 21354) },
+            { AOM_CDF4(15482, 22528, 26034) },
+            { AOM_CDF4(12070, 19269, 23624) },
+            { AOM_CDF4(8953, 15406, 20106) },
+            { AOM_CDF4(7027, 12730, 17220) },
+            { AOM_CDF4(5887, 10913, 15140) },
+            { AOM_CDF4(3793, 7278, 10447) } },
+          { { AOM_CDF4(15571, 22232, 25749) },
+            { AOM_CDF4(14506, 21575, 25374) },
+            { AOM_CDF4(10189, 17089, 21569) },
+            { AOM_CDF4(7316, 13301, 17915) },
+            { AOM_CDF4(5783, 10912, 15190) },
+            { AOM_CDF4(4760, 9155, 13088) },
+            { AOM_CDF4(2993, 5966, 8774) },
+            { AOM_CDF4(23424, 28903, 30778) },
+            { AOM_CDF4(20775, 27666, 30290) },
+            { AOM_CDF4(16474, 24410, 28299) },
+            { AOM_CDF4(12471, 20180, 24987) },
+            { AOM_CDF4(9410, 16487, 21439) },
+            { AOM_CDF4(7536, 13614, 18529) },
+            { AOM_CDF4(5048, 9586, 13549) },
+            { AOM_CDF4(21090, 27290, 29756) },
+            { AOM_CDF4(20796, 27402, 30026) },
+            { AOM_CDF4(17819, 25485, 28969) },
+            { AOM_CDF4(13860, 21909, 26462) },
+            { AOM_CDF4(11002, 18494, 23529) },
+            { AOM_CDF4(8953, 15929, 20897) },
+            { AOM_CDF4(6448, 11918, 16454) } } },
+        { { { AOM_CDF4(15999, 22208, 25449) },
+            { AOM_CDF4(13050, 19988, 24122) },
+            { AOM_CDF4(8594, 14864, 19378) },
+            { AOM_CDF4(6033, 11079, 15238) },
+            { AOM_CDF4(4554, 8683, 12347) },
+            { AOM_CDF4(3672, 7139, 10337) },
+            { AOM_CDF4(1900, 3771, 5576) },
+            { AOM_CDF4(15788, 21340, 23949) },
+            { AOM_CDF4(16825, 24235, 27758) },
+            { AOM_CDF4(12873, 20402, 24810) },
+            { AOM_CDF4(9590, 16363, 21094) },
+            { AOM_CDF4(7352, 13209, 17733) },
+            { AOM_CDF4(5960, 10989, 15184) },
+            { AOM_CDF4(3232, 6234, 9007) },
+            { AOM_CDF4(15761, 20716, 23224) },
+            { AOM_CDF4(19318, 25989, 28759) },
+            { AOM_CDF4(15529, 23094, 26929) },
+            { AOM_CDF4(11662, 18989, 23641) },
+            { AOM_CDF4(8955, 15568, 20366) },
+            { AOM_CDF4(7281, 13106, 17708) },
+            { AOM_CDF4(4248, 8059, 11440) } },
+          { { AOM_CDF4(14899, 21217, 24503) },
+            { AOM_CDF4(13519, 20283, 24047) },
+            { AOM_CDF4(9429, 15966, 20365) },
+            { AOM_CDF4(6700, 12355, 16652) },
+            { AOM_CDF4(5088, 9704, 13716) },
+            { AOM_CDF4(4243, 8154, 11731) },
+            { AOM_CDF4(2702, 5364, 7861) },
+            { AOM_CDF4(22745, 28388, 30454) },
+            { AOM_CDF4(20235, 27146, 29922) },
+            { AOM_CDF4(15896, 23715, 27637) },
+            { AOM_CDF4(11840, 19350, 24131) },
+            { AOM_CDF4(9122, 15932, 20880) },
+            { AOM_CDF4(7488, 13581, 18362) },
+            { AOM_CDF4(5114, 9568, 13370) },
+            { AOM_CDF4(20845, 26553, 28932) },
+            { AOM_CDF4(20981, 27372, 29884) },
+            { AOM_CDF4(17781, 25335, 28785) },
+            { AOM_CDF4(13760, 21708, 26297) },
+            { AOM_CDF4(10975, 18415, 23365) },
+            { AOM_CDF4(9045, 15789, 20686) },
+            { AOM_CDF4(6130, 11199, 15423) } } },
+        { { { AOM_CDF4(13549, 19724, 23158) },
+            { AOM_CDF4(11844, 18382, 22246) },
+            { AOM_CDF4(7919, 13619, 17773) },
+            { AOM_CDF4(5486, 10143, 13946) },
+            { AOM_CDF4(4166, 7983, 11324) },
+            { AOM_CDF4(3364, 6506, 9427) },
+            { AOM_CDF4(1598, 3160, 4674) },
+            { AOM_CDF4(15281, 20979, 23781) },
+            { AOM_CDF4(14939, 22119, 25952) },
+            { AOM_CDF4(11363, 18407, 22812) },
+            { AOM_CDF4(8609, 14857, 19370) },
+            { AOM_CDF4(6737, 12184, 16480) },
+            { AOM_CDF4(5506, 10263, 14262) },
+            { AOM_CDF4(2990, 5786, 8380) },
+            { AOM_CDF4(20249, 25253, 27417) },
+            { AOM_CDF4(21070, 27518, 30001) },
+            { AOM_CDF4(16854, 24469, 28074) },
+            { AOM_CDF4(12864, 20486, 25000) },
+            { AOM_CDF4(9962, 16978, 21778) },
+            { AOM_CDF4(8074, 14338, 19048) },
+            { AOM_CDF4(4494, 8479, 11906) } },
+          { { AOM_CDF4(13960, 19617, 22829) },
+            { AOM_CDF4(11150, 17341, 21228) },
+            { AOM_CDF4(7150, 12964, 17190) },
+            { AOM_CDF4(5331, 10002, 13867) },
+            { AOM_CDF4(4167, 7744, 11057) },
+            { AOM_CDF4(3480, 6629, 9646) },
+            { AOM_CDF4(1883, 3784, 5686) },
+            { AOM_CDF4(18752, 25660, 28912) },
+            { AOM_CDF4(16968, 24586, 28030) },
+            { AOM_CDF4(13520, 21055, 25313) },
+            { AOM_CDF4(10453, 17626, 22280) },
+            { AOM_CDF4(8386, 14505, 19116) },
+            { AOM_CDF4(6742, 12595, 17008) },
+            { AOM_CDF4(4273, 8140, 11499) },
+            { AOM_CDF4(22120, 27827, 30233) },
+            { AOM_CDF4(20563, 27358, 29895) },
+            { AOM_CDF4(17076, 24644, 28153) },
+            { AOM_CDF4(13362, 20942, 25309) },
+            { AOM_CDF4(10794, 17965, 22695) },
+            { AOM_CDF4(9014, 15652, 20319) },
+            { AOM_CDF4(5708, 10512, 14497) } } },
+        { { { AOM_CDF4(5705, 10930, 15725) },
+            { AOM_CDF4(7946, 12765, 16115) },
+            { AOM_CDF4(6801, 12123, 16226) },
+            { AOM_CDF4(5462, 10135, 14200) },
+            { AOM_CDF4(4189, 8011, 11507) },
+            { AOM_CDF4(3191, 6229, 9408) },
+            { AOM_CDF4(1057, 2137, 3212) },
+            { AOM_CDF4(10018, 17067, 21491) },
+            { AOM_CDF4(7380, 12582, 16453) },
+            { AOM_CDF4(6068, 10845, 14339) },
+            { AOM_CDF4(5098, 9198, 12555) },
+            { AOM_CDF4(4312, 8010, 11119) },
+            { AOM_CDF4(3700, 6966, 9781) },
+            { AOM_CDF4(1693, 3326, 4887) },
+            { AOM_CDF4(18757, 24930, 27774) },
+            { AOM_CDF4(17648, 24596, 27817) },
+            { AOM_CDF4(14707, 22052, 26026) },
+            { AOM_CDF4(11720, 18852, 23292) },
+            { AOM_CDF4(9357, 15952, 20525) },
+            { AOM_CDF4(7810, 13753, 18210) },
+            { AOM_CDF4(3879, 7333, 10328) } },
+          { { AOM_CDF4(8278, 13242, 15922) },
+            { AOM_CDF4(10547, 15867, 18919) },
+            { AOM_CDF4(9106, 15842, 20609) },
+            { AOM_CDF4(6833, 13007, 17218) },
+            { AOM_CDF4(4811, 9712, 13923) },
+            { AOM_CDF4(3985, 7352, 11128) },
+            { AOM_CDF4(1688, 3458, 5262) },
+            { AOM_CDF4(12951, 21861, 26510) },
+            { AOM_CDF4(9788, 16044, 20276) },
+            { AOM_CDF4(6309, 11244, 14870) },
+            { AOM_CDF4(5183, 9349, 12566) },
+            { AOM_CDF4(4389, 8229, 11492) },
+            { AOM_CDF4(3633, 6945, 10620) },
+            { AOM_CDF4(3600, 6847, 9907) },
+            { AOM_CDF4(21748, 28137, 30255) },
+            { AOM_CDF4(19436, 26581, 29560) },
+            { AOM_CDF4(16359, 24201, 27953) },
+            { AOM_CDF4(13961, 21693, 25871) },
+            { AOM_CDF4(11544, 18686, 23322) },
+            { AOM_CDF4(9372, 16462, 20952) },
+            { AOM_CDF4(6138, 11210, 15390) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(16138, 22223, 25509) },
+            { AOM_CDF4(15347, 22430, 26332) },
+            { AOM_CDF4(9614, 16736, 21332) },
+            { AOM_CDF4(6600, 12275, 16907) },
+            { AOM_CDF4(4811, 9424, 13547) },
+            { AOM_CDF4(3748, 7809, 11420) },
+            { AOM_CDF4(2254, 4587, 6890) },
+            { AOM_CDF4(15196, 20284, 23177) },
+            { AOM_CDF4(18317, 25469, 28451) },
+            { AOM_CDF4(13918, 21651, 25842) },
+            { AOM_CDF4(10052, 17150, 21995) },
+            { AOM_CDF4(7499, 13630, 18587) },
+            { AOM_CDF4(6158, 11417, 16003) },
+            { AOM_CDF4(4014, 7785, 11252) },
+            { AOM_CDF4(15048, 21067, 24384) },
+            { AOM_CDF4(18202, 25346, 28553) },
+            { AOM_CDF4(14302, 22019, 26356) },
+            { AOM_CDF4(10839, 18139, 23166) },
+            { AOM_CDF4(8715, 15744, 20806) },
+            { AOM_CDF4(7536, 13576, 18544) },
+            { AOM_CDF4(5413, 10335, 14498) } },
+          { { AOM_CDF4(17394, 24501, 27895) },
+            { AOM_CDF4(15889, 23420, 27185) },
+            { AOM_CDF4(11561, 19133, 23870) },
+            { AOM_CDF4(8285, 14812, 19844) },
+            { AOM_CDF4(6496, 12043, 16550) },
+            { AOM_CDF4(4771, 9574, 13677) },
+            { AOM_CDF4(3603, 6830, 10144) },
+            { AOM_CDF4(21656, 27704, 30200) },
+            { AOM_CDF4(21324, 27915, 30511) },
+            { AOM_CDF4(17327, 25336, 28997) },
+            { AOM_CDF4(13417, 21381, 26033) },
+            { AOM_CDF4(10132, 17425, 22338) },
+            { AOM_CDF4(8580, 15016, 19633) },
+            { AOM_CDF4(5694, 11477, 16411) },
+            { AOM_CDF4(24116, 29780, 31450) },
+            { AOM_CDF4(23853, 29695, 31591) },
+            { AOM_CDF4(20085, 27614, 30428) },
+            { AOM_CDF4(15326, 24335, 28575) },
+            { AOM_CDF4(11814, 19472, 24810) },
+            { AOM_CDF4(10221, 18611, 24767) },
+            { AOM_CDF4(7689, 14558, 20321) } } },
+        { { { AOM_CDF4(16214, 22380, 25770) },
+            { AOM_CDF4(14213, 21304, 25295) },
+            { AOM_CDF4(9213, 15823, 20455) },
+            { AOM_CDF4(6395, 11758, 16139) },
+            { AOM_CDF4(4779, 9187, 13066) },
+            { AOM_CDF4(3821, 7501, 10953) },
+            { AOM_CDF4(2293, 4567, 6795) },
+            { AOM_CDF4(15859, 21283, 23820) },
+            { AOM_CDF4(18404, 25602, 28726) },
+            { AOM_CDF4(14325, 21980, 26206) },
+            { AOM_CDF4(10669, 17937, 22720) },
+            { AOM_CDF4(8297, 14642, 19447) },
+            { AOM_CDF4(6746, 12389, 16893) },
+            { AOM_CDF4(4324, 8251, 11770) },
+            { AOM_CDF4(16532, 21631, 24475) },
+            { AOM_CDF4(20667, 27150, 29668) },
+            { AOM_CDF4(16728, 24510, 28175) },
+            { AOM_CDF4(12861, 20645, 25332) },
+            { AOM_CDF4(10076, 17361, 22417) },
+            { AOM_CDF4(8395, 14940, 19963) },
+            { AOM_CDF4(5731, 10683, 14912) } },
+          { { AOM_CDF4(14433, 21155, 24938) },
+            { AOM_CDF4(14658, 21716, 25545) },
+            { AOM_CDF4(9923, 16824, 21557) },
+            { AOM_CDF4(6982, 13052, 17721) },
+            { AOM_CDF4(5419, 10503, 15050) },
+            { AOM_CDF4(4852, 9162, 13014) },
+            { AOM_CDF4(3271, 6395, 9630) },
+            { AOM_CDF4(22210, 27833, 30109) },
+            { AOM_CDF4(20750, 27368, 29821) },
+            { AOM_CDF4(16894, 24828, 28573) },
+            { AOM_CDF4(13247, 21276, 25757) },
+            { AOM_CDF4(10038, 17265, 22563) },
+            { AOM_CDF4(8587, 14947, 20327) },
+            { AOM_CDF4(5645, 11371, 15252) },
+            { AOM_CDF4(22027, 27526, 29714) },
+            { AOM_CDF4(23098, 29146, 31221) },
+            { AOM_CDF4(19886, 27341, 30272) },
+            { AOM_CDF4(15609, 23747, 28046) },
+            { AOM_CDF4(11993, 20065, 24939) },
+            { AOM_CDF4(9637, 18267, 23671) },
+            { AOM_CDF4(7625, 13801, 19144) } } },
+        { { { AOM_CDF4(14438, 20798, 24089) },
+            { AOM_CDF4(12621, 19203, 23097) },
+            { AOM_CDF4(8177, 14125, 18402) },
+            { AOM_CDF4(5674, 10501, 14456) },
+            { AOM_CDF4(4236, 8239, 11733) },
+            { AOM_CDF4(3447, 6750, 9806) },
+            { AOM_CDF4(1986, 3950, 5864) },
+            { AOM_CDF4(16208, 22099, 24930) },
+            { AOM_CDF4(16537, 24025, 27585) },
+            { AOM_CDF4(12780, 20381, 24867) },
+            { AOM_CDF4(9767, 16612, 21416) },
+            { AOM_CDF4(7686, 13738, 18398) },
+            { AOM_CDF4(6333, 11614, 15964) },
+            { AOM_CDF4(3941, 7571, 10836) },
+            { AOM_CDF4(22819, 27422, 29202) },
+            { AOM_CDF4(22224, 28514, 30721) },
+            { AOM_CDF4(17660, 25433, 28913) },
+            { AOM_CDF4(13574, 21482, 26002) },
+            { AOM_CDF4(10629, 17977, 22938) },
+            { AOM_CDF4(8612, 15298, 20265) },
+            { AOM_CDF4(5607, 10491, 14596) } },
+          { { AOM_CDF4(13569, 19800, 23206) },
+            { AOM_CDF4(13128, 19924, 23869) },
+            { AOM_CDF4(8329, 14841, 19403) },
+            { AOM_CDF4(6130, 10976, 15057) },
+            { AOM_CDF4(4682, 8839, 12518) },
+            { AOM_CDF4(3656, 7409, 10588) },
+            { AOM_CDF4(2577, 5099, 7412) },
+            { AOM_CDF4(22427, 28684, 30585) },
+            { AOM_CDF4(20913, 27750, 30139) },
+            { AOM_CDF4(15840, 24109, 27834) },
+            { AOM_CDF4(12308, 20029, 24569) },
+            { AOM_CDF4(10216, 16785, 21458) },
+            { AOM_CDF4(8309, 14203, 19113) },
+            { AOM_CDF4(6043, 11168, 15307) },
+            { AOM_CDF4(23166, 28901, 30998) },
+            { AOM_CDF4(21899, 28405, 30751) },
+            { AOM_CDF4(18413, 26091, 29443) },
+            { AOM_CDF4(15233, 23114, 27352) },
+            { AOM_CDF4(12683, 20472, 25288) },
+            { AOM_CDF4(10702, 18259, 23409) },
+            { AOM_CDF4(8125, 14464, 19226) } } },
+        { { { AOM_CDF4(9040, 14786, 18360) },
+            { AOM_CDF4(9979, 15718, 19415) },
+            { AOM_CDF4(7913, 13918, 18311) },
+            { AOM_CDF4(5859, 10889, 15184) },
+            { AOM_CDF4(4593, 8677, 12510) },
+            { AOM_CDF4(3820, 7396, 10791) },
+            { AOM_CDF4(1730, 3471, 5192) },
+            { AOM_CDF4(11803, 18365, 22709) },
+            { AOM_CDF4(11419, 18058, 22225) },
+            { AOM_CDF4(9418, 15774, 20243) },
+            { AOM_CDF4(7539, 13325, 17657) },
+            { AOM_CDF4(6233, 11317, 15384) },
+            { AOM_CDF4(5137, 9656, 13545) },
+            { AOM_CDF4(2977, 5774, 8349) },
+            { AOM_CDF4(21207, 27246, 29640) },
+            { AOM_CDF4(19547, 26578, 29497) },
+            { AOM_CDF4(16169, 23871, 27690) },
+            { AOM_CDF4(12820, 20458, 25018) },
+            { AOM_CDF4(10224, 17332, 22214) },
+            { AOM_CDF4(8526, 15048, 19884) },
+            { AOM_CDF4(5037, 9410, 13118) } },
+          { { AOM_CDF4(12339, 17329, 20140) },
+            { AOM_CDF4(13505, 19895, 23225) },
+            { AOM_CDF4(9847, 16944, 21564) },
+            { AOM_CDF4(7280, 13256, 18348) },
+            { AOM_CDF4(4712, 10009, 14454) },
+            { AOM_CDF4(4361, 7914, 12477) },
+            { AOM_CDF4(2870, 5628, 7995) },
+            { AOM_CDF4(20061, 25504, 28526) },
+            { AOM_CDF4(15235, 22878, 26145) },
+            { AOM_CDF4(12985, 19958, 24155) },
+            { AOM_CDF4(9782, 16641, 21403) },
+            { AOM_CDF4(9456, 16360, 20760) },
+            { AOM_CDF4(6855, 12940, 18557) },
+            { AOM_CDF4(5661, 10564, 15002) },
+            { AOM_CDF4(25656, 30602, 31894) },
+            { AOM_CDF4(22570, 29107, 31092) },
+            { AOM_CDF4(18917, 26423, 29541) },
+            { AOM_CDF4(15940, 23649, 27754) },
+            { AOM_CDF4(12803, 20581, 25219) },
+            { AOM_CDF4(11082, 18695, 23376) },
+            { AOM_CDF4(7939, 14373, 19005) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(18315, 24289, 27551) },
+            { AOM_CDF4(16854, 24068, 27835) },
+            { AOM_CDF4(10140, 17927, 23173) },
+            { AOM_CDF4(6722, 12982, 18267) },
+            { AOM_CDF4(4661, 9826, 14706) },
+            { AOM_CDF4(3832, 8165, 12294) },
+            { AOM_CDF4(2795, 6098, 9245) },
+            { AOM_CDF4(17145, 23326, 26672) },
+            { AOM_CDF4(20733, 27680, 30308) },
+            { AOM_CDF4(16032, 24461, 28546) },
+            { AOM_CDF4(11653, 20093, 25081) },
+            { AOM_CDF4(9290, 16429, 22086) },
+            { AOM_CDF4(7796, 14598, 19982) },
+            { AOM_CDF4(6502, 12378, 17441) },
+            { AOM_CDF4(21681, 27732, 30320) },
+            { AOM_CDF4(22389, 29044, 31261) },
+            { AOM_CDF4(19027, 26731, 30087) },
+            { AOM_CDF4(14739, 23755, 28624) },
+            { AOM_CDF4(11358, 20778, 25511) },
+            { AOM_CDF4(10995, 18073, 24190) },
+            { AOM_CDF4(9162, 14990, 20617) } },
+          { { AOM_CDF4(21425, 27952, 30388) },
+            { AOM_CDF4(18062, 25838, 29034) },
+            { AOM_CDF4(11956, 19881, 24808) },
+            { AOM_CDF4(7718, 15000, 20980) },
+            { AOM_CDF4(5702, 11254, 16143) },
+            { AOM_CDF4(4898, 9088, 16864) },
+            { AOM_CDF4(3679, 6776, 11907) },
+            { AOM_CDF4(23294, 30160, 31663) },
+            { AOM_CDF4(24397, 29896, 31836) },
+            { AOM_CDF4(19245, 27128, 30593) },
+            { AOM_CDF4(13202, 19825, 26404) },
+            { AOM_CDF4(11578, 19297, 23957) },
+            { AOM_CDF4(8073, 13297, 21370) },
+            { AOM_CDF4(5461, 10923, 19745) },
+            { AOM_CDF4(27367, 30521, 31934) },
+            { AOM_CDF4(24904, 30671, 31940) },
+            { AOM_CDF4(23075, 28460, 31299) },
+            { AOM_CDF4(14400, 23658, 30417) },
+            { AOM_CDF4(13885, 23882, 28325) },
+            { AOM_CDF4(14746, 22938, 27853) },
+            { AOM_CDF4(5461, 16384, 27307) } } },
+        { { { AOM_CDF4(18274, 24813, 27890) },
+            { AOM_CDF4(15537, 23149, 27003) },
+            { AOM_CDF4(9449, 16740, 21827) },
+            { AOM_CDF4(6700, 12498, 17261) },
+            { AOM_CDF4(4988, 9866, 14198) },
+            { AOM_CDF4(4236, 8147, 11902) },
+            { AOM_CDF4(2867, 5860, 8654) },
+            { AOM_CDF4(17124, 23171, 26101) },
+            { AOM_CDF4(20396, 27477, 30148) },
+            { AOM_CDF4(16573, 24629, 28492) },
+            { AOM_CDF4(12749, 20846, 25674) },
+            { AOM_CDF4(10233, 17878, 22818) },
+            { AOM_CDF4(8525, 15332, 20363) },
+            { AOM_CDF4(6283, 11632, 16255) },
+            { AOM_CDF4(20466, 26511, 29286) },
+            { AOM_CDF4(23059, 29174, 31191) },
+            { AOM_CDF4(19481, 27263, 30241) },
+            { AOM_CDF4(15458, 23631, 28137) },
+            { AOM_CDF4(12416, 20608, 25693) },
+            { AOM_CDF4(10261, 18011, 23261) },
+            { AOM_CDF4(8016, 14655, 19666) } },
+          { { AOM_CDF4(17616, 24586, 28112) },
+            { AOM_CDF4(15809, 23299, 27155) },
+            { AOM_CDF4(10767, 18890, 23793) },
+            { AOM_CDF4(7727, 14255, 18865) },
+            { AOM_CDF4(6129, 11926, 16882) },
+            { AOM_CDF4(4482, 9704, 14861) },
+            { AOM_CDF4(3277, 7452, 11522) },
+            { AOM_CDF4(22956, 28551, 30730) },
+            { AOM_CDF4(22724, 28937, 30961) },
+            { AOM_CDF4(18467, 26324, 29580) },
+            { AOM_CDF4(13234, 20713, 25649) },
+            { AOM_CDF4(11181, 17592, 22481) },
+            { AOM_CDF4(8291, 18358, 24576) },
+            { AOM_CDF4(7568, 11881, 14984) },
+            { AOM_CDF4(24948, 29001, 31147) },
+            { AOM_CDF4(25674, 30619, 32151) },
+            { AOM_CDF4(20841, 26793, 29603) },
+            { AOM_CDF4(14669, 24356, 28666) },
+            { AOM_CDF4(11334, 23593, 28219) },
+            { AOM_CDF4(8922, 14762, 22873) },
+            { AOM_CDF4(8301, 13544, 20535) } } },
+        { { { AOM_CDF4(17113, 23733, 27081) },
+            { AOM_CDF4(14139, 21406, 25452) },
+            { AOM_CDF4(8552, 15002, 19776) },
+            { AOM_CDF4(5871, 11120, 15378) },
+            { AOM_CDF4(4455, 8616, 12253) },
+            { AOM_CDF4(3469, 6910, 10386) },
+            { AOM_CDF4(2255, 4553, 6782) },
+            { AOM_CDF4(18224, 24376, 27053) },
+            { AOM_CDF4(19290, 26710, 29614) },
+            { AOM_CDF4(14936, 22991, 27184) },
+            { AOM_CDF4(11238, 18951, 23762) },
+            { AOM_CDF4(8786, 15617, 20588) },
+            { AOM_CDF4(7317, 13228, 18003) },
+            { AOM_CDF4(5101, 9512, 13493) },
+            { AOM_CDF4(22639, 28222, 30210) },
+            { AOM_CDF4(23216, 29331, 31307) },
+            { AOM_CDF4(19075, 26762, 29895) },
+            { AOM_CDF4(15014, 23113, 27457) },
+            { AOM_CDF4(11938, 19857, 24752) },
+            { AOM_CDF4(9942, 17280, 22282) },
+            { AOM_CDF4(7167, 13144, 17752) } },
+          { { AOM_CDF4(15820, 22738, 26488) },
+            { AOM_CDF4(13530, 20885, 25216) },
+            { AOM_CDF4(8395, 15530, 20452) },
+            { AOM_CDF4(6574, 12321, 16380) },
+            { AOM_CDF4(5353, 10419, 14568) },
+            { AOM_CDF4(4613, 8446, 12381) },
+            { AOM_CDF4(3440, 7158, 9903) },
+            { AOM_CDF4(24247, 29051, 31224) },
+            { AOM_CDF4(22118, 28058, 30369) },
+            { AOM_CDF4(16498, 24768, 28389) },
+            { AOM_CDF4(12920, 21175, 26137) },
+            { AOM_CDF4(10730, 18619, 25352) },
+            { AOM_CDF4(10187, 16279, 22791) },
+            { AOM_CDF4(9310, 14631, 22127) },
+            { AOM_CDF4(24970, 30558, 32057) },
+            { AOM_CDF4(24801, 29942, 31698) },
+            { AOM_CDF4(22432, 28453, 30855) },
+            { AOM_CDF4(19054, 25680, 29580) },
+            { AOM_CDF4(14392, 23036, 28109) },
+            { AOM_CDF4(12495, 20947, 26650) },
+            { AOM_CDF4(12442, 20326, 26214) } } },
+        { { { AOM_CDF4(12162, 18785, 22648) },
+            { AOM_CDF4(12749, 19697, 23806) },
+            { AOM_CDF4(8580, 15297, 20346) },
+            { AOM_CDF4(6169, 11749, 16543) },
+            { AOM_CDF4(4836, 9391, 13448) },
+            { AOM_CDF4(3821, 7711, 11613) },
+            { AOM_CDF4(2228, 4601, 7070) },
+            { AOM_CDF4(16319, 24725, 28280) },
+            { AOM_CDF4(15698, 23277, 27168) },
+            { AOM_CDF4(12726, 20368, 25047) },
+            { AOM_CDF4(9912, 17015, 21976) },
+            { AOM_CDF4(7888, 14220, 19179) },
+            { AOM_CDF4(6777, 12284, 17018) },
+            { AOM_CDF4(4492, 8590, 12252) },
+            { AOM_CDF4(23249, 28904, 30947) },
+            { AOM_CDF4(21050, 27908, 30512) },
+            { AOM_CDF4(17440, 25340, 28949) },
+            { AOM_CDF4(14059, 22018, 26541) },
+            { AOM_CDF4(11288, 18903, 23898) },
+            { AOM_CDF4(9411, 16342, 21428) },
+            { AOM_CDF4(6278, 11588, 15944) } },
+          { { AOM_CDF4(13981, 20067, 23226) },
+            { AOM_CDF4(16922, 23580, 26783) },
+            { AOM_CDF4(11005, 19039, 24487) },
+            { AOM_CDF4(7389, 14218, 19798) },
+            { AOM_CDF4(5598, 11505, 17206) },
+            { AOM_CDF4(6090, 11213, 15659) },
+            { AOM_CDF4(3820, 7371, 10119) },
+            { AOM_CDF4(21082, 26925, 29675) },
+            { AOM_CDF4(21262, 28627, 31128) },
+            { AOM_CDF4(18392, 26454, 30437) },
+            { AOM_CDF4(14870, 22910, 27096) },
+            { AOM_CDF4(12620, 19484, 24908) },
+            { AOM_CDF4(9290, 16553, 22802) },
+            { AOM_CDF4(6668, 14288, 20004) },
+            { AOM_CDF4(27704, 31055, 31949) },
+            { AOM_CDF4(24709, 29978, 31788) },
+            { AOM_CDF4(21668, 29264, 31657) },
+            { AOM_CDF4(18295, 26968, 30074) },
+            { AOM_CDF4(16399, 24422, 29313) },
+            { AOM_CDF4(14347, 23026, 28104) },
+            { AOM_CDF4(12370, 19806, 24477) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } }
+    };
 
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q3[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(25117), AOM_ICDF(25655), AOM_ICDF(28371), AOM_ICDF(30246),
-    AOM_ICDF(30939), AOM_ICDF(32768), },
-    {AOM_ICDF(15083), AOM_ICDF(16850), AOM_ICDF(26029), AOM_ICDF(29031),
-    AOM_ICDF(30115), AOM_ICDF(32768), },
-    {AOM_ICDF(8774), AOM_ICDF(12118), AOM_ICDF(22041), AOM_ICDF(26730),
-    AOM_ICDF(28574), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13690), AOM_ICDF(23135), AOM_ICDF(31469), AOM_ICDF(31868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13306), AOM_ICDF(22730), AOM_ICDF(31466), AOM_ICDF(31860),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13503), AOM_ICDF(19892), AOM_ICDF(30528), AOM_ICDF(31005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13150), AOM_ICDF(16108), AOM_ICDF(28345), AOM_ICDF(28869),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12014), AOM_ICDF(12842), AOM_ICDF(25693), AOM_ICDF(26145),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8937), AOM_ICDF(13405), AOM_ICDF(23831), AOM_ICDF(28300),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18707), AOM_ICDF(26260), AOM_ICDF(31853), AOM_ICDF(32238),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15985), AOM_ICDF(24804), AOM_ICDF(31717), AOM_ICDF(32115),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14012), AOM_ICDF(18913), AOM_ICDF(30497), AOM_ICDF(31005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12300), AOM_ICDF(14741), AOM_ICDF(28386), AOM_ICDF(28958),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12483), AOM_ICDF(15084), AOM_ICDF(24966), AOM_ICDF(26526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19934), AOM_ICDF(28117), AOM_ICDF(32022), AOM_ICDF(32378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14925), AOM_ICDF(26201), AOM_ICDF(31828), AOM_ICDF(32262),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13132), AOM_ICDF(18927), AOM_ICDF(30269), AOM_ICDF(31173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13926), AOM_ICDF(19251), AOM_ICDF(28262), AOM_ICDF(29901),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(16626), AOM_ICDF(28981), AOM_ICDF(32074), AOM_ICDF(32413),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12895), AOM_ICDF(27583), AOM_ICDF(31974), AOM_ICDF(32332),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14150), AOM_ICDF(22094), AOM_ICDF(31030), AOM_ICDF(31775),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(5279), AOM_ICDF(29309), AOM_ICDF(32149), AOM_ICDF(32477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5880), AOM_ICDF(29657), AOM_ICDF(32086), AOM_ICDF(32385),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11469), AOM_ICDF(18022), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(8302), AOM_ICDF(14024), AOM_ICDF(16072), AOM_ICDF(27926),
-    AOM_ICDF(28871), AOM_ICDF(32768), },
-    {AOM_ICDF(9359), AOM_ICDF(15522), AOM_ICDF(20581), AOM_ICDF(28595),
-    AOM_ICDF(29250), AOM_ICDF(32768), },
-    {AOM_ICDF(5318), AOM_ICDF(12803), AOM_ICDF(19679), AOM_ICDF(27719),
-    AOM_ICDF(28609), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(22745), AOM_ICDF(25806), AOM_ICDF(31997), AOM_ICDF(32327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18803), AOM_ICDF(25473), AOM_ICDF(31960), AOM_ICDF(32293),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15553), AOM_ICDF(19553), AOM_ICDF(31039), AOM_ICDF(31407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13037), AOM_ICDF(15169), AOM_ICDF(28589), AOM_ICDF(29060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10871), AOM_ICDF(11694), AOM_ICDF(24941), AOM_ICDF(25360),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(10923), AOM_ICDF(18725), AOM_ICDF(23406),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22276), AOM_ICDF(27316), AOM_ICDF(32078), AOM_ICDF(32402),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19227), AOM_ICDF(25420), AOM_ICDF(31954), AOM_ICDF(32293),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12383), AOM_ICDF(16969), AOM_ICDF(30280), AOM_ICDF(30766),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11282), AOM_ICDF(13725), AOM_ICDF(26516), AOM_ICDF(27379),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5120), AOM_ICDF(9216), AOM_ICDF(15360), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22814), AOM_ICDF(28656), AOM_ICDF(32097), AOM_ICDF(32425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19349), AOM_ICDF(26355), AOM_ICDF(32000), AOM_ICDF(32341),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13824), AOM_ICDF(17830), AOM_ICDF(30780), AOM_ICDF(31142),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6746), AOM_ICDF(13493), AOM_ICDF(25058), AOM_ICDF(27949),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19746), AOM_ICDF(28536), AOM_ICDF(32088), AOM_ICDF(32411),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17457), AOM_ICDF(27155), AOM_ICDF(32024), AOM_ICDF(32376),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10949), AOM_ICDF(16662), AOM_ICDF(29118), AOM_ICDF(30229),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6096), AOM_ICDF(12955), AOM_ICDF(21337), AOM_ICDF(27434),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(10114), AOM_ICDF(29713), AOM_ICDF(32140), AOM_ICDF(32448),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11455), AOM_ICDF(29324), AOM_ICDF(32094), AOM_ICDF(32419),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(14418), AOM_ICDF(23593), AOM_ICDF(27525),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(30309), AOM_ICDF(30623), AOM_ICDF(31738), AOM_ICDF(32084),
-    AOM_ICDF(32428), AOM_ICDF(32768), },
-    {AOM_ICDF(25732), AOM_ICDF(26211), AOM_ICDF(31079), AOM_ICDF(31737),
-    AOM_ICDF(32269), AOM_ICDF(32768), },
-    {AOM_ICDF(19676), AOM_ICDF(21061), AOM_ICDF(29564), AOM_ICDF(31011),
-    AOM_ICDF(31879), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12328), AOM_ICDF(28270), AOM_ICDF(32125), AOM_ICDF(32447),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11177), AOM_ICDF(28585), AOM_ICDF(32076), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13232), AOM_ICDF(25364), AOM_ICDF(31558), AOM_ICDF(32072),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11997), AOM_ICDF(18443), AOM_ICDF(30261), AOM_ICDF(31873),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7399), AOM_ICDF(11627), AOM_ICDF(24312), AOM_ICDF(27483),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(16893), AOM_ICDF(29817), AOM_ICDF(32005), AOM_ICDF(32463),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14911), AOM_ICDF(27935), AOM_ICDF(32179), AOM_ICDF(32473),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9973), AOM_ICDF(19946), AOM_ICDF(24220), AOM_ICDF(28494),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(18859), AOM_ICDF(29232), AOM_ICDF(31354), AOM_ICDF(32061),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11281), AOM_ICDF(26322), AOM_ICDF(29545), AOM_ICDF(31156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(8937), AOM_ICDF(19363), AOM_ICDF(23831), AOM_ICDF(28300),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30586), AOM_ICDF(30911), AOM_ICDF(31771), AOM_ICDF(32121),
-    AOM_ICDF(32443), AOM_ICDF(32768), },
-    {AOM_ICDF(23875), AOM_ICDF(24492), AOM_ICDF(30970), AOM_ICDF(31684),
-    AOM_ICDF(32217), AOM_ICDF(32768), },
-    {AOM_ICDF(15874), AOM_ICDF(17477), AOM_ICDF(29172), AOM_ICDF(30703),
-    AOM_ICDF(32023), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17059), AOM_ICDF(30027), AOM_ICDF(32152), AOM_ICDF(32450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13931), AOM_ICDF(29387), AOM_ICDF(32103), AOM_ICDF(32414),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12903), AOM_ICDF(25742), AOM_ICDF(31906), AOM_ICDF(32289),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13493), AOM_ICDF(23130), AOM_ICDF(29614), AOM_ICDF(30840),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(14746), AOM_ICDF(26214), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18660), AOM_ICDF(30626), AOM_ICDF(32150), AOM_ICDF(32459),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17338), AOM_ICDF(29279), AOM_ICDF(32168), AOM_ICDF(32495),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11916), AOM_ICDF(17873), AOM_ICDF(26810), AOM_ICDF(29789),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(14564), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23269), AOM_ICDF(31374), AOM_ICDF(32245), AOM_ICDF(32507),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15741), AOM_ICDF(27628), AOM_ICDF(30840), AOM_ICDF(31804),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(15464), AOM_ICDF(29454), AOM_ICDF(30559), AOM_ICDF(31663),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6827), AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(18128), AOM_ICDF(19079), AOM_ICDF(27400), AOM_ICDF(29265),
-    AOM_ICDF(30385), AOM_ICDF(32768), },
-    {AOM_ICDF(10290), AOM_ICDF(12446), AOM_ICDF(23496), AOM_ICDF(26905),
-    AOM_ICDF(28729), AOM_ICDF(32768), },
-    {AOM_ICDF(5877), AOM_ICDF(9423), AOM_ICDF(18374), AOM_ICDF(23871),
-    AOM_ICDF(26028), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(16010), AOM_ICDF(22388), AOM_ICDF(30990), AOM_ICDF(31378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14579), AOM_ICDF(21619), AOM_ICDF(30755), AOM_ICDF(31177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13859), AOM_ICDF(18660), AOM_ICDF(29381), AOM_ICDF(29904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12288), AOM_ICDF(14656), AOM_ICDF(27505), AOM_ICDF(28077),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10009), AOM_ICDF(10812), AOM_ICDF(23591), AOM_ICDF(24068),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8663), AOM_ICDF(9981), AOM_ICDF(19962), AOM_ICDF(20904),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20773), AOM_ICDF(24941), AOM_ICDF(31701), AOM_ICDF(32046),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17537), AOM_ICDF(22279), AOM_ICDF(31257), AOM_ICDF(31629),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13337), AOM_ICDF(15972), AOM_ICDF(29181), AOM_ICDF(29575),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11120), AOM_ICDF(12128), AOM_ICDF(26440), AOM_ICDF(26874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10061), AOM_ICDF(10800), AOM_ICDF(23999), AOM_ICDF(24276),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24073), AOM_ICDF(27227), AOM_ICDF(31920), AOM_ICDF(32246),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18916), AOM_ICDF(22611), AOM_ICDF(31508), AOM_ICDF(31853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13371), AOM_ICDF(14495), AOM_ICDF(28662), AOM_ICDF(29093),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9283), AOM_ICDF(9840), AOM_ICDF(24228), AOM_ICDF(24506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(9362), AOM_ICDF(20285), AOM_ICDF(24966),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25180), AOM_ICDF(28079), AOM_ICDF(32048), AOM_ICDF(32365),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19790), AOM_ICDF(23090), AOM_ICDF(31675), AOM_ICDF(32001),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12634), AOM_ICDF(13382), AOM_ICDF(28384), AOM_ICDF(28718),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11264), AOM_ICDF(12083), AOM_ICDF(28672), AOM_ICDF(29286),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(13493), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(26180), AOM_ICDF(29109), AOM_ICDF(32085), AOM_ICDF(32408),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19990), AOM_ICDF(23991), AOM_ICDF(31806), AOM_ICDF(32152),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13735), AOM_ICDF(14612), AOM_ICDF(29022), AOM_ICDF(29326),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10240), AOM_ICDF(25259), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(5084), AOM_ICDF(13063), AOM_ICDF(15732), AOM_ICDF(27628),
-    AOM_ICDF(28823), AOM_ICDF(32768), },
-    {AOM_ICDF(3233), AOM_ICDF(11850), AOM_ICDF(16878), AOM_ICDF(26809),
-    AOM_ICDF(27973), AOM_ICDF(32768), },
-    {AOM_ICDF(1405), AOM_ICDF(10468), AOM_ICDF(15220), AOM_ICDF(25209),
-    AOM_ICDF(26482), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23854), AOM_ICDF(26692), AOM_ICDF(31964), AOM_ICDF(32291),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20514), AOM_ICDF(25677), AOM_ICDF(31833), AOM_ICDF(32170),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16504), AOM_ICDF(20235), AOM_ICDF(30877), AOM_ICDF(31237),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13241), AOM_ICDF(15173), AOM_ICDF(28673), AOM_ICDF(29116),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9526), AOM_ICDF(10553), AOM_ICDF(23852), AOM_ICDF(24361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(6428), AOM_ICDF(17806), AOM_ICDF(18148),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24345), AOM_ICDF(27736), AOM_ICDF(32033), AOM_ICDF(32355),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20277), AOM_ICDF(23726), AOM_ICDF(31700), AOM_ICDF(32031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13361), AOM_ICDF(15650), AOM_ICDF(29411), AOM_ICDF(29794),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9421), AOM_ICDF(10887), AOM_ICDF(25426), AOM_ICDF(26039),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(7607), AOM_ICDF(17749), AOM_ICDF(18530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26118), AOM_ICDF(28888), AOM_ICDF(32095), AOM_ICDF(32413),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21286), AOM_ICDF(24631), AOM_ICDF(31871), AOM_ICDF(32198),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13285), AOM_ICDF(15402), AOM_ICDF(29317), AOM_ICDF(29737),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9902), AOM_ICDF(10814), AOM_ICDF(24755), AOM_ICDF(25276),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11431), AOM_ICDF(13717), AOM_ICDF(20575), AOM_ICDF(23623),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27178), AOM_ICDF(29612), AOM_ICDF(32119), AOM_ICDF(32433),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22095), AOM_ICDF(25550), AOM_ICDF(31976), AOM_ICDF(32298),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13847), AOM_ICDF(16273), AOM_ICDF(29602), AOM_ICDF(30024),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8771), AOM_ICDF(10923), AOM_ICDF(19694), AOM_ICDF(20521),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11398), AOM_ICDF(15672), AOM_ICDF(21370), AOM_ICDF(25645),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28257), AOM_ICDF(30327), AOM_ICDF(32126), AOM_ICDF(32441),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22325), AOM_ICDF(26453), AOM_ICDF(32054), AOM_ICDF(32380),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14860), AOM_ICDF(17652), AOM_ICDF(30682), AOM_ICDF(31035),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5097), AOM_ICDF(10194), AOM_ICDF(18933), AOM_ICDF(21117),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(28902), AOM_ICDF(29234), AOM_ICDF(31608), AOM_ICDF(31973),
-    AOM_ICDF(32378), AOM_ICDF(32768), },
-    {AOM_ICDF(22721), AOM_ICDF(23397), AOM_ICDF(30476), AOM_ICDF(31293),
-    AOM_ICDF(32179), AOM_ICDF(32768), },
-    {AOM_ICDF(16404), AOM_ICDF(18013), AOM_ICDF(27505), AOM_ICDF(29454),
-    AOM_ICDF(31300), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14290), AOM_ICDF(27662), AOM_ICDF(31923), AOM_ICDF(32327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13282), AOM_ICDF(26727), AOM_ICDF(31749), AOM_ICDF(32113),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12514), AOM_ICDF(22487), AOM_ICDF(30689), AOM_ICDF(31459),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11657), AOM_ICDF(16967), AOM_ICDF(29660), AOM_ICDF(30437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8937), AOM_ICDF(12660), AOM_ICDF(24576), AOM_ICDF(26810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20145), AOM_ICDF(28026), AOM_ICDF(31820), AOM_ICDF(32212),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16906), AOM_ICDF(25677), AOM_ICDF(31760), AOM_ICDF(32059),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12332), AOM_ICDF(18322), AOM_ICDF(29597), AOM_ICDF(31006),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(13107), AOM_ICDF(21299), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23492), AOM_ICDF(29214), AOM_ICDF(32166), AOM_ICDF(32467),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18757), AOM_ICDF(25536), AOM_ICDF(31789), AOM_ICDF(32165),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12603), AOM_ICDF(16384), AOM_ICDF(25206), AOM_ICDF(28987),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24518), AOM_ICDF(29453), AOM_ICDF(32074), AOM_ICDF(32382),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19369), AOM_ICDF(26533), AOM_ICDF(31972), AOM_ICDF(32370),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(24576), AOM_ICDF(28789), AOM_ICDF(31364), AOM_ICDF(32066),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20052), AOM_ICDF(24454), AOM_ICDF(29834), AOM_ICDF(31301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30358), AOM_ICDF(30700), AOM_ICDF(31747), AOM_ICDF(32103),
-    AOM_ICDF(32430), AOM_ICDF(32768), },
-    {AOM_ICDF(22346), AOM_ICDF(23277), AOM_ICDF(30508), AOM_ICDF(31386),
-    AOM_ICDF(32138), AOM_ICDF(32768), },
-    {AOM_ICDF(11974), AOM_ICDF(14562), AOM_ICDF(27349), AOM_ICDF(28970),
-    AOM_ICDF(31969), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(22910), AOM_ICDF(29539), AOM_ICDF(32102), AOM_ICDF(32412),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18429), AOM_ICDF(28710), AOM_ICDF(32106), AOM_ICDF(32432),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13601), AOM_ICDF(25238), AOM_ICDF(31845), AOM_ICDF(32262),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12472), AOM_ICDF(20976), AOM_ICDF(29026), AOM_ICDF(30500),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8738), AOM_ICDF(11469), AOM_ICDF(24030), AOM_ICDF(26761),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23359), AOM_ICDF(30038), AOM_ICDF(32127), AOM_ICDF(32444),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19590), AOM_ICDF(28108), AOM_ICDF(32056), AOM_ICDF(32382),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15578), AOM_ICDF(22024), AOM_ICDF(29008), AOM_ICDF(30619),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26372), AOM_ICDF(31019), AOM_ICDF(32146), AOM_ICDF(32463),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22190), AOM_ICDF(28573), AOM_ICDF(32160), AOM_ICDF(32464),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26672), AOM_ICDF(31311), AOM_ICDF(32156), AOM_ICDF(32462),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20946), AOM_ICDF(27885), AOM_ICDF(31997), AOM_ICDF(32382),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27342), AOM_ICDF(31385), AOM_ICDF(32130), AOM_ICDF(32449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8674), AOM_ICDF(22167), AOM_ICDF(26985), AOM_ICDF(29877),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(2479), AOM_ICDF(4993), AOM_ICDF(17332), AOM_ICDF(21885),
-    AOM_ICDF(25826), AOM_ICDF(32768), },
-    {AOM_ICDF(2848), AOM_ICDF(5996), AOM_ICDF(15242), AOM_ICDF(20755),
-    AOM_ICDF(23763), AOM_ICDF(32768), },
-    {AOM_ICDF(2125), AOM_ICDF(6226), AOM_ICDF(11733), AOM_ICDF(18389),
-    AOM_ICDF(20442), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14539), AOM_ICDF(19828), AOM_ICDF(29467), AOM_ICDF(29934),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12513), AOM_ICDF(19139), AOM_ICDF(29177), AOM_ICDF(29702),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11826), AOM_ICDF(16348), AOM_ICDF(27245), AOM_ICDF(27977),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10123), AOM_ICDF(12262), AOM_ICDF(24690), AOM_ICDF(25359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7979), AOM_ICDF(8826), AOM_ICDF(20804), AOM_ICDF(21295),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5262), AOM_ICDF(5604), AOM_ICDF(14716), AOM_ICDF(15015),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20625), AOM_ICDF(24118), AOM_ICDF(31086), AOM_ICDF(31446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16710), AOM_ICDF(20899), AOM_ICDF(30505), AOM_ICDF(30864),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13161), AOM_ICDF(15579), AOM_ICDF(27988), AOM_ICDF(28449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10596), AOM_ICDF(11651), AOM_ICDF(24124), AOM_ICDF(24589),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7724), AOM_ICDF(8452), AOM_ICDF(21060), AOM_ICDF(21476),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(9466), AOM_ICDF(18933), AOM_ICDF(21117),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24265), AOM_ICDF(26472), AOM_ICDF(31667), AOM_ICDF(31998),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18213), AOM_ICDF(21117), AOM_ICDF(30932), AOM_ICDF(31280),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12944), AOM_ICDF(14000), AOM_ICDF(27696), AOM_ICDF(28050),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9709), AOM_ICDF(10056), AOM_ICDF(23282), AOM_ICDF(23579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8590), AOM_ICDF(9862), AOM_ICDF(18770), AOM_ICDF(19724),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26658), AOM_ICDF(28275), AOM_ICDF(31975), AOM_ICDF(32294),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20049), AOM_ICDF(22203), AOM_ICDF(31374), AOM_ICDF(31708),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12795), AOM_ICDF(13387), AOM_ICDF(28328), AOM_ICDF(28653),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8607), AOM_ICDF(9073), AOM_ICDF(23383), AOM_ICDF(23695),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(9947), AOM_ICDF(18725), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28651), AOM_ICDF(29902), AOM_ICDF(32085), AOM_ICDF(32402),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21133), AOM_ICDF(23229), AOM_ICDF(31684), AOM_ICDF(32013),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13231), AOM_ICDF(14045), AOM_ICDF(28203), AOM_ICDF(28576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7903), AOM_ICDF(8481), AOM_ICDF(21781), AOM_ICDF(22359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(824), AOM_ICDF(8672), AOM_ICDF(16514), AOM_ICDF(27587),
-    AOM_ICDF(29231), AOM_ICDF(32768), },
-    {AOM_ICDF(1118), AOM_ICDF(9561), AOM_ICDF(17021), AOM_ICDF(25911),
-    AOM_ICDF(27753), AOM_ICDF(32768), },
-    {AOM_ICDF(806), AOM_ICDF(9313), AOM_ICDF(13998), AOM_ICDF(22910),
-    AOM_ICDF(25224), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23650), AOM_ICDF(26487), AOM_ICDF(31840), AOM_ICDF(32166),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19593), AOM_ICDF(25206), AOM_ICDF(31604), AOM_ICDF(31944),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15813), AOM_ICDF(19643), AOM_ICDF(30328), AOM_ICDF(30726),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12978), AOM_ICDF(15108), AOM_ICDF(27886), AOM_ICDF(28310),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9793), AOM_ICDF(11020), AOM_ICDF(23305), AOM_ICDF(23818),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4855), AOM_ICDF(5565), AOM_ICDF(14268), AOM_ICDF(14741),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24547), AOM_ICDF(27751), AOM_ICDF(31964), AOM_ICDF(32285),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19674), AOM_ICDF(23377), AOM_ICDF(31426), AOM_ICDF(31759),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12643), AOM_ICDF(14489), AOM_ICDF(28159), AOM_ICDF(28541),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9110), AOM_ICDF(10279), AOM_ICDF(23565), AOM_ICDF(23992),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5082), AOM_ICDF(5617), AOM_ICDF(16317), AOM_ICDF(16651),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(18971), AOM_ICDF(24145),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26773), AOM_ICDF(29038), AOM_ICDF(32050), AOM_ICDF(32367),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20956), AOM_ICDF(23898), AOM_ICDF(31563), AOM_ICDF(31888),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12527), AOM_ICDF(13472), AOM_ICDF(27840), AOM_ICDF(28211),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8773), AOM_ICDF(9353), AOM_ICDF(22555), AOM_ICDF(22856),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4291), AOM_ICDF(4876), AOM_ICDF(16969), AOM_ICDF(17554),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(17348), AOM_ICDF(23130),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28065), AOM_ICDF(29768), AOM_ICDF(32086), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21847), AOM_ICDF(24001), AOM_ICDF(31608), AOM_ICDF(31929),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12482), AOM_ICDF(13091), AOM_ICDF(27413), AOM_ICDF(27739),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7582), AOM_ICDF(8002), AOM_ICDF(22090), AOM_ICDF(22405),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6324), AOM_ICDF(7186), AOM_ICDF(15809), AOM_ICDF(16671),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29731), AOM_ICDF(30798), AOM_ICDF(32113), AOM_ICDF(32431),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22224), AOM_ICDF(24448), AOM_ICDF(31791), AOM_ICDF(32118),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12622), AOM_ICDF(13513), AOM_ICDF(28103), AOM_ICDF(28530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8886), AOM_ICDF(9600), AOM_ICDF(22890), AOM_ICDF(23604),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8058), AOM_ICDF(9669), AOM_ICDF(18264), AOM_ICDF(19876),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(27375), AOM_ICDF(27731), AOM_ICDF(31591), AOM_ICDF(31993),
-    AOM_ICDF(32404), AOM_ICDF(32768), },
-    {AOM_ICDF(20943), AOM_ICDF(21758), AOM_ICDF(30037), AOM_ICDF(31074),
-    AOM_ICDF(32003), AOM_ICDF(32768), },
-    {AOM_ICDF(16218), AOM_ICDF(17771), AOM_ICDF(26832), AOM_ICDF(29181),
-    AOM_ICDF(30586), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17239), AOM_ICDF(27853), AOM_ICDF(31557), AOM_ICDF(32198),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14494), AOM_ICDF(25906), AOM_ICDF(31543), AOM_ICDF(32033),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12980), AOM_ICDF(19788), AOM_ICDF(29137), AOM_ICDF(29410),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11796), AOM_ICDF(14680), AOM_ICDF(26477), AOM_ICDF(27787),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12603), AOM_ICDF(15124), AOM_ICDF(21005), AOM_ICDF(23526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22821), AOM_ICDF(27655), AOM_ICDF(32024), AOM_ICDF(32303),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16534), AOM_ICDF(23629), AOM_ICDF(31145), AOM_ICDF(31686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12407), AOM_ICDF(14952), AOM_ICDF(28950), AOM_ICDF(30859),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(10486), AOM_ICDF(19661), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26369), AOM_ICDF(29624), AOM_ICDF(31996), AOM_ICDF(32272),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19346), AOM_ICDF(24807), AOM_ICDF(31750), AOM_ICDF(32027),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15056), AOM_ICDF(19484), AOM_ICDF(27454), AOM_ICDF(30111),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28213), AOM_ICDF(30301), AOM_ICDF(32199), AOM_ICDF(32483),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22988), AOM_ICDF(27307), AOM_ICDF(31879), AOM_ICDF(32260),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11796), AOM_ICDF(15729), AOM_ICDF(24904), AOM_ICDF(28836),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29813), AOM_ICDF(31323), AOM_ICDF(32142), AOM_ICDF(32444),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21497), AOM_ICDF(25254), AOM_ICDF(31307), AOM_ICDF(32142),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30560), AOM_ICDF(30889), AOM_ICDF(31795), AOM_ICDF(32128),
-    AOM_ICDF(32455), AOM_ICDF(32768), },
-    {AOM_ICDF(20347), AOM_ICDF(20993), AOM_ICDF(30496), AOM_ICDF(31112),
-    AOM_ICDF(32263), AOM_ICDF(32768), },
-    {AOM_ICDF(9723), AOM_ICDF(10992), AOM_ICDF(27830), AOM_ICDF(28681),
-    AOM_ICDF(32168), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(25900), AOM_ICDF(30610), AOM_ICDF(32179), AOM_ICDF(32474),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18535), AOM_ICDF(29316), AOM_ICDF(32153), AOM_ICDF(32437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15230), AOM_ICDF(25845), AOM_ICDF(30922), AOM_ICDF(31845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(27097), AOM_ICDF(28987),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8548), AOM_ICDF(12822), AOM_ICDF(21370), AOM_ICDF(25645),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26104), AOM_ICDF(30659), AOM_ICDF(32157), AOM_ICDF(32462),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20457), AOM_ICDF(28242), AOM_ICDF(31682), AOM_ICDF(32225),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10923), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(28740), AOM_ICDF(30618), AOM_ICDF(32154), AOM_ICDF(32461),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19333), AOM_ICDF(26214), AOM_ICDF(30802), AOM_ICDF(31785),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28161), AOM_ICDF(30834), AOM_ICDF(32160), AOM_ICDF(32464),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(26536), AOM_ICDF(29149), AOM_ICDF(31562), AOM_ICDF(32165),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29913), AOM_ICDF(31560), AOM_ICDF(32172), AOM_ICDF(32470),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22209), AOM_ICDF(28035), AOM_ICDF(30583), AOM_ICDF(31676),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(3982), AOM_ICDF(6433), AOM_ICDF(20418), AOM_ICDF(25151),
-    AOM_ICDF(27471), AOM_ICDF(32768), },
-    {AOM_ICDF(3342), AOM_ICDF(6943), AOM_ICDF(15018), AOM_ICDF(20274),
-    AOM_ICDF(22412), AOM_ICDF(32768), },
-    {AOM_ICDF(1805), AOM_ICDF(5863), AOM_ICDF(9932), AOM_ICDF(16426),
-    AOM_ICDF(17655), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(11799), AOM_ICDF(19138), AOM_ICDF(28295), AOM_ICDF(28881),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11008), AOM_ICDF(18597), AOM_ICDF(28369), AOM_ICDF(29021),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10104), AOM_ICDF(15628), AOM_ICDF(26339), AOM_ICDF(27195),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8537), AOM_ICDF(11246), AOM_ICDF(22663), AOM_ICDF(23623),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5895), AOM_ICDF(6476), AOM_ICDF(16647), AOM_ICDF(17329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4046), AOM_ICDF(4357), AOM_ICDF(10849), AOM_ICDF(11160),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18503), AOM_ICDF(22222), AOM_ICDF(30403), AOM_ICDF(30814),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15264), AOM_ICDF(19282), AOM_ICDF(29949), AOM_ICDF(30339),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12101), AOM_ICDF(14721), AOM_ICDF(27350), AOM_ICDF(27783),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9243), AOM_ICDF(10177), AOM_ICDF(22679), AOM_ICDF(23097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5571), AOM_ICDF(5967), AOM_ICDF(16714), AOM_ICDF(17043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2731), AOM_ICDF(3755), AOM_ICDF(14677), AOM_ICDF(15701),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23077), AOM_ICDF(25272), AOM_ICDF(31444), AOM_ICDF(31771),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16598), AOM_ICDF(19790), AOM_ICDF(30479), AOM_ICDF(30822),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11961), AOM_ICDF(12871), AOM_ICDF(27162), AOM_ICDF(27529),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8156), AOM_ICDF(8563), AOM_ICDF(22220), AOM_ICDF(22579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5851), AOM_ICDF(6242), AOM_ICDF(15994), AOM_ICDF(16384),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26084), AOM_ICDF(27933), AOM_ICDF(31906), AOM_ICDF(32223),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19335), AOM_ICDF(21760), AOM_ICDF(31149), AOM_ICDF(31477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12724), AOM_ICDF(13278), AOM_ICDF(27015), AOM_ICDF(27365),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8687), AOM_ICDF(9010), AOM_ICDF(21051), AOM_ICDF(21334),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5814), AOM_ICDF(6606), AOM_ICDF(14534), AOM_ICDF(15327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30147), AOM_ICDF(30787), AOM_ICDF(32081), AOM_ICDF(32395),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20402), AOM_ICDF(21697), AOM_ICDF(30943), AOM_ICDF(31266),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11661), AOM_ICDF(12125), AOM_ICDF(25710), AOM_ICDF(26034),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7224), AOM_ICDF(7504), AOM_ICDF(19876), AOM_ICDF(20156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6183), AOM_ICDF(7110), AOM_ICDF(17002), AOM_ICDF(17930),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(17246), AOM_ICDF(22420),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(4079), AOM_ICDF(8378), AOM_ICDF(25109), AOM_ICDF(29897),
-    AOM_ICDF(30898), AOM_ICDF(32768), },
-    {AOM_ICDF(3870), AOM_ICDF(8207), AOM_ICDF(22495), AOM_ICDF(27162),
-    AOM_ICDF(29559), AOM_ICDF(32768), },
-    {AOM_ICDF(2127), AOM_ICDF(6197), AOM_ICDF(15932), AOM_ICDF(20604),
-    AOM_ICDF(27312), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(21253), AOM_ICDF(26168), AOM_ICDF(31780), AOM_ICDF(32120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16610), AOM_ICDF(23985), AOM_ICDF(31495), AOM_ICDF(31866),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14861), AOM_ICDF(21030), AOM_ICDF(30219), AOM_ICDF(30784),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14573), AOM_ICDF(18162), AOM_ICDF(28524), AOM_ICDF(29116),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14036), AOM_ICDF(15983), AOM_ICDF(26283), AOM_ICDF(27085),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9119), AOM_ICDF(10742), AOM_ICDF(19630), AOM_ICDF(20016),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23192), AOM_ICDF(27248), AOM_ICDF(31887), AOM_ICDF(32215),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18219), AOM_ICDF(23213), AOM_ICDF(31417), AOM_ICDF(31769),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12657), AOM_ICDF(14754), AOM_ICDF(27845), AOM_ICDF(28233),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8127), AOM_ICDF(8829), AOM_ICDF(20909), AOM_ICDF(21279),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7547), AOM_ICDF(8142), AOM_ICDF(17476), AOM_ICDF(18072),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(16384), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25516), AOM_ICDF(28301), AOM_ICDF(31970), AOM_ICDF(32289),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19094), AOM_ICDF(23041), AOM_ICDF(31404), AOM_ICDF(31732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12328), AOM_ICDF(13099), AOM_ICDF(27275), AOM_ICDF(27613),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8134), AOM_ICDF(8458), AOM_ICDF(21075), AOM_ICDF(21352),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5041), AOM_ICDF(5881), AOM_ICDF(17644), AOM_ICDF(18485),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(12743), AOM_ICDF(18204), AOM_ICDF(23666),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28082), AOM_ICDF(29782), AOM_ICDF(32087), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21281), AOM_ICDF(24161), AOM_ICDF(31679), AOM_ICDF(31997),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12144), AOM_ICDF(12913), AOM_ICDF(27139), AOM_ICDF(27460),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8232), AOM_ICDF(8472), AOM_ICDF(21659), AOM_ICDF(21979),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3034), AOM_ICDF(4855), AOM_ICDF(17598), AOM_ICDF(19418),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30193), AOM_ICDF(31021), AOM_ICDF(32122), AOM_ICDF(32435),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22124), AOM_ICDF(23763), AOM_ICDF(31498), AOM_ICDF(31816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12066), AOM_ICDF(12418), AOM_ICDF(26849), AOM_ICDF(27157),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8701), AOM_ICDF(8979), AOM_ICDF(20920), AOM_ICDF(21197),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5266), AOM_ICDF(7022), AOM_ICDF(15799), AOM_ICDF(17554),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23468), AOM_ICDF(24062), AOM_ICDF(30645), AOM_ICDF(31200),
-    AOM_ICDF(32193), AOM_ICDF(32768), },
-    {AOM_ICDF(12642), AOM_ICDF(14371), AOM_ICDF(26924), AOM_ICDF(28832),
-    AOM_ICDF(31098), AOM_ICDF(32768), },
-    {AOM_ICDF(7785), AOM_ICDF(8831), AOM_ICDF(23705), AOM_ICDF(26028),
-    AOM_ICDF(29979), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13575), AOM_ICDF(28087), AOM_ICDF(31130), AOM_ICDF(31832),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11108), AOM_ICDF(27955), AOM_ICDF(31657), AOM_ICDF(32213),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9797), AOM_ICDF(23985), AOM_ICDF(28039), AOM_ICDF(30741),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5578), AOM_ICDF(18824), AOM_ICDF(26493), AOM_ICDF(28585),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5041), AOM_ICDF(12603), AOM_ICDF(18905), AOM_ICDF(22686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17613), AOM_ICDF(26624), AOM_ICDF(30310), AOM_ICDF(31539),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11398), AOM_ICDF(22795), AOM_ICDF(29444), AOM_ICDF(30868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8548), AOM_ICDF(15672), AOM_ICDF(22795), AOM_ICDF(28494),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24145), AOM_ICDF(26301), AOM_ICDF(30181), AOM_ICDF(31475),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15565), AOM_ICDF(20480), AOM_ICDF(27853), AOM_ICDF(30310),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27434), AOM_ICDF(28450), AOM_ICDF(30990), AOM_ICDF(31752),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14947), AOM_ICDF(21845), AOM_ICDF(29319), AOM_ICDF(31043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31130), AOM_ICDF(31676), AOM_ICDF(32180), AOM_ICDF(32474),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18289), AOM_ICDF(22099), AOM_ICDF(28196), AOM_ICDF(30482),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29436), AOM_ICDF(29775), AOM_ICDF(31685), AOM_ICDF(32029),
-    AOM_ICDF(32425), AOM_ICDF(32768), },
-    {AOM_ICDF(10536), AOM_ICDF(11074), AOM_ICDF(27753), AOM_ICDF(28385),
-    AOM_ICDF(31293), AOM_ICDF(32768), },
-    {AOM_ICDF(3010), AOM_ICDF(3521), AOM_ICDF(22603), AOM_ICDF(23227),
-    AOM_ICDF(30440), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17576), AOM_ICDF(29491), AOM_ICDF(30981), AOM_ICDF(31874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10426), AOM_ICDF(29044), AOM_ICDF(31725), AOM_ICDF(32321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15766), AOM_ICDF(28286), AOM_ICDF(31377), AOM_ICDF(32304),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19661), AOM_ICDF(26985), AOM_ICDF(30069), AOM_ICDF(31611),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16035), AOM_ICDF(23007), AOM_ICDF(28585), AOM_ICDF(30676),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23073), AOM_ICDF(30053), AOM_ICDF(31605), AOM_ICDF(32186),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12858), AOM_ICDF(24887), AOM_ICDF(30279), AOM_ICDF(31524),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24030), AOM_ICDF(26839), AOM_ICDF(30896), AOM_ICDF(31832),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17644), AOM_ICDF(23526), AOM_ICDF(27727), AOM_ICDF(30247),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28019), AOM_ICDF(30156), AOM_ICDF(31343), AOM_ICDF(32056),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14980), AOM_ICDF(22469), AOM_ICDF(27151), AOM_ICDF(29959),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30549), AOM_ICDF(31511), AOM_ICDF(32176), AOM_ICDF(32472),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15019), AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
-/* clang-format on */
+static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+    [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
+        { { { { { AOM_CDF4(4034, 8930, 12727) },
+                { AOM_CDF4(18082, 29741, 31877) },
+                { AOM_CDF4(12596, 26124, 30493) },
+                { AOM_CDF4(9446, 21118, 27005) },
+                { AOM_CDF4(6308, 15141, 21279) },
+                { AOM_CDF4(2463, 6357, 9783) },
+                { AOM_CDF4(20667, 30546, 31929) },
+                { AOM_CDF4(13043, 26123, 30134) },
+                { AOM_CDF4(8151, 18757, 24778) },
+                { AOM_CDF4(5255, 12839, 18632) },
+                { AOM_CDF4(2820, 7206, 11161) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(15736, 27553, 30604) },
+                { AOM_CDF4(11210, 23794, 28787) },
+                { AOM_CDF4(5947, 13874, 19701) },
+                { AOM_CDF4(4215, 9323, 13891) },
+                { AOM_CDF4(2833, 6462, 10059) },
+                { AOM_CDF4(19605, 30393, 31582) },
+                { AOM_CDF4(13523, 26252, 30248) },
+                { AOM_CDF4(8446, 18622, 24512) },
+                { AOM_CDF4(3818, 10343, 15974) },
+                { AOM_CDF4(1481, 4117, 6796) },
+                { AOM_CDF4(22649, 31302, 32190) },
+                { AOM_CDF4(14829, 27127, 30449) },
+                { AOM_CDF4(8313, 17702, 23304) },
+                { AOM_CDF4(3022, 8301, 12786) },
+                { AOM_CDF4(1536, 4412, 7184) },
+                { AOM_CDF4(22354, 29774, 31372) },
+                { AOM_CDF4(14723, 25472, 29214) },
+                { AOM_CDF4(6673, 13745, 18662) },
+                { AOM_CDF4(2068, 5766, 9322) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6302, 16444, 21761) },
+                { AOM_CDF4(23040, 31538, 32475) },
+                { AOM_CDF4(15196, 28452, 31496) },
+                { AOM_CDF4(10020, 22946, 28514) },
+                { AOM_CDF4(6533, 16862, 23501) },
+                { AOM_CDF4(3538, 9816, 15076) },
+                { AOM_CDF4(24444, 31875, 32525) },
+                { AOM_CDF4(15881, 28924, 31635) },
+                { AOM_CDF4(9922, 22873, 28466) },
+                { AOM_CDF4(6527, 16966, 23691) },
+                { AOM_CDF4(4114, 11303, 17220) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(20201, 30770, 32209) },
+                { AOM_CDF4(14754, 28071, 31258) },
+                { AOM_CDF4(8378, 20186, 26517) },
+                { AOM_CDF4(5916, 15299, 21978) },
+                { AOM_CDF4(4268, 11583, 17901) },
+                { AOM_CDF4(24361, 32025, 32581) },
+                { AOM_CDF4(18673, 30105, 31943) },
+                { AOM_CDF4(10196, 22244, 27576) },
+                { AOM_CDF4(5495, 14349, 20417) },
+                { AOM_CDF4(2676, 7415, 11498) },
+                { AOM_CDF4(24678, 31958, 32585) },
+                { AOM_CDF4(18629, 29906, 31831) },
+                { AOM_CDF4(9364, 20724, 26315) },
+                { AOM_CDF4(4641, 12318, 18094) },
+                { AOM_CDF4(2758, 7387, 11579) },
+                { AOM_CDF4(25433, 31842, 32469) },
+                { AOM_CDF4(18795, 29289, 31411) },
+                { AOM_CDF4(7644, 17584, 23592) },
+                { AOM_CDF4(3408, 9014, 15047) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4536, 10072, 14001) },
+                { AOM_CDF4(25459, 31416, 32206) },
+                { AOM_CDF4(16605, 28048, 30818) },
+                { AOM_CDF4(11008, 22857, 27719) },
+                { AOM_CDF4(6915, 16268, 22315) },
+                { AOM_CDF4(2625, 6812, 10537) },
+                { AOM_CDF4(24257, 31788, 32499) },
+                { AOM_CDF4(16880, 29454, 31879) },
+                { AOM_CDF4(11958, 25054, 29778) },
+                { AOM_CDF4(7916, 18718, 25084) },
+                { AOM_CDF4(3383, 8777, 13446) },
+                { AOM_CDF4(22720, 31603, 32393) },
+                { AOM_CDF4(14960, 28125, 31335) },
+                { AOM_CDF4(9731, 22210, 27928) },
+                { AOM_CDF4(6304, 15832, 22277) },
+                { AOM_CDF4(2910, 7818, 12166) },
+                { AOM_CDF4(20375, 30627, 32131) },
+                { AOM_CDF4(13904, 27284, 30887) },
+                { AOM_CDF4(9368, 21558, 27144) },
+                { AOM_CDF4(5937, 14966, 21119) },
+                { AOM_CDF4(2667, 7225, 11319) },
+                { AOM_CDF4(23970, 31470, 32378) },
+                { AOM_CDF4(17173, 29734, 32018) },
+                { AOM_CDF4(12795, 25441, 29965) },
+                { AOM_CDF4(8981, 19680, 25893) },
+                { AOM_CDF4(4728, 11372, 16902) },
+                { AOM_CDF4(24287, 31797, 32439) },
+                { AOM_CDF4(16703, 29145, 31696) },
+                { AOM_CDF4(10833, 23554, 28725) },
+                { AOM_CDF4(6468, 16566, 23057) },
+                { AOM_CDF4(2415, 6562, 10278) },
+                { AOM_CDF4(26610, 32395, 32659) },
+                { AOM_CDF4(18590, 30498, 32117) },
+                { AOM_CDF4(12420, 25756, 29950) },
+                { AOM_CDF4(7639, 18746, 24710) },
+                { AOM_CDF4(3001, 8086, 12347) },
+                { AOM_CDF4(25076, 32064, 32580) },
+                { AOM_CDF4(17946, 30128, 32028) },
+                { AOM_CDF4(12024, 24985, 29378) },
+                { AOM_CDF4(7517, 18390, 24304) },
+                { AOM_CDF4(3243, 8781, 13331) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6037, 16771, 21957) },
+                { AOM_CDF4(24774, 31704, 32426) },
+                { AOM_CDF4(16830, 28589, 31056) },
+                { AOM_CDF4(10602, 22828, 27760) },
+                { AOM_CDF4(6733, 16829, 23071) },
+                { AOM_CDF4(3250, 8914, 13556) },
+                { AOM_CDF4(25582, 32220, 32668) },
+                { AOM_CDF4(18659, 30342, 32223) },
+                { AOM_CDF4(12546, 26149, 30515) },
+                { AOM_CDF4(8420, 20451, 26801) },
+                { AOM_CDF4(4636, 12420, 18344) },
+                { AOM_CDF4(27581, 32362, 32639) },
+                { AOM_CDF4(18987, 30083, 31978) },
+                { AOM_CDF4(11327, 24248, 29084) },
+                { AOM_CDF4(7264, 17719, 24120) },
+                { AOM_CDF4(3995, 10768, 16169) },
+                { AOM_CDF4(25893, 31831, 32487) },
+                { AOM_CDF4(16577, 28587, 31379) },
+                { AOM_CDF4(10189, 22748, 28182) },
+                { AOM_CDF4(6832, 17094, 23556) },
+                { AOM_CDF4(3708, 10110, 15334) },
+                { AOM_CDF4(25904, 32282, 32656) },
+                { AOM_CDF4(19721, 30792, 32276) },
+                { AOM_CDF4(12819, 26243, 30411) },
+                { AOM_CDF4(8572, 20614, 26891) },
+                { AOM_CDF4(5364, 14059, 20467) },
+                { AOM_CDF4(26580, 32438, 32677) },
+                { AOM_CDF4(20852, 31225, 32340) },
+                { AOM_CDF4(12435, 25700, 29967) },
+                { AOM_CDF4(8691, 20825, 26976) },
+                { AOM_CDF4(4446, 12209, 17269) },
+                { AOM_CDF4(27350, 32429, 32696) },
+                { AOM_CDF4(21372, 30977, 32272) },
+                { AOM_CDF4(12673, 25270, 29853) },
+                { AOM_CDF4(9208, 20925, 26640) },
+                { AOM_CDF4(5018, 13351, 18732) },
+                { AOM_CDF4(27351, 32479, 32713) },
+                { AOM_CDF4(21398, 31209, 32387) },
+                { AOM_CDF4(12162, 25047, 29842) },
+                { AOM_CDF4(7896, 18691, 25319) },
+                { AOM_CDF4(4670, 12882, 18881) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5487, 10460, 13708) },
+                { AOM_CDF4(21597, 28303, 30674) },
+                { AOM_CDF4(11037, 21953, 26476) },
+                { AOM_CDF4(8147, 17962, 22952) },
+                { AOM_CDF4(5242, 13061, 18532) },
+                { AOM_CDF4(1889, 5208, 8182) },
+                { AOM_CDF4(26774, 32133, 32590) },
+                { AOM_CDF4(17844, 29564, 31767) },
+                { AOM_CDF4(11690, 24438, 29171) },
+                { AOM_CDF4(7542, 18215, 24459) },
+                { AOM_CDF4(2993, 8050, 12319) },
+                { AOM_CDF4(28023, 32328, 32591) },
+                { AOM_CDF4(18651, 30126, 31954) },
+                { AOM_CDF4(12164, 25146, 29589) },
+                { AOM_CDF4(7762, 18530, 24771) },
+                { AOM_CDF4(3492, 9183, 13920) },
+                { AOM_CDF4(27591, 32008, 32491) },
+                { AOM_CDF4(17149, 28853, 31510) },
+                { AOM_CDF4(11485, 24003, 28860) },
+                { AOM_CDF4(7697, 18086, 24210) },
+                { AOM_CDF4(3075, 7999, 12218) },
+                { AOM_CDF4(28268, 32482, 32654) },
+                { AOM_CDF4(19631, 31051, 32404) },
+                { AOM_CDF4(13860, 27260, 31020) },
+                { AOM_CDF4(9605, 21613, 27594) },
+                { AOM_CDF4(4876, 12162, 17908) },
+                { AOM_CDF4(27248, 32316, 32576) },
+                { AOM_CDF4(18955, 30457, 32075) },
+                { AOM_CDF4(11824, 23997, 28795) },
+                { AOM_CDF4(7346, 18196, 24647) },
+                { AOM_CDF4(3403, 9247, 14111) },
+                { AOM_CDF4(29711, 32655, 32735) },
+                { AOM_CDF4(21169, 31394, 32417) },
+                { AOM_CDF4(13487, 27198, 30957) },
+                { AOM_CDF4(8828, 21683, 27614) },
+                { AOM_CDF4(4270, 11451, 17038) },
+                { AOM_CDF4(28708, 32578, 32731) },
+                { AOM_CDF4(20120, 31241, 32482) },
+                { AOM_CDF4(13692, 27550, 31321) },
+                { AOM_CDF4(9418, 22514, 28439) },
+                { AOM_CDF4(4999, 13283, 19462) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(5673, 14302, 19711) },
+                { AOM_CDF4(26251, 30701, 31834) },
+                { AOM_CDF4(12782, 23783, 27803) },
+                { AOM_CDF4(9127, 20657, 25808) },
+                { AOM_CDF4(6368, 16208, 21462) },
+                { AOM_CDF4(2465, 7177, 10822) },
+                { AOM_CDF4(29961, 32563, 32719) },
+                { AOM_CDF4(18318, 29891, 31949) },
+                { AOM_CDF4(11361, 24514, 29357) },
+                { AOM_CDF4(7900, 19603, 25607) },
+                { AOM_CDF4(4002, 10590, 15546) },
+                { AOM_CDF4(29637, 32310, 32595) },
+                { AOM_CDF4(18296, 29913, 31809) },
+                { AOM_CDF4(10144, 21515, 26871) },
+                { AOM_CDF4(5358, 14322, 20394) },
+                { AOM_CDF4(3067, 8362, 13346) },
+                { AOM_CDF4(28652, 32470, 32676) },
+                { AOM_CDF4(17538, 30771, 32209) },
+                { AOM_CDF4(13924, 26882, 30494) },
+                { AOM_CDF4(10496, 22837, 27869) },
+                { AOM_CDF4(7236, 16396, 21621) },
+                { AOM_CDF4(30743, 32687, 32746) },
+                { AOM_CDF4(23006, 31676, 32489) },
+                { AOM_CDF4(14494, 27828, 31120) },
+                { AOM_CDF4(10174, 22801, 28352) },
+                { AOM_CDF4(6242, 15281, 21043) },
+                { AOM_CDF4(25817, 32243, 32720) },
+                { AOM_CDF4(18618, 31367, 32325) },
+                { AOM_CDF4(13997, 28318, 31878) },
+                { AOM_CDF4(12255, 26534, 31383) },
+                { AOM_CDF4(9561, 21588, 28450) },
+                { AOM_CDF4(28188, 32635, 32724) },
+                { AOM_CDF4(22060, 32365, 32728) },
+                { AOM_CDF4(18102, 30690, 32528) },
+                { AOM_CDF4(14196, 28864, 31999) },
+                { AOM_CDF4(12262, 25792, 30865) },
+                { AOM_CDF4(24176, 32109, 32628) },
+                { AOM_CDF4(18280, 29681, 31963) },
+                { AOM_CDF4(10205, 23703, 29664) },
+                { AOM_CDF4(7889, 20025, 27676) },
+                { AOM_CDF4(6060, 16743, 23970) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5141, 7096, 8260) },
+                { AOM_CDF4(27186, 29022, 29789) },
+                { AOM_CDF4(6668, 12568, 15682) },
+                { AOM_CDF4(2172, 6181, 8638) },
+                { AOM_CDF4(1126, 3379, 4531) },
+                { AOM_CDF4(443, 1361, 2254) },
+                { AOM_CDF4(26083, 31153, 32436) },
+                { AOM_CDF4(13486, 24603, 28483) },
+                { AOM_CDF4(6508, 14840, 19910) },
+                { AOM_CDF4(3386, 8800, 13286) },
+                { AOM_CDF4(1530, 4322, 7054) },
+                { AOM_CDF4(29639, 32080, 32548) },
+                { AOM_CDF4(15897, 27552, 30290) },
+                { AOM_CDF4(8588, 20047, 25383) },
+                { AOM_CDF4(4889, 13339, 19269) },
+                { AOM_CDF4(2240, 6871, 10498) },
+                { AOM_CDF4(28165, 32197, 32517) },
+                { AOM_CDF4(20735, 30427, 31568) },
+                { AOM_CDF4(14325, 24671, 27692) },
+                { AOM_CDF4(5119, 12554, 17805) },
+                { AOM_CDF4(1810, 5441, 8261) },
+                { AOM_CDF4(31212, 32724, 32748) },
+                { AOM_CDF4(23352, 31766, 32545) },
+                { AOM_CDF4(14669, 27570, 31059) },
+                { AOM_CDF4(8492, 20894, 27272) },
+                { AOM_CDF4(3644, 10194, 15204) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(2461, 7013, 9371) },
+                { AOM_CDF4(24749, 29600, 30986) },
+                { AOM_CDF4(9466, 19037, 22417) },
+                { AOM_CDF4(3584, 9280, 14400) },
+                { AOM_CDF4(1505, 3929, 5433) },
+                { AOM_CDF4(677, 1500, 2736) },
+                { AOM_CDF4(23987, 30702, 32117) },
+                { AOM_CDF4(13554, 24571, 29263) },
+                { AOM_CDF4(6211, 14556, 21155) },
+                { AOM_CDF4(3135, 10972, 15625) },
+                { AOM_CDF4(2435, 7127, 11427) },
+                { AOM_CDF4(31300, 32532, 32550) },
+                { AOM_CDF4(14757, 30365, 31954) },
+                { AOM_CDF4(4405, 11612, 18553) },
+                { AOM_CDF4(580, 4132, 7322) },
+                { AOM_CDF4(1695, 10169, 14124) },
+                { AOM_CDF4(30008, 32282, 32591) },
+                { AOM_CDF4(19244, 30108, 31748) },
+                { AOM_CDF4(11180, 24158, 29555) },
+                { AOM_CDF4(5650, 14972, 19209) },
+                { AOM_CDF4(2114, 5109, 8456) },
+                { AOM_CDF4(31856, 32716, 32748) },
+                { AOM_CDF4(23012, 31664, 32572) },
+                { AOM_CDF4(13694, 26656, 30636) },
+                { AOM_CDF4(8142, 19508, 26093) },
+                { AOM_CDF4(4253, 10955, 16724) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(601, 983, 1311) },
+                { AOM_CDF4(18725, 23406, 28087) },
+                { AOM_CDF4(5461, 8192, 10923) },
+                { AOM_CDF4(3781, 15124, 21425) },
+                { AOM_CDF4(2587, 7761, 12072) },
+                { AOM_CDF4(106, 458, 810) },
+                { AOM_CDF4(22282, 29710, 31894) },
+                { AOM_CDF4(8508, 20926, 25984) },
+                { AOM_CDF4(3726, 12713, 18083) },
+                { AOM_CDF4(1620, 7112, 10893) },
+                { AOM_CDF4(729, 2236, 3495) },
+                { AOM_CDF4(30163, 32474, 32684) },
+                { AOM_CDF4(18304, 30464, 32000) },
+                { AOM_CDF4(11443, 26526, 29647) },
+                { AOM_CDF4(6007, 15292, 21299) },
+                { AOM_CDF4(2234, 6703, 8937) },
+                { AOM_CDF4(30954, 32177, 32571) },
+                { AOM_CDF4(17363, 29562, 31076) },
+                { AOM_CDF4(9686, 22464, 27410) },
+                { AOM_CDF4(8192, 16384, 21390) },
+                { AOM_CDF4(1755, 8046, 11264) },
+                { AOM_CDF4(31168, 32734, 32748) },
+                { AOM_CDF4(22486, 31441, 32471) },
+                { AOM_CDF4(12833, 25627, 29738) },
+                { AOM_CDF4(6980, 17379, 23122) },
+                { AOM_CDF4(3111, 8887, 13479) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(6041, 11854, 15927) },
+                { AOM_CDF4(20326, 30905, 32251) },
+                { AOM_CDF4(14164, 26831, 30725) },
+                { AOM_CDF4(9760, 20647, 26585) },
+                { AOM_CDF4(6416, 14953, 21219) },
+                { AOM_CDF4(2966, 7151, 10891) },
+                { AOM_CDF4(23567, 31374, 32254) },
+                { AOM_CDF4(14978, 27416, 30946) },
+                { AOM_CDF4(9434, 20225, 26254) },
+                { AOM_CDF4(6658, 14558, 20535) },
+                { AOM_CDF4(3916, 8677, 12989) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(18088, 29545, 31587) },
+                { AOM_CDF4(13062, 25843, 30073) },
+                { AOM_CDF4(8940, 16827, 22251) },
+                { AOM_CDF4(7654, 13220, 17973) },
+                { AOM_CDF4(5733, 10316, 14456) },
+                { AOM_CDF4(22879, 31388, 32114) },
+                { AOM_CDF4(15215, 27993, 30955) },
+                { AOM_CDF4(9397, 19445, 24978) },
+                { AOM_CDF4(3442, 9813, 15344) },
+                { AOM_CDF4(1368, 3936, 6532) },
+                { AOM_CDF4(25494, 32033, 32406) },
+                { AOM_CDF4(16772, 27963, 30718) },
+                { AOM_CDF4(9419, 18165, 23260) },
+                { AOM_CDF4(2677, 7501, 11797) },
+                { AOM_CDF4(1516, 4344, 7170) },
+                { AOM_CDF4(26556, 31454, 32101) },
+                { AOM_CDF4(17128, 27035, 30108) },
+                { AOM_CDF4(8324, 15344, 20249) },
+                { AOM_CDF4(1903, 5696, 9469) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8455, 19003, 24368) },
+                { AOM_CDF4(23563, 32021, 32604) },
+                { AOM_CDF4(16237, 29446, 31935) },
+                { AOM_CDF4(10724, 23999, 29358) },
+                { AOM_CDF4(6725, 17528, 24416) },
+                { AOM_CDF4(3927, 10927, 16825) },
+                { AOM_CDF4(26313, 32288, 32634) },
+                { AOM_CDF4(17430, 30095, 32095) },
+                { AOM_CDF4(11116, 24606, 29679) },
+                { AOM_CDF4(7195, 18384, 25269) },
+                { AOM_CDF4(4726, 12852, 19315) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(22822, 31648, 32483) },
+                { AOM_CDF4(16724, 29633, 31929) },
+                { AOM_CDF4(10261, 23033, 28725) },
+                { AOM_CDF4(7029, 17840, 24528) },
+                { AOM_CDF4(4867, 13886, 21502) },
+                { AOM_CDF4(25298, 31892, 32491) },
+                { AOM_CDF4(17809, 29330, 31512) },
+                { AOM_CDF4(9668, 21329, 26579) },
+                { AOM_CDF4(4774, 12956, 18976) },
+                { AOM_CDF4(2322, 7030, 11540) },
+                { AOM_CDF4(25472, 31920, 32543) },
+                { AOM_CDF4(17957, 29387, 31632) },
+                { AOM_CDF4(9196, 20593, 26400) },
+                { AOM_CDF4(4680, 12705, 19202) },
+                { AOM_CDF4(2917, 8456, 13436) },
+                { AOM_CDF4(26471, 32059, 32574) },
+                { AOM_CDF4(18458, 29783, 31909) },
+                { AOM_CDF4(8400, 19464, 25956) },
+                { AOM_CDF4(3812, 10973, 17206) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(6779, 13743, 17678) },
+                { AOM_CDF4(24806, 31797, 32457) },
+                { AOM_CDF4(17616, 29047, 31372) },
+                { AOM_CDF4(11063, 23175, 28003) },
+                { AOM_CDF4(6521, 16110, 22324) },
+                { AOM_CDF4(2764, 7504, 11654) },
+                { AOM_CDF4(25266, 32367, 32637) },
+                { AOM_CDF4(19054, 30553, 32175) },
+                { AOM_CDF4(12139, 25212, 29807) },
+                { AOM_CDF4(7311, 18162, 24704) },
+                { AOM_CDF4(3397, 9164, 14074) },
+                { AOM_CDF4(25988, 32208, 32522) },
+                { AOM_CDF4(16253, 28912, 31526) },
+                { AOM_CDF4(9151, 21387, 27372) },
+                { AOM_CDF4(5688, 14915, 21496) },
+                { AOM_CDF4(2717, 7627, 12004) },
+                { AOM_CDF4(23144, 31855, 32443) },
+                { AOM_CDF4(16070, 28491, 31325) },
+                { AOM_CDF4(8702, 20467, 26517) },
+                { AOM_CDF4(5243, 13956, 20367) },
+                { AOM_CDF4(2621, 7335, 11567) },
+                { AOM_CDF4(26636, 32340, 32630) },
+                { AOM_CDF4(19990, 31050, 32341) },
+                { AOM_CDF4(13243, 26105, 30315) },
+                { AOM_CDF4(8588, 19521, 25918) },
+                { AOM_CDF4(4717, 11585, 17304) },
+                { AOM_CDF4(25844, 32292, 32582) },
+                { AOM_CDF4(19090, 30635, 32097) },
+                { AOM_CDF4(11963, 24546, 28939) },
+                { AOM_CDF4(6218, 16087, 22354) },
+                { AOM_CDF4(2340, 6608, 10426) },
+                { AOM_CDF4(28046, 32576, 32694) },
+                { AOM_CDF4(21178, 31313, 32296) },
+                { AOM_CDF4(13486, 26184, 29870) },
+                { AOM_CDF4(7149, 17871, 23723) },
+                { AOM_CDF4(2833, 7958, 12259) },
+                { AOM_CDF4(27710, 32528, 32686) },
+                { AOM_CDF4(20674, 31076, 32268) },
+                { AOM_CDF4(12413, 24955, 29243) },
+                { AOM_CDF4(6676, 16927, 23097) },
+                { AOM_CDF4(2966, 8333, 12919) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8639, 19339, 24429) },
+                { AOM_CDF4(24404, 31837, 32525) },
+                { AOM_CDF4(16997, 29425, 31784) },
+                { AOM_CDF4(11253, 24234, 29149) },
+                { AOM_CDF4(6751, 17394, 24028) },
+                { AOM_CDF4(3490, 9830, 15191) },
+                { AOM_CDF4(26283, 32471, 32714) },
+                { AOM_CDF4(19599, 31168, 32442) },
+                { AOM_CDF4(13146, 26954, 30893) },
+                { AOM_CDF4(8214, 20588, 26890) },
+                { AOM_CDF4(4699, 13081, 19300) },
+                { AOM_CDF4(28212, 32458, 32669) },
+                { AOM_CDF4(18594, 30316, 32100) },
+                { AOM_CDF4(11219, 24408, 29234) },
+                { AOM_CDF4(6865, 17656, 24149) },
+                { AOM_CDF4(3678, 10362, 16006) },
+                { AOM_CDF4(25825, 32136, 32616) },
+                { AOM_CDF4(17313, 29853, 32021) },
+                { AOM_CDF4(11197, 24471, 29472) },
+                { AOM_CDF4(6947, 17781, 24405) },
+                { AOM_CDF4(3768, 10660, 16261) },
+                { AOM_CDF4(27352, 32500, 32706) },
+                { AOM_CDF4(20850, 31468, 32469) },
+                { AOM_CDF4(14021, 27707, 31133) },
+                { AOM_CDF4(8964, 21748, 27838) },
+                { AOM_CDF4(5437, 14665, 21187) },
+                { AOM_CDF4(26304, 32492, 32698) },
+                { AOM_CDF4(20409, 31380, 32385) },
+                { AOM_CDF4(13682, 27222, 30632) },
+                { AOM_CDF4(8974, 21236, 26685) },
+                { AOM_CDF4(4234, 11665, 16934) },
+                { AOM_CDF4(26273, 32357, 32711) },
+                { AOM_CDF4(20672, 31242, 32441) },
+                { AOM_CDF4(14172, 27254, 30902) },
+                { AOM_CDF4(9870, 21898, 27275) },
+                { AOM_CDF4(5164, 13506, 19270) },
+                { AOM_CDF4(26725, 32459, 32728) },
+                { AOM_CDF4(20991, 31442, 32527) },
+                { AOM_CDF4(13071, 26434, 30811) },
+                { AOM_CDF4(8184, 20090, 26742) },
+                { AOM_CDF4(4803, 13255, 19895) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7555, 14942, 18501) },
+                { AOM_CDF4(24410, 31178, 32287) },
+                { AOM_CDF4(14394, 26738, 30253) },
+                { AOM_CDF4(8413, 19554, 25195) },
+                { AOM_CDF4(4766, 12924, 18785) },
+                { AOM_CDF4(2029, 5806, 9207) },
+                { AOM_CDF4(26776, 32364, 32663) },
+                { AOM_CDF4(18732, 29967, 31931) },
+                { AOM_CDF4(11005, 23786, 28852) },
+                { AOM_CDF4(6466, 16909, 23510) },
+                { AOM_CDF4(3044, 8638, 13419) },
+                { AOM_CDF4(29208, 32582, 32704) },
+                { AOM_CDF4(20068, 30857, 32208) },
+                { AOM_CDF4(12003, 25085, 29595) },
+                { AOM_CDF4(6947, 17750, 24189) },
+                { AOM_CDF4(3245, 9103, 14007) },
+                { AOM_CDF4(27359, 32465, 32669) },
+                { AOM_CDF4(19421, 30614, 32174) },
+                { AOM_CDF4(11915, 25010, 29579) },
+                { AOM_CDF4(6950, 17676, 24074) },
+                { AOM_CDF4(3007, 8473, 13096) },
+                { AOM_CDF4(29002, 32676, 32735) },
+                { AOM_CDF4(22102, 31849, 32576) },
+                { AOM_CDF4(14408, 28009, 31405) },
+                { AOM_CDF4(9027, 21679, 27931) },
+                { AOM_CDF4(4694, 12678, 18748) },
+                { AOM_CDF4(28216, 32528, 32682) },
+                { AOM_CDF4(20849, 31264, 32318) },
+                { AOM_CDF4(12756, 25815, 29751) },
+                { AOM_CDF4(7565, 18801, 24923) },
+                { AOM_CDF4(3509, 9533, 14477) },
+                { AOM_CDF4(30133, 32687, 32739) },
+                { AOM_CDF4(23063, 31910, 32515) },
+                { AOM_CDF4(14588, 28051, 31132) },
+                { AOM_CDF4(9085, 21649, 27457) },
+                { AOM_CDF4(4261, 11654, 17264) },
+                { AOM_CDF4(29518, 32691, 32748) },
+                { AOM_CDF4(22451, 31959, 32613) },
+                { AOM_CDF4(14864, 28722, 31700) },
+                { AOM_CDF4(9695, 22964, 28716) },
+                { AOM_CDF4(4932, 13358, 19502) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6465, 16958, 21688) },
+                { AOM_CDF4(25199, 31514, 32360) },
+                { AOM_CDF4(14774, 27149, 30607) },
+                { AOM_CDF4(9257, 21438, 26972) },
+                { AOM_CDF4(5723, 15183, 21882) },
+                { AOM_CDF4(3150, 8879, 13731) },
+                { AOM_CDF4(26989, 32262, 32682) },
+                { AOM_CDF4(17396, 29937, 32085) },
+                { AOM_CDF4(11387, 24901, 29784) },
+                { AOM_CDF4(7289, 18821, 25548) },
+                { AOM_CDF4(3734, 10577, 16086) },
+                { AOM_CDF4(29728, 32501, 32695) },
+                { AOM_CDF4(17431, 29701, 31903) },
+                { AOM_CDF4(9921, 22826, 28300) },
+                { AOM_CDF4(5896, 15434, 22068) },
+                { AOM_CDF4(3430, 9646, 14757) },
+                { AOM_CDF4(28614, 32511, 32705) },
+                { AOM_CDF4(19364, 30638, 32263) },
+                { AOM_CDF4(13129, 26254, 30402) },
+                { AOM_CDF4(8754, 20484, 26440) },
+                { AOM_CDF4(4378, 11607, 17110) },
+                { AOM_CDF4(30292, 32671, 32744) },
+                { AOM_CDF4(21780, 31603, 32501) },
+                { AOM_CDF4(14314, 27829, 31291) },
+                { AOM_CDF4(9611, 22327, 28263) },
+                { AOM_CDF4(4890, 13087, 19065) },
+                { AOM_CDF4(25862, 32567, 32733) },
+                { AOM_CDF4(20794, 32050, 32567) },
+                { AOM_CDF4(17243, 30625, 32254) },
+                { AOM_CDF4(13283, 27628, 31474) },
+                { AOM_CDF4(9669, 22532, 28918) },
+                { AOM_CDF4(27435, 32697, 32748) },
+                { AOM_CDF4(24922, 32390, 32714) },
+                { AOM_CDF4(21449, 31504, 32536) },
+                { AOM_CDF4(16392, 29729, 31832) },
+                { AOM_CDF4(11692, 24884, 29076) },
+                { AOM_CDF4(24193, 32290, 32735) },
+                { AOM_CDF4(18909, 31104, 32563) },
+                { AOM_CDF4(12236, 26841, 31403) },
+                { AOM_CDF4(8171, 21840, 29082) },
+                { AOM_CDF4(7224, 17280, 25275) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(3078, 6839, 9890) },
+                { AOM_CDF4(13837, 20450, 24479) },
+                { AOM_CDF4(5914, 14222, 19328) },
+                { AOM_CDF4(3866, 10267, 14762) },
+                { AOM_CDF4(2612, 7208, 11042) },
+                { AOM_CDF4(1067, 2991, 4776) },
+                { AOM_CDF4(25817, 31646, 32529) },
+                { AOM_CDF4(13708, 26338, 30385) },
+                { AOM_CDF4(7328, 18585, 24870) },
+                { AOM_CDF4(4691, 13080, 19276) },
+                { AOM_CDF4(1825, 5253, 8352) },
+                { AOM_CDF4(29386, 32315, 32624) },
+                { AOM_CDF4(17160, 29001, 31360) },
+                { AOM_CDF4(9602, 21862, 27396) },
+                { AOM_CDF4(5915, 15772, 22148) },
+                { AOM_CDF4(2786, 7779, 12047) },
+                { AOM_CDF4(29246, 32450, 32663) },
+                { AOM_CDF4(18696, 29929, 31818) },
+                { AOM_CDF4(10510, 23369, 28560) },
+                { AOM_CDF4(6229, 16499, 23125) },
+                { AOM_CDF4(2608, 7448, 11705) },
+                { AOM_CDF4(30753, 32710, 32748) },
+                { AOM_CDF4(21638, 31487, 32503) },
+                { AOM_CDF4(12937, 26854, 30870) },
+                { AOM_CDF4(8182, 20596, 26970) },
+                { AOM_CDF4(3637, 10269, 15497) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(5244, 12150, 16906) },
+                { AOM_CDF4(20486, 26858, 29701) },
+                { AOM_CDF4(7756, 18317, 23735) },
+                { AOM_CDF4(3452, 9256, 13146) },
+                { AOM_CDF4(2020, 5206, 8229) },
+                { AOM_CDF4(1801, 4993, 7903) },
+                { AOM_CDF4(27051, 31858, 32531) },
+                { AOM_CDF4(15988, 27531, 30619) },
+                { AOM_CDF4(9188, 21484, 26719) },
+                { AOM_CDF4(6273, 17186, 23800) },
+                { AOM_CDF4(3108, 9355, 14764) },
+                { AOM_CDF4(31076, 32520, 32680) },
+                { AOM_CDF4(18119, 30037, 31850) },
+                { AOM_CDF4(10244, 22969, 27472) },
+                { AOM_CDF4(4692, 14077, 19273) },
+                { AOM_CDF4(3694, 11677, 17556) },
+                { AOM_CDF4(30060, 32581, 32720) },
+                { AOM_CDF4(21011, 30775, 32120) },
+                { AOM_CDF4(11931, 24820, 29289) },
+                { AOM_CDF4(7119, 17662, 24356) },
+                { AOM_CDF4(3833, 10706, 16304) },
+                { AOM_CDF4(31954, 32731, 32748) },
+                { AOM_CDF4(23913, 31724, 32489) },
+                { AOM_CDF4(15520, 28060, 31286) },
+                { AOM_CDF4(11517, 23008, 28571) },
+                { AOM_CDF4(6193, 14508, 20629) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(1035, 2807, 4156) },
+                { AOM_CDF4(13162, 18138, 20939) },
+                { AOM_CDF4(2696, 6633, 8755) },
+                { AOM_CDF4(1373, 4161, 6853) },
+                { AOM_CDF4(1099, 2746, 4716) },
+                { AOM_CDF4(340, 1021, 1599) },
+                { AOM_CDF4(22826, 30419, 32135) },
+                { AOM_CDF4(10395, 21762, 26942) },
+                { AOM_CDF4(4726, 12407, 17361) },
+                { AOM_CDF4(2447, 7080, 10593) },
+                { AOM_CDF4(1227, 3717, 6011) },
+                { AOM_CDF4(28156, 31424, 31934) },
+                { AOM_CDF4(16915, 27754, 30373) },
+                { AOM_CDF4(9148, 20990, 26431) },
+                { AOM_CDF4(5950, 15515, 21148) },
+                { AOM_CDF4(2492, 7327, 11526) },
+                { AOM_CDF4(30602, 32477, 32670) },
+                { AOM_CDF4(20026, 29955, 31568) },
+                { AOM_CDF4(11220, 23628, 28105) },
+                { AOM_CDF4(6652, 17019, 22973) },
+                { AOM_CDF4(3064, 8536, 13043) },
+                { AOM_CDF4(31769, 32724, 32748) },
+                { AOM_CDF4(22230, 30887, 32373) },
+                { AOM_CDF4(12234, 25079, 29731) },
+                { AOM_CDF4(7326, 18816, 25353) },
+                { AOM_CDF4(3933, 10907, 16616) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(8896, 16227, 20630) },
+                { AOM_CDF4(23629, 31782, 32527) },
+                { AOM_CDF4(15173, 27755, 31321) },
+                { AOM_CDF4(10158, 21233, 27382) },
+                { AOM_CDF4(6420, 14857, 21558) },
+                { AOM_CDF4(3269, 8155, 12646) },
+                { AOM_CDF4(24835, 32009, 32496) },
+                { AOM_CDF4(16509, 28421, 31579) },
+                { AOM_CDF4(10957, 21514, 27418) },
+                { AOM_CDF4(7881, 15930, 22096) },
+                { AOM_CDF4(5388, 10960, 15918) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(20745, 30773, 32093) },
+                { AOM_CDF4(15200, 27221, 30861) },
+                { AOM_CDF4(13032, 20873, 25667) },
+                { AOM_CDF4(12285, 18663, 23494) },
+                { AOM_CDF4(11563, 17481, 21489) },
+                { AOM_CDF4(26260, 31982, 32320) },
+                { AOM_CDF4(15397, 28083, 31100) },
+                { AOM_CDF4(9742, 19217, 24824) },
+                { AOM_CDF4(3261, 9629, 15362) },
+                { AOM_CDF4(1480, 4322, 7499) },
+                { AOM_CDF4(27599, 32256, 32460) },
+                { AOM_CDF4(16857, 27659, 30774) },
+                { AOM_CDF4(9551, 18290, 23748) },
+                { AOM_CDF4(3052, 8933, 14103) },
+                { AOM_CDF4(2021, 5910, 9787) },
+                { AOM_CDF4(29005, 32015, 32392) },
+                { AOM_CDF4(17677, 27694, 30863) },
+                { AOM_CDF4(9204, 17356, 23219) },
+                { AOM_CDF4(2403, 7516, 12814) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(10808, 22056, 26896) },
+                { AOM_CDF4(25739, 32313, 32676) },
+                { AOM_CDF4(17288, 30203, 32221) },
+                { AOM_CDF4(11359, 24878, 29896) },
+                { AOM_CDF4(6949, 17767, 24893) },
+                { AOM_CDF4(4287, 11796, 18071) },
+                { AOM_CDF4(27880, 32521, 32705) },
+                { AOM_CDF4(19038, 31004, 32414) },
+                { AOM_CDF4(12564, 26345, 30768) },
+                { AOM_CDF4(8269, 19947, 26779) },
+                { AOM_CDF4(5674, 14657, 21674) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(25742, 32319, 32671) },
+                { AOM_CDF4(19557, 31164, 32454) },
+                { AOM_CDF4(13381, 26381, 30755) },
+                { AOM_CDF4(10101, 21466, 26722) },
+                { AOM_CDF4(9209, 19650, 26825) },
+                { AOM_CDF4(27107, 31917, 32432) },
+                { AOM_CDF4(18056, 28893, 31203) },
+                { AOM_CDF4(10200, 21434, 26764) },
+                { AOM_CDF4(4660, 12913, 19502) },
+                { AOM_CDF4(2368, 6930, 12504) },
+                { AOM_CDF4(26960, 32158, 32613) },
+                { AOM_CDF4(18628, 30005, 32031) },
+                { AOM_CDF4(10233, 22442, 28232) },
+                { AOM_CDF4(5471, 14630, 21516) },
+                { AOM_CDF4(3235, 10767, 17109) },
+                { AOM_CDF4(27696, 32440, 32692) },
+                { AOM_CDF4(20032, 31167, 32438) },
+                { AOM_CDF4(8700, 21341, 28442) },
+                { AOM_CDF4(5662, 14831, 21795) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(9704, 17294, 21132) },
+                { AOM_CDF4(26762, 32278, 32633) },
+                { AOM_CDF4(18382, 29620, 31819) },
+                { AOM_CDF4(10891, 23475, 28723) },
+                { AOM_CDF4(6358, 16583, 23309) },
+                { AOM_CDF4(3248, 9118, 14141) },
+                { AOM_CDF4(27204, 32573, 32699) },
+                { AOM_CDF4(19818, 30824, 32329) },
+                { AOM_CDF4(11772, 25120, 30041) },
+                { AOM_CDF4(6995, 18033, 25039) },
+                { AOM_CDF4(3752, 10442, 16098) },
+                { AOM_CDF4(27222, 32256, 32559) },
+                { AOM_CDF4(15356, 28399, 31475) },
+                { AOM_CDF4(8821, 20635, 27057) },
+                { AOM_CDF4(5511, 14404, 21239) },
+                { AOM_CDF4(2935, 8222, 13051) },
+                { AOM_CDF4(24875, 32120, 32529) },
+                { AOM_CDF4(15233, 28265, 31445) },
+                { AOM_CDF4(8605, 20570, 26932) },
+                { AOM_CDF4(5431, 14413, 21196) },
+                { AOM_CDF4(2994, 8341, 13223) },
+                { AOM_CDF4(28201, 32604, 32700) },
+                { AOM_CDF4(21041, 31446, 32456) },
+                { AOM_CDF4(13221, 26213, 30475) },
+                { AOM_CDF4(8255, 19385, 26037) },
+                { AOM_CDF4(4930, 12585, 18830) },
+                { AOM_CDF4(28768, 32448, 32627) },
+                { AOM_CDF4(19705, 30561, 32021) },
+                { AOM_CDF4(11572, 23589, 28220) },
+                { AOM_CDF4(5532, 15034, 21446) },
+                { AOM_CDF4(2460, 7150, 11456) },
+                { AOM_CDF4(29874, 32619, 32699) },
+                { AOM_CDF4(21621, 31071, 32201) },
+                { AOM_CDF4(12511, 24747, 28992) },
+                { AOM_CDF4(6281, 16395, 22748) },
+                { AOM_CDF4(3246, 9278, 14497) },
+                { AOM_CDF4(29715, 32625, 32712) },
+                { AOM_CDF4(20958, 31011, 32283) },
+                { AOM_CDF4(11233, 23671, 28806) },
+                { AOM_CDF4(6012, 16128, 22868) },
+                { AOM_CDF4(3427, 9851, 15414) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(11016, 22111, 26794) },
+                { AOM_CDF4(25946, 32357, 32677) },
+                { AOM_CDF4(17890, 30452, 32252) },
+                { AOM_CDF4(11678, 25142, 29816) },
+                { AOM_CDF4(6720, 17534, 24584) },
+                { AOM_CDF4(4230, 11665, 17820) },
+                { AOM_CDF4(28400, 32623, 32747) },
+                { AOM_CDF4(21164, 31668, 32575) },
+                { AOM_CDF4(13572, 27388, 31182) },
+                { AOM_CDF4(8234, 20750, 27358) },
+                { AOM_CDF4(5065, 14055, 20897) },
+                { AOM_CDF4(28981, 32547, 32705) },
+                { AOM_CDF4(18681, 30543, 32239) },
+                { AOM_CDF4(10919, 24075, 29286) },
+                { AOM_CDF4(6431, 17199, 24077) },
+                { AOM_CDF4(3819, 10464, 16618) },
+                { AOM_CDF4(26870, 32467, 32693) },
+                { AOM_CDF4(19041, 30831, 32347) },
+                { AOM_CDF4(11794, 25211, 30016) },
+                { AOM_CDF4(6888, 18019, 24970) },
+                { AOM_CDF4(4370, 12363, 18992) },
+                { AOM_CDF4(29578, 32670, 32744) },
+                { AOM_CDF4(23159, 32007, 32613) },
+                { AOM_CDF4(15315, 28669, 31676) },
+                { AOM_CDF4(9298, 22607, 28782) },
+                { AOM_CDF4(6144, 15913, 22968) },
+                { AOM_CDF4(28110, 32499, 32669) },
+                { AOM_CDF4(21574, 30937, 32015) },
+                { AOM_CDF4(12759, 24818, 28727) },
+                { AOM_CDF4(6545, 16761, 23042) },
+                { AOM_CDF4(3649, 10597, 16833) },
+                { AOM_CDF4(28163, 32552, 32728) },
+                { AOM_CDF4(22101, 31469, 32464) },
+                { AOM_CDF4(13160, 25472, 30143) },
+                { AOM_CDF4(7303, 18684, 25468) },
+                { AOM_CDF4(5241, 13975, 20955) },
+                { AOM_CDF4(28400, 32631, 32744) },
+                { AOM_CDF4(22104, 31793, 32603) },
+                { AOM_CDF4(13557, 26571, 30846) },
+                { AOM_CDF4(7749, 19861, 26675) },
+                { AOM_CDF4(4873, 14030, 21234) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(9800, 17635, 21073) },
+                { AOM_CDF4(26153, 31885, 32527) },
+                { AOM_CDF4(15038, 27852, 31006) },
+                { AOM_CDF4(8718, 20564, 26486) },
+                { AOM_CDF4(5128, 14076, 20514) },
+                { AOM_CDF4(2636, 7566, 11925) },
+                { AOM_CDF4(27551, 32504, 32701) },
+                { AOM_CDF4(18310, 30054, 32100) },
+                { AOM_CDF4(10211, 23420, 29082) },
+                { AOM_CDF4(6222, 16876, 23916) },
+                { AOM_CDF4(3462, 9954, 15498) },
+                { AOM_CDF4(29991, 32633, 32721) },
+                { AOM_CDF4(19883, 30751, 32201) },
+                { AOM_CDF4(11141, 24184, 29285) },
+                { AOM_CDF4(6420, 16940, 23774) },
+                { AOM_CDF4(3392, 9753, 15118) },
+                { AOM_CDF4(28465, 32616, 32712) },
+                { AOM_CDF4(19850, 30702, 32244) },
+                { AOM_CDF4(10983, 24024, 29223) },
+                { AOM_CDF4(6294, 16770, 23582) },
+                { AOM_CDF4(3244, 9283, 14509) },
+                { AOM_CDF4(30023, 32717, 32748) },
+                { AOM_CDF4(22940, 32032, 32626) },
+                { AOM_CDF4(14282, 27928, 31473) },
+                { AOM_CDF4(8562, 21327, 27914) },
+                { AOM_CDF4(4846, 13393, 19919) },
+                { AOM_CDF4(29981, 32590, 32695) },
+                { AOM_CDF4(20465, 30963, 32166) },
+                { AOM_CDF4(11479, 23579, 28195) },
+                { AOM_CDF4(5916, 15648, 22073) },
+                { AOM_CDF4(3031, 8605, 13398) },
+                { AOM_CDF4(31146, 32691, 32739) },
+                { AOM_CDF4(23106, 31724, 32444) },
+                { AOM_CDF4(13783, 26738, 30439) },
+                { AOM_CDF4(7852, 19468, 25807) },
+                { AOM_CDF4(3860, 11124, 16853) },
+                { AOM_CDF4(31014, 32724, 32748) },
+                { AOM_CDF4(23629, 32109, 32628) },
+                { AOM_CDF4(14747, 28115, 31403) },
+                { AOM_CDF4(8545, 21242, 27478) },
+                { AOM_CDF4(4574, 12781, 19067) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(9185, 19694, 24688) },
+                { AOM_CDF4(26081, 31985, 32621) },
+                { AOM_CDF4(16015, 29000, 31787) },
+                { AOM_CDF4(10542, 23690, 29206) },
+                { AOM_CDF4(6732, 17945, 24677) },
+                { AOM_CDF4(3916, 11039, 16722) },
+                { AOM_CDF4(28224, 32566, 32744) },
+                { AOM_CDF4(19100, 31138, 32485) },
+                { AOM_CDF4(12528, 26620, 30879) },
+                { AOM_CDF4(7741, 20277, 26885) },
+                { AOM_CDF4(4566, 12845, 18990) },
+                { AOM_CDF4(29933, 32593, 32718) },
+                { AOM_CDF4(17670, 30333, 32155) },
+                { AOM_CDF4(10385, 23600, 28909) },
+                { AOM_CDF4(6243, 16236, 22407) },
+                { AOM_CDF4(3976, 10389, 16017) },
+                { AOM_CDF4(28377, 32561, 32738) },
+                { AOM_CDF4(19366, 31175, 32482) },
+                { AOM_CDF4(13327, 27175, 31094) },
+                { AOM_CDF4(8258, 20769, 27143) },
+                { AOM_CDF4(4703, 13198, 19527) },
+                { AOM_CDF4(31086, 32706, 32748) },
+                { AOM_CDF4(22853, 31902, 32583) },
+                { AOM_CDF4(14759, 28186, 31419) },
+                { AOM_CDF4(9284, 22382, 28348) },
+                { AOM_CDF4(5585, 15192, 21868) },
+                { AOM_CDF4(28291, 32652, 32746) },
+                { AOM_CDF4(19849, 32107, 32571) },
+                { AOM_CDF4(14834, 26818, 29214) },
+                { AOM_CDF4(10306, 22594, 28672) },
+                { AOM_CDF4(6615, 17384, 23384) },
+                { AOM_CDF4(28947, 32604, 32745) },
+                { AOM_CDF4(25625, 32289, 32646) },
+                { AOM_CDF4(18758, 28672, 31403) },
+                { AOM_CDF4(10017, 23430, 28523) },
+                { AOM_CDF4(6862, 15269, 22131) },
+                { AOM_CDF4(23933, 32509, 32739) },
+                { AOM_CDF4(19927, 31495, 32631) },
+                { AOM_CDF4(11903, 26023, 30621) },
+                { AOM_CDF4(7026, 20094, 27252) },
+                { AOM_CDF4(5998, 18106, 24437) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4456, 11274, 15533) },
+                { AOM_CDF4(21219, 29079, 31616) },
+                { AOM_CDF4(11173, 23774, 28567) },
+                { AOM_CDF4(7282, 18293, 24263) },
+                { AOM_CDF4(4890, 13286, 19115) },
+                { AOM_CDF4(1890, 5508, 8659) },
+                { AOM_CDF4(26651, 32136, 32647) },
+                { AOM_CDF4(14630, 28254, 31455) },
+                { AOM_CDF4(8716, 21287, 27395) },
+                { AOM_CDF4(5615, 15331, 22008) },
+                { AOM_CDF4(2675, 7700, 12150) },
+                { AOM_CDF4(29954, 32526, 32690) },
+                { AOM_CDF4(16126, 28982, 31633) },
+                { AOM_CDF4(9030, 21361, 27352) },
+                { AOM_CDF4(5411, 14793, 21271) },
+                { AOM_CDF4(2943, 8422, 13163) },
+                { AOM_CDF4(29539, 32601, 32730) },
+                { AOM_CDF4(18125, 30385, 32201) },
+                { AOM_CDF4(10422, 24090, 29468) },
+                { AOM_CDF4(6468, 17487, 24438) },
+                { AOM_CDF4(2970, 8653, 13531) },
+                { AOM_CDF4(30912, 32715, 32748) },
+                { AOM_CDF4(20666, 31373, 32497) },
+                { AOM_CDF4(12509, 26640, 30917) },
+                { AOM_CDF4(8058, 20629, 27290) },
+                { AOM_CDF4(4231, 12006, 18052) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(10202, 20633, 25484) },
+                { AOM_CDF4(27336, 31445, 32352) },
+                { AOM_CDF4(12420, 24384, 28552) },
+                { AOM_CDF4(7648, 18115, 23856) },
+                { AOM_CDF4(5662, 14341, 19902) },
+                { AOM_CDF4(3611, 10328, 15390) },
+                { AOM_CDF4(30945, 32616, 32736) },
+                { AOM_CDF4(18682, 30505, 32253) },
+                { AOM_CDF4(11513, 25336, 30203) },
+                { AOM_CDF4(7449, 19452, 26148) },
+                { AOM_CDF4(4482, 13051, 18886) },
+                { AOM_CDF4(32022, 32690, 32747) },
+                { AOM_CDF4(18578, 30501, 32146) },
+                { AOM_CDF4(11249, 23368, 28631) },
+                { AOM_CDF4(5645, 16958, 22158) },
+                { AOM_CDF4(5009, 11444, 16637) },
+                { AOM_CDF4(31357, 32710, 32748) },
+                { AOM_CDF4(21552, 31494, 32504) },
+                { AOM_CDF4(13891, 27677, 31340) },
+                { AOM_CDF4(9051, 22098, 28172) },
+                { AOM_CDF4(5190, 13377, 19486) },
+                { AOM_CDF4(32364, 32740, 32748) },
+                { AOM_CDF4(24839, 31907, 32551) },
+                { AOM_CDF4(17160, 28779, 31696) },
+                { AOM_CDF4(12452, 24137, 29602) },
+                { AOM_CDF4(6165, 15389, 22477) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(2575, 7281, 11077) },
+                { AOM_CDF4(14002, 20866, 25402) },
+                { AOM_CDF4(6343, 15056, 19658) },
+                { AOM_CDF4(4474, 11858, 17041) },
+                { AOM_CDF4(2865, 8299, 12534) },
+                { AOM_CDF4(1344, 3949, 6391) },
+                { AOM_CDF4(24720, 31239, 32459) },
+                { AOM_CDF4(12585, 25356, 29968) },
+                { AOM_CDF4(7181, 18246, 24444) },
+                { AOM_CDF4(5025, 13667, 19885) },
+                { AOM_CDF4(2521, 7304, 11605) },
+                { AOM_CDF4(29908, 32252, 32584) },
+                { AOM_CDF4(17421, 29156, 31575) },
+                { AOM_CDF4(9889, 22188, 27782) },
+                { AOM_CDF4(5878, 15647, 22123) },
+                { AOM_CDF4(2814, 8665, 13323) },
+                { AOM_CDF4(30183, 32568, 32713) },
+                { AOM_CDF4(18528, 30195, 32049) },
+                { AOM_CDF4(10982, 24606, 29657) },
+                { AOM_CDF4(6957, 18165, 25231) },
+                { AOM_CDF4(3508, 10118, 15468) },
+                { AOM_CDF4(31761, 32736, 32748) },
+                { AOM_CDF4(21041, 31328, 32546) },
+                { AOM_CDF4(12568, 26732, 31166) },
+                { AOM_CDF4(8052, 20720, 27733) },
+                { AOM_CDF4(4336, 12192, 18396) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(7062, 16472, 22319) },
+                { AOM_CDF4(24538, 32261, 32674) },
+                { AOM_CDF4(13675, 28041, 31779) },
+                { AOM_CDF4(8590, 20674, 27631) },
+                { AOM_CDF4(5685, 14675, 22013) },
+                { AOM_CDF4(3655, 9898, 15731) },
+                { AOM_CDF4(26493, 32418, 32658) },
+                { AOM_CDF4(16376, 29342, 32090) },
+                { AOM_CDF4(10594, 22649, 28970) },
+                { AOM_CDF4(8176, 17170, 24303) },
+                { AOM_CDF4(5605, 12694, 19139) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(23888, 31902, 32542) },
+                { AOM_CDF4(18612, 29687, 31987) },
+                { AOM_CDF4(16245, 24852, 29249) },
+                { AOM_CDF4(15765, 22608, 27559) },
+                { AOM_CDF4(19895, 24699, 27510) },
+                { AOM_CDF4(28401, 32212, 32457) },
+                { AOM_CDF4(15274, 27825, 30980) },
+                { AOM_CDF4(9364, 18128, 24332) },
+                { AOM_CDF4(2283, 8193, 15082) },
+                { AOM_CDF4(1228, 3972, 7881) },
+                { AOM_CDF4(29455, 32469, 32620) },
+                { AOM_CDF4(17981, 28245, 31388) },
+                { AOM_CDF4(10921, 20098, 26240) },
+                { AOM_CDF4(3743, 11829, 18657) },
+                { AOM_CDF4(2374, 9593, 15715) },
+                { AOM_CDF4(31068, 32466, 32635) },
+                { AOM_CDF4(20321, 29572, 31971) },
+                { AOM_CDF4(10771, 20255, 27119) },
+                { AOM_CDF4(2795, 10410, 17361) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(9320, 22102, 27840) },
+                { AOM_CDF4(27057, 32464, 32724) },
+                { AOM_CDF4(16331, 30268, 32309) },
+                { AOM_CDF4(10319, 23935, 29720) },
+                { AOM_CDF4(6189, 16448, 24106) },
+                { AOM_CDF4(3589, 10884, 18808) },
+                { AOM_CDF4(29026, 32624, 32748) },
+                { AOM_CDF4(19226, 31507, 32587) },
+                { AOM_CDF4(12692, 26921, 31203) },
+                { AOM_CDF4(7049, 19532, 27635) },
+                { AOM_CDF4(7727, 15669, 23252) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(28056, 32625, 32748) },
+                { AOM_CDF4(22383, 32075, 32669) },
+                { AOM_CDF4(15417, 27098, 31749) },
+                { AOM_CDF4(18127, 26493, 27190) },
+                { AOM_CDF4(5461, 16384, 21845) },
+                { AOM_CDF4(27982, 32091, 32584) },
+                { AOM_CDF4(19045, 29868, 31972) },
+                { AOM_CDF4(10397, 22266, 27932) },
+                { AOM_CDF4(5990, 13697, 21500) },
+                { AOM_CDF4(1792, 6912, 15104) },
+                { AOM_CDF4(28198, 32501, 32718) },
+                { AOM_CDF4(21534, 31521, 32569) },
+                { AOM_CDF4(11109, 25217, 30017) },
+                { AOM_CDF4(5671, 15124, 26151) },
+                { AOM_CDF4(4681, 14043, 18725) },
+                { AOM_CDF4(28688, 32580, 32741) },
+                { AOM_CDF4(22576, 32079, 32661) },
+                { AOM_CDF4(10627, 22141, 28340) },
+                { AOM_CDF4(9362, 14043, 28087) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7754, 16948, 22142) },
+                { AOM_CDF4(25670, 32330, 32691) },
+                { AOM_CDF4(15663, 29225, 31994) },
+                { AOM_CDF4(9878, 23288, 29158) },
+                { AOM_CDF4(6419, 17088, 24336) },
+                { AOM_CDF4(3859, 11003, 17039) },
+                { AOM_CDF4(27562, 32595, 32725) },
+                { AOM_CDF4(17575, 30588, 32399) },
+                { AOM_CDF4(10819, 24838, 30309) },
+                { AOM_CDF4(7124, 18686, 25916) },
+                { AOM_CDF4(4479, 12688, 19340) },
+                { AOM_CDF4(28385, 32476, 32673) },
+                { AOM_CDF4(15306, 29005, 31938) },
+                { AOM_CDF4(8937, 21615, 28322) },
+                { AOM_CDF4(5982, 15603, 22786) },
+                { AOM_CDF4(3620, 10267, 16136) },
+                { AOM_CDF4(27280, 32464, 32667) },
+                { AOM_CDF4(15607, 29160, 32004) },
+                { AOM_CDF4(9091, 22135, 28740) },
+                { AOM_CDF4(6232, 16632, 24020) },
+                { AOM_CDF4(4047, 11377, 17672) },
+                { AOM_CDF4(29220, 32630, 32718) },
+                { AOM_CDF4(19650, 31220, 32462) },
+                { AOM_CDF4(13050, 26312, 30827) },
+                { AOM_CDF4(9228, 20870, 27468) },
+                { AOM_CDF4(6146, 15149, 21971) },
+                { AOM_CDF4(30169, 32481, 32623) },
+                { AOM_CDF4(17212, 29311, 31554) },
+                { AOM_CDF4(9911, 21311, 26882) },
+                { AOM_CDF4(4487, 13314, 20372) },
+                { AOM_CDF4(2570, 7772, 12889) },
+                { AOM_CDF4(30924, 32613, 32708) },
+                { AOM_CDF4(19490, 30206, 32107) },
+                { AOM_CDF4(11232, 23998, 29276) },
+                { AOM_CDF4(6769, 17955, 25035) },
+                { AOM_CDF4(4398, 12623, 19214) },
+                { AOM_CDF4(30609, 32627, 32722) },
+                { AOM_CDF4(19370, 30582, 32287) },
+                { AOM_CDF4(10457, 23619, 29409) },
+                { AOM_CDF4(6443, 17637, 24834) },
+                { AOM_CDF4(4645, 13236, 20106) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8626, 20271, 26216) },
+                { AOM_CDF4(26707, 32406, 32711) },
+                { AOM_CDF4(16999, 30329, 32286) },
+                { AOM_CDF4(11445, 25123, 30286) },
+                { AOM_CDF4(6411, 18828, 25601) },
+                { AOM_CDF4(6801, 12458, 20248) },
+                { AOM_CDF4(29918, 32682, 32748) },
+                { AOM_CDF4(20649, 31739, 32618) },
+                { AOM_CDF4(12879, 27773, 31581) },
+                { AOM_CDF4(7896, 21751, 28244) },
+                { AOM_CDF4(5260, 14870, 23698) },
+                { AOM_CDF4(29252, 32593, 32731) },
+                { AOM_CDF4(17072, 30460, 32294) },
+                { AOM_CDF4(10653, 24143, 29365) },
+                { AOM_CDF4(6536, 17490, 23983) },
+                { AOM_CDF4(4929, 13170, 20085) },
+                { AOM_CDF4(28137, 32518, 32715) },
+                { AOM_CDF4(18171, 30784, 32407) },
+                { AOM_CDF4(11437, 25436, 30459) },
+                { AOM_CDF4(7252, 18534, 26176) },
+                { AOM_CDF4(4126, 13353, 20978) },
+                { AOM_CDF4(31162, 32726, 32748) },
+                { AOM_CDF4(23017, 32222, 32701) },
+                { AOM_CDF4(15629, 29233, 32046) },
+                { AOM_CDF4(9387, 22621, 29480) },
+                { AOM_CDF4(6922, 17616, 25010) },
+                { AOM_CDF4(28838, 32265, 32614) },
+                { AOM_CDF4(19701, 30206, 31920) },
+                { AOM_CDF4(11214, 22410, 27933) },
+                { AOM_CDF4(5320, 14177, 23034) },
+                { AOM_CDF4(5049, 12881, 17827) },
+                { AOM_CDF4(27484, 32471, 32734) },
+                { AOM_CDF4(21076, 31526, 32561) },
+                { AOM_CDF4(12707, 26303, 31211) },
+                { AOM_CDF4(8169, 21722, 28219) },
+                { AOM_CDF4(6045, 19406, 27042) },
+                { AOM_CDF4(27753, 32572, 32745) },
+                { AOM_CDF4(20832, 31878, 32653) },
+                { AOM_CDF4(13250, 27356, 31674) },
+                { AOM_CDF4(7718, 21508, 29858) },
+                { AOM_CDF4(7209, 18350, 25559) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7876, 16901, 21741) },
+                { AOM_CDF4(24001, 31898, 32625) },
+                { AOM_CDF4(14529, 27959, 31451) },
+                { AOM_CDF4(8273, 20818, 27258) },
+                { AOM_CDF4(5278, 14673, 21510) },
+                { AOM_CDF4(2983, 8843, 14039) },
+                { AOM_CDF4(28016, 32574, 32732) },
+                { AOM_CDF4(17471, 30306, 32301) },
+                { AOM_CDF4(10224, 24063, 29728) },
+                { AOM_CDF4(6602, 17954, 25052) },
+                { AOM_CDF4(4002, 11585, 17759) },
+                { AOM_CDF4(30190, 32634, 32739) },
+                { AOM_CDF4(17497, 30282, 32270) },
+                { AOM_CDF4(10229, 23729, 29538) },
+                { AOM_CDF4(6344, 17211, 24440) },
+                { AOM_CDF4(3849, 11189, 17108) },
+                { AOM_CDF4(28570, 32583, 32726) },
+                { AOM_CDF4(17521, 30161, 32238) },
+                { AOM_CDF4(10153, 23565, 29378) },
+                { AOM_CDF4(6455, 17341, 24443) },
+                { AOM_CDF4(3907, 11042, 17024) },
+                { AOM_CDF4(30689, 32715, 32748) },
+                { AOM_CDF4(21546, 31840, 32610) },
+                { AOM_CDF4(13547, 27581, 31459) },
+                { AOM_CDF4(8912, 21757, 28309) },
+                { AOM_CDF4(5548, 15080, 22046) },
+                { AOM_CDF4(30783, 32540, 32685) },
+                { AOM_CDF4(17540, 29528, 31668) },
+                { AOM_CDF4(10160, 21468, 26783) },
+                { AOM_CDF4(4724, 13393, 20054) },
+                { AOM_CDF4(2702, 8174, 13102) },
+                { AOM_CDF4(31648, 32686, 32742) },
+                { AOM_CDF4(20954, 31094, 32337) },
+                { AOM_CDF4(12420, 25698, 30179) },
+                { AOM_CDF4(7304, 19320, 26248) },
+                { AOM_CDF4(4366, 12261, 18864) },
+                { AOM_CDF4(31581, 32723, 32748) },
+                { AOM_CDF4(21373, 31586, 32525) },
+                { AOM_CDF4(12744, 26625, 30885) },
+                { AOM_CDF4(7431, 20322, 26950) },
+                { AOM_CDF4(4692, 13323, 20111) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(7833, 18369, 24095) },
+                { AOM_CDF4(26650, 32273, 32702) },
+                { AOM_CDF4(16371, 29961, 32191) },
+                { AOM_CDF4(11055, 24082, 29629) },
+                { AOM_CDF4(6892, 18644, 25400) },
+                { AOM_CDF4(5006, 13057, 19240) },
+                { AOM_CDF4(29834, 32666, 32748) },
+                { AOM_CDF4(19577, 31335, 32570) },
+                { AOM_CDF4(12253, 26509, 31122) },
+                { AOM_CDF4(7991, 20772, 27711) },
+                { AOM_CDF4(5677, 15910, 23059) },
+                { AOM_CDF4(30109, 32532, 32720) },
+                { AOM_CDF4(16747, 30166, 32252) },
+                { AOM_CDF4(10134, 23542, 29184) },
+                { AOM_CDF4(5791, 16176, 23556) },
+                { AOM_CDF4(4362, 10414, 17284) },
+                { AOM_CDF4(29492, 32626, 32748) },
+                { AOM_CDF4(19894, 31402, 32525) },
+                { AOM_CDF4(12942, 27071, 30869) },
+                { AOM_CDF4(8346, 21216, 27405) },
+                { AOM_CDF4(6572, 17087, 23859) },
+                { AOM_CDF4(32035, 32735, 32748) },
+                { AOM_CDF4(22957, 31838, 32618) },
+                { AOM_CDF4(14724, 28572, 31772) },
+                { AOM_CDF4(10364, 23999, 29553) },
+                { AOM_CDF4(7004, 18433, 25655) },
+                { AOM_CDF4(27528, 32277, 32681) },
+                { AOM_CDF4(16959, 31171, 32096) },
+                { AOM_CDF4(10486, 23593, 27962) },
+                { AOM_CDF4(8192, 16384, 23211) },
+                { AOM_CDF4(8937, 17873, 20852) },
+                { AOM_CDF4(27715, 32002, 32615) },
+                { AOM_CDF4(15073, 29491, 31676) },
+                { AOM_CDF4(11264, 24576, 28672) },
+                { AOM_CDF4(2341, 18725, 23406) },
+                { AOM_CDF4(7282, 18204, 25486) },
+                { AOM_CDF4(28547, 32213, 32657) },
+                { AOM_CDF4(20788, 29773, 32239) },
+                { AOM_CDF4(6780, 21469, 30508) },
+                { AOM_CDF4(5958, 14895, 23831) },
+                { AOM_CDF4(16384, 21845, 27307) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5992, 14304, 19765) },
+                { AOM_CDF4(22612, 31238, 32456) },
+                { AOM_CDF4(13456, 27162, 31087) },
+                { AOM_CDF4(8001, 20062, 26504) },
+                { AOM_CDF4(5168, 14105, 20764) },
+                { AOM_CDF4(2632, 7771, 12385) },
+                { AOM_CDF4(27034, 32344, 32709) },
+                { AOM_CDF4(15850, 29415, 31997) },
+                { AOM_CDF4(9494, 22776, 28841) },
+                { AOM_CDF4(6151, 16830, 23969) },
+                { AOM_CDF4(3461, 10039, 15722) },
+                { AOM_CDF4(30134, 32569, 32731) },
+                { AOM_CDF4(15638, 29422, 31945) },
+                { AOM_CDF4(9150, 21865, 28218) },
+                { AOM_CDF4(5647, 15719, 22676) },
+                { AOM_CDF4(3402, 9772, 15477) },
+                { AOM_CDF4(28530, 32586, 32735) },
+                { AOM_CDF4(17139, 30298, 32292) },
+                { AOM_CDF4(10200, 24039, 29685) },
+                { AOM_CDF4(6419, 17674, 24786) },
+                { AOM_CDF4(3544, 10225, 15824) },
+                { AOM_CDF4(31333, 32726, 32748) },
+                { AOM_CDF4(20618, 31487, 32544) },
+                { AOM_CDF4(12901, 27217, 31232) },
+                { AOM_CDF4(8624, 21734, 28171) },
+                { AOM_CDF4(5104, 14191, 20748) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(11206, 21090, 26561) },
+                { AOM_CDF4(28759, 32279, 32671) },
+                { AOM_CDF4(14171, 27952, 31569) },
+                { AOM_CDF4(9743, 22907, 29141) },
+                { AOM_CDF4(6871, 17886, 24868) },
+                { AOM_CDF4(4960, 13152, 19315) },
+                { AOM_CDF4(31077, 32661, 32748) },
+                { AOM_CDF4(19400, 31195, 32515) },
+                { AOM_CDF4(12752, 26858, 31040) },
+                { AOM_CDF4(8370, 22098, 28591) },
+                { AOM_CDF4(5457, 15373, 22298) },
+                { AOM_CDF4(31697, 32706, 32748) },
+                { AOM_CDF4(17860, 30657, 32333) },
+                { AOM_CDF4(12510, 24812, 29261) },
+                { AOM_CDF4(6180, 19124, 24722) },
+                { AOM_CDF4(5041, 13548, 17959) },
+                { AOM_CDF4(31552, 32716, 32748) },
+                { AOM_CDF4(21908, 31769, 32623) },
+                { AOM_CDF4(14470, 28201, 31565) },
+                { AOM_CDF4(9493, 22982, 28608) },
+                { AOM_CDF4(6858, 17240, 24137) },
+                { AOM_CDF4(32543, 32752, 32756) },
+                { AOM_CDF4(24286, 32097, 32666) },
+                { AOM_CDF4(15958, 29217, 32024) },
+                { AOM_CDF4(10207, 24234, 29958) },
+                { AOM_CDF4(6929, 18305, 25652) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4137, 10847, 15682) },
+                { AOM_CDF4(17824, 27001, 30058) },
+                { AOM_CDF4(10204, 22796, 28291) },
+                { AOM_CDF4(6076, 15935, 22125) },
+                { AOM_CDF4(3852, 10937, 16816) },
+                { AOM_CDF4(2252, 6324, 10131) },
+                { AOM_CDF4(25840, 32016, 32662) },
+                { AOM_CDF4(15109, 28268, 31531) },
+                { AOM_CDF4(9385, 22231, 28340) },
+                { AOM_CDF4(6082, 16672, 23479) },
+                { AOM_CDF4(3318, 9427, 14681) },
+                { AOM_CDF4(30594, 32574, 32718) },
+                { AOM_CDF4(16836, 29552, 31859) },
+                { AOM_CDF4(9556, 22542, 28356) },
+                { AOM_CDF4(6305, 16725, 23540) },
+                { AOM_CDF4(3376, 9895, 15184) },
+                { AOM_CDF4(29383, 32617, 32745) },
+                { AOM_CDF4(18891, 30809, 32401) },
+                { AOM_CDF4(11688, 25942, 30687) },
+                { AOM_CDF4(7468, 19469, 26651) },
+                { AOM_CDF4(3909, 11358, 17012) },
+                { AOM_CDF4(31564, 32736, 32748) },
+                { AOM_CDF4(20906, 31611, 32600) },
+                { AOM_CDF4(13191, 27621, 31537) },
+                { AOM_CDF4(8768, 22029, 28676) },
+                { AOM_CDF4(5079, 14109, 20906) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } } };
 
-typedef coeff_cdf_model coeff_cdf_table[TX_SIZES][PLANE_TYPES];
-static const coeff_cdf_table *av1_default_qctx_coef_cdfs[TOKEN_CDF_Q_CTXS] = {
-  &av1_default_coef_head_cdfs_q0, &av1_default_coef_head_cdfs_q1,
-  &av1_default_coef_head_cdfs_q2, &av1_default_coef_head_cdfs_q3,
-};
+static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
+        NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) },
+                                        { AOM_CDF3(29600, 31446) },
+                                        { AOM_CDF3(30844, 31878) },
+                                        { AOM_CDF3(24926, 28948) } },
+                                      { { AOM_CDF3(21365, 30026) },
+                                        { AOM_CDF3(30512, 32423) },
+                                        { AOM_CDF3(31658, 32621) },
+                                        { AOM_CDF3(29630, 31881) } } },
+                                    { { { AOM_CDF3(5717, 26477) },
+                                        { AOM_CDF3(30491, 31703) },
+                                        { AOM_CDF3(31550, 32158) },
+                                        { AOM_CDF3(29648, 31491) } },
+                                      { { AOM_CDF3(12608, 27820) },
+                                        { AOM_CDF3(30680, 32225) },
+                                        { AOM_CDF3(30809, 32335) },
+                                        { AOM_CDF3(31299, 32423) } } },
+                                    { { { AOM_CDF3(1786, 12612) },
+                                        { AOM_CDF3(30663, 31625) },
+                                        { AOM_CDF3(32339, 32468) },
+                                        { AOM_CDF3(31148, 31833) } },
+                                      { { AOM_CDF3(18857, 23865) },
+                                        { AOM_CDF3(31428, 32428) },
+                                        { AOM_CDF3(31744, 32373) },
+                                        { AOM_CDF3(31775, 32526) } } },
+                                    { { { AOM_CDF3(1787, 2532) },
+                                        { AOM_CDF3(30832, 31662) },
+                                        { AOM_CDF3(31824, 32682) },
+                                        { AOM_CDF3(32133, 32569) } },
+                                      { { AOM_CDF3(13751, 22235) },
+                                        { AOM_CDF3(32089, 32409) },
+                                        { AOM_CDF3(27084, 27920) },
+                                        { AOM_CDF3(29291, 32594) } } },
+                                    { { { AOM_CDF3(1725, 3449) },
+                                        { AOM_CDF3(31102, 31935) },
+                                        { AOM_CDF3(32457, 32613) },
+                                        { AOM_CDF3(32412, 32649) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(17560, 29888) },
+                                        { AOM_CDF3(29671, 31549) },
+                                        { AOM_CDF3(31007, 32056) },
+                                        { AOM_CDF3(27286, 30006) } },
+                                      { { AOM_CDF3(26594, 31212) },
+                                        { AOM_CDF3(31208, 32582) },
+                                        { AOM_CDF3(31835, 32637) },
+                                        { AOM_CDF3(30595, 32206) } } },
+                                    { { { AOM_CDF3(15239, 29932) },
+                                        { AOM_CDF3(31315, 32095) },
+                                        { AOM_CDF3(32130, 32434) },
+                                        { AOM_CDF3(30864, 31996) } },
+                                      { { AOM_CDF3(26279, 30968) },
+                                        { AOM_CDF3(31142, 32495) },
+                                        { AOM_CDF3(31713, 32540) },
+                                        { AOM_CDF3(31929, 32594) } } },
+                                    { { { AOM_CDF3(2644, 25198) },
+                                        { AOM_CDF3(32038, 32451) },
+                                        { AOM_CDF3(32639, 32695) },
+                                        { AOM_CDF3(32166, 32518) } },
+                                      { { AOM_CDF3(17187, 27668) },
+                                        { AOM_CDF3(31714, 32550) },
+                                        { AOM_CDF3(32283, 32678) },
+                                        { AOM_CDF3(31930, 32563) } } },
+                                    { { { AOM_CDF3(1044, 2257) },
+                                        { AOM_CDF3(30755, 31923) },
+                                        { AOM_CDF3(32208, 32693) },
+                                        { AOM_CDF3(32244, 32615) } },
+                                      { { AOM_CDF3(21317, 26207) },
+                                        { AOM_CDF3(29133, 30868) },
+                                        { AOM_CDF3(29311, 31231) },
+                                        { AOM_CDF3(29657, 31087) } } },
+                                    { { { AOM_CDF3(478, 1834) },
+                                        { AOM_CDF3(31005, 31987) },
+                                        { AOM_CDF3(32317, 32724) },
+                                        { AOM_CDF3(30865, 32648) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(20092, 30774) },
+                                        { AOM_CDF3(30695, 32020) },
+                                        { AOM_CDF3(31131, 32103) },
+                                        { AOM_CDF3(28666, 30870) } },
+                                      { { AOM_CDF3(27258, 31095) },
+                                        { AOM_CDF3(31804, 32623) },
+                                        { AOM_CDF3(31763, 32528) },
+                                        { AOM_CDF3(31438, 32506) } } },
+                                    { { { AOM_CDF3(18049, 30489) },
+                                        { AOM_CDF3(31706, 32286) },
+                                        { AOM_CDF3(32163, 32473) },
+                                        { AOM_CDF3(31550, 32184) } },
+                                      { { AOM_CDF3(27116, 30842) },
+                                        { AOM_CDF3(31971, 32598) },
+                                        { AOM_CDF3(32088, 32576) },
+                                        { AOM_CDF3(32067, 32664) } } },
+                                    { { { AOM_CDF3(12854, 29093) },
+                                        { AOM_CDF3(32272, 32558) },
+                                        { AOM_CDF3(32667, 32729) },
+                                        { AOM_CDF3(32306, 32585) } },
+                                      { { AOM_CDF3(25476, 30366) },
+                                        { AOM_CDF3(32169, 32687) },
+                                        { AOM_CDF3(32479, 32689) },
+                                        { AOM_CDF3(31673, 32634) } } },
+                                    { { { AOM_CDF3(2809, 19301) },
+                                        { AOM_CDF3(32205, 32622) },
+                                        { AOM_CDF3(32338, 32730) },
+                                        { AOM_CDF3(31786, 32616) } },
+                                      { { AOM_CDF3(22737, 29105) },
+                                        { AOM_CDF3(30810, 32362) },
+                                        { AOM_CDF3(30014, 32627) },
+                                        { AOM_CDF3(30528, 32574) } } },
+                                    { { { AOM_CDF3(935, 3382) },
+                                        { AOM_CDF3(30789, 31909) },
+                                        { AOM_CDF3(32466, 32756) },
+                                        { AOM_CDF3(30860, 32513) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(22497, 31198) },
+                                        { AOM_CDF3(31715, 32495) },
+                                        { AOM_CDF3(31606, 32337) },
+                                        { AOM_CDF3(30388, 31990) } },
+                                      { { AOM_CDF3(27877, 31584) },
+                                        { AOM_CDF3(32170, 32728) },
+                                        { AOM_CDF3(32155, 32688) },
+                                        { AOM_CDF3(32219, 32702) } } },
+                                    { { { AOM_CDF3(21457, 31043) },
+                                        { AOM_CDF3(31951, 32483) },
+                                        { AOM_CDF3(32153, 32562) },
+                                        { AOM_CDF3(31473, 32215) } },
+                                      { { AOM_CDF3(27558, 31151) },
+                                        { AOM_CDF3(32020, 32640) },
+                                        { AOM_CDF3(32097, 32575) },
+                                        { AOM_CDF3(32242, 32719) } } },
+                                    { { { AOM_CDF3(19980, 30591) },
+                                        { AOM_CDF3(32219, 32597) },
+                                        { AOM_CDF3(32581, 32706) },
+                                        { AOM_CDF3(31803, 32287) } },
+                                      { { AOM_CDF3(26473, 30507) },
+                                        { AOM_CDF3(32431, 32723) },
+                                        { AOM_CDF3(32196, 32611) },
+                                        { AOM_CDF3(31588, 32528) } } },
+                                    { { { AOM_CDF3(24647, 30463) },
+                                        { AOM_CDF3(32412, 32695) },
+                                        { AOM_CDF3(32468, 32720) },
+                                        { AOM_CDF3(31269, 32523) } },
+                                      { { AOM_CDF3(28482, 31505) },
+                                        { AOM_CDF3(32152, 32701) },
+                                        { AOM_CDF3(31732, 32598) },
+                                        { AOM_CDF3(31767, 32712) } } },
+                                    { { { AOM_CDF3(12358, 24977) },
+                                        { AOM_CDF3(31331, 32385) },
+                                        { AOM_CDF3(32634, 32756) },
+                                        { AOM_CDF3(30411, 32548) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } } };
diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c
index c5b91e991..c96d37cca 100644
--- a/third_party/aom/av1/common/txb_common.c
+++ b/third_party/aom/av1/common/txb_common.c
@@ -12,17 +12,17 @@
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
 
-const int16_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
-                                         8, 9, 10, 11, 12, 13, 14, 15 };
+const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                        8, 9, 10, 11, 12, 13, 14, 15 };
 
-const int16_t av1_coeff_band_8x8[64] = {
+const int8_t av1_coeff_band_8x8[64] = {
   0,  1,  2,  2,  3,  3,  4,  4,  5,  6,  2,  2,  3,  3,  4,  4,
   7,  7,  8,  8,  9,  9,  10, 10, 7,  7,  8,  8,  9,  9,  10, 10,
   11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14,
   15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18,
 };
 
-const int16_t av1_coeff_band_16x16[256] = {
+const int8_t av1_coeff_band_16x16[256] = {
   0,  1,  4,  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  2,  3,  4,
   4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,
   7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,  7,  7,  8,
@@ -39,7 +39,7 @@ const int16_t av1_coeff_band_16x16[256] = {
   19, 20, 20, 20, 20, 21, 21, 21, 21,
 };
 
-const int16_t av1_coeff_band_32x32[1024] = {
+const int8_t av1_coeff_band_32x32[1024] = {
   0,  1,  4,  4,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
   11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2,  3,  4,  4,  7,  7,
   7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
@@ -96,223 +96,372 @@ const int16_t av1_coeff_band_32x32[1024] = {
   22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
 };
 
-#if LV_MAP_PROB
-void av1_init_txb_probs(FRAME_CONTEXT *fc) {
-  TX_SIZE tx_size;
-  int plane, ctx, level;
+// The ctx offset table when TX is TX_CLASS_2D.
+// TX col and row indices are clamped to 4
 
-  // Update probability models for transform block skip flag
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
-      fc->txb_skip_cdf[tx_size][ctx][0] =
-          AOM_ICDF(128 * (aom_cdf_prob)fc->txb_skip[tx_size][ctx]);
-      fc->txb_skip_cdf[tx_size][ctx][1] = AOM_ICDF(32768);
-      fc->txb_skip_cdf[tx_size][ctx][2] = 0;
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_4x4[16] = {
+  0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21,
+};
 
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) {
-      fc->dc_sign_cdf[plane][ctx][0] =
-          AOM_ICDF(128 * (aom_cdf_prob)fc->dc_sign[plane][ctx]);
-      fc->dc_sign_cdf[plane][ctx][1] = AOM_ICDF(32768);
-      fc->dc_sign_cdf[plane][ctx][2] = 0;
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_8x8[64] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 1,  6,  6,  21, 21, 21, 21, 21,
+  6,  6,  21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  // Update probability models for non-zero coefficient map and eob flag.
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (level = 0; level < NUM_BASE_LEVELS; ++level) {
-        for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
-          fc->coeff_base_cdf[tx_size][plane][level][ctx][0] = AOM_ICDF(
-              128 * (aom_cdf_prob)fc->coeff_base[tx_size][plane][level][ctx]);
-          fc->coeff_base_cdf[tx_size][plane][level][ctx][1] = AOM_ICDF(32768);
-          fc->coeff_base_cdf[tx_size][plane][level][ctx][2] = 0;
-        }
-      }
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_16x16[256] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-        fc->nz_map_cdf[tx_size][plane][ctx][0] =
-            AOM_ICDF(128 * (aom_cdf_prob)fc->nz_map[tx_size][plane][ctx]);
-        fc->nz_map_cdf[tx_size][plane][ctx][1] = AOM_ICDF(32768);
-        fc->nz_map_cdf[tx_size][plane][ctx][2] = 0;
-      }
+const int8_t av1_nz_map_ctx_offset_32x32[1024] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-      for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-        fc->eob_flag_cdf[tx_size][plane][ctx][0] =
-            AOM_ICDF(128 * (aom_cdf_prob)fc->eob_flag[tx_size][plane][ctx]);
-        fc->eob_flag_cdf[tx_size][plane][ctx][1] = AOM_ICDF(32768);
-        fc->eob_flag_cdf[tx_size][plane][ctx][2] = 0;
-      }
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_8x4[32] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 16, 16, 6,  21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-        fc->coeff_lps_cdf[tx_size][plane][ctx][0] =
-            AOM_ICDF(128 * (aom_cdf_prob)fc->coeff_lps[tx_size][plane][ctx]);
-        fc->coeff_lps_cdf[tx_size][plane][ctx][1] = AOM_ICDF(32768);
-        fc->coeff_lps_cdf[tx_size][plane][ctx][2] = 0;
-      }
-#if BR_NODE
-      for (int br = 0; br < BASE_RANGE_SETS; ++br) {
-        for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-          fc->coeff_br_cdf[tx_size][plane][br][ctx][0] = AOM_ICDF(
-              128 * (aom_cdf_prob)fc->coeff_br[tx_size][plane][br][ctx]);
-          fc->coeff_br_cdf[tx_size][plane][br][ctx][1] = AOM_ICDF(32768);
-          fc->coeff_br_cdf[tx_size][plane][br][ctx][2] = 0;
-        }
-      }
-#endif  // BR_NODE
-    }
-  }
-#if CONFIG_CTX1D
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) {
-        fc->eob_mode_cdf[tx_size][plane][tx_class][0] = AOM_ICDF(
-            128 * (aom_cdf_prob)fc->eob_mode[tx_size][plane][tx_class]);
-        fc->eob_mode_cdf[tx_size][plane][tx_class][1] = AOM_ICDF(32768);
-        fc->eob_mode_cdf[tx_size][plane][tx_class][2] = 0;
-      }
-    }
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) {
-        for (ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx) {
-          fc->empty_line_cdf[tx_size][plane][tx_class][ctx][0] = AOM_ICDF(
-              128 *
-              (aom_cdf_prob)fc->empty_line[tx_size][plane][tx_class][ctx]);
-          fc->empty_line_cdf[tx_size][plane][tx_class][ctx][1] =
-              AOM_ICDF(32768);
-          fc->empty_line_cdf[tx_size][plane][tx_class][ctx][2] = 0;
-        }
-      }
-    }
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) {
-        for (ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx) {
-          fc->hv_eob_cdf[tx_size][plane][tx_class][ctx][0] = AOM_ICDF(
-              128 * (aom_cdf_prob)fc->hv_eob[tx_size][plane][tx_class][ctx]);
-          fc->hv_eob_cdf[tx_size][plane][tx_class][ctx][1] = AOM_ICDF(32768);
-          fc->hv_eob_cdf[tx_size][plane][tx_class][ctx][2] = 0;
-        }
-      }
-    }
-  }
-#endif  // CONFIG_CTX1D
-}
-#endif  // LV_MAP_PROB
+const int8_t av1_nz_map_ctx_offset_8x16[128] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-void av1_adapt_txb_probs(AV1_COMMON *cm, unsigned int count_sat,
-                         unsigned int update_factor) {
-  FRAME_CONTEXT *fc = cm->fc;
-  const FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  const FRAME_COUNTS *counts = &cm->counts;
-  TX_SIZE tx_size;
-  int plane, ctx, level;
+const int8_t av1_nz_map_ctx_offset_16x8[128] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  // Update probability models for transform block skip flag
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
-    for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
-      fc->txb_skip[tx_size][ctx] = mode_mv_merge_probs(
-          pre_fc->txb_skip[tx_size][ctx], counts->txb_skip[tx_size][ctx]);
+const int8_t av1_nz_map_ctx_offset_16x32[512] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-      fc->dc_sign[plane][ctx] = mode_mv_merge_probs(
-          pre_fc->dc_sign[plane][ctx], counts->dc_sign[plane][ctx]);
+const int8_t av1_nz_map_ctx_offset_32x16[512] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  // Update probability models for non-zero coefficient map and eob flag.
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (level = 0; level < NUM_BASE_LEVELS; ++level)
-        for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-          fc->coeff_base[tx_size][plane][level][ctx] =
-              merge_probs(pre_fc->coeff_base[tx_size][plane][level][ctx],
-                          counts->coeff_base[tx_size][plane][level][ctx],
-                          count_sat, update_factor);
+const int8_t av1_nz_map_ctx_offset_32x64[1024] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-        fc->nz_map[tx_size][plane][ctx] = merge_probs(
-            pre_fc->nz_map[tx_size][plane][ctx],
-            counts->nz_map[tx_size][plane][ctx], count_sat, update_factor);
-      }
+const int8_t av1_nz_map_ctx_offset_64x32[1024] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16,
+  16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-      for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-        fc->eob_flag[tx_size][plane][ctx] = merge_probs(
-            pre_fc->eob_flag[tx_size][plane][ctx],
-            counts->eob_flag[tx_size][plane][ctx], count_sat, update_factor);
-      }
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_4x16[64] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-        fc->coeff_lps[tx_size][plane][ctx] = merge_probs(
-            pre_fc->coeff_lps[tx_size][plane][ctx],
-            counts->coeff_lps[tx_size][plane][ctx], count_sat, update_factor);
-      }
-#if BR_NODE
-      for (int br = 0; br < BASE_RANGE_SETS; ++br) {
-        for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-          fc->coeff_br[tx_size][plane][br][ctx] =
-              merge_probs(pre_fc->coeff_br[tx_size][plane][br][ctx],
-                          counts->coeff_br[tx_size][plane][br][ctx], count_sat,
-                          update_factor);
-        }
-      }
-#endif  // BR_NODE
-    }
-  }
-#if CONFIG_CTX1D
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        fc->eob_mode[tx_size][plane][tx_class] =
-            merge_probs(pre_fc->eob_mode[tx_size][plane][tx_class],
-                        counts->eob_mode[tx_size][plane][tx_class], count_sat,
-                        update_factor);
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
-          fc->empty_line[tx_size][plane][tx_class][ctx] =
-              merge_probs(pre_fc->empty_line[tx_size][plane][tx_class][ctx],
-                          counts->empty_line[tx_size][plane][tx_class][ctx],
-                          count_sat, update_factor);
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
-          fc->hv_eob[tx_size][plane][tx_class][ctx] =
-              merge_probs(pre_fc->hv_eob[tx_size][plane][tx_class][ctx],
-                          counts->hv_eob[tx_size][plane][tx_class][ctx],
-                          count_sat, update_factor);
-  }
-#endif
-}
+const int8_t av1_nz_map_ctx_offset_16x4[64] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x32[256] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x8[256] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t *av1_nz_map_ctx_offset[19] = {
+  av1_nz_map_ctx_offset_4x4,    // TX_4x4
+  av1_nz_map_ctx_offset_8x8,    // TX_8x8
+  av1_nz_map_ctx_offset_16x16,  // TX_16x16
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x8
+  av1_nz_map_ctx_offset_8x4,    // TX_8x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x16
+  av1_nz_map_ctx_offset_16x8,   // TX_16x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x32
+  av1_nz_map_ctx_offset_32x16,  // TX_32x16
+  av1_nz_map_ctx_offset_32x64,  // TX_32x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x16
+  av1_nz_map_ctx_offset_16x4,   // TX_16x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x32
+  av1_nz_map_ctx_offset_32x8,   // TX_32x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x16
+};
 
 void av1_init_lv_map(AV1_COMMON *cm) {
   LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
   for (int row = 0; row < 2; ++row) {
     for (int col = 0; col < 2; ++col) {
-      for (int sig_mag = 0; sig_mag < 2; ++sig_mag) {
+      for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
         for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
+          if (row == 0 && col == 0 && count > 5) continue;
+          if ((row == 0 || col == 0) && count > 8) continue;
+
           coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
               get_base_ctx_from_count_mag(row, col, count, sig_mag);
         }
@@ -320,3 +469,7 @@ void av1_init_lv_map(AV1_COMMON *cm) {
     }
   }
 }
+
+const int16_t k_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
+                                        17, 33, 65, 129, 257, 513 };
+const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index 3bf8f8c61..cdac90d9e 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -12,72 +12,133 @@
 #ifndef AV1_COMMON_TXB_COMMON_H_
 #define AV1_COMMON_TXB_COMMON_H_
 
-#define REDUCE_CONTEXT_DEPENDENCY 0
-#define MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY 0
+extern const int16_t k_eob_group_start[12];
+extern const int16_t k_eob_offset_bits[12];
 
-extern const int16_t av1_coeff_band_4x4[16];
+extern const int8_t av1_coeff_band_4x4[16];
 
-extern const int16_t av1_coeff_band_8x8[64];
+extern const int8_t av1_coeff_band_8x8[64];
 
-extern const int16_t av1_coeff_band_16x16[256];
+extern const int8_t av1_coeff_band_16x16[256];
 
-extern const int16_t av1_coeff_band_32x32[1024];
+extern const int8_t av1_coeff_band_32x32[1024];
+
+extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL];
 
 typedef struct txb_ctx {
   int txb_skip_ctx;
   int dc_sign_ctx;
 } TXB_CTX;
 
-static INLINE TX_SIZE get_txsize_context(TX_SIZE tx_size) {
-  return txsize_sqr_up_map[tx_size];
-}
+static const int base_level_count_to_index[13] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+};
 
-static int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
+// Note: TX_PAD_2D is dependent to this offset table.
+static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
   /* clang-format off*/
   { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
   { 0, 2 },  { 1, -1 },  { 1, 0 },  { 1, 1 },  { 2, 0 }
   /* clang-format on*/
 };
 
-static INLINE int get_level_count(const tran_low_t *tcoeffs, int bwl,
-                                  int height, int row, int col, int level,
-                                  int (*nb_offset)[2], int nb_num) {
-  int count = 0;
-  for (int idx = 0; idx < nb_num; ++idx) {
-    const int ref_row = row + nb_offset[idx][0];
-    const int ref_col = col + nb_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    count += abs_coeff > level;
+#define CONTEXT_MAG_POSITION_NUM 3
+static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = {
+  { { 0, 1 }, { 1, 0 }, { 1, 1 } },
+  { { 0, 1 }, { 1, 0 }, { 0, 2 } },
+  { { 0, 1 }, { 1, 0 }, { 2, 0 } }
+};
+static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = {
+  { 0, 1 }, { 1, 0 }, { 1, 1 }
+};
+
+static const TX_CLASS tx_type_to_class[TX_TYPES] = {
+  TX_CLASS_2D,     // DCT_DCT
+  TX_CLASS_2D,     // ADST_DCT
+  TX_CLASS_2D,     // DCT_ADST
+  TX_CLASS_2D,     // ADST_ADST
+  TX_CLASS_2D,     // FLIPADST_DCT
+  TX_CLASS_2D,     // DCT_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_FLIPADST
+  TX_CLASS_2D,     // ADST_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_ADST
+  TX_CLASS_2D,     // IDTX
+  TX_CLASS_VERT,   // V_DCT
+  TX_CLASS_HORIZ,  // H_DCT
+  TX_CLASS_VERT,   // V_ADST
+  TX_CLASS_HORIZ,  // H_ADST
+  TX_CLASS_VERT,   // V_FLIPADST
+  TX_CLASS_HORIZ,  // H_FLIPADST
+};
+
+static const int8_t eob_to_pos_small[33] = {
+  0, 1, 2,                                        // 0-2
+  3, 3,                                           // 3-4
+  4, 4, 4, 4,                                     // 5-8
+  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+  6,                               // place holder
+  7,                               // 33-64
+  8,  8,                           // 65-128
+  9,  9,  9,  9,                   // 129-256
+  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
+  11                               // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+  int t;
+
+  if (eob < 33) {
+    t = eob_to_pos_small[eob];
+  } else {
+    const int e = AOMMIN((eob - 1) >> 5, 16);
+    t = eob_to_pos_large[e];
   }
-  return count;
+
+  *extra = eob - k_eob_group_start[t];
+
+  return t;
 }
 
-static INLINE void get_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
-                           int height, int row, int col, int (*nb_offset)[2],
-                           int nb_num) {
-  mag[0] = 0;
-  mag[1] = 0;
-  for (int idx = 0; idx < nb_num; ++idx) {
-    const int ref_row = row + nb_offset[idx][0];
-    const int ref_col = col + nb_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    if (nb_offset[idx][0] >= 0 && nb_offset[idx][1] >= 0) {
-      if (abs_coeff > mag[0]) {
-        mag[0] = abs_coeff;
-        mag[1] = 1;
-      } else if (abs_coeff == mag[0]) {
-        ++mag[1];
-      }
-    }
-  }
+static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type,
+                                      const int eob_token) {
+  static const int8_t tx_type_to_offset[TX_TYPES] = {
+    -1,  // DCT_DCT
+    -1,  // ADST_DCT
+    -1,  // DCT_ADST
+    -1,  // ADST_ADST
+    -1,  // FLIPADST_DCT
+    -1,  // DCT_FLIPADST
+    -1,  // FLIPADST_FLIPADST
+    -1,  // ADST_FLIPADST
+    -1,  // FLIPADST_ADST
+    -1,  // IDTX
+    10,  // V_DCT
+    10,  // H_DCT
+    10,  // V_ADST
+    10,  // H_ADST
+    10,  // V_FLIPADST
+    10,  // H_FLIPADST
+  };
+  return eob_token + tx_type_to_offset[tx_type];
+}
+
+static INLINE int get_txb_bwl(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide_log2[tx_size];
+}
+
+static INLINE int get_txb_wide(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide[tx_size];
+}
+
+static INLINE int get_txb_high(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_high[tx_size];
 }
 
 static INLINE void get_base_count_mag(int *mag, int *count,
@@ -110,67 +171,124 @@ static INLINE void get_base_count_mag(int *mag, int *count,
   }
 }
 
-static INLINE int get_level_count_mag(int *mag, const tran_low_t *tcoeffs,
-                                      int bwl, int height, int row, int col,
-                                      int level, int (*nb_offset)[2],
-                                      int nb_num) {
-  const int stride = 1 << bwl;
+static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
+  return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
+}
+
+static INLINE int get_padded_idx(const int idx, const int bwl) {
+  return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
+}
+
+static INLINE int get_level_count(const uint8_t *const levels, const int stride,
+                                  const int row, const int col, const int level,
+                                  const int (*nb_offset)[2], const int nb_num) {
   int count = 0;
-  *mag = 0;
+
   for (int idx = 0; idx < nb_num; ++idx) {
     const int ref_row = row + nb_offset[idx][0];
     const int ref_col = col + nb_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height || ref_col >= stride)
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    count += abs_coeff > level;
-    if (nb_offset[idx][0] >= 0 && nb_offset[idx][1] >= 0)
-      *mag = AOMMAX(*mag, abs_coeff);
+    const int pos = ref_row * stride + ref_col;
+    count += levels[pos] > level;
   }
   return count;
 }
 
+static INLINE void get_level_mag(const uint8_t *const levels, const int stride,
+                                 const int row, const int col, int *const mag) {
+  for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) {
+    const int ref_row = row + mag_ref_offset[idx][0];
+    const int ref_col = col + mag_ref_offset[idx][1];
+    const int pos = ref_row * stride + ref_col;
+    mag[idx] = levels[pos];
+  }
+}
+
 static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
                                               int sig_mag) {
-  const int ctx = (count + 1) >> 1;
+  const int ctx = base_level_count_to_index[count];
   int ctx_idx = -1;
+
   if (row == 0 && col == 0) {
-    ctx_idx = (ctx << 1) + sig_mag;
-    // TODO(angiebird): turn this on once the optimization is finalized
-    // assert(ctx_idx < 8);
+    if (sig_mag >= 2) return ctx_idx = 0;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 1;
+      else
+        ctx_idx = 2;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 3 + ctx;
+    assert(ctx_idx <= 6);
+    return ctx_idx;
   } else if (row == 0) {
-    ctx_idx = 8 + (ctx << 1) + sig_mag;
-    // TODO(angiebird): turn this on once the optimization is finalized
-    // assert(ctx_idx < 18);
+    if (sig_mag >= 2) return ctx_idx = 6;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 7;
+      else
+        ctx_idx = 8;
+      return ctx_idx;
+    }
+
+    ctx_idx = 9 + ctx;
+    assert(ctx_idx <= 11);
+    return ctx_idx;
   } else if (col == 0) {
-    ctx_idx = 8 + 10 + (ctx << 1) + sig_mag;
+    if (sig_mag >= 2) return ctx_idx = 12;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 13;
+      else
+        ctx_idx = 14;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 15 + ctx;
+    assert(ctx_idx <= 17);
     // TODO(angiebird): turn this on once the optimization is finalized
     // assert(ctx_idx < 28);
   } else {
-    ctx_idx = 8 + 10 + 10 + (ctx << 1) + sig_mag;
-    assert(ctx_idx < COEFF_BASE_CONTEXTS);
+    if (sig_mag >= 2) return ctx_idx = 18;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 19;
+      else
+        ctx_idx = 20;
+      return ctx_idx;
+    }
+
+    ctx_idx = 21 + ctx;
+
+    assert(ctx_idx <= 24);
   }
   return ctx_idx;
 }
 
-static INLINE int get_base_ctx(const tran_low_t *tcoeffs,
-                               int c,  // raster order
-                               const int bwl, const int height,
-                               const int level) {
+static INLINE int get_base_ctx(const uint8_t *const levels,
+                               const int c,  // raster order
+                               const int bwl, const int level_minus_1,
+                               const int count) {
   const int row = c >> bwl;
   const int col = c - (row << bwl);
-  const int level_minus_1 = level - 1;
-  int mag;
-  int count =
-      get_level_count_mag(&mag, tcoeffs, bwl, height, row, col, level_minus_1,
-                          base_ref_offset, BASE_CONTEXT_POSITION_NUM);
-  int ctx_idx = get_base_ctx_from_count_mag(row, col, count, mag > level);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  int mag_count = 0;
+  int nb_mag[3] = { 0 };
+
+  get_level_mag(levels, stride, row, col, nb_mag);
+
+  for (int idx = 0; idx < 3; ++idx)
+    mag_count += nb_mag[idx] > (level_minus_1 + 1);
+  const int ctx_idx =
+      get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count));
   return ctx_idx;
 }
 
 #define BR_CONTEXT_POSITION_NUM 8  // Base range coefficient context
-static int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
+// Note: TX_PAD_2D is dependent to this offset table.
+static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
   /* clang-format off*/
   { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
   { 0, 1 },   { 1, -1 }, { 1, 0 },  { 1, 1 },
@@ -181,18 +299,8 @@ static const int br_level_map[9] = {
   0, 0, 1, 1, 2, 2, 3, 3, 3,
 };
 
-static const int coeff_to_br_index[COEFF_BASE_RANGE] = {
-  0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-
-static const int br_index_to_coeff[BASE_RANGE_SETS] = {
-  0, 2, 6,
-};
-
-static const int br_extra_bits[BASE_RANGE_SETS] = {
-  1, 2, 3,
-};
-
+// Note: If BR_MAG_OFFSET changes, the calculation of offset in
+// get_br_ctx_from_count_mag() must be updated.
 #define BR_MAG_OFFSET 1
 // TODO(angiebird): optimize this function by using a table to map from
 // count/mag to ctx
@@ -223,369 +331,356 @@ static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
   return count;
 }
 
-static INLINE int get_br_ctx_from_count_mag(int row, int col, int count,
-                                            int mag) {
-  int offset = 0;
-  if (mag <= BR_MAG_OFFSET)
-    offset = 0;
-  else if (mag <= 3)
-    offset = 1;
-  else if (mag <= 5)
-    offset = 2;
-  else
-    offset = 3;
-
-  int ctx = br_level_map[count];
-  ctx += offset * BR_TMP_OFFSET;
-
+static INLINE int get_br_ctx_from_count_mag(const int row, const int col,
+                                            const int count, const int mag) {
   // DC: 0 - 1
-  if (row == 0 && col == 0) return ctx;
-
   // Top row: 2 - 4
-  if (row == 0) return 2 + ctx;
-
   // Left column: 5 - 7
-  if (col == 0) return 5 + ctx;
-
   // others: 8 - 11
-  return 8 + ctx;
+  static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } };
+  const int mag_clamp = AOMMIN(mag, 6);
+  const int offset = mag_clamp >> 1;
+  const int ctx =
+      br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col];
+  return ctx;
 }
 
-static INLINE int get_br_ctx(const tran_low_t *tcoeffs,
-                             const int c,  // raster order
-                             const int bwl, const int height) {
+static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+                                const int c,  // raster order
+                                const int bwl) {
+  assert(c > 0);
   const int row = c >> bwl;
   const int col = c - (row << bwl);
-  const int level_minus_1 = NUM_BASE_LEVELS;
-  int mag;
-  const int count =
-      get_level_count_mag(&mag, tcoeffs, bwl, height, row, col, level_minus_1,
-                          br_ref_offset, BR_CONTEXT_POSITION_NUM);
-  const int ctx = get_br_ctx_from_count_mag(row, col, count, mag);
-  return ctx;
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
+  mag = AOMMIN((mag + 1) >> 1, 6);
+  //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
+  if ((row | col) < 2) return mag + 7;
+  return mag + 14;
 }
 
-#define SIG_REF_OFFSET_NUM 7
-static int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
-  { -2, -1 }, { -2, 0 }, { -1, -2 }, { -1, -1 },
-  { -1, 0 },  { 0, -2 }, { 0, -1 },
-};
-
-#if REDUCE_CONTEXT_DEPENDENCY
-static INLINE int get_nz_count(const tran_low_t *tcoeffs, int bwl, int height,
-                               int row, int col, int prev_row, int prev_col) {
-  int count = 0;
-  for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) {
-    const int ref_row = row + sig_ref_offset[idx][0];
-    const int ref_col = col + sig_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl) || (prev_row == ref_row && prev_col == ref_col))
-      continue;
-    const int nb_pos = (ref_row << bwl) + ref_col;
-    count += (tcoeffs[nb_pos] != 0);
-  }
-  return count;
-}
-#else
-static INLINE int get_nz_count(const tran_low_t *tcoeffs, int bwl, int height,
-                               int row, int col) {
-  int count = 0;
-  for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) {
-    const int ref_row = row + sig_ref_offset[idx][0];
-    const int ref_col = col + sig_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int nb_pos = (ref_row << bwl) + ref_col;
-    count += (tcoeffs[nb_pos] != 0);
-  }
-  return count;
-}
-#endif
-
-static INLINE TX_CLASS get_tx_class(TX_TYPE tx_type) {
-  switch (tx_type) {
-#if CONFIG_EXT_TX
-    case V_DCT:
-    case V_ADST:
-    case V_FLIPADST: return TX_CLASS_VERT;
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST: return TX_CLASS_HORIZ;
-#endif
-    default: return TX_CLASS_2D;
+static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
+                                       const int c,  // raster order
+                                       const int bwl, const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = levels[pos + 1];
+  mag += levels[pos + stride];
+  switch (tx_class) {
+    case TX_CLASS_2D:
+      mag += levels[pos + stride + 1];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if ((row < 2) && (col < 2)) return mag + 7;
+      break;
+    case TX_CLASS_HORIZ:
+      mag += levels[pos + 2];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (col == 0) return mag + 7;
+      break;
+    case TX_CLASS_VERT:
+      mag += levels[pos + (stride << 1)];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (row == 0) return mag + 7;
+      break;
+    default: break;
   }
-}
 
-// TODO(angiebird): optimize this function by generate a table that maps from
-// count to ctx
-static INLINE int get_nz_map_ctx_from_count(int count,
-                                            int coeff_idx,  // raster order
-                                            int bwl, TX_TYPE tx_type) {
-  (void)tx_type;
-  const int row = coeff_idx >> bwl;
-  const int col = coeff_idx - (row << bwl);
-  int ctx = 0;
-#if CONFIG_EXT_TX
-  int tx_class = get_tx_class(tx_type);
-  int offset;
-  if (tx_class == TX_CLASS_2D)
-    offset = 0;
-  else if (tx_class == TX_CLASS_VERT)
-    offset = SIG_COEF_CONTEXTS_2D;
-  else
-    offset = SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D;
-#else
-  int offset = 0;
-#endif
-
-  if (row == 0 && col == 0) return offset + 0;
-
-  if (row == 0 && col == 1) return offset + 1 + count;
-
-  if (row == 1 && col == 0) return offset + 3 + count;
-
-  if (row == 1 && col == 1) {
-    ctx = (count + 1) >> 1;
-
-    assert(5 + ctx <= 7);
-
-    return offset + 5 + ctx;
-  }
+  return mag + 14;
+}
 
-  if (row == 0) {
-    ctx = (count + 1) >> 1;
+#define SIG_REF_OFFSET_NUM 5
 
-    assert(ctx < 2);
-    return offset + 8 + ctx;
-  }
+// Note: TX_PAD_2D is dependent to these offset tables.
+static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
+  { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 }
+  // , { 1, 2 }, { 2, 1 },
+};
 
-  if (col == 0) {
-    ctx = (count + 1) >> 1;
+static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = {
+  { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 }
+  // , { 1, 1 }, { 2, 1 },
+};
 
-    assert(ctx < 2);
-    return offset + 10 + ctx;
-  }
+static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = {
+  { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 }
+  // , { 1, 1 }, { 1, 2 },
+};
 
-  ctx = count >> 1;
+#define SIG_REF_DIFF_OFFSET_NUM 3
 
-  assert(12 + ctx < 16);
+static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = {
+  { 1, 1 }, { 0, 2 }, { 2, 0 }
+};
 
-  return offset + 12 + ctx;
-}
+static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = {
+  { 2, 0 }, { 3, 0 }, { 4, 0 }
+};
 
-static INLINE int get_nz_map_ctx(const tran_low_t *tcoeffs, const int scan_idx,
-                                 const int16_t *scan, const int bwl,
-                                 const int height, TX_TYPE tx_type) {
-  const int coeff_idx = scan[scan_idx];
-  const int row = coeff_idx >> bwl;
-  const int col = coeff_idx - (row << bwl);
-#if REDUCE_CONTEXT_DEPENDENCY
-  int prev_coeff_idx;
-  int prev_row;
-  int prev_col;
-  if (scan_idx > MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY) {
-    prev_coeff_idx = scan[scan_idx - 1];  // raster order
-    prev_row = prev_coeff_idx >> bwl;
-    prev_col = prev_coeff_idx - (prev_row << bwl);
-  } else {
-    prev_coeff_idx = -1;
-    prev_row = -1;
-    prev_col = -1;
-  }
-  int count = get_nz_count(tcoeffs, bwl, height, row, col, prev_row, prev_col);
-#else
-  int count = get_nz_count(tcoeffs, bwl, height, row, col);
-#endif
-  return get_nz_map_ctx_from_count(count, coeff_idx, bwl, tx_type);
-}
+static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = {
+  { 0, 2 }, { 0, 3 }, { 0, 4 }
+};
 
-static INLINE int get_eob_ctx(const tran_low_t *tcoeffs,
-                              const int coeff_idx,  // raster order
-                              const TX_SIZE txs_ctx, TX_TYPE tx_type) {
-  (void)tcoeffs;
-  int offset = 0;
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_VERT)
-    offset = EOB_COEF_CONTEXTS_2D;
-  else if (tx_class == TX_CLASS_HORIZ)
-    offset = EOB_COEF_CONTEXTS_2D + EOB_COEF_CONTEXTS_1D;
-#else
-  (void)tx_type;
-#endif
-
-  if (txs_ctx == TX_4X4) return offset + av1_coeff_band_4x4[coeff_idx];
-  if (txs_ctx == TX_8X8) return offset + av1_coeff_band_8x8[coeff_idx];
-  if (txs_ctx == TX_16X16) return offset + av1_coeff_band_16x16[coeff_idx];
-  if (txs_ctx == TX_32X32) return offset + av1_coeff_band_32x32[coeff_idx];
-
-  assert(0);
-  return 0;
-}
+static const uint8_t clip_max3[256] = {
+  0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
 
-static INLINE void set_dc_sign(int *cul_level, tran_low_t v) {
-  if (v < 0)
-    *cul_level |= 1 << COEFF_CONTEXT_BITS;
-  else if (v > 0)
-    *cul_level += 2 << COEFF_CONTEXT_BITS;
-}
+static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
+                                       const int bwl, const TX_CLASS tx_class) {
+  int mag;
 
-static INLINE int get_dc_sign_ctx(int dc_sign) {
-  int dc_sign_ctx = 0;
-  if (dc_sign < 0)
-    dc_sign_ctx = 1;
-  else if (dc_sign > 0)
-    dc_sign_ctx = 2;
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  mag = clip_max3[levels[1]];                         // { 0, 1 }
+  mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]];  // { 1, 0 }
+
+  if (tx_class == TX_CLASS_2D) {
+    mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]];          // { 1, 1 }
+    mag += clip_max3[levels[2]];                                    // { 0, 2 }
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+  } else if (tx_class == TX_CLASS_VERT) {
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+    mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]];  // { 3, 0 }
+    mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]];  // { 4, 0 }
+  } else {
+    mag += clip_max3[levels[2]];  // { 0, 2 }
+    mag += clip_max3[levels[3]];  // { 0, 3 }
+    mag += clip_max3[levels[4]];  // { 0, 4 }
+  }
 
-  return dc_sign_ctx;
+  return mag;
 }
 
-static INLINE void get_txb_ctx(BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               int plane, const ENTROPY_CONTEXT *a,
-                               const ENTROPY_CONTEXT *l, TXB_CTX *txb_ctx) {
-  const int txb_w_unit = tx_size_wide_unit[tx_size];
-  const int txb_h_unit = tx_size_high_unit[tx_size];
-  int ctx_offset = (plane == 0) ? 0 : 7;
-
-  if (plane_bsize > txsize_to_bsize[tx_size]) ctx_offset += 3;
-
-  int dc_sign = 0;
-  for (int k = 0; k < txb_w_unit; ++k) {
-    int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
-    if (sign == 1)
-      --dc_sign;
-    else if (sign == 2)
-      ++dc_sign;
-    else if (sign != 0)
-      assert(0);
-  }
-
-  for (int k = 0; k < txb_h_unit; ++k) {
-    int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
-    if (sign == 1)
-      --dc_sign;
-    else if (sign == 2)
-      ++dc_sign;
-    else if (sign != 0)
-      assert(0);
+static INLINE int get_nz_count(const uint8_t *const levels, const int bwl,
+                               const TX_CLASS tx_class) {
+  int count;
+
+  count = (levels[1] != 0);                         // { 0, 1 }
+  count += (levels[(1 << bwl) + TX_PAD_HOR] != 0);  // { 1, 0 }
+
+  for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) {
+    const int row_offset =
+        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0]
+                                   : ((tx_class == TX_CLASS_VERT)
+                                          ? sig_ref_diff_offset_vert[idx][0]
+                                          : sig_ref_diff_offset_horiz[idx][0]));
+    const int col_offset =
+        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1]
+                                   : ((tx_class == TX_CLASS_VERT)
+                                          ? sig_ref_diff_offset_vert[idx][1]
+                                          : sig_ref_diff_offset_horiz[idx][1]));
+    const int nb_pos =
+        (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset;
+    count += (levels[nb_pos] != 0);
   }
+  return count;
+}
 
-  txb_ctx->dc_sign_ctx = get_dc_sign_ctx(dc_sign);
-
-  if (plane == 0) {
-    int top = 0;
-    int left = 0;
+#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
+#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
+#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
+
+static const int nz_map_ctx_offset_1d[32] = {
+  NZ_MAP_CTX_0,  NZ_MAP_CTX_5,  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+};
 
-    for (int k = 0; k < txb_w_unit; ++k) {
-      top = AOMMAX(top, ((uint8_t)a[k] & COEFF_CONTEXT_MASK));
+static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
+    const int stats,
+    const int coeff_idx,  // raster order
+    const int bwl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
+  // tx_class == 0(TX_CLASS_2D)
+  if ((tx_class | coeff_idx) == 0) return 0;
+  int ctx = (stats + 1) >> 1;
+  ctx = AOMMIN(ctx, 4);
+  switch (tx_class) {
+    case TX_CLASS_2D: {
+      // This is the algorithm to generate av1_nz_map_ctx_offset[][]
+      //   const int width = tx_size_wide[tx_size];
+      //   const int height = tx_size_high[tx_size];
+      //   if (width < height) {
+      //     if (row < 2) return 11 + ctx;
+      //   } else if (width > height) {
+      //     if (col < 2) return 16 + ctx;
+      //   }
+      //   if (row + col < 2) return ctx + 1;
+      //   if (row + col < 4) return 5 + ctx + 1;
+      //   return 21 + ctx;
+      return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
     }
-
-    for (int k = 0; k < txb_h_unit; ++k) {
-      left = AOMMAX(left, ((uint8_t)l[k] & COEFF_CONTEXT_MASK));
+    case TX_CLASS_HORIZ: {
+      const int row = coeff_idx >> bwl;
+      const int col = coeff_idx - (row << bwl);
+      return ctx + nz_map_ctx_offset_1d[col];
+      break;
     }
-
-    top = AOMMIN(top, 255);
-    left = AOMMIN(left, 255);
-
-    if (plane_bsize == txsize_to_bsize[tx_size])
-      txb_ctx->txb_skip_ctx = 0;
-    else if (top == 0 && left == 0)
-      txb_ctx->txb_skip_ctx = 1;
-    else if (top == 0 || left == 0)
-      txb_ctx->txb_skip_ctx = 2 + (AOMMAX(top, left) > 3);
-    else if (AOMMAX(top, left) <= 3)
-      txb_ctx->txb_skip_ctx = 4;
-    else if (AOMMIN(top, left) <= 3)
-      txb_ctx->txb_skip_ctx = 5;
-    else
-      txb_ctx->txb_skip_ctx = 6;
-  } else {
-    int ctx_base = get_entropy_context(tx_size, a, l);
-    txb_ctx->txb_skip_ctx = ctx_offset + ctx_base;
+    case TX_CLASS_VERT: {
+      const int row = coeff_idx >> bwl;
+      return ctx + nz_map_ctx_offset_1d[row];
+      break;
+    }
+    default: break;
   }
+  return 0;
 }
 
-#if LV_MAP_PROB
-void av1_init_txb_probs(FRAME_CONTEXT *fc);
-#endif  // LV_MAP_PROB
+typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
+typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
 
-void av1_adapt_txb_probs(AV1_COMMON *cm, unsigned int count_sat,
-                         unsigned int update_factor);
+static INLINE int get_lower_levels_ctx_eob(int bwl, int height, int scan_idx) {
+  if (scan_idx == 0) return 0;
+  if (scan_idx <= (height << bwl) / 8) return 1;
+  if (scan_idx <= (height << bwl) / 4) return 2;
+  return 3;
+}
 
-void av1_init_lv_map(AV1_COMMON *cm);
+static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+                                          int bwl, TX_SIZE tx_size) {
+  assert(coeff_idx > 0);
+  int mag;
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  levels = levels + get_padded_idx(coeff_idx, bwl);
+  mag = AOMMIN(levels[1], 3);                                     // { 0, 1 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3);              // { 1, 0 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR + 1], 3);          // { 1, 1 }
+  mag += AOMMIN(levels[2], 3);                                    // { 0, 2 }
+  mag += AOMMIN(levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)], 3);  // { 2, 0 }
+
+  const int ctx = AOMMIN((mag + 1) >> 1, 4);
+  return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+}
+static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
+                                                 int coeff_idx, int bwl,
+                                                 TX_SIZE tx_size,
+                                                 TX_CLASS tx_class) {
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
 
-#if CONFIG_CTX1D
-static INLINE void get_eob_vert(int16_t *eob_ls, const tran_low_t *tcoeff,
-                                int w, int h) {
-  for (int c = 0; c < w; ++c) {
-    eob_ls[c] = 0;
-    for (int r = h - 1; r >= 0; --r) {
-      int coeff_idx = r * w + c;
-      if (tcoeff[coeff_idx] != 0) {
-        eob_ls[c] = r + 1;
-        break;
-      }
-    }
+static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+                                               int bwl, int height,
+                                               const uint8_t *levels,
+                                               int coeff_idx, TX_SIZE tx_size,
+                                               TX_CLASS tx_class) {
+  if (is_last) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) >> 3) return 1;
+    if (scan_idx <= (height << bwl) >> 2) return 2;
+    return 3;
   }
+  return get_lower_levels_ctx(levels, coeff_idx, bwl, tx_size, tx_class);
 }
 
-static INLINE void get_eob_horiz(int16_t *eob_ls, const tran_low_t *tcoeff,
-                                 int w, int h) {
-  for (int r = 0; r < h; ++r) {
-    eob_ls[r] = 0;
-    for (int c = w - 1; c >= 0; --c) {
-      int coeff_idx = r * w + c;
-      if (tcoeff[coeff_idx] != 0) {
-        eob_ls[r] = c + 1;
-        break;
-      }
-    }
-  }
+static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+  if (dc_val < 0)
+    *cul_level |= 1 << COEFF_CONTEXT_BITS;
+  else if (dc_val > 0)
+    *cul_level += 2 << COEFF_CONTEXT_BITS;
 }
 
-static INLINE int get_empty_line_ctx(int line_idx, int16_t *eob_ls) {
-  if (line_idx > 0) {
-    int prev_eob = eob_ls[line_idx - 1];
-    if (prev_eob == 0) {
-      return 1;
-    } else if (prev_eob < 3) {
-      return 2;
-    } else if (prev_eob < 6) {
-      return 3;
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+                               const TX_SIZE tx_size, const int plane,
+                               const ENTROPY_CONTEXT *const a,
+                               const ENTROPY_CONTEXT *const l,
+                               TXB_CTX *const txb_ctx) {
+#define MAX_TX_SIZE_UNIT 16
+  static const int8_t signs[3] = { 0, -1, 1 };
+  static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+  };
+  const int txb_w_unit = tx_size_wide_unit[tx_size];
+  const int txb_h_unit = tx_size_high_unit[tx_size];
+  int dc_sign = 0;
+  int k = 0;
+
+  do {
+    const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_w_unit);
+
+  k = 0;
+  do {
+    const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_h_unit);
+
+  txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
+
+  if (plane == 0) {
+    if (plane_bsize == txsize_to_bsize[tx_size]) {
+      txb_ctx->txb_skip_ctx = 0;
     } else {
-      return 4;
+      // This is the algorithm to generate table skip_contexts[min][max].
+      //    if (!max)
+      //      txb_skip_ctx = 1;
+      //    else if (!min)
+      //      txb_skip_ctx = 2 + (max > 3);
+      //    else if (max <= 3)
+      //      txb_skip_ctx = 4;
+      //    else if (min <= 3)
+      //      txb_skip_ctx = 5;
+      //    else
+      //      txb_skip_ctx = 6;
+      static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 6 } };
+      int top = 0;
+      int left = 0;
+
+      k = 0;
+      do {
+        top |= a[k];
+      } while (++k < txb_w_unit);
+      top &= COEFF_CONTEXT_MASK;
+
+      k = 0;
+      do {
+        left |= l[k];
+      } while (++k < txb_h_unit);
+      left &= COEFF_CONTEXT_MASK;
+      const int max = AOMMIN(top | left, 4);
+      const int min = AOMMIN(AOMMIN(top, left), 4);
+
+      txb_ctx->txb_skip_ctx = skip_contexts[min][max];
     }
   } else {
-    return 0;
+    const int ctx_base = get_entropy_context(tx_size, a, l);
+    const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
+                            num_pels_log2_lookup[txsize_to_bsize[tx_size]])
+                               ? 10
+                               : 7;
+    txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
   }
+#undef MAX_TX_SIZE_UNIT
 }
 
-#define MAX_POS_CTX 8
-static int pos_ctx[MAX_HVTX_SIZE] = {
-  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-static INLINE int get_hv_eob_ctx(int line_idx, int pos, int16_t *eob_ls) {
-  if (line_idx > 0) {
-    int prev_eob = eob_ls[line_idx - 1];
-    int diff = pos + 1 - prev_eob;
-    int abs_diff = abs(diff);
-    int ctx_idx = pos_ctx[abs_diff];
-    assert(ctx_idx < MAX_POS_CTX);
-    if (diff < 0) {
-      ctx_idx += MAX_POS_CTX;
-      assert(ctx_idx >= MAX_POS_CTX);
-      assert(ctx_idx < 2 * MAX_POS_CTX);
-    }
-    return ctx_idx;
-  } else {
-    int ctx_idx = MAX_POS_CTX + MAX_POS_CTX + pos_ctx[pos];
-    assert(ctx_idx < HV_EOB_CONTEXTS);
-    assert(HV_EOB_CONTEXTS == MAX_POS_CTX * 3);
-    return ctx_idx;
-  }
-}
-#endif  // CONFIG_CTX1D
+void av1_init_lv_map(AV1_COMMON *cm);
 
 #endif  // AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 34374af69..ae6f07657 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -15,7 +15,8 @@
 #include <math.h>
 #include <assert.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
@@ -91,78 +92,11 @@ static const int error_measure_lut[512] = {
 };
 /* clang-format on */
 
-static ProjectPointsFunc get_project_points_type(TransformationType type) {
-  switch (type) {
-    case VERTRAPEZOID: return project_points_vertrapezoid;
-    case HORTRAPEZOID: return project_points_hortrapezoid;
-    case HOMOGRAPHY: return project_points_homography;
-    case AFFINE: return project_points_affine;
-    case ROTZOOM: return project_points_rotzoom;
-    case TRANSLATION: return project_points_translation;
-    default: assert(0); return NULL;
-  }
-}
-
-void project_points_translation(const int32_t *mat, int *points, int *proj,
-                                const int n, const int stride_points,
-                                const int stride_proj, const int subsampling_x,
-                                const int subsampling_y) {
-  int i;
-  for (i = 0; i < n; ++i) {
-    const int x = *(points++), y = *(points++);
-    if (subsampling_x)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[0]), WARPEDDIFF_PREC_BITS);
-    if (subsampling_y)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[1], WARPEDDIFF_PREC_BITS);
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
-                            const int n, const int stride_points,
-                            const int stride_proj, const int subsampling_x,
-                            const int subsampling_y) {
-  int i;
-  for (i = 0; i < n; ++i) {
-    const int x = *(points++), y = *(points++);
-    if (subsampling_x)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
-                                            WARPEDDIFF_PREC_BITS);
-    if (subsampling_y)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          -mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
-              (-mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[3] * x + mat[2] * y + mat[1],
-                                            WARPEDDIFF_PREC_BITS);
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
 void project_points_affine(const int32_t *mat, int *points, int *proj,
                            const int n, const int stride_points,
                            const int stride_proj, const int subsampling_x,
                            const int subsampling_y) {
-  int i;
-  for (i = 0; i < n; ++i) {
+  for (int i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
@@ -185,301 +119,6 @@ void project_points_affine(const int32_t *mat, int *points, int *proj,
   }
 }
 
-void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y) {
-  int i;
-  int64_t x, y, Z;
-  int64_t xp, yp;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    x = (subsampling_x ? 4 * x + 1 : 2 * x);
-    y = (subsampling_y ? 4 * y + 1 : 2 * y);
-
-    Z = (mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-    yp = (mat[5] * y + 2 * mat[1]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-
-    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
-    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
-
-    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    *(proj++) = (int)xp;
-    *(proj++) = (int)yp;
-
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y) {
-  int i;
-  int64_t x, y, Z;
-  int64_t xp, yp;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    x = (subsampling_x ? 4 * x + 1 : 2 * x);
-    y = (subsampling_y ? 4 * y + 1 : 2 * y);
-
-    Z = (mat[6] * x + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[2] * x + 2 * mat[0]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-    yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-
-    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
-    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
-
-    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    *(proj++) = (int)xp;
-    *(proj++) = (int)yp;
-
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-void project_points_homography(const int32_t *mat, int *points, int *proj,
-                               const int n, const int stride_points,
-                               const int stride_proj, const int subsampling_x,
-                               const int subsampling_y) {
-  int i;
-  int64_t x, y, Z;
-  int64_t xp, yp;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    x = (subsampling_x ? 4 * x + 1 : 2 * x);
-    y = (subsampling_y ? 4 * y + 1 : 2 * y);
-
-    Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-    yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-
-    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
-    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
-
-    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    *(proj++) = (int)xp;
-    *(proj++) = (int)yp;
-
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-static const int16_t
-    filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
-#if WARPEDPIXEL_PREC_BITS == 6
-      { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
-      { 1, -3, 127, 4, -1, 0 },    { 1, -4, 126, 6, -2, 1 },
-      { 1, -5, 126, 8, -3, 1 },    { 1, -6, 125, 11, -4, 1 },
-      { 1, -7, 124, 13, -4, 1 },   { 2, -8, 123, 15, -5, 1 },
-      { 2, -9, 122, 18, -6, 1 },   { 2, -10, 121, 20, -6, 1 },
-      { 2, -11, 120, 22, -7, 2 },  { 2, -12, 119, 25, -8, 2 },
-      { 3, -13, 117, 27, -8, 2 },  { 3, -13, 116, 29, -9, 2 },
-      { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
-      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
-      { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
-      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
-      { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
-      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 94, 60, -16, 4 },
-      { 4, -18, 91, 63, -16, 4 },  { 4, -18, 89, 65, -16, 4 },
-      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 85, 70, -17, 4 },
-      { 4, -18, 82, 73, -17, 4 },  { 4, -18, 80, 75, -17, 4 },
-      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 75, 80, -18, 4 },
-      { 4, -17, 73, 82, -18, 4 },  { 4, -17, 70, 85, -18, 4 },
-      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 65, 89, -18, 4 },
-      { 4, -16, 63, 91, -18, 4 },  { 4, -16, 60, 94, -18, 4 },
-      { 3, -15, 58, 96, -18, 4 },  { 4, -15, 55, 98, -18, 4 },
-      { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
-      { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
-      { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
-      { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
-      { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
-      { 2, -8, 27, 117, -13, 3 },  { 2, -8, 25, 119, -12, 2 },
-      { 2, -7, 22, 120, -11, 2 },  { 1, -6, 20, 121, -10, 2 },
-      { 1, -6, 18, 122, -9, 2 },   { 1, -5, 15, 123, -8, 2 },
-      { 1, -4, 13, 124, -7, 1 },   { 1, -4, 11, 125, -6, 1 },
-      { 1, -3, 8, 126, -5, 1 },    { 1, -2, 6, 126, -4, 1 },
-      { 0, -1, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
-#elif WARPEDPIXEL_PREC_BITS == 5
-      { 0, 0, 128, 0, 0, 0 },      { 1, -3, 127, 4, -1, 0 },
-      { 1, -5, 126, 8, -3, 1 },    { 1, -7, 124, 13, -4, 1 },
-      { 2, -9, 122, 18, -6, 1 },   { 2, -11, 120, 22, -7, 2 },
-      { 3, -13, 117, 27, -8, 2 },  { 3, -14, 114, 32, -10, 3 },
-      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 108, 42, -12, 3 },
-      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 100, 52, -14, 3 },
-      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 91, 63, -16, 4 },
-      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 82, 73, -17, 4 },
-      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 73, 82, -18, 4 },
-      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 63, 91, -18, 4 },
-      { 3, -15, 58, 96, -18, 4 },  { 3, -14, 52, 100, -17, 4 },
-      { 3, -13, 47, 104, -17, 4 }, { 3, -12, 42, 108, -16, 3 },
-      { 3, -11, 37, 111, -15, 3 }, { 3, -10, 32, 114, -14, 3 },
-      { 2, -8, 27, 117, -13, 3 },  { 2, -7, 22, 120, -11, 2 },
-      { 1, -6, 18, 122, -9, 2 },   { 1, -4, 13, 124, -7, 1 },
-      { 1, -3, 8, 126, -5, 1 },    { 0, -1, 4, 127, -3, 1 },
-#endif  // WARPEDPIXEL_PREC_BITS == 6
-    };
-
-static int32_t do_ntap_filter(const int32_t *const p, int x) {
-  int i;
-  int32_t sum = 0;
-  for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
-    sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
-  }
-  return sum;
-}
-
-static int32_t do_cubic_filter(const int32_t *const p, int x) {
-  if (x == 0) {
-    return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
-  } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
-    return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
-  } else {
-    const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
-    const int64_t v2 =
-        (int64_t)x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
-    const int64_t v3 = x * (p[1] - p[-1]);
-    const int64_t v4 = 2 * p[0];
-    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
-        (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
-            (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
-            (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
-        3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
-  }
-}
-
-static INLINE void get_subcolumn(int taps, const uint8_t *const ref,
-                                 int32_t *col, int stride, int x, int y_start) {
-  int i;
-  for (i = 0; i < taps; ++i) {
-    col[i] = ref[(i + y_start) * stride + x];
-  }
-}
-
-static uint8_t bi_ntap_filter(const uint8_t *const ref, int x, int y,
-                              int stride) {
-  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
-    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
-    get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
-                  i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
-                  j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
-    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint8_t)clip_pixel(val);
-}
-
-static uint8_t bi_cubic_filter(const uint8_t *const ref, int x, int y,
-                               int stride) {
-  int32_t val, arr[4];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < 4; ++k) {
-    int32_t arr_temp[4];
-    get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] =
-        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint8_t)clip_pixel(val);
-}
-
-static uint8_t bi_linear_filter(const uint8_t *const ref, int x, int y,
-                                int stride) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t val;
-  val = ROUND_POWER_OF_TWO_SIGNED(
-      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
-              (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
-          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[(iy + 1) * stride + ix + 1] * sy * sx,
-      WARPEDPIXEL_PREC_BITS * 2);
-  return (uint8_t)clip_pixel(val);
-}
-
-static uint8_t warp_interpolate(const uint8_t *const ref, int x, int y,
-                                int width, int height, int stride) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t v;
-
-  if (ix < 0 && iy < 0)
-    return ref[0];
-  else if (ix < 0 && iy >= height - 1)
-    return ref[(height - 1) * stride];
-  else if (ix >= width - 1 && iy < 0)
-    return ref[width - 1];
-  else if (ix >= width - 1 && iy >= height - 1)
-    return ref[(height - 1) * stride + (width - 1)];
-  else if (ix < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (iy < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (ix >= width - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride + width - 1] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (iy >= height - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-            ref[(height - 1) * stride + ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
-             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
-    return bi_ntap_filter(ref, x, y, stride);
-  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
-    return bi_cubic_filter(ref, x, y, stride);
-  } else {
-    return bi_linear_filter(ref, x, y, stride);
-  }
-}
-
 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
 // at a time. The zoom/rotation/shear in the model are applied to the
 // "fractional" position of each pixel, which therefore varies within
@@ -683,15 +322,14 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
   8240,  8224,  8208,  8192,
 };
 
-#if CONFIG_WARPED_MOTION
 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
 // at precision of DIV_LUT_PREC_BITS along with the shift.
 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
-  int64_t e, f;
+  int64_t f;
   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
                                : get_msb((unsigned int)D));
   // e is obtained from D after resetting the most significant 1 bit.
-  e = D - ((uint64_t)1 << *shift);
+  const int64_t e = D - ((uint64_t)1 << *shift);
   // Get the most significant DIV_LUT_BITS (8) bits of e into f
   if (*shift > DIV_LUT_BITS)
     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
@@ -702,13 +340,12 @@ static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
   // Use f as lookup into the precomputed table of multipliers
   return div_lut[f];
 }
-#endif  // CONFIG_WARPED_MOTION
 
 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
-  int32_t e, f;
+  int32_t f;
   *shift = get_msb(D);
   // e is obtained from D after resetting the most significant 1 bit.
-  e = D - ((uint32_t)1 << *shift);
+  const int32_t e = D - ((uint32_t)1 << *shift);
   // Get the most significant DIV_LUT_BITS (8) bits of e into f
   if (*shift > DIV_LUT_BITS)
     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
@@ -743,16 +380,13 @@ int get_shear_params(WarpedMotionParams *wm) {
   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
   int16_t shift;
   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
-  int64_t v;
-  v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
   wm->gamma =
       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
   v = ((int64_t)mat[3] * mat[4]) * y;
   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
                         (1 << WARPEDMODEL_PREC_BITS),
                     INT16_MIN, INT16_MAX);
-  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
-    return 0;
 
   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
               (1 << WARP_PARAM_REDUCE_BITS);
@@ -762,171 +396,24 @@ int get_shear_params(WarpedMotionParams *wm) {
               (1 << WARP_PARAM_REDUCE_BITS);
   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
               (1 << WARP_PARAM_REDUCE_BITS);
-  return 1;
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE void highbd_get_subcolumn(int taps, const uint16_t *const ref,
-                                        int32_t *col, int stride, int x,
-                                        int y_start) {
-  int i;
-  for (i = 0; i < taps; ++i) {
-    col[i] = ref[(i + y_start) * stride + x];
-  }
-}
-
-static uint16_t highbd_bi_ntap_filter(const uint16_t *const ref, int x, int y,
-                                      int stride, int bd) {
-  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
-    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
-    highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
-                         i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
-                         j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
-    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint16_t)clip_pixel_highbd(val, bd);
-}
-
-static uint16_t highbd_bi_cubic_filter(const uint16_t *const ref, int x, int y,
-                                       int stride, int bd) {
-  int32_t val, arr[4];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < 4; ++k) {
-    int32_t arr_temp[4];
-    highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] =
-        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint16_t)clip_pixel_highbd(val, bd);
-}
 
-static uint16_t highbd_bi_linear_filter(const uint16_t *const ref, int x, int y,
-                                        int stride, int bd) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t val;
-  val = ROUND_POWER_OF_TWO_SIGNED(
-      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
-              (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
-          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[(iy + 1) * stride + ix + 1] * sy * sx,
-      WARPEDPIXEL_PREC_BITS * 2);
-  return (uint16_t)clip_pixel_highbd(val, bd);
-}
+  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+    return 0;
 
-static uint16_t highbd_warp_interpolate(const uint16_t *const ref, int x, int y,
-                                        int width, int height, int stride,
-                                        int bd) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t v;
-
-  if (ix < 0 && iy < 0)
-    return ref[0];
-  else if (ix < 0 && iy > height - 1)
-    return ref[(height - 1) * stride];
-  else if (ix > width - 1 && iy < 0)
-    return ref[width - 1];
-  else if (ix > width - 1 && iy > height - 1)
-    return ref[(height - 1) * stride + (width - 1)];
-  else if (ix < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (iy < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (ix > width - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride + width - 1] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (iy > height - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-            ref[(height - 1) * stride + ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
-             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
-    return highbd_bi_ntap_filter(ref, x, y, stride, bd);
-  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
-    return highbd_bi_cubic_filter(ref, x, y, stride, bd);
-  } else {
-    return highbd_bi_linear_filter(ref, x, y, stride, bd);
-  }
+  return 1;
 }
 
 static INLINE int highbd_error_measure(int err, int bd) {
   const int b = bd - 8;
   const int bmask = (1 << b) - 1;
   const int v = (1 << b);
-  int e1, e2;
   err = abs(err);
-  e1 = err >> b;
-  e2 = err & bmask;
+  const int e1 = err >> b;
+  const int e2 = err & bmask;
   return error_measure_lut[255 + e1] * (v - e2) +
          error_measure_lut[256 + e1] * e2;
 }
 
-static void highbd_warp_plane_old(const WarpedMotionParams *const wm,
-                                  const uint8_t *const ref8, int width,
-                                  int height, int stride,
-                                  const uint8_t *const pred8, int p_col,
-                                  int p_row, int p_width, int p_height,
-                                  int p_stride, int subsampling_x,
-                                  int subsampling_y, int x_scale, int y_scale,
-                                  int bd, ConvolveParams *conv_params) {
-  int i, j;
-  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
-  if (projectpoints == NULL) return;
-  for (i = p_row; i < p_row + p_height; ++i) {
-    for (j = p_col; j < p_col + p_width; ++j) {
-      int in[2], out[2];
-      in[0] = j;
-      in[1] = i;
-      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
-      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
-      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
-      if (conv_params->do_average)
-        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
-            pred[(j - p_col) + (i - p_row) * p_stride] +
-                highbd_warp_interpolate(ref, out[0], out[1], width, height,
-                                        stride, bd),
-            1);
-      else
-        pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
-            ref, out[0], out[1], width, height, stride, bd);
-    }
-  }
-}
-
 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
     for hardware implementations, see the comments above av1_warp_affine_c
 */
@@ -938,37 +425,23 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
                               ConvolveParams *conv_params, int16_t alpha,
                               int16_t beta, int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
-  int i, j, k, l, m;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
   const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      use_conv_params
-          ? bd + FILTER_BITS + 1 - conv_params->round_0
-          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      use_conv_params
-          ? bd + 2 * FILTER_BITS - conv_params->round_0
-          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-#endif
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
       // Calculate the center of this 8x8 block,
       // project to luma coordinates (if in a subsampled chroma plane),
       // apply the affine transformation,
@@ -980,9 +453,9 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       const int32_t x4 = dst_x >> subsampling_x;
       const int32_t y4 = dst_y >> subsampling_y;
 
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4);
@@ -992,15 +465,11 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
       // Horizontal filter
-      for (k = -7; k < 8; ++k) {
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
+      for (int k = -7; k < 8; ++k) {
+        const int iy = clamp(iy4 + k, 0, height - 1);
 
         int sx = sx4 + beta * (k + 4);
-        for (l = -4; l < 4; ++l) {
+        for (int l = -4; l < 4; ++l) {
           int ix = ix4 + l - 3;
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
@@ -1008,12 +477,8 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
-          for (m = 0; m < 8; ++m) {
-            int sample_x = ix + m;
-            if (sample_x < 0)
-              sample_x = 0;
-            else if (sample_x > width - 1)
-              sample_x = width - 1;
+          for (int m = 0; m < 8; ++m) {
+            const int sample_x = clamp(ix + m, 0, width - 1);
             sum += ref[iy * stride + sample_x] * coeffs[m];
           }
           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
@@ -1024,46 +489,50 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       }
 
       // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
         int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
-          for (m = 0; m < 8; ++m) {
+          for (int m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-#if CONFIG_CONVOLVE_ROUND
-          if (use_conv_params) {
+
+          if (conv_params->is_compound) {
             CONV_BUF_TYPE *p =
                 &conv_params
                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
                            (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                  (1 << (offset_bits_horiz + FILTER_BITS -
-                         conv_params->round_0 - conv_params->round_1)) -
-                  (1 << (offset_bits_vert - conv_params->round_1));
-            if (conv_params->do_average)
-              *p += sum;
-            else
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint16_t *dst16 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_jnt_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst16 =
+                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
+            } else {
               *p = sum;
+            }
           } else {
-#else
-          {
-#endif
             uint16_t *p =
                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
             assert(0 <= sum && sum < (1 << (bd + 2)));
-            uint16_t px =
-                clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
-            if (conv_params->do_average)
-              *p = ROUND_POWER_OF_TWO(*p + px, 1);
-            else
-              *p = px;
+            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
           }
           sy += gamma;
         }
@@ -1076,32 +545,25 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
                               int width, int height, int stride,
                               const uint8_t *const pred8, int p_col, int p_row,
                               int p_width, int p_height, int p_stride,
-                              int subsampling_x, int subsampling_y, int x_scale,
-                              int y_scale, int bd,
+                              int subsampling_x, int subsampling_y, int bd,
                               ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
-      x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
-    const int32_t *const mat = wm->wmmat;
-    const int16_t alpha = wm->alpha;
-    const int16_t beta = wm->beta;
-    const int16_t gamma = wm->gamma;
-    const int16_t delta = wm->delta;
-
-    const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-    av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
-                           p_width, p_height, p_stride, subsampling_x,
-                           subsampling_y, bd, conv_params, alpha, beta, gamma,
-                           delta);
-  } else {
-    highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
-                          p_width, p_height, p_stride, subsampling_x,
-                          subsampling_y, x_scale, y_scale, bd, conv_params);
-  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, bd, conv_params, alpha, beta, gamma,
+                         delta);
 }
 
 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
@@ -1120,25 +582,25 @@ static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
 static int64_t highbd_warp_error(
     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
-    int p_height, int p_stride, int subsampling_x, int subsampling_y,
-    int x_scale, int y_scale, int bd, int64_t best_error) {
+    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
+    int64_t best_error) {
   int64_t gm_sumerr = 0;
-  int warp_w, warp_h;
-  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
 
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  conv_params.use_jnt_comp_avg = 0;
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
       // avoid warping extra 8x8 blocks in the padded region of the frame
       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
       highbd_warp_plane(wm, ref8, width, height, stride,
                         CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
-                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
-                        y_scale, bd, &conv_params);
+                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
+                        &conv_params);
 
       gm_sumerr += highbd_frame_error(
           tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
@@ -1148,41 +610,11 @@ static int64_t highbd_warp_error(
   }
   return gm_sumerr;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE int error_measure(int err) {
   return error_measure_lut[255 + err];
 }
 
-static void warp_plane_old(const WarpedMotionParams *const wm,
-                           const uint8_t *const ref, int width, int height,
-                           int stride, uint8_t *pred, int p_col, int p_row,
-                           int p_width, int p_height, int p_stride,
-                           int subsampling_x, int subsampling_y, int x_scale,
-                           int y_scale, ConvolveParams *conv_params) {
-  int i, j;
-  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
-  if (projectpoints == NULL) return;
-  for (i = p_row; i < p_row + p_height; ++i) {
-    for (j = p_col; j < p_col + p_width; ++j) {
-      int in[2], out[2];
-      in[0] = j;
-      in[1] = i;
-      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
-      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
-      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
-      if (conv_params->do_average)
-        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
-            pred[(j - p_col) + (i - p_row) * p_stride] +
-                warp_interpolate(ref, out[0], out[1], width, height, stride),
-            1);
-      else
-        pred[(j - p_col) + (i - p_row) * p_stride] =
-            warp_interpolate(ref, out[0], out[1], width, height, stride);
-    }
-  }
-}
-
 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    * Split the input into 8x8 blocks
    * For each block, project the point (4, 4) within the block, to get the
@@ -1237,10 +669,10 @@ static void warp_plane_old(const WarpedMotionParams *const wm,
     This allows the derivation of the appropriate bit widths and offsets for
     the various intermediate values: If
 
-    F := WARPEDPIXEL_FILTER_BITS = 7 (or else the above ranges need adjusting)
+    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
          intermediate value.
-    H := HORSHEAR_REDUCE_PREC_BITS
+    H := ROUND0_BITS
     V := VERSHEAR_REDUCE_PREC_BITS
     (and note that we must have H + V = 2*F for the output to have the same
      scale as the input)
@@ -1275,38 +707,23 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
                        int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
-  int i, j, k, l, m;
   const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      use_conv_params
-          ? bd + FILTER_BITS + 1 - conv_params->round_0
-          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      use_conv_params
-          ? bd + 2 * FILTER_BITS - conv_params->round_0
-          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-#endif
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
       // Calculate the center of this 8x8 block,
       // project to luma coordinates (if in a subsampled chroma plane),
       // apply the affine transformation,
@@ -1330,17 +747,13 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
       // Horizontal filter
-      for (k = -7; k < 8; ++k) {
+      for (int k = -7; k < 8; ++k) {
         // Clamp to top/bottom edge of the frame
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
+        const int iy = clamp(iy4 + k, 0, height - 1);
 
         int sx = sx4 + beta * (k + 4);
 
-        for (l = -4; l < 4; ++l) {
+        for (int l = -4; l < 4; ++l) {
           int ix = ix4 + l - 3;
           // At this point, sx = sx4 + alpha * l + beta * k
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
@@ -1349,13 +762,9 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
-          for (m = 0; m < 8; ++m) {
+          for (int m = 0; m < 8; ++m) {
             // Clamp to left/right edge of the frame
-            int sample_x = ix + m;
-            if (sample_x < 0)
-              sample_x = 0;
-            else if (sample_x > width - 1)
-              sample_x = width - 1;
+            const int sample_x = clamp(ix + m, 0, width - 1);
 
             sum += ref[iy * stride + sample_x] * coeffs[m];
           }
@@ -1367,9 +776,9 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
       }
 
       // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
         int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
           // At this point, sy = sy4 + gamma * l + delta * k
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
@@ -1377,36 +786,40 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
-          for (m = 0; m < 8; ++m) {
+          for (int m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-#if CONFIG_CONVOLVE_ROUND
-          if (use_conv_params) {
+
+          if (conv_params->is_compound) {
             CONV_BUF_TYPE *p =
                 &conv_params
                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
                            (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                  (1 << (offset_bits_horiz + FILTER_BITS -
-                         conv_params->round_0 - conv_params->round_1)) -
-                  (1 << (offset_bits_vert - conv_params->round_1));
-            if (conv_params->do_average)
-              *p += sum;
-            else
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint8_t *dst8 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_jnt_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
+            } else {
               *p = sum;
+            }
           } else {
-#else
-          {
-#endif
             uint8_t *p =
                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
             assert(0 <= sum && sum < (1 << (bd + 2)));
-            uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
-            if (conv_params->do_average)
-              *p = ROUND_POWER_OF_TWO(*p + px, 1);
-            else
-              *p = px;
+            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
           }
           sy += gamma;
         }
@@ -1419,27 +832,20 @@ static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
                        int width, int height, int stride, uint8_t *pred,
                        int p_col, int p_row, int p_width, int p_height,
                        int p_stride, int subsampling_x, int subsampling_y,
-                       int x_scale, int y_scale, ConvolveParams *conv_params) {
+                       ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
-      x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
-    const int32_t *const mat = wm->wmmat;
-    const int16_t alpha = wm->alpha;
-    const int16_t beta = wm->beta;
-    const int16_t gamma = wm->gamma;
-    const int16_t delta = wm->delta;
-
-    av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
-                    p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                    conv_params, alpha, beta, gamma, delta);
-  } else {
-    warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
-                   p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-                   y_scale, conv_params);
-  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
+                  alpha, beta, gamma, delta);
 }
 
 static int64_t frame_error(const uint8_t *const ref, int stride,
@@ -1459,14 +865,15 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
                           int width, int height, int stride,
                           const uint8_t *const dst, int p_col, int p_row,
                           int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y, int x_scale,
-                          int y_scale, int64_t best_error) {
+                          int subsampling_x, int subsampling_y,
+                          int64_t best_error) {
   int64_t gm_sumerr = 0;
   int warp_w, warp_h;
   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+  conv_params.use_jnt_comp_avg = 0;
 
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
@@ -1475,8 +882,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
       warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
       warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
       warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
-                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
-                 y_scale, &conv_params);
+                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
 
       gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
                                warp_w, warp_h, p_stride);
@@ -1486,70 +892,49 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
   return gm_sumerr;
 }
 
-int64_t av1_frame_error(
-#if CONFIG_HIGHBITDEPTH
-    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
-    const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
-    int p_stride) {
-#if CONFIG_HIGHBITDEPTH
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride) {
   if (use_hbd) {
     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
                               CONVERT_TO_SHORTPTR(dst), p_width, p_height,
                               p_stride, bd);
   }
-#endif  // CONFIG_HIGHBITDEPTH
   return frame_error(ref, stride, dst, p_width, p_height, p_stride);
 }
 
-int64_t av1_warp_error(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                       int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
                        const uint8_t *ref, int width, int height, int stride,
                        uint8_t *dst, int p_col, int p_row, int p_width,
                        int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int x_scale, int y_scale,
-                       int64_t best_error) {
+                       int subsampling_y, int64_t best_error) {
   if (wm->wmtype <= AFFINE)
     if (!get_shear_params(wm)) return 1;
-#if CONFIG_HIGHBITDEPTH
   if (use_hbd)
     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
                              p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, x_scale, y_scale, bd, best_error);
-#endif  // CONFIG_HIGHBITDEPTH
+                             subsampling_y, bd, best_error);
   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
-                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-                    y_scale, best_error);
+                    p_height, p_stride, subsampling_x, subsampling_y,
+                    best_error);
 }
 
-void av1_warp_plane(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, int x_scale, int y_scale,
-                    ConvolveParams *conv_params) {
-#if CONFIG_HIGHBITDEPTH
+                    int subsampling_y, ConvolveParams *conv_params) {
   if (use_hbd)
     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                      x_scale, y_scale, bd, conv_params);
+                      bd, conv_params);
   else
-#endif  // CONFIG_HIGHBITDEPTH
     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
-               p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-               y_scale, conv_params);
+               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
 }
 
-#if CONFIG_WARPED_MOTION
-#define LEAST_SQUARES_ORDER 2
-
 #define LS_MV_MAX 256  // max mv in 1/8-pel
-#define LS_STEP 2
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
 
 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
 // the precision needed is:
@@ -1570,13 +955,17 @@ void av1_warp_plane(WarpedMotionParams *wm,
 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
 
-#define LS_SUM(a) ((a)*4 + LS_STEP * 2)
-#define LS_SQUARE(a) \
-  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
-#define LS_PRODUCT1(a, b) \
-  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
-#define LS_PRODUCT2(a, b) \
-  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a)                                          \
+  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b)                                           \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b)                                               \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
 
 #define USE_LIMITED_PREC_MULT 0
 
@@ -1655,22 +1044,24 @@ static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
 }
 #endif  // USE_LIMITED_PREC_MULT
 
-static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
-                           int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
-                           int mi_col) {
+static int find_affine_int(int np, const int *pts1, const int *pts2,
+                           BLOCK_SIZE bsize, int mvy, int mvx,
+                           WarpedMotionParams *wm, int mi_row, int mi_col) {
   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   int32_t Bx[2] = { 0, 0 };
   int32_t By[2] = { 0, 0 };
-  int i, n = 0;
+  int i;
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1);
-  const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1);
-  const int suy = isuy * 8;
-  const int sux = isux * 8;
+  const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
+  const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+  const int suy = rsuy * 8;
+  const int sux = rsux * 8;
   const int duy = suy + mvy;
   const int dux = sux + mvx;
+  const int isuy = (mi_row * MI_SIZE + rsuy);
+  const int isux = (mi_col * MI_SIZE + rsux);
 
   // Assume the center pixel of the block has exactly the same motion vector
   // as transmitted for the block. First shift the origin of the source
@@ -1694,13 +1085,15 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
   //
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
-  int sx, sy, dx, dy;
   // Contribution from neighbor block
-  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
-    dx = pts2[i * 2] - dux;
-    dy = pts2[i * 2 + 1] - duy;
-    sx = pts1[i * 2] - sux;
-    sy = pts1[i * 2 + 1] - suy;
+  for (i = 0; i < np; i++) {
+    const int dx = pts2[i * 2] - dux;
+    const int dy = pts2[i * 2 + 1] - duy;
+    const int sx = pts1[i * 2] - sux;
+    const int sy = pts1[i * 2 + 1] - suy;
+    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+    // selection is done in find_samples(). Also, global offset can be removed
+    // while collecting samples.
     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
       A[0][0] += LS_SQUARE(sx);
       A[0][1] += LS_PRODUCT1(sx, sy);
@@ -1709,41 +1102,20 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
       Bx[1] += LS_PRODUCT1(sy, dx);
       By[0] += LS_PRODUCT1(sx, dy);
       By[1] += LS_PRODUCT2(sy, dy);
-      n++;
     }
   }
-  int downshift;
-  if (n >= 4)
-    downshift = LS_MAT_DOWN_BITS;
-  else if (n >= 2)
-    downshift = LS_MAT_DOWN_BITS - 1;
-  else
-    downshift = LS_MAT_DOWN_BITS - 2;
-
-  // Reduce precision by downshift bits
-  A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-
-  int64_t Px[2], Py[2], Det;
-  int16_t iDet, shift;
 
-  // These divided by the Det, are the least squares solutions
-  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
-  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
-  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
-  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+  // Just for debugging, and can be removed later.
+  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
+
+  int64_t Det;
+  int16_t iDet, shift;
 
   // Compute Determinant of A
   Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
@@ -1755,6 +1127,14 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
     shift = 0;
   }
 
+  int64_t Px[2], Py[2];
+
+  // These divided by the Det, are the least squares solutions
+  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
@@ -1783,13 +1163,13 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mvx, WarpedMotionParams *wm_params, int mi_row,
                     int mi_col) {
   assert(wm_params->wmtype == AFFINE);
-  const int result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
-                                     mi_row, mi_col);
-  if (result == 0) {
-    // check compatibility with the fast warp filter
-    if (!get_shear_params(wm_params)) return 1;
-  }
 
-  return result;
+  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
+                      mi_col))
+    return 1;
+
+  // check compatibility with the fast warp filter
+  if (!get_shear_params(wm_params)) return 1;
+
+  return 0;
 }
-#endif  // CONFIG_WARPED_MOTION
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index e05f6a85f..f5da36bbb 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -18,94 +18,79 @@
 #include <math.h>
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/common/mv.h"
 #include "av1/common/convolve.h"
 
 #define MAX_PARAMDIM 9
-#if CONFIG_WARPED_MOTION
 #define LEAST_SQUARES_SAMPLES_MAX_BITS 3
 #define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
-
-#if WARPED_MOTION_SORT_SAMPLES
-// Search 1 row on the top and 1 column on the left, 1 upper-left block,
-// 1 upper-right block.
-#define SAMPLES_ARRAY_SIZE ((MAX_MIB_SIZE * 2 + 2) * 2)
-#else
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
-#endif  // WARPED_MOTION_SORT_SAMPLES
-
+#define WARPED_MOTION_DEBUG 0
 #define DEFAULT_WMTYPE AFFINE
-#endif  // CONFIG_WARPED_MOTION
 
 extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
 
-typedef void (*ProjectPointsFunc)(const int32_t *mat, int *points, int *proj,
-                                  const int n, const int stride_points,
-                                  const int stride_proj,
-                                  const int subsampling_x,
-                                  const int subsampling_y);
-
-void project_points_translation(const int32_t *mat, int *points, int *proj,
-                                const int n, const int stride_points,
-                                const int stride_proj, const int subsampling_x,
-                                const int subsampling_y);
-
-void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
-                            const int n, const int stride_points,
-                            const int stride_proj, const int subsampling_x,
-                            const int subsampling_y);
+static const uint8_t warp_pad_left[14][16] = {
+  { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
+  { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
+  { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
+  { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
+  { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
+};
+
+static const uint8_t warp_pad_right[14][16] = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
+  { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
+  { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
+  { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+  { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+  { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
+};
 
 void project_points_affine(const int32_t *mat, int *points, int *proj,
                            const int n, const int stride_points,
                            const int stride_proj, const int subsampling_x,
                            const int subsampling_y);
 
-void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y);
-void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y);
-void project_points_homography(const int32_t *mat, int *points, int *proj,
-                               const int n, const int stride_points,
-                               const int stride_proj, const int subsampling_x,
-                               const int subsampling_y);
-
 // Returns the error between the result of applying motion 'wm' to the frame
 // described by 'ref' and the frame described by 'dst'.
-int64_t av1_warp_error(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                       int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
                        const uint8_t *ref, int width, int height, int stride,
                        uint8_t *dst, int p_col, int p_row, int p_width,
                        int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int x_scale, int y_scale,
-                       int64_t best_error);
+                       int subsampling_y, int64_t best_error);
 
 // Returns the error between the frame described by 'ref' and the frame
 // described by 'dst'.
-int64_t av1_frame_error(
-#if CONFIG_HIGHBITDEPTH
-    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
-    const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
-    int p_stride);
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride);
 
-void av1_warp_plane(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, int x_scale, int y_scale,
-                    ConvolveParams *conv_params);
+                    int subsampling_y, ConvolveParams *conv_params);
 
 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mvx, WarpedMotionParams *wm_params, int mi_row,
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 000000000..8aa14696f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const int16_t *x_filters, int x0_qn,
+                                  int x_step_qn) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const uint8_t *src_y;
+  uint8_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint8_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 8-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_8 = xx_loadl_64(src_x0);
+      const __m128i src1_8 = xx_loadl_64(src_x1);
+      const __m128i src2_8 = xx_loadl_64(src_x2);
+      const __m128i src3_8 = xx_loadl_64(src_x3);
+
+      // Now zero-extend up to 16-bit precision, i.e.
+      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Pack 16-bit values into 8-bit values, i.e.
+      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+      // -> [ 0 0 0 0 0 0 DC BA ]
+      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+      // Write to the output
+      xx_storel_32(&dst_y[x], shifted_8);
+    }
+  }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+                                         uint16_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         int x0_qn, int x_step_qn, int bd) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+  const uint16_t *src_y;
+  uint16_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint16_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 16-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_16 = xx_loadu_128(src_x0);
+      const __m128i src1_16 = xx_loadu_128(src_x1);
+      const __m128i src2_16 = xx_loadu_128(src_x2);
+      const __m128i src3_16 = xx_loadu_128(src_x3);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+      // Write to the output
+      xx_storel_64(&dst_y[x], clipped_16);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 1f0fedb2a..6747cae01 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -12,135 +12,16 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-// Make a mask for coefficients of 10/12 tap filters. The coefficients are
-// packed "89ab89ab". If it's a 12-tap filter, we want all 1's; if it's a
-// 10-tap filter, we want "11001100" to just match the 8,9 terms.
-static __m128i make_1012_mask(int ntaps) {
-  uint32_t low = 0xffffffff;
-  uint32_t high = (ntaps == 12) ? low : 0;
-  return _mm_set_epi32(high, low, high, low);
-}
-
-// Zero-extend the given input operand to an entire __m128i register.
-//
-// Note that there's almost an intrinsic to do this but 32-bit Visual Studio
-// doesn't have _mm_set_epi64x so we have to do it by hand.
-static __m128i extend_32_to_128(uint32_t x) {
-  return _mm_set_epi32(0, 0, 0, x);
-}
-
-// Load an SSE register from p and bitwise AND with a.
-static __m128i load_and_128i(const void *p, __m128i a) {
-  const __m128d ad = _mm_castsi128_pd(a);
-  const __m128d bd = _mm_load1_pd((const double *)p);
-  return _mm_castpd_si128(_mm_and_pd(ad, bd));
-}
-
-// The horizontal filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// hfilter8.
-static void hfilter(const uint8_t *src, int src_stride, int32_t *dst, int w,
-                    int h, int subpel_x_qn, int x_step_qn,
-                    const InterpFilterParams *filter_params, unsigned round) {
-  const int bd = 8;
-  const int ntaps = filter_params->taps;
-  assert(ntaps == 10 || ntaps == 12);
-
-  src -= ntaps / 2 - 1;
-
-  // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
-  // out the unneeded entries.
-  const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
-  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
-
-  int x_qn = subpel_x_qn;
-  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
-    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-    assert(filter_idx < SUBPEL_SHIFTS);
-    const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
-    // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
-    // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
-    // are masked out with hicoeff_mask.
-    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-    const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-
-    int y;
-    for (y = 0; y <= h - 4; y += 4) {
-      const uint8_t *const src0 = src_col + y * src_stride;
-      const uint8_t *const src1 = src0 + 1 * src_stride;
-      const uint8_t *const src2 = src0 + 2 * src_stride;
-      const uint8_t *const src3 = src0 + 3 * src_stride;
-
-      // Load up source data. This is 8-bit input data, so each load gets 16
-      // pixels (we need at most 12)
-      const __m128i data08 = _mm_loadu_si128((__m128i *)src0);
-      const __m128i data18 = _mm_loadu_si128((__m128i *)src1);
-      const __m128i data28 = _mm_loadu_si128((__m128i *)src2);
-      const __m128i data38 = _mm_loadu_si128((__m128i *)src3);
-
-      // Now zero-extend up to 16-bit precision by interleaving with zeros. For
-      // the "high" pixels (8 to 11), interleave first (so that the expansion
-      // to 16-bits operates on an entire register).
-      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
-      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
-      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
-      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
-      const __m128i data01hi8 = _mm_unpackhi_epi32(data08, data18);
-      const __m128i data23hi8 = _mm_unpackhi_epi32(data28, data38);
-      const __m128i data01hi = _mm_unpacklo_epi8(data01hi8, zero);
-      const __m128i data23hi = _mm_unpacklo_epi8(data23hi8, zero);
-
-      // Multiply by coefficients
-      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
-      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
-      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
-      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-      const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
-      const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
-
-      // Reduce horizontally and add
-      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
-      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
-      const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
-      const __m128i conv = _mm_add_epi32(convlo, convhi);
-
-      // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
-      // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
-    }
-    for (; y < h; ++y) {
-      const uint8_t *const src_row = src_col + y * src_stride;
-
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < ntaps; ++k) {
-        sum += filter[k] * src_row[k];
-      }
-
-      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
-    }
-  }
-}
-
 // A specialised version of hfilter, the horizontal filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
                      int h, int subpel_x_qn, int x_step_qn,
                      const InterpFilterParams *filter_params, unsigned round) {
   const int bd = 8;
@@ -150,7 +31,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
 
   int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
   const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
 
   int x_qn = subpel_x_qn;
   for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
@@ -197,11 +78,12 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
       const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
 
       // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
+      __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
 
+      shifted = _mm_packus_epi32(shifted, shifted);
       // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
     }
     for (; y < h; ++y) {
       const uint8_t *const src_row = src_col + y * src_stride;
@@ -216,256 +98,179 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
   }
 }
 
-// Do a 12-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32(const int32_t *src, __m128i coeff03, __m128i coeff47,
-                           __m128i coeff8d) {
-  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
-  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
-  const __m128i data8d = _mm_loadu_si128((__m128i *)(src + 8));
-  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
-  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
-  const __m128i conv8d = _mm_mullo_epi32(data8d, coeff8d);
-  return _mm_add_epi32(_mm_add_epi32(conv03, conv47), conv8d);
-}
-
-// Do an 8-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32_8(const int32_t *src, __m128i coeff03,
-                             __m128i coeff47) {
-  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
-  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
-  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
-  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
-  return _mm_add_epi32(conv03, conv47);
-}
-
-// The vertical filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// vfilter8.
-static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
-                    int dst_stride, int w, int h, int subpel_y_qn,
-                    int y_step_qn, const InterpFilterParams *filter_params,
-                    const ConvolveParams *conv_params, int bd) {
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int ntaps = filter_params->taps;
-
-  // Construct a mask with which we'll AND filter coefficients 89ab to zero out
-  // the unneeded entries. The upper bits of this mask are unused.
-  const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
-  int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(conv_params->round_1);
-
-  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
-                         (1 << (offset_bits - conv_params->round_1 - 1)));
-  const __m128i sub = _mm_set1_epi32(sub32);
-
-  int y_qn = subpel_y_qn;
-  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
-    const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-    assert(filter_idx < SUBPEL_SHIFTS);
-    const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
-    // Load up coefficients for the filter and sign-extend to 32-bit precision
-    // (to do so, calculate sign bits and then interleave)
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
-    const __m128i coeffhi16 = load_and_128i(filter + 8, hicoeff_mask);
-    const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
-    const __m128i csignhi16 = _mm_cmplt_epi16(coeffhi16, zero);
-    const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
-    const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
-    const __m128i coeff8d = _mm_unpacklo_epi16(coeffhi16, csignhi16);
-
-    int x;
-    for (x = 0; x <= w - 4; x += 4) {
-      const int32_t *const src0 = src_y + x * src_stride;
-      const int32_t *const src1 = src0 + 1 * src_stride;
-      const int32_t *const src2 = src0 + 2 * src_stride;
-      const int32_t *const src3 = src0 + 3 * src_stride;
-
-      // Load the source data for the three rows, adding the three registers of
-      // convolved products to one as we go (conv0..conv3) to avoid the
-      // register pressure getting too high.
-      const __m128i conv0 = convolve_32(src0, coeff03, coeff47, coeff8d);
-      const __m128i conv1 = convolve_32(src1, coeff03, coeff47, coeff8d);
-      const __m128i conv2 = convolve_32(src2, coeff03, coeff47, coeff8d);
-      const __m128i conv3 = convolve_32(src3, coeff03, coeff47, coeff8d);
-
-      // Now reduce horizontally to get one lane for each result
-      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
-      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
-      const __m128i conv = _mm_hadd_epi32(conv01, conv23);
-
-      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-      const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
-      int32_t *dst_x = dst + y * dst_stride + x;
-      const __m128i result =
-          (conv_params->do_average)
-              ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
-              : subbed;
-
-      _mm_storeu_si128((__m128i *)dst_x, result);
-    }
-    for (; x < w; ++x) {
-      const int32_t *src_x = src_y + x * src_stride;
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
-    }
-  }
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+  __m128i data = _mm_loadu_si128((__m128i *)src);
+  return _mm_madd_epi16(data, coeff);
 }
 
 // A specialised version of vfilter, the vertical filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
                      int dst_stride, int w, int h, int subpel_y_qn,
                      int y_step_qn, const InterpFilterParams *filter_params,
                      const ConvolveParams *conv_params, int bd) {
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int ntaps = 8;
 
-  int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(conv_params->round_1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
 
   const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
                          (1 << (offset_bits - conv_params->round_1 - 1)));
-  const __m128i sub = _mm_set1_epi32(sub32);
+  const __m128i sub = _mm_set1_epi16(sub32);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
 
   int y_qn = subpel_y_qn;
   for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
-    const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
     const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
 
-    // Load up coefficients for the filter and sign-extend to 32-bit precision
-    // (to do so, calculate sign bits and then interleave)
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
     const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
-    const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
-    const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
-    const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
-
     int x;
     for (x = 0; x <= w - 4; x += 4) {
-      const int32_t *const src0 = src_y + x * src_stride;
-      const int32_t *const src1 = src0 + 1 * src_stride;
-      const int32_t *const src2 = src0 + 2 * src_stride;
-      const int32_t *const src3 = src0 + 3 * src_stride;
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
 
       // Load the source data for the three rows, adding the three registers of
       // convolved products to one as we go (conv0..conv3) to avoid the
       // register pressure getting too high.
-      const __m128i conv0 = convolve_32_8(src0, coeff03, coeff47);
-      const __m128i conv1 = convolve_32_8(src1, coeff03, coeff47);
-      const __m128i conv2 = convolve_32_8(src2, coeff03, coeff47);
-      const __m128i conv3 = convolve_32_8(src3, coeff03, coeff47);
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
 
       // Now reduce horizontally to get one lane for each result
       const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
       const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
-      const __m128i conv = _mm_hadd_epi32(conv01, conv23);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
 
+      conv = _mm_add_epi32(conv, res_add_const);
       // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-      const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
-      int32_t *dst_x = dst + y * dst_stride + x;
-      const __m128i result =
-          (conv_params->do_average)
-              ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
-              : subbed;
-
-      _mm_storeu_si128((__m128i *)dst_x, result);
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint8_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+      __m128i result;
+      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+          if (conv_params->use_jnt_comp_avg) {
+            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+            const __m128i shifted_32 =
+                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+          } else {
+            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+          }
+          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+          const __m128i result_8 = _mm_packus_epi16(result, result);
+          *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+        } else {
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+        const __m128i result_8 = _mm_packus_epi16(result, result);
+        *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+      }
     }
     for (; x < w; ++x) {
-      const int32_t *src_x = src_y + x * src_stride;
-      CONV_BUF_TYPE sum = 1 << offset_bits;
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
       for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - sub32;
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
     }
   }
 }
-
 void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
-                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
                                   InterpFilterParams *filter_params_y,
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params) {
-  int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  // TODO(yaowu): remove unnecessary initializations
+  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
 
   const int xtaps = filter_params_x->taps;
   const int ytaps = filter_params_y->taps;
-
   const int fo_vert = ytaps / 2 - 1;
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
 
   // horizontal filter
-  if (xtaps == 8)
-    hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
-             x_step_qn, filter_params_x, conv_params->round_0);
-  else
-    hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
-            x_step_qn, filter_params_x, conv_params->round_0);
+  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+           x_step_qn, filter_params_x, conv_params->round_0);
 
   // vertical filter (input is transposed)
-  if (ytaps == 8)
-    vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-             filter_params_y, conv_params, 8);
-  else
-    vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-            filter_params_y, conv_params, 8);
+  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+           filter_params_y, conv_params, 8);
 }
 
-#if CONFIG_HIGHBITDEPTH
-// An wrapper to generate the SHUFPD instruction with __m128i types (just
-// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
-// casts)
-static __m128i mm_shuffle0_si128(__m128i a, __m128i b) {
-  __m128d ad = _mm_castsi128_pd(a);
-  __m128d bd = _mm_castsi128_pd(b);
-  return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
-}
-
-// The horizontal filter for av1_highbd_convolve_2d_scale_sse4_1. This
-// is the more general version, supporting 10 and 12 tap filters. For
-// 8-tap filters, use hfilter8.
-static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
-                           int w, int h, int subpel_x_qn, int x_step_qn,
-                           const InterpFilterParams *filter_params,
-                           unsigned round, int bd) {
-  const int ntaps = filter_params->taps;
-  assert(ntaps == 10 || ntaps == 12);
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+                            int w, int h, int subpel_x_qn, int x_step_qn,
+                            const InterpFilterParams *filter_params,
+                            unsigned round, int bd) {
+  const int ntaps = 8;
 
   src -= ntaps / 2 - 1;
 
-  // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
-  // out the unneeded entries.
-  const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
   int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
   const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
 
   int x_qn = subpel_x_qn;
   for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
@@ -475,11 +280,8 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
 
-    // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
-    // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
-    // are masked out with hicoeff_mask.
+    // Load the filter coefficients
     const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-    const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
 
     int y;
     for (y = 0; y <= h - 4; y += 4) {
@@ -488,43 +290,31 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
       const uint16_t *const src2 = src0 + 2 * src_stride;
       const uint16_t *const src3 = src0 + 3 * src_stride;
 
-      // Load up source data. This is 16-bit input data, so each load gets 8
-      // pixels (we need at most 12)
+      // Load up source data. This is 16-bit input data, so each load gets the 8
+      // pixels we need.
       const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
       const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
       const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
       const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
-      const __m128i data0hi = _mm_loadu_si128((__m128i *)(src0 + 8));
-      const __m128i data1hi = _mm_loadu_si128((__m128i *)(src1 + 8));
-      const __m128i data2hi = _mm_loadu_si128((__m128i *)(src2 + 8));
-      const __m128i data3hi = _mm_loadu_si128((__m128i *)(src3 + 8));
-
-      // The "hi" data has rubbish in the top half so interleave pairs together
-      // to minimise the calculation we need to do.
-      const __m128i data01hi = mm_shuffle0_si128(data0hi, data1hi);
-      const __m128i data23hi = mm_shuffle0_si128(data2hi, data3hi);
 
       // Multiply by coefficients
       const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
       const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
       const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
       const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-      const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
-      const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
 
       // Reduce horizontally and add
       const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
       const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
-      const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
-      const __m128i conv = _mm_add_epi32(convlo, convhi);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
 
       // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
+      __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
 
+      shifted = _mm_packus_epi32(shifted, shifted);
       // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
     }
     for (; y < h; ++y) {
       const uint16_t *const src_row = src_col + y * src_stride;
@@ -538,108 +328,173 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
     }
   }
 }
-
-// A specialised version of hfilter, the horizontal filter for
+// A specialised version of vfilter, the vertical filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.
-static void highbd_hfilter8(const uint16_t *src, int src_stride, int32_t *dst,
-                            int w, int h, int subpel_x_qn, int x_step_qn,
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+                            int dst_stride, int w, int h, int subpel_y_qn,
+                            int y_step_qn,
                             const InterpFilterParams *filter_params,
-                            unsigned round, int bd) {
+                            const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int ntaps = 8;
 
-  src -= ntaps / 2 - 1;
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
 
-  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi32(sub32);
 
-  int x_qn = subpel_x_qn;
-  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
-    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const __m128i clip_pixel_ =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
 
-    // Load the filter coefficients
-    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-
-    int y;
-    for (y = 0; y <= h - 4; y += 4) {
-      const uint16_t *const src0 = src_col + y * src_stride;
-      const uint16_t *const src1 = src0 + 1 * src_stride;
-      const uint16_t *const src2 = src0 + 2 * src_stride;
-      const uint16_t *const src3 = src0 + 3 * src_stride;
-
-      // Load up source data. This is 16-bit input data, so each load gets the 8
-      // pixels we need.
-      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
-      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
-      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
-      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
-
-      // Multiply by coefficients
-      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
-      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
-      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
-      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
 
-      // Reduce horizontally and add
-      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
-      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
 
-      // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
+      conv = _mm_add_epi32(conv, res_add_const);
 
-      // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint16_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+
+      __m128i result;
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          __m128i p_32 =
+              _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+          if (conv_params->use_jnt_comp_avg) {
+            shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                    _mm_mullo_epi32(shifted, wt1));
+            shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+          } else {
+            shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+          }
+          __m128i res32 = _mm_sub_epi32(shifted, sub);
+          res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
+                                round_bits_shift);
+
+          __m128i res16 = _mm_packus_epi32(res32, res32);
+          res16 = _mm_min_epi16(res16, clip_pixel_);
+          _mm_storel_epi64((__m128i *)dst_x, res16);
+        } else {
+          __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi32(shifted, sub);
+        result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+        result = _mm_packus_epi32(result, result);
+        result = _mm_min_epi16(result, clip_pixel_);
+        _mm_storel_epi64((__m128i *)dst_x, result);
+      }
     }
-    for (; y < h; ++y) {
-      const uint16_t *const src_row = src_col + y * src_stride;
 
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < ntaps; ++k) {
-        sum += filter[k] * src_row[k];
+    for (; x < w; ++x) {
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
       }
-
-      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
     }
   }
 }
 
 void av1_highbd_convolve_2d_scale_sse4_1(
-    const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
-    int w, int h, InterpFilterParams *filter_params_x,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
     InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
     ConvolveParams *conv_params, int bd) {
-  int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  // TODO(yaowu): Move this out of stack
+  DECLARE_ALIGNED(16, int16_t,
+                  tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
-
   const int xtaps = filter_params_x->taps;
   const int ytaps = filter_params_y->taps;
   const int fo_vert = ytaps / 2 - 1;
 
+  memset(tmp, 0, sizeof(tmp));
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
+
   // horizontal filter
-  if (xtaps == 8)
-    highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
-                    subpel_x_qn, x_step_qn, filter_params_x,
-                    conv_params->round_0, bd);
-  else
-    highbd_hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
-                   subpel_x_qn, x_step_qn, filter_params_x,
-                   conv_params->round_0, bd);
+  highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+                  subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+                  bd);
 
   // vertical filter (input is transposed)
-  if (ytaps == 8)
-    vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-             filter_params_y, conv_params, bd);
-  else
-    vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-            filter_params_y, conv_params, bd);
+  highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+                  filter_params_y, conv_params, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
deleted file mode 100644
index e85c15eaf..000000000
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <tmmintrin.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "av1/common/filter.h"
-
-#define WIDTH_BOUND (16)
-#define HEIGHT_BOUND (16)
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_12sharp_signal_dir[15][2][16]);
-
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
-
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
-#endif
-
-typedef int8_t (*SubpelFilterCoeffs)[16];
-
-static INLINE SubpelFilterCoeffs
-get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  if (p.interp_filter == MULTITAP_SHARP) {
-    return &sub_pel_filters_12sharp_signal_dir[index][0];
-  }
-#endif
-#if USE_TEMPORALFILTER_12TAP
-  if (p.interp_filter == TEMPORALFILTER_12TAP) {
-    return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
-  }
-#endif
-  (void)p;
-  (void)index;
-  return NULL;
-}
-
-static INLINE SubpelFilterCoeffs
-get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  if (p.interp_filter == MULTITAP_SHARP) {
-    return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
-  }
-#endif
-#if USE_TEMPORALFILTER_12TAP
-  if (p.interp_filter == TEMPORALFILTER_12TAP) {
-    return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
-  }
-#endif
-  (void)p;
-  (void)index;
-  return NULL;
-}
-
-static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
-  __m128i t0, t1;
-
-  t0 = _mm_unpacklo_epi16(in[0], in[1]);
-  t1 = _mm_unpacklo_epi16(in[2], in[3]);
-
-  out[0] = _mm_unpacklo_epi32(t0, t1);
-  out[1] = _mm_srli_si128(out[0], 8);
-  out[2] = _mm_unpackhi_epi32(t0, t1);
-  out[3] = _mm_srli_si128(out[2], 8);
-
-  t0 = _mm_unpackhi_epi16(in[0], in[1]);
-  t1 = _mm_unpackhi_epi16(in[2], in[3]);
-
-  out[4] = _mm_unpacklo_epi32(t0, t1);
-  out[5] = _mm_srli_si128(out[4], 8);
-  // Note: We ignore out[6] and out[7] because
-  // they're zero vectors.
-}
-
-typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
-
-static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i y = _mm_loadl_epi64((__m128i const *)src);
-  y = _mm_unpacklo_epi8(y, zero);
-  y = _mm_add_epi16(*x, y);
-  y = _mm_add_epi16(y, one);
-  y = _mm_srai_epi16(y, 1);
-  y = _mm_packus_epi16(y, y);
-  return y;
-}
-
-static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
-  uint32_t temp;
-  __m128i u = _mm_packus_epi16(*x, *x);
-  temp = _mm_cvtsi128_si32(u);
-  *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
-  uint32_t temp;
-  __m128i y = accumulate_store(x, dst);
-  temp = _mm_cvtsi128_si32(y);
-  *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
-                                           accumulate_store_2_pixel };
-
-static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
-  __m128i u = _mm_packus_epi16(*x, *x);
-  *(int *)dst = _mm_cvtsi128_si32(u);
-}
-
-static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
-  __m128i y = accumulate_store(x, dst);
-  *(int *)dst = _mm_cvtsi128_si32(y);
-}
-
-static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
-                                           accumulate_store_4_pixel };
-
-static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                           store_pixel_t store_func, uint8_t *dst) {
-  __m128i sumPairRow[4];
-  __m128i sumPairCol[8];
-  __m128i pixel;
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (10 == tapsNum) {
-    src -= 1;
-  }
-
-  pixel = _mm_loadu_si128((__m128i const *)src);
-  sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
-  sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
-  sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
-
-  pixel = _mm_loadu_si128((__m128i const *)(src + 1));
-  sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
-  sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
-  sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
-
-  transpose_4x8(sumPairRow, sumPairCol);
-
-  sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
-  sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
-
-  sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
-  sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
-
-  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
-  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
-  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
-
-  sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
-  sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
-  sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
-
-  store_func(&sumPairRow[1], dst);
-}
-
-static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                           store_pixel_t store, uint8_t *buf) {
-  horiz_w4_ssse3(src, f, tapsNum, store, buf);
-  src += 4;
-  buf += 4;
-  horiz_w4_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                            store_pixel_t store, uint8_t *buf) {
-  horiz_w8_ssse3(src, f, tapsNum, store, buf);
-  src += 8;
-  buf += 8;
-  horiz_w8_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                            store_pixel_t store, uint8_t *buf) {
-  horiz_w16_ssse3(src, f, tapsNum, store, buf);
-  src += 16;
-  buf += 16;
-  horiz_w16_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                            store_pixel_t store, uint8_t *buf) {
-  horiz_w32_ssse3(src, f, tapsNum, store, buf);
-  src += 32;
-  buf += 32;
-  horiz_w32_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                             store_pixel_t store, uint8_t *buf) {
-  horiz_w64_ssse3(src, f, tapsNum, store, buf);
-  src += 64;
-  buf += 64;
-  horiz_w64_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
-                           uint8_t *) = {
-  horiz_w4_ssse3,  horiz_w8_ssse3,  horiz_w16_ssse3,
-  horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
-};
-
-static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
-                               int width, store_pixel_t store, uint8_t *dst) {
-  switch (width) {
-    // Note:
-    // For width=2 and 4, store function must be different
-    case 2:
-    case 4: horizTab[0](src, f, tapsNum, store, dst); break;
-    case 8: horizTab[1](src, f, tapsNum, store, dst); break;
-    case 16: horizTab[2](src, f, tapsNum, store, dst); break;
-    case 32: horizTab[3](src, f, tapsNum, store, dst); break;
-    case 64: horizTab[4](src, f, tapsNum, store, dst); break;
-    case 128: horizTab[5](src, f, tapsNum, store, dst); break;
-    default: assert(0);
-  }
-}
-
-// Vertical 8-pixel parallel
-typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride);
-
-static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  __m128i v0, v1, v2, v3;
-
-  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  u0 = _mm_mulhrs_epi16(u0, k_256);
-  u1 = _mm_mulhrs_epi16(u1, k_256);
-  u2 = _mm_mulhrs_epi16(u2, k_256);
-  u3 = _mm_mulhrs_epi16(u3, k_256);
-  u4 = _mm_mulhrs_epi16(u4, k_256);
-  u5 = _mm_mulhrs_epi16(u5, k_256);
-  u6 = _mm_mulhrs_epi16(u6, k_256);
-  u7 = _mm_mulhrs_epi16(u7, k_256);
-
-  v0 = _mm_packus_epi16(u0, u1);
-  v1 = _mm_packus_epi16(u2, u3);
-  v2 = _mm_packus_epi16(u4, u5);
-  v3 = _mm_packus_epi16(u6, u7);
-
-  u0 = _mm_unpacklo_epi8(v0, v1);
-  u1 = _mm_unpackhi_epi8(v0, v1);
-  u2 = _mm_unpacklo_epi8(v2, v3);
-  u3 = _mm_unpackhi_epi8(v2, v3);
-
-  u4 = _mm_unpacklo_epi8(u0, u1);
-  u5 = _mm_unpacklo_epi8(u2, u3);
-  u6 = _mm_unpackhi_epi8(u0, u1);
-  u7 = _mm_unpackhi_epi8(u2, u3);
-
-  u0 = _mm_unpacklo_epi32(u4, u5);
-  u1 = _mm_unpackhi_epi32(u4, u5);
-  u2 = _mm_unpacklo_epi32(u6, u7);
-  u3 = _mm_unpackhi_epi32(u6, u7);
-
-  u4 = _mm_srli_si128(u0, 8);
-  u5 = _mm_srli_si128(u1, 8);
-  u6 = _mm_srli_si128(u2, 8);
-  u7 = _mm_srli_si128(u3, 8);
-
-  _mm_storel_epi64((__m128i *)dst, u0);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
-}
-
-static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  u0 = _mm_mulhrs_epi16(u0, k_256);
-  u1 = _mm_mulhrs_epi16(u1, k_256);
-  u2 = _mm_mulhrs_epi16(u2, k_256);
-  u3 = _mm_mulhrs_epi16(u3, k_256);
-  u4 = _mm_mulhrs_epi16(u4, k_256);
-  u5 = _mm_mulhrs_epi16(u5, k_256);
-  u6 = _mm_mulhrs_epi16(u6, k_256);
-  u7 = _mm_mulhrs_epi16(u7, k_256);
-
-  v0 = _mm_packus_epi16(u0, u1);
-  v1 = _mm_packus_epi16(u2, u3);
-  v2 = _mm_packus_epi16(u4, u5);
-  v3 = _mm_packus_epi16(u6, u7);
-
-  u0 = _mm_unpacklo_epi8(v0, v1);
-  u1 = _mm_unpackhi_epi8(v0, v1);
-  u2 = _mm_unpacklo_epi8(v2, v3);
-  u3 = _mm_unpackhi_epi8(v2, v3);
-
-  u4 = _mm_unpacklo_epi8(u0, u1);
-  u5 = _mm_unpacklo_epi8(u2, u3);
-  u6 = _mm_unpackhi_epi8(u0, u1);
-  u7 = _mm_unpackhi_epi8(u2, u3);
-
-  u0 = _mm_unpacklo_epi32(u4, u5);
-  u1 = _mm_unpackhi_epi32(u4, u5);
-  u2 = _mm_unpacklo_epi32(u6, u7);
-  u3 = _mm_unpackhi_epi32(u6, u7);
-
-  u4 = _mm_srli_si128(u0, 8);
-  u5 = _mm_srli_si128(u1, 8);
-  u6 = _mm_srli_si128(u2, 8);
-  u7 = _mm_srli_si128(u3, 8);
-
-  v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
-  v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
-  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
-  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-  v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
-  v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
-  v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
-  v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
-
-  u0 = _mm_unpacklo_epi8(u0, zero);
-  u1 = _mm_unpacklo_epi8(u1, zero);
-  u2 = _mm_unpacklo_epi8(u2, zero);
-  u3 = _mm_unpacklo_epi8(u3, zero);
-  u4 = _mm_unpacklo_epi8(u4, zero);
-  u5 = _mm_unpacklo_epi8(u5, zero);
-  u6 = _mm_unpacklo_epi8(u6, zero);
-  u7 = _mm_unpacklo_epi8(u7, zero);
-
-  v0 = _mm_unpacklo_epi8(v0, zero);
-  v1 = _mm_unpacklo_epi8(v1, zero);
-  v2 = _mm_unpacklo_epi8(v2, zero);
-  v3 = _mm_unpacklo_epi8(v3, zero);
-  v4 = _mm_unpacklo_epi8(v4, zero);
-  v5 = _mm_unpacklo_epi8(v5, zero);
-  v6 = _mm_unpacklo_epi8(v6, zero);
-  v7 = _mm_unpacklo_epi8(v7, zero);
-
-  v0 = _mm_adds_epi16(u0, v0);
-  v1 = _mm_adds_epi16(u4, v1);
-  v2 = _mm_adds_epi16(u1, v2);
-  v3 = _mm_adds_epi16(u5, v3);
-  v4 = _mm_adds_epi16(u2, v4);
-  v5 = _mm_adds_epi16(u6, v5);
-  v6 = _mm_adds_epi16(u3, v6);
-  v7 = _mm_adds_epi16(u7, v7);
-
-  v0 = _mm_adds_epi16(v0, one);
-  v1 = _mm_adds_epi16(v1, one);
-  v2 = _mm_adds_epi16(v2, one);
-  v3 = _mm_adds_epi16(v3, one);
-  v4 = _mm_adds_epi16(v4, one);
-  v5 = _mm_adds_epi16(v5, one);
-  v6 = _mm_adds_epi16(v6, one);
-  v7 = _mm_adds_epi16(v7, one);
-
-  v0 = _mm_srai_epi16(v0, 1);
-  v1 = _mm_srai_epi16(v1, 1);
-  v2 = _mm_srai_epi16(v2, 1);
-  v3 = _mm_srai_epi16(v3, 1);
-  v4 = _mm_srai_epi16(v4, 1);
-  v5 = _mm_srai_epi16(v5, 1);
-  v6 = _mm_srai_epi16(v6, 1);
-  v7 = _mm_srai_epi16(v7, 1);
-
-  u0 = _mm_packus_epi16(v0, v1);
-  u1 = _mm_packus_epi16(v2, v3);
-  u2 = _mm_packus_epi16(v4, v5);
-  u3 = _mm_packus_epi16(v6, v7);
-
-  u4 = _mm_srli_si128(u0, 8);
-  u5 = _mm_srli_si128(u1, 8);
-  u6 = _mm_srli_si128(u2, 8);
-  u7 = _mm_srli_si128(u3, 8);
-
-  _mm_storel_epi64((__m128i *)dst, u0);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
-}
-
-static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
-                                             transpose8x8_accumu_to_dst };
-
-static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
-  __m128i t0, t1, t2, t3, u0, u1;
-
-  t0 = _mm_unpacklo_epi16(in[0], in[1]);
-  t1 = _mm_unpacklo_epi16(in[2], in[3]);
-  t2 = _mm_unpacklo_epi16(in[4], in[5]);
-  t3 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  u0 = _mm_unpacklo_epi32(t0, t1);
-  u1 = _mm_unpacklo_epi32(t2, t3);
-
-  out[0] = _mm_unpacklo_epi64(u0, u1);
-  out[1] = _mm_unpackhi_epi64(u0, u1);
-
-  u0 = _mm_unpackhi_epi32(t0, t1);
-  u1 = _mm_unpackhi_epi32(t2, t3);
-
-  out[2] = _mm_unpacklo_epi64(u0, u1);
-  out[3] = _mm_unpackhi_epi64(u0, u1);
-
-  t0 = _mm_unpackhi_epi16(in[0], in[1]);
-  t1 = _mm_unpackhi_epi16(in[2], in[3]);
-  t2 = _mm_unpackhi_epi16(in[4], in[5]);
-  t3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  u0 = _mm_unpacklo_epi32(t0, t1);
-  u1 = _mm_unpacklo_epi32(t2, t3);
-
-  out[4] = _mm_unpacklo_epi64(u0, u1);
-  out[5] = _mm_unpackhi_epi64(u0, u1);
-
-  // Ignore out[6] and out[7]
-  // they're zero vectors.
-}
-
-static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                   __m128i *f, int tapsNum, uint16_t *buf) {
-  __m128i s[8], t[6];
-  __m128i min_x2x3, max_x2x3;
-  __m128i temp;
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (tapsNum == 10) {
-    src_ptr -= 1;
-  }
-  s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
-  s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-  s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-  s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-  s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-  // TRANSPOSE...
-  // Vecotor represents column pixel pairs instead of a row
-  transpose_8x16(s, t);
-
-  // multiply 2 adjacent elements with the filter and add the result
-  s[0] = _mm_maddubs_epi16(t[0], f[0]);
-  s[1] = _mm_maddubs_epi16(t[1], f[1]);
-  s[2] = _mm_maddubs_epi16(t[2], f[2]);
-  s[3] = _mm_maddubs_epi16(t[3], f[3]);
-  s[4] = _mm_maddubs_epi16(t[4], f[4]);
-  s[5] = _mm_maddubs_epi16(t[5], f[5]);
-
-  // add and saturate the results together
-  min_x2x3 = _mm_min_epi16(s[2], s[3]);
-  max_x2x3 = _mm_max_epi16(s[2], s[3]);
-  temp = _mm_adds_epi16(s[0], s[1]);
-  temp = _mm_adds_epi16(temp, s[5]);
-  temp = _mm_adds_epi16(temp, s[4]);
-
-  temp = _mm_adds_epi16(temp, min_x2x3);
-  temp = _mm_adds_epi16(temp, max_x2x3);
-
-  _mm_storeu_si128((__m128i *)buf, temp);
-}
-
-// Vertical 4-pixel parallel
-static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  __m128i v0, v1, v2, v3;
-
-  // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
-  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
-  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
-  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpacklo_epi16(u2, u3);
-
-  v2 = _mm_unpacklo_epi32(v0, v1);
-  v3 = _mm_unpackhi_epi32(v0, v1);
-
-  u0 = _mm_mulhrs_epi16(v2, k_256);
-  u1 = _mm_mulhrs_epi16(v3, k_256);
-
-  u0 = _mm_packus_epi16(u0, u1);
-  u1 = _mm_srli_si128(u0, 4);
-  u2 = _mm_srli_si128(u0, 8);
-  u3 = _mm_srli_si128(u0, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(u0);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
-}
-
-static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-
-  __m128i v0, v1, v2, v3;
-
-  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
-  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
-  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpacklo_epi16(u2, u3);
-
-  v2 = _mm_unpacklo_epi32(v0, v1);
-  v3 = _mm_unpackhi_epi32(v0, v1);
-
-  u0 = _mm_mulhrs_epi16(v2, k_256);
-  u1 = _mm_mulhrs_epi16(v3, k_256);
-
-  u2 = _mm_packus_epi16(u0, u1);
-  u0 = _mm_unpacklo_epi8(u2, zero);
-  u1 = _mm_unpackhi_epi8(u2, zero);
-
-  // load pixel values
-  v0 = _mm_loadl_epi64((__m128i const *)(dst));
-  v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
-  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
-  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-
-  v0 = _mm_unpacklo_epi8(v0, zero);
-  v1 = _mm_unpacklo_epi8(v1, zero);
-  v2 = _mm_unpacklo_epi8(v2, zero);
-  v3 = _mm_unpacklo_epi8(v3, zero);
-
-  v0 = _mm_unpacklo_epi64(v0, v1);
-  v1 = _mm_unpacklo_epi64(v2, v3);
-
-  u0 = _mm_adds_epi16(u0, v0);
-  u1 = _mm_adds_epi16(u1, v1);
-
-  u0 = _mm_adds_epi16(u0, one);
-  u1 = _mm_adds_epi16(u1, one);
-
-  u0 = _mm_srai_epi16(u0, 1);
-  u1 = _mm_srai_epi16(u1, 1);
-
-  // saturation and pack to pixels
-  u0 = _mm_packus_epi16(u0, u1);
-  u1 = _mm_srli_si128(u0, 4);
-  u2 = _mm_srli_si128(u0, 8);
-  u3 = _mm_srli_si128(u0, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(u0);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
-}
-
-static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
-                                             transpose4x4_accumu_to_dst };
-
-static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                   __m128i *f, int tapsNum, uint16_t *buf) {
-  __m128i A, B, C, D;
-  __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
-  __m128i x0, x1, x2, x3, x4, x5;
-  __m128i min_x2x3, max_x2x3, temp;
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (tapsNum == 10) {
-    src_ptr -= 1;
-  }
-  A = _mm_loadu_si128((const __m128i *)src_ptr);
-  B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-  C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-
-  // TRANSPOSE...
-  // Vecotor represents column pixel pairs instead of a row
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  s7s6 = _mm_srli_si128(s5s4, 8);
-
-  tr0_0 = _mm_unpackhi_epi16(A, B);
-  tr0_1 = _mm_unpackhi_epi16(C, D);
-  s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  sbsa = _mm_srli_si128(s9s8, 8);
-
-  // multiply 2 adjacent elements with the filter and add the result
-  x0 = _mm_maddubs_epi16(s1s0, f[0]);
-  x1 = _mm_maddubs_epi16(s3s2, f[1]);
-  x2 = _mm_maddubs_epi16(s5s4, f[2]);
-  x3 = _mm_maddubs_epi16(s7s6, f[3]);
-  x4 = _mm_maddubs_epi16(s9s8, f[4]);
-  x5 = _mm_maddubs_epi16(sbsa, f[5]);
-  // add and saturate the results together
-  min_x2x3 = _mm_min_epi16(x2, x3);
-  max_x2x3 = _mm_max_epi16(x2, x3);
-  temp = _mm_adds_epi16(x0, x1);
-  temp = _mm_adds_epi16(temp, x5);
-  temp = _mm_adds_epi16(temp, x4);
-
-  temp = _mm_adds_epi16(temp, min_x2x3);
-  temp = _mm_adds_epi16(temp, max_x2x3);
-  _mm_storel_epi64((__m128i *)buf, temp);
-}
-
-// Note:
-//  This function assumes:
-// (1) 10/12-taps filters
-// (2) x_step_q4 = 16 then filter is fixed at the call
-
-void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const InterpFilterParams filter_params,
-                              const int subpel_x_q4, int x_step_q4,
-                              ConvolveParams *conv_params) {
-  DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
-  __m128i verf[6];
-  __m128i horf[2];
-  SubpelFilterCoeffs hCoeffs, vCoeffs;
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  const uint8_t *src_ptr;
-  store_pixel_t store2p = store2pixelTab[conv_params->do_average];
-  store_pixel_t store4p = store4pixelTab[conv_params->do_average];
-  transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->do_average];
-  transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->do_average];
-
-  const int tapsNum = filter_params.taps;
-  int block_height, block_residu;
-  int i, col, count;
-  (void)x_step_q4;
-
-  if (0 == subpel_x_q4 || 16 != x_step_q4) {
-    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_x_q4, x_step_q4, conv_params);
-    return;
-  }
-
-  hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
-  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
-
-  if (!hCoeffs || !vCoeffs) {
-    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_x_q4, x_step_q4, conv_params);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  horf[0] = *((const __m128i *)(hCoeffs));
-  horf[1] = *((const __m128i *)(hCoeffs + 1));
-
-  count = 0;
-
-  // here tapsNum is filter size
-  src -= (tapsNum >> 1) - 1;
-  src_ptr = src;
-  if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
-    // 8-pixels parallel
-    block_height = h >> 3;
-    block_residu = h & 7;
-
-    do {
-      for (col = 0; col < w; col += 8) {
-        for (i = 0; i < 8; ++i) {
-          filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
-                                 temp + (i * 8));
-          src_ptr += 1;
-        }
-        transpose_8x8(temp, 8, dst + col, dst_stride);
-      }
-      count++;
-      src_ptr = src + count * src_stride * 8;
-      dst += dst_stride * 8;
-    } while (count < block_height);
-
-    for (i = 0; i < block_residu; ++i) {
-      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
-      src_ptr += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    if (w > 2) {
-      // 4-pixels parallel
-      block_height = h >> 2;
-      block_residu = h & 3;
-
-      do {
-        for (col = 0; col < w; col += 4) {
-          for (i = 0; i < 4; ++i) {
-            filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
-                                   temp + (i * 4));
-            src_ptr += 1;
-          }
-          transpose_4x4(temp, 4, dst + col, dst_stride);
-        }
-        count++;
-        src_ptr = src + count * src_stride * 4;
-        dst += dst_stride * 4;
-      } while (count < block_height);
-
-      for (i = 0; i < block_residu; ++i) {
-        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
-        src_ptr += src_stride;
-        dst += dst_stride;
-      }
-    } else {
-      for (i = 0; i < h; i++) {
-        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
-        src_ptr += src_stride;
-        dst += dst_stride;
-      }
-    }
-  }
-}
-
-// Vertical convolution filtering
-static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
-  __m128i u = _mm_packus_epi16(*x, *x);
-  _mm_storel_epi64((__m128i *)dst, u);
-}
-
-static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
-  __m128i y = accumulate_store(x, dst);
-  _mm_storel_epi64((__m128i *)dst, y);
-}
-
-static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
-                                           accumulate_store_8_pixel };
-
-static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
-                                 int tapsNum, __m128i *f) {
-  __m128i s[12];
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i min_x2x3, max_x2x3, sum;
-  int i = 0;
-  int r = 0;
-
-  if (10 == tapsNum) {
-    i += 1;
-    s[0] = zero;
-  }
-  while (i < 12) {
-    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
-    i += 1;
-    r += 1;
-  }
-
-  s[0] = _mm_unpacklo_epi8(s[0], s[1]);
-  s[2] = _mm_unpacklo_epi8(s[2], s[3]);
-  s[4] = _mm_unpacklo_epi8(s[4], s[5]);
-  s[6] = _mm_unpacklo_epi8(s[6], s[7]);
-  s[8] = _mm_unpacklo_epi8(s[8], s[9]);
-  s[10] = _mm_unpacklo_epi8(s[10], s[11]);
-
-  s[0] = _mm_maddubs_epi16(s[0], f[0]);
-  s[2] = _mm_maddubs_epi16(s[2], f[1]);
-  s[4] = _mm_maddubs_epi16(s[4], f[2]);
-  s[6] = _mm_maddubs_epi16(s[6], f[3]);
-  s[8] = _mm_maddubs_epi16(s[8], f[4]);
-  s[10] = _mm_maddubs_epi16(s[10], f[5]);
-
-  min_x2x3 = _mm_min_epi16(s[4], s[6]);
-  max_x2x3 = _mm_max_epi16(s[4], s[6]);
-  sum = _mm_adds_epi16(s[0], s[2]);
-  sum = _mm_adds_epi16(sum, s[10]);
-  sum = _mm_adds_epi16(sum, s[8]);
-
-  sum = _mm_adds_epi16(sum, min_x2x3);
-  sum = _mm_adds_epi16(sum, max_x2x3);
-
-  sum = _mm_mulhrs_epi16(sum, k_256);
-  sum = _mm_packus_epi16(sum, sum);
-  sum = _mm_unpacklo_epi8(sum, zero);
-  return sum;
-}
-
-static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
-                                             __m128i *f, int tapsNum,
-                                             store_pixel_t store_func,
-                                             uint8_t *dst) {
-  __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
-  store_func(&sum, dst);
-}
-
-static void filter_vert_compute_small(const uint8_t *src, int src_stride,
-                                      __m128i *f, int tapsNum,
-                                      store_pixel_t store_func, int h,
-                                      uint8_t *dst, int dst_stride) {
-  int rowIndex = 0;
-  do {
-    filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
-                                     dst);
-    rowIndex++;
-    src += src_stride;
-    dst += dst_stride;
-  } while (rowIndex < h);
-}
-
-static void filter_vert_compute_large(const uint8_t *src, int src_stride,
-                                      __m128i *f, int tapsNum,
-                                      store_pixel_t store_func, int w, int h,
-                                      uint8_t *dst, int dst_stride) {
-  int col;
-  int rowIndex = 0;
-  const uint8_t *src_ptr = src;
-  uint8_t *dst_ptr = dst;
-
-  do {
-    for (col = 0; col < w; col += 8) {
-      filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
-                                       store_func, dst_ptr);
-      src_ptr += 8;
-      dst_ptr += 8;
-    }
-    rowIndex++;
-    src_ptr = src + rowIndex * src_stride;
-    dst_ptr = dst + rowIndex * dst_stride;
-  } while (rowIndex < h);
-}
-
-void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_y_q4, int y_step_q4,
-                             ConvolveParams *conv_params) {
-  __m128i verf[6];
-  SubpelFilterCoeffs vCoeffs;
-  const uint8_t *src_ptr;
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  uint8_t *dst_ptr = dst;
-  store_pixel_t store2p = store2pixelTab[conv_params->do_average];
-  store_pixel_t store4p = store4pixelTab[conv_params->do_average];
-  store_pixel_t store8p = store8pixelTab[conv_params->do_average];
-  const int tapsNum = filter_params.taps;
-
-  if (0 == subpel_y_q4 || 16 != y_step_q4) {
-    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                        subpel_y_q4, y_step_q4, conv_params);
-    return;
-  }
-
-  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
-
-  if (!vCoeffs) {
-    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                        subpel_y_q4, y_step_q4, conv_params);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  src -= src_stride * ((tapsNum >> 1) - 1);
-  src_ptr = src;
-
-  if (w > 4) {
-    filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
-                              dst_ptr, dst_stride);
-  } else if (4 == w) {
-    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
-                              dst_ptr, dst_stride);
-  } else if (2 == w) {
-    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
-                              dst_ptr, dst_stride);
-  } else {
-    assert(0);
-  }
-}
-
-static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
-                                   int8_t (*simd_horiz_filter)[2][16]) {
-  int shift;
-  int offset = (12 - taps) / 2;
-  const int16_t *filter_row;
-  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
-    int i;
-    filter_row = filter_ptr + shift * taps;
-    for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
-
-    for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
-
-    for (i = 0; i < taps; ++i) {
-      simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
-      simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
-    }
-
-    for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
-
-    for (i = offset + 2 + taps; i < 16; ++i)
-      simd_horiz_filter[shift - 1][1][i] = 0;
-  }
-}
-
-static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
-                                  int8_t (*simd_vert_filter)[6][16]) {
-  int shift;
-  int offset = (12 - taps) / 2;
-  const int16_t *filter_row;
-  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
-    int i;
-    filter_row = filter_ptr + shift * taps;
-    for (i = 0; i < 6; ++i) {
-      int j;
-      for (j = 0; j < 16; ++j) {
-        int c = i * 2 + (j % 2) - offset;
-        if (c >= 0 && c < taps)
-          simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
-        else
-          simd_vert_filter[shift - 1][i][j] = 0;
-      }
-    }
-  }
-}
-
-typedef struct SimdFilter {
-  InterpFilter interp_filter;
-  int8_t (*simd_horiz_filter)[2][16];
-  int8_t (*simd_vert_filter)[6][16];
-} SimdFilter;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-#define MULTITAP_FILTER_NUM 1
-SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
-  { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
-    &sub_pel_filters_12sharp_ver_signal_dir[0] },
-};
-#endif
-
-#if USE_TEMPORALFILTER_12TAP
-SimdFilter temporal_simd_filter = {
-  TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
-  &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
-};
-#endif
-
-void av1_lowbd_convolve_init_ssse3(void) {
-#if USE_TEMPORALFILTER_12TAP
-  {
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
-    int taps = filter_params.taps;
-    const int16_t *filter_ptr = filter_params.filter_ptr;
-    init_simd_horiz_filter(filter_ptr, taps,
-                           temporal_simd_filter.simd_horiz_filter);
-    init_simd_vert_filter(filter_ptr, taps,
-                          temporal_simd_filter.simd_vert_filter);
-  }
-#endif
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  {
-    int i;
-    for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
-      InterpFilter interp_filter = simd_filters[i].interp_filter;
-      InterpFilterParams filter_params =
-          av1_get_interp_filter_params(interp_filter);
-      int taps = filter_params.taps;
-      const int16_t *filter_ptr = filter_params.filter_ptr;
-      init_simd_horiz_filter(filter_ptr, taps,
-                             simd_filters[i].simd_horiz_filter);
-      init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
-    }
-  }
-#endif
-  return;
-}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
deleted file mode 100644
index 97d2e74b1..000000000
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ /dev/null
@@ -1,839 +0,0 @@
-#include "av1/common/x86/av1_txfm1d_sse4.h"
-
-void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 32;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[32];
-  __m128i buf1[32];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    int j;
-    for (j = 0; j < 32; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
-
-    // stage 1
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
-    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
-    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
-    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
-    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
-    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
-    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
-    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
-    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
-    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
-    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
-    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
-    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
-    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
-    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
-    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
-    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
-    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
-    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
-    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
-    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    buf0[18] = buf1[18];
-    buf0[19] = buf1[19];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
-                        buf0[24], bit);
-    buf0[28] = buf1[28];
-    buf0[29] = buf1[29];
-    buf0[30] = buf1[30];
-    buf0[31] = buf1[31];
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
-    buf1[8] = buf0[8];
-    buf1[9] = buf0[9];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
-                        buf1[13], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
-                        buf1[12], bit);
-    buf1[14] = buf0[14];
-    buf1[15] = buf0[15];
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
-    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
-    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
-    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
-    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
-    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
-    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
-    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
-    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
-    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
-    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
-    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
-    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
-    buf0[4] = buf1[4];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
-                        buf0[6], bit);
-    buf0[7] = buf1[7];
-    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
-    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
-    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
-    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
-    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
-    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
-    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
-    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
-                        buf0[28], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    buf0[22] = buf1[22];
-    buf0[23] = buf1[23];
-    buf0[24] = buf1[24];
-    buf0[25] = buf1[25];
-    buf0[30] = buf1[30];
-    buf0[31] = buf1[31];
-
-    // stage 5
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
-                        buf1[1], bit);
-    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
-                        buf1[3], bit);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
-    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
-    buf1[8] = buf0[8];
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
-                        buf1[14], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
-                        buf1[13], bit);
-    buf1[11] = buf0[11];
-    buf1[12] = buf0[12];
-    buf1[15] = buf0[15];
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
-    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
-    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
-    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
-    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
-    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
-    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
-    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
-    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
-    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                        bit);
-    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
-                        buf0[6], bit);
-    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
-    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
-    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
-    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
-    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
-    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
-    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
-    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
-    buf0[16] = buf1[16];
-    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
-                        buf0[30], bit);
-    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
-                        buf0[29], bit);
-    buf0[19] = buf1[19];
-    buf0[20] = buf1[20];
-    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
-                        buf0[25], bit);
-    buf0[23] = buf1[23];
-    buf0[24] = buf1[24];
-    buf0[27] = buf1[27];
-    buf0[28] = buf1[28];
-    buf0[31] = buf1[31];
-
-    // stage 7
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[1];
-    buf1[2] = buf0[2];
-    buf1[3] = buf0[3];
-    buf1[4] = buf0[4];
-    buf1[5] = buf0[5];
-    buf1[6] = buf0[6];
-    buf1[7] = buf0[7];
-    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
-                        buf1[15], bit);
-    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
-                        buf1[14], bit);
-    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
-                        buf1[13], bit);
-    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
-                        buf1[12], bit);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
-    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
-    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
-    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
-    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
-    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
-    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
-    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
-    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
-    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
-    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
-
-    // stage 8
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    buf0[14] = buf1[14];
-    buf0[15] = buf1[15];
-    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
-                        buf0[31], bit);
-    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
-                        buf0[30], bit);
-    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
-                        buf0[29], bit);
-    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
-                        buf0[28], bit);
-    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
-                        buf0[27], bit);
-    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
-                        buf0[25], bit);
-    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
-                        buf0[24], bit);
-
-    // stage 9
-    stage_idx++;
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[16];
-    buf1[2] = buf0[8];
-    buf1[3] = buf0[24];
-    buf1[4] = buf0[4];
-    buf1[5] = buf0[20];
-    buf1[6] = buf0[12];
-    buf1[7] = buf0[28];
-    buf1[8] = buf0[2];
-    buf1[9] = buf0[18];
-    buf1[10] = buf0[10];
-    buf1[11] = buf0[26];
-    buf1[12] = buf0[6];
-    buf1[13] = buf0[22];
-    buf1[14] = buf0[14];
-    buf1[15] = buf0[30];
-    buf1[16] = buf0[1];
-    buf1[17] = buf0[17];
-    buf1[18] = buf0[9];
-    buf1[19] = buf0[25];
-    buf1[20] = buf0[5];
-    buf1[21] = buf0[21];
-    buf1[22] = buf0[13];
-    buf1[23] = buf0[29];
-    buf1[24] = buf0[3];
-    buf1[25] = buf0[19];
-    buf1[26] = buf0[11];
-    buf1[27] = buf0[27];
-    buf1[28] = buf0[7];
-    buf1[29] = buf0[23];
-    buf1[30] = buf0[15];
-    buf1[31] = buf0[31];
-
-    for (j = 0; j < 32; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
-
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 4;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[4];
-  __m128i buf1[4];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    int j;
-    for (j = 0; j < 4; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
-
-    // stage 1
-    stage_idx++;
-    buf1[0] = buf0[3];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[1];
-    buf1[3] = buf0[2];
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
-                        bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-
-    // stage 3
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-
-    // stage 5
-    stage_idx++;
-    buf1[0] = buf0[0];
-    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
-    buf1[2] = buf0[3];
-    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
-    for (j = 0; j < 4; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
-
-void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 32;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[32];
-  __m128i buf1[32];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    int j;
-    for (j = 0; j < 32; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
-
-    // stage 1
-    stage_idx++;
-    buf1[0] = buf0[31];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[29];
-    buf1[3] = buf0[2];
-    buf1[4] = buf0[27];
-    buf1[5] = buf0[4];
-    buf1[6] = buf0[25];
-    buf1[7] = buf0[6];
-    buf1[8] = buf0[23];
-    buf1[9] = buf0[8];
-    buf1[10] = buf0[21];
-    buf1[11] = buf0[10];
-    buf1[12] = buf0[19];
-    buf1[13] = buf0[12];
-    buf1[14] = buf0[17];
-    buf1[15] = buf0[14];
-    buf1[16] = buf0[15];
-    buf1[17] = buf0[16];
-    buf1[18] = buf0[13];
-    buf1[19] = buf0[18];
-    buf1[20] = buf0[11];
-    buf1[21] = buf0[20];
-    buf1[22] = buf0[9];
-    buf1[23] = buf0[22];
-    buf1[24] = buf0[7];
-    buf1[25] = buf0[24];
-    buf1[26] = buf0[5];
-    buf1[27] = buf0[26];
-    buf1[28] = buf0[3];
-    buf1[29] = buf0[28];
-    buf1[30] = buf0[1];
-    buf1[31] = buf0[30];
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
-                        bit);
-    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
-                        bit);
-    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
-                        bit);
-    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
-                        buf0[9], bit);
-    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
-                        buf0[17], bit);
-    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
-                        buf0[19], bit);
-    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
-                        buf0[21], bit);
-    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 3
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
-    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
-    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
-    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
-    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
-    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    buf0[14] = buf1[14];
-    buf0[15] = buf1[15];
-    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
-                        buf0[17], bit);
-    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
-                        buf0[19], bit);
-    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
-                        buf0[21], bit);
-    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 5
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
-    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
-    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
-    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
-    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
-    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
-    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
-    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
-    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
-                        bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    buf0[18] = buf1[18];
-    buf0[19] = buf1[19];
-    buf0[20] = buf1[20];
-    buf0[21] = buf1[21];
-    buf0[22] = buf1[22];
-    buf0[23] = buf1[23];
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 7
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
-    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
-    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
-    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
-    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
-
-    // stage 8
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
-                        buf0[5], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    buf0[18] = buf1[18];
-    buf0[19] = buf1[19];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
-                        buf0[21], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    buf0[24] = buf1[24];
-    buf0[25] = buf1[25];
-    buf0[26] = buf1[26];
-    buf0[27] = buf1[27];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 9
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
-    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
-    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
-    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
-    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
-    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
-
-    // stage 10
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
-                        buf0[19], bit);
-    buf0[20] = buf1[20];
-    buf0[21] = buf1[21];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    buf0[24] = buf1[24];
-    buf0[25] = buf1[25];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    buf0[28] = buf1[28];
-    buf0[29] = buf1[29];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 11
-    stage_idx++;
-    buf1[0] = buf0[0];
-    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
-    buf1[2] = buf0[24];
-    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
-    buf1[4] = buf0[12];
-    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
-    buf1[6] = buf0[20];
-    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
-    buf1[8] = buf0[6];
-    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
-    buf1[10] = buf0[30];
-    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
-    buf1[12] = buf0[10];
-    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
-    buf1[14] = buf0[18];
-    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
-    buf1[16] = buf0[3];
-    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
-    buf1[18] = buf0[27];
-    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
-    buf1[20] = buf0[15];
-    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
-    buf1[22] = buf0[23];
-    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
-    buf1[24] = buf0[5];
-    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
-    buf1[26] = buf0[29];
-    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
-    buf1[28] = buf0[9];
-    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
-    buf1[30] = buf0[17];
-    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
-    for (j = 0; j < 32; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
deleted file mode 100644
index 58ede028a..000000000
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/enums.h"
-#include "av1/common/av1_txfm.h"
-#include "av1/common/x86/av1_txfm1d_sse4.h"
-
-static INLINE void int16_array_with_stride_to_int32_array_without_stride(
-    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
-  int r, c;
-  for (r = 0; r < txfm1d_size; r++) {
-    for (c = 0; c < txfm1d_size; c++) {
-      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
-    }
-  }
-}
-
-typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
-                             const int8_t *cos_bit, const int8_t *stage_range);
-
-static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
-  switch (txfm_type) {
-    case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
-    case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
-    default: assert(0);
-  }
-  return NULL;
-}
-
-static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
-                                     const int stride,
-                                     const TXFM_2D_FLIP_CFG *cfg,
-                                     int32_t *txfm_buf) {
-  // TODO(sarahparker) This does not currently support rectangular transforms
-  // and will break without splitting txfm_size out into row and col size.
-  // Rectangular transforms use c code only, so it should be ok for now.
-  // It will be corrected when there are sse implementations for rectangular
-  // transforms.
-  assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size);
-  const int txfm_size = cfg->row_cfg->txfm_size;
-  const int8_t *shift = cfg->row_cfg->shift;
-  const int8_t *stage_range_col = cfg->col_cfg->stage_range;
-  const int8_t *stage_range_row = cfg->row_cfg->stage_range;
-  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
-  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
-  const TxfmFuncSSE2 txfm_func_col =
-      fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
-  const TxfmFuncSSE2 txfm_func_row =
-      fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
-
-  __m128i *buf_128 = (__m128i *)txfm_buf;
-  __m128i *out_128 = (__m128i *)output;
-  int num_per_128 = 4;
-  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
-
-  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
-                                                        txfm_size);
-  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
-  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
-  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
-  transpose_32(txfm_size, out_128, buf_128);
-  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
-  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
-  transpose_32(txfm_size, buf_128, out_128);
-}
-
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
-                                 int stride, TX_TYPE tx_type, int bd) {
-  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
-  (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
-}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
index 68461bc36..212d3bd72 100644
--- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -12,81 +12,14 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "./av1_rtcd.h"
-#include "av1/common/filter.h"
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
-#endif
-
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
-#endif
+#include "config/av1_rtcd.h"
 
-typedef int16_t (*HbdSubpelFilterCoeffs)[8];
+#include "av1/common/filter.h"
 
 typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
                               int src_stride, uint16_t *dst, int dst_stride,
                               int bd);
 
-static INLINE HbdSubpelFilterCoeffs
-hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  if (p.interp_filter == MULTITAP_SHARP) {
-    return &subpel_filters_sharp[index][0];
-  }
-#endif
-#if USE_TEMPORALFILTER_12TAP
-  if (p.interp_filter == TEMPORALFILTER_12TAP) {
-    return &subpel_temporalfilter[index][0];
-  }
-#endif
-  (void)p;
-  (void)index;
-  return NULL;
-}
-
-static void init_simd_filter(const int16_t *filter_ptr, int taps,
-                             int16_t (*simd_filter)[6][8]) {
-  int shift;
-  int offset = (12 - taps) / 2;
-  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
-    const int16_t *filter_row = filter_ptr + shift * taps;
-    int i, j;
-    for (i = 0; i < 12; ++i) {
-      for (j = 0; j < 4; ++j) {
-        int r = i / 2;
-        int c = j * 2 + (i % 2);
-        if (i - offset >= 0 && i - offset < taps)
-          simd_filter[shift - 1][r][c] = filter_row[i - offset];
-        else
-          simd_filter[shift - 1][r][c] = 0;
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_init_sse4_1(void) {
-#if USE_TEMPORALFILTER_12TAP
-  {
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
-    int taps = filter_params.taps;
-    const int16_t *filter_ptr = filter_params.filter_ptr;
-    init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
-  }
-#endif
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  {
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(MULTITAP_SHARP);
-    int taps = filter_params.taps;
-    const int16_t *filter_ptr = filter_params.filter_ptr;
-    init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
-  }
-#endif
-}
-
 // pixelsNum 0: write all 4 pixels
 //           1/2/3: residual pixels 1/2/3
 static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
@@ -218,138 +151,6 @@ void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
   writePixel(u, width, pixelsNum, dst, dst_stride);
 }
 
-static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
-
-static INLINE void transpose_pair(__m128i *in, __m128i *out) {
-  __m128i x0, x1;
-
-  x0 = _mm_unpacklo_epi32(in[0], in[1]);
-  x1 = _mm_unpacklo_epi32(in[2], in[3]);
-
-  out[0] = _mm_unpacklo_epi64(x0, x1);
-  out[1] = _mm_unpackhi_epi64(x0, x1);
-
-  x0 = _mm_unpackhi_epi32(in[0], in[1]);
-  x1 = _mm_unpackhi_epi32(in[2], in[3]);
-
-  out[2] = _mm_unpacklo_epi64(x0, x1);
-  out[3] = _mm_unpackhi_epi64(x0, x1);
-
-  x0 = _mm_unpacklo_epi32(in[4], in[5]);
-  x1 = _mm_unpacklo_epi32(in[6], in[7]);
-
-  out[4] = _mm_unpacklo_epi64(x0, x1);
-  out[5] = _mm_unpackhi_epi64(x0, x1);
-}
-
-static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
-                                int tapsNum, uint32_t *buf) {
-  __m128i u[8], v[6];
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (tapsNum == 10) {
-    src -= 1;
-  }
-
-  u[0] = _mm_loadu_si128((__m128i const *)src);
-  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
-  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
-  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
-  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
-  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
-
-  transpose_pair(u, v);
-
-  u[0] = _mm_madd_epi16(v[0], f[0]);
-  u[1] = _mm_madd_epi16(v[1], f[1]);
-  u[2] = _mm_madd_epi16(v[2], f[2]);
-  u[3] = _mm_madd_epi16(v[3], f[3]);
-  u[4] = _mm_madd_epi16(v[4], f[4]);
-  u[5] = _mm_madd_epi16(v[5], f[5]);
-
-  u[6] = _mm_min_epi32(u[2], u[3]);
-  u[7] = _mm_max_epi32(u[2], u[3]);
-
-  u[0] = _mm_add_epi32(u[0], u[1]);
-  u[0] = _mm_add_epi32(u[0], u[5]);
-  u[0] = _mm_add_epi32(u[0], u[4]);
-  u[0] = _mm_add_epi32(u[0], u[6]);
-  u[0] = _mm_add_epi32(u[0], u[7]);
-
-  _mm_storeu_si128((__m128i *)buf, u[0]);
-}
-
-void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
-                                      uint16_t *dst, int dst_stride, int w,
-                                      int h,
-                                      const InterpFilterParams filter_params,
-                                      const int subpel_x_q4, int x_step_q4,
-                                      int avg, int bd) {
-  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
-  __m128i verf[6];
-  HbdSubpelFilterCoeffs vCoeffs;
-  const uint16_t *srcPtr;
-  const int tapsNum = filter_params.taps;
-  int i, col, count, blkResidu, blkHeight;
-  TransposeSave transSave = transSaveTab[avg];
-  (void)x_step_q4;
-
-  if (0 == subpel_x_q4 || 16 != x_step_q4) {
-    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
-                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
-    return;
-  }
-
-  vCoeffs =
-      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
-  if (!vCoeffs) {
-    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
-                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  src -= (tapsNum >> 1) - 1;
-  srcPtr = src;
-
-  count = 0;
-  blkHeight = h >> 2;
-  blkResidu = h & 3;
-
-  while (blkHeight != 0) {
-    for (col = 0; col < w; col += 4) {
-      for (i = 0; i < 4; ++i) {
-        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
-        srcPtr += 1;
-      }
-      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
-    }
-    count++;
-    srcPtr = src + count * src_stride * 4;
-    dst += dst_stride * 4;
-    blkHeight--;
-  }
-
-  if (blkResidu == 0) return;
-
-  for (col = 0; col < w; col += 4) {
-    for (i = 0; i < 4; ++i) {
-      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
-      srcPtr += 1;
-    }
-    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
-  }
-}
-
 // Vertical convolutional filter
 
 typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
@@ -402,134 +203,3 @@ static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
 }
 
 WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
-
-static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
-                                       const __m128i *f, int taps,
-                                       uint16_t *dst, WritePixels saveFunc,
-                                       int bd) {
-  __m128i s[12];
-  __m128i zero = _mm_setzero_si128();
-  int i = 0;
-  int r = 0;
-
-  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
-  assert(taps == 10 || taps == 12);
-  if (10 == taps) {
-    i += 1;
-    s[0] = zero;
-  }
-  while (i < 12) {
-    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
-    i += 1;
-    r += 1;
-  }
-
-  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
-  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
-  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
-  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
-  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
-  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
-
-  s[0] = _mm_madd_epi16(s[0], f[0]);
-  s[2] = _mm_madd_epi16(s[2], f[1]);
-  s[4] = _mm_madd_epi16(s[4], f[2]);
-  s[6] = _mm_madd_epi16(s[6], f[3]);
-  s[8] = _mm_madd_epi16(s[8], f[4]);
-  s[10] = _mm_madd_epi16(s[10], f[5]);
-
-  s[1] = _mm_min_epi32(s[4], s[6]);
-  s[3] = _mm_max_epi32(s[4], s[6]);
-
-  s[0] = _mm_add_epi32(s[0], s[2]);
-  s[0] = _mm_add_epi32(s[0], s[10]);
-  s[0] = _mm_add_epi32(s[0], s[8]);
-  s[0] = _mm_add_epi32(s[0], s[1]);
-  s[0] = _mm_add_epi32(s[0], s[3]);
-
-  saveFunc(s, bd, dst);
-}
-
-static void highbd_filter_vert_compute_large(const uint16_t *src,
-                                             int src_stride, const __m128i *f,
-                                             int taps, int w, int h,
-                                             uint16_t *dst, int dst_stride,
-                                             int avg, int bd) {
-  int col;
-  int rowIndex = 0;
-  const uint16_t *src_ptr = src;
-  uint16_t *dst_ptr = dst;
-  const int step = 4;
-  WritePixels write4pixels = write4pixelsTab[avg];
-
-  do {
-    for (col = 0; col < w; col += step) {
-      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
-                                 write4pixels, bd);
-      src_ptr += step;
-      dst_ptr += step;
-    }
-    rowIndex++;
-    src_ptr = src + rowIndex * src_stride;
-    dst_ptr = dst + rowIndex * dst_stride;
-  } while (rowIndex < h);
-}
-
-static void highbd_filter_vert_compute_small(const uint16_t *src,
-                                             int src_stride, const __m128i *f,
-                                             int taps, int w, int h,
-                                             uint16_t *dst, int dst_stride,
-                                             int avg, int bd) {
-  int rowIndex = 0;
-  WritePixels write2pixels = write2pixelsTab[avg];
-  (void)w;
-
-  do {
-    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
-    rowIndex++;
-    src += src_stride;
-    dst += dst_stride;
-  } while (rowIndex < h);
-}
-
-void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
-                                     uint16_t *dst, int dst_stride, int w,
-                                     int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_y_q4, int y_step_q4,
-                                     int avg, int bd) {
-  __m128i verf[6];
-  HbdSubpelFilterCoeffs vCoeffs;
-  const int tapsNum = filter_params.taps;
-
-  if (0 == subpel_y_q4 || 16 != y_step_q4) {
-    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
-    return;
-  }
-
-  vCoeffs =
-      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
-  if (!vCoeffs) {
-    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  src -= src_stride * ((tapsNum >> 1) - 1);
-
-  if (w > 2) {
-    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
-                                     dst_stride, avg, bd);
-  } else {
-    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
-                                     dst_stride, avg, bd);
-  }
-}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 000000000..7415c58df
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x1[0], x1[3]);
+  btf_16_adds_subs_avx2(x1[1], x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]);
+
+  btf_16_adds_subs_avx2(x1[8], x1[11]);
+  btf_16_adds_subs_avx2(x1[9], x1[10]);
+  btf_16_subs_adds_avx2(x1[15], x1[12]);
+  btf_16_subs_adds_avx2(x1[14], x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+  btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]);
+  btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]);
+  btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]);
+  btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]);
+  btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]);
+  btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]);
+  btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]);
+  btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]);
+}
+
+static void idct16_new_avx2(const __m256i *input, __m256i *output,
+                            int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = input[8];
+  x1[2] = input[4];
+  x1[3] = input[12];
+  x1[4] = input[2];
+  x1[5] = input[10];
+  x1[6] = input[6];
+  x1[7] = input[14];
+  x1[8] = input[1];
+  x1[9] = input[9];
+  x1[10] = input[5];
+  x1[11] = input[13];
+  x1[12] = input[3];
+  x1[13] = input[11];
+  x1[14] = input[7];
+  x1[15] = input[15];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(x1[8], x1[9]);
+  btf_16_subs_adds_avx2(x1[11], x1[10]);
+  btf_16_adds_subs_avx2(x1[12], x1[13]);
+  btf_16_subs_adds_avx2(x1[15], x1[14]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(x1[4], x1[5]);
+  btf_16_subs_adds_avx2(x1[7], x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+
+  idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[2] = input[4];
+  x1[4] = input[2];
+  x1[6] = input[6];
+  x1[8] = input[1];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(x1[8], x1[9]);
+  btf_16_subs_adds_avx2(x1[11], x1[10]);
+  btf_16_adds_subs_avx2(x1[12], x1[13]);
+  btf_16_subs_adds_avx2(x1[15], x1[14]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(x1[4], x1[5]);
+  btf_16_subs_adds_avx2(x1[7], x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+
+  idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x1[2];
+  x1[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+  // stage 5
+  // stage 6
+  output[0] = x1[0];
+  output[1] = x1[1];
+  output[2] = x1[1];
+  output[3] = x1[0];
+  output[4] = x1[0];
+  output[5] = x1[1];
+  output[6] = x1[1];
+  output[7] = x1[0];
+  output[8] = x1[0];
+  output[9] = x1[1];
+  output[10] = x1[1];
+  output[11] = x1[0];
+  output[12] = x1[0];
+  output[13] = x1[1];
+  output[14] = x1[1];
+  output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[0], x[8]);
+  btf_16_adds_subs_avx2(x[1], x[9]);
+  btf_16_adds_subs_avx2(x[2], x[10]);
+  btf_16_adds_subs_avx2(x[3], x[11]);
+  btf_16_adds_subs_avx2(x[4], x[12]);
+  btf_16_adds_subs_avx2(x[5], x[13]);
+  btf_16_adds_subs_avx2(x[6], x[14]);
+  btf_16_adds_subs_avx2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[0], x[4]);
+  btf_16_adds_subs_avx2(x[1], x[5]);
+  btf_16_adds_subs_avx2(x[2], x[6]);
+  btf_16_adds_subs_avx2(x[3], x[7]);
+  btf_16_adds_subs_avx2(x[8], x[12]);
+  btf_16_adds_subs_avx2(x[9], x[13]);
+  btf_16_adds_subs_avx2(x[10], x[14]);
+  btf_16_adds_subs_avx2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[0], x[2]);
+  btf_16_adds_subs_avx2(x[1], x[3]);
+  btf_16_adds_subs_avx2(x[4], x[6]);
+  btf_16_adds_subs_avx2(x[5], x[7]);
+  btf_16_adds_subs_avx2(x[8], x[10]);
+  btf_16_adds_subs_avx2(x[9], x[11]);
+  btf_16_adds_subs_avx2(x[12], x[14]);
+  btf_16_adds_subs_avx2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+  const __m256i __zero = _mm256_setzero_si256();
+  output[0] = x1[0];
+  output[1] = _mm256_subs_epi16(__zero, x1[8]);
+  output[2] = x1[12];
+  output[3] = _mm256_subs_epi16(__zero, x1[4]);
+  output[4] = x1[6];
+  output[5] = _mm256_subs_epi16(__zero, x1[14]);
+  output[6] = x1[10];
+  output[7] = _mm256_subs_epi16(__zero, x1[2]);
+  output[8] = x1[3];
+  output[9] = _mm256_subs_epi16(__zero, x1[11]);
+  output[10] = x1[15];
+  output[11] = _mm256_subs_epi16(__zero, x1[7]);
+  output[12] = x1[5];
+  output[13] = _mm256_subs_epi16(__zero, x1[13]);
+  output[14] = x1[9];
+  output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_new_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[15];
+  x1[1] = input[0];
+  x1[2] = input[13];
+  x1[3] = input[2];
+  x1[4] = input[11];
+  x1[5] = input[4];
+  x1[6] = input[9];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[9] = input[8];
+  x1[10] = input[5];
+  x1[11] = input[10];
+  x1[12] = input[3];
+  x1[13] = input[12];
+  x1[14] = input[1];
+  x1[15] = input[14];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+  x1[3] = input[2];
+  x1[5] = input[4];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[1];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+  // stage 3
+  x1[8] = x1[0];
+  x1[9] = x1[1];
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]);
+
+  // stage 5
+  x1[4] = x1[0];
+  x1[5] = x1[1];
+
+  x1[12] = x1[8];
+  x1[13] = x1[9];
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]);
+
+  // stage 7
+  x1[2] = x1[0];
+  x1[3] = x1[1];
+  x1[6] = x1[4];
+  x1[7] = x1[5];
+  x1[10] = x1[8];
+  x1[11] = x1[9];
+  x1[14] = x1[12];
+  x1[15] = x1[13];
+
+  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[16], x[17]);
+  btf_16_subs_adds_avx2(x[19], x[18]);
+  btf_16_adds_subs_avx2(x[20], x[21]);
+  btf_16_subs_adds_avx2(x[23], x[22]);
+  btf_16_adds_subs_avx2(x[24], x[25]);
+  btf_16_subs_adds_avx2(x[27], x[26]);
+  btf_16_adds_subs_avx2(x[28], x[29]);
+  btf_16_subs_adds_avx2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  btf_16_adds_subs_avx2(x[16], x[19]);
+  btf_16_adds_subs_avx2(x[17], x[18]);
+  btf_16_subs_adds_avx2(x[23], x[20]);
+  btf_16_subs_adds_avx2(x[22], x[21]);
+  btf_16_adds_subs_avx2(x[24], x[27]);
+  btf_16_adds_subs_avx2(x[25], x[26]);
+  btf_16_subs_adds_avx2(x[31], x[28]);
+  btf_16_subs_adds_avx2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[11]);
+  btf_16_adds_subs_avx2(x[9], x[10]);
+  btf_16_subs_adds_avx2(x[15], x[12]);
+  btf_16_subs_adds_avx2(x[14], x[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(x[16], x[23]);
+  btf_16_adds_subs_avx2(x[17], x[22]);
+  btf_16_adds_subs_avx2(x[18], x[21]);
+  btf_16_adds_subs_avx2(x[19], x[20]);
+  btf_16_subs_adds_avx2(x[31], x[24]);
+  btf_16_subs_adds_avx2(x[30], x[25]);
+  btf_16_subs_adds_avx2(x[29], x[26]);
+  btf_16_subs_adds_avx2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[15]);
+  btf_16_adds_subs_avx2(x[1], x[14]);
+  btf_16_adds_subs_avx2(x[2], x[13]);
+  btf_16_adds_subs_avx2(x[3], x[12]);
+  btf_16_adds_subs_avx2(x[4], x[11]);
+  btf_16_adds_subs_avx2(x[5], x[10]);
+  btf_16_adds_subs_avx2(x[6], x[9]);
+  btf_16_adds_subs_avx2(x[7], x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]);
+  btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]);
+  btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]);
+  btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]);
+  btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]);
+  btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]);
+  btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]);
+  btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]);
+  btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]);
+  btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]);
+  btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]);
+  btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]);
+  btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]);
+  btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]);
+  btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]);
+  btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_avx2(x);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[9]);
+  btf_16_subs_adds_avx2(x[11], x[10]);
+  btf_16_adds_subs_avx2(x[12], x[13]);
+  btf_16_subs_adds_avx2(x[15], x[14]);
+  idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(x[4], x[5]);
+  btf_16_subs_adds_avx2(x[7], x[6]);
+  idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+
+  btf_16_adds_subs_avx2(x[0], x[3]);
+  btf_16_adds_subs_avx2(x[1], x[2]);
+  idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_new_avx2(const __m256i *input, __m256i *output,
+                            int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m256i x1[32];
+  x1[0] = input[0];
+  x1[1] = input[16];
+  x1[2] = input[8];
+  x1[3] = input[24];
+  x1[4] = input[4];
+  x1[5] = input[20];
+  x1[6] = input[12];
+  x1[7] = input[28];
+  x1[8] = input[2];
+  x1[9] = input[18];
+  x1[10] = input[10];
+  x1[11] = input[26];
+  x1[12] = input[6];
+  x1[13] = input[22];
+  x1[14] = input[14];
+  x1[15] = input[30];
+  x1[16] = input[1];
+  x1[17] = input[17];
+  x1[18] = input[9];
+  x1[19] = input[25];
+  x1[20] = input[5];
+  x1[21] = input[21];
+  x1[22] = input[13];
+  x1[23] = input[29];
+  x1[24] = input[3];
+  x1[25] = input[19];
+  x1[26] = input[11];
+  x1[27] = input[27];
+  x1[28] = input[7];
+  x1[29] = input[23];
+  x1[30] = input[15];
+  x1[31] = input[31];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]);
+  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]);
+  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]);
+  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]);
+  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]);
+  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]);
+  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]);
+  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+  idct32_high16_stage3_avx2(x1);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(x1[8], x1[9]);
+  btf_16_subs_adds_avx2(x1[11], x1[10]);
+  btf_16_adds_subs_avx2(x1[12], x1[13]);
+  btf_16_subs_adds_avx2(x1[15], x1[14]);
+  idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(x1[4], x1[5]);
+  btf_16_subs_adds_avx2(x1[7], x1[6]);
+  idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_avx2(x1[0], x1[3]);
+  btf_16_adds_subs_avx2(x1[1], x1[2]);
+  idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit);
+
+  idct32_stage7_avx2(x1, cospi, __rounding, cos_bit);
+  idct32_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_adds_subs_avx2(x[32], x[35]);
+  btf_16_adds_subs_avx2(x[33], x[34]);
+  btf_16_subs_adds_avx2(x[39], x[36]);
+  btf_16_subs_adds_avx2(x[38], x[37]);
+  btf_16_adds_subs_avx2(x[40], x[43]);
+  btf_16_adds_subs_avx2(x[41], x[42]);
+  btf_16_subs_adds_avx2(x[47], x[44]);
+  btf_16_subs_adds_avx2(x[46], x[45]);
+  btf_16_adds_subs_avx2(x[48], x[51]);
+  btf_16_adds_subs_avx2(x[49], x[50]);
+  btf_16_subs_adds_avx2(x[55], x[52]);
+  btf_16_subs_adds_avx2(x[54], x[53]);
+  btf_16_adds_subs_avx2(x[56], x[59]);
+  btf_16_adds_subs_avx2(x[57], x[58]);
+  btf_16_subs_adds_avx2(x[63], x[60]);
+  btf_16_subs_adds_avx2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  btf_16_adds_subs_avx2(x[16], x[19]);
+  btf_16_adds_subs_avx2(x[17], x[18]);
+  btf_16_subs_adds_avx2(x[23], x[20]);
+  btf_16_subs_adds_avx2(x[22], x[21]);
+  btf_16_adds_subs_avx2(x[24], x[27]);
+  btf_16_adds_subs_avx2(x[25], x[26]);
+  btf_16_subs_adds_avx2(x[31], x[28]);
+  btf_16_subs_adds_avx2(x[30], x[29]);
+  idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_adds_subs_avx2(x[32], x[39]);
+  btf_16_adds_subs_avx2(x[33], x[38]);
+  btf_16_adds_subs_avx2(x[34], x[37]);
+  btf_16_adds_subs_avx2(x[35], x[36]);
+  btf_16_subs_adds_avx2(x[47], x[40]);
+  btf_16_subs_adds_avx2(x[46], x[41]);
+  btf_16_subs_adds_avx2(x[45], x[42]);
+  btf_16_subs_adds_avx2(x[44], x[43]);
+  btf_16_adds_subs_avx2(x[48], x[55]);
+  btf_16_adds_subs_avx2(x[49], x[54]);
+  btf_16_adds_subs_avx2(x[50], x[53]);
+  btf_16_adds_subs_avx2(x[51], x[52]);
+  btf_16_subs_adds_avx2(x[63], x[56]);
+  btf_16_subs_adds_avx2(x[62], x[57]);
+  btf_16_subs_adds_avx2(x[61], x[58]);
+  btf_16_subs_adds_avx2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_avx2(x[16], x[23]);
+  btf_16_adds_subs_avx2(x[17], x[22]);
+  btf_16_adds_subs_avx2(x[18], x[21]);
+  btf_16_adds_subs_avx2(x[19], x[20]);
+  btf_16_subs_adds_avx2(x[31], x[24]);
+  btf_16_subs_adds_avx2(x[30], x[25]);
+  btf_16_subs_adds_avx2(x[29], x[26]);
+  btf_16_subs_adds_avx2(x[28], x[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[15]);
+  btf_16_adds_subs_avx2(x[1], x[14]);
+  btf_16_adds_subs_avx2(x[2], x[13]);
+  btf_16_adds_subs_avx2(x[3], x[12]);
+  btf_16_adds_subs_avx2(x[4], x[11]);
+  btf_16_adds_subs_avx2(x[5], x[10]);
+  btf_16_adds_subs_avx2(x[6], x[9]);
+  btf_16_adds_subs_avx2(x[7], x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(x[32], x[47]);
+  btf_16_adds_subs_avx2(x[33], x[46]);
+  btf_16_adds_subs_avx2(x[34], x[45]);
+  btf_16_adds_subs_avx2(x[35], x[44]);
+  btf_16_adds_subs_avx2(x[36], x[43]);
+  btf_16_adds_subs_avx2(x[37], x[42]);
+  btf_16_adds_subs_avx2(x[38], x[41]);
+  btf_16_adds_subs_avx2(x[39], x[40]);
+  btf_16_subs_adds_avx2(x[63], x[48]);
+  btf_16_subs_adds_avx2(x[62], x[49]);
+  btf_16_subs_adds_avx2(x[61], x[50]);
+  btf_16_subs_adds_avx2(x[60], x[51]);
+  btf_16_subs_adds_avx2(x[59], x[52]);
+  btf_16_subs_adds_avx2(x[58], x[53]);
+  btf_16_subs_adds_avx2(x[57], x[54]);
+  btf_16_subs_adds_avx2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[31]);
+  btf_16_adds_subs_avx2(x[1], x[30]);
+  btf_16_adds_subs_avx2(x[2], x[29]);
+  btf_16_adds_subs_avx2(x[3], x[28]);
+  btf_16_adds_subs_avx2(x[4], x[27]);
+  btf_16_adds_subs_avx2(x[5], x[26]);
+  btf_16_adds_subs_avx2(x[6], x[25]);
+  btf_16_adds_subs_avx2(x[7], x[24]);
+  btf_16_adds_subs_avx2(x[8], x[23]);
+  btf_16_adds_subs_avx2(x[9], x[22]);
+  btf_16_adds_subs_avx2(x[10], x[21]);
+  btf_16_adds_subs_avx2(x[11], x[20]);
+  btf_16_adds_subs_avx2(x[12], x[19]);
+  btf_16_adds_subs_avx2(x[13], x[18]);
+  btf_16_adds_subs_avx2(x[14], x[17]);
+  btf_16_adds_subs_avx2(x[15], x[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]);
+  btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]);
+  btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]);
+  btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]);
+  btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]);
+  btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]);
+  btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]);
+  btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]);
+  btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]);
+  btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]);
+  btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]);
+  btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]);
+  btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]);
+  btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]);
+  btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]);
+  btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]);
+  btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]);
+  btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]);
+  btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]);
+  btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]);
+  btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]);
+  btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]);
+  btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]);
+  btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]);
+  btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]);
+  btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]);
+  btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]);
+  btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]);
+  btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]);
+  btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]);
+  btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]);
+  btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  x[9] = x[9];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[11]);
+  btf_16_adds_subs_avx2(x[9], x[10]);
+  btf_16_subs_adds_avx2(x[15], x[12]);
+  btf_16_subs_adds_avx2(x[14], x[13]);
+  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(x[32], x[33]);
+  btf_16_subs_adds_avx2(x[35], x[34]);
+  btf_16_adds_subs_avx2(x[36], x[37]);
+  btf_16_subs_adds_avx2(x[39], x[38]);
+  btf_16_adds_subs_avx2(x[40], x[41]);
+  btf_16_subs_adds_avx2(x[43], x[42]);
+  btf_16_adds_subs_avx2(x[44], x[45]);
+  btf_16_subs_adds_avx2(x[47], x[46]);
+  btf_16_adds_subs_avx2(x[48], x[49]);
+  btf_16_subs_adds_avx2(x[51], x[50]);
+  btf_16_adds_subs_avx2(x[52], x[53]);
+  btf_16_subs_adds_avx2(x[55], x[54]);
+  btf_16_adds_subs_avx2(x[56], x[57]);
+  btf_16_subs_adds_avx2(x[59], x[58]);
+  btf_16_adds_subs_avx2(x[60], x[61]);
+  btf_16_subs_adds_avx2(x[63], x[62]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(x[16], x[17]);
+  btf_16_subs_adds_avx2(x[19], x[18]);
+  btf_16_adds_subs_avx2(x[20], x[21]);
+  btf_16_subs_adds_avx2(x[23], x[22]);
+  btf_16_adds_subs_avx2(x[24], x[25]);
+  btf_16_subs_adds_avx2(x[27], x[26]);
+  btf_16_adds_subs_avx2(x[28], x[29]);
+  btf_16_subs_adds_avx2(x[31], x[30]);
+  idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[9]);
+  btf_16_subs_adds_avx2(x[11], x[10]);
+  btf_16_adds_subs_avx2(x[12], x[13]);
+  btf_16_subs_adds_avx2(x[15], x[14]);
+  idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(x[4], x[5]);
+  btf_16_subs_adds_avx2(x[7], x[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(x[0], x[3]);
+  btf_16_adds_subs_avx2(x[1], x[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[11]);
+  btf_16_adds_subs_avx2(x[9], x[10]);
+  btf_16_subs_adds_avx2(x[15], x[12]);
+  btf_16_subs_adds_avx2(x[14], x[13]);
+  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
+          { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
+          idct32_new_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
+          idct64_low32_new_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + (i << 4) * input_stride;
+    for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      const int32_t *input_cur = input_row + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
+                                          16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+
+    __m256i *buf1_cur = buf1 + (i << 4);
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_av2(buf0 + 16 * j, temp, 16);
+        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i *buf1_cur = buf1 + i * txfm_size_row;
+    col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+                                 stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                             (1 << (NewSqrt2Bits - shift - 1)));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale_rounding);
+      hi = _mm256_madd_epi16(hi, scale_rounding);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m256i rect_scale =
+        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      src = _mm256_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale_rounding);
+      hi = _mm256_madd_epi16(hi, scale_rounding);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+                                           __m256i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding);
+  for (int h = 0; h < height; ++h) {
+    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+    lo = _mm256_madd_epi16(lo, scale_coeff);
+    hi = _mm256_madd_epi16(hi, scale_coeff);
+    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm256_add_epi32(lo, shift_rounding);
+    hi = _mm256_add_epi32(hi, shift_rounding);
+    lo = _mm256_srai_epi32(lo, -shift);
+    hi = _mm256_srai_epi32(hi, -shift);
+    const __m256i x = _mm256_packs_epi32(lo, hi);
+    write_recon_w16_avx2(x, output);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_SIZE tx_size,
+                                                  int32_t eob) {
+  (void)eob;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m256i buf[32];
+  for (int i = 0; i < input_stride; i += 16) {
+    iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
+  const int input_stride = txfm_size_col_notzero;
+  const int buf_size_w_div16 = (eobx + 16) >> 4;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i buf0[64];
+    iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+  assert(row_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 16;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
+                                          buf0_cur, 16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+    __m256i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_av2(buf0 + 16 * j, temp, 16);
+        transpose_16bit_16x16_avx2(temp,
+                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+      }
+    }
+    for (int j = 0; j < buf_size_w_div16; ++j) {
+      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+                              buf1 + j * 16, shift[1], 16, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:   // ADST in vertical, DCT in horizontal
+    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
+    case ADST_ADST:  // ADST in both directions
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    default:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+    case TX_8X8:
+    case TX_4X8:
+    case TX_8X4:
+    case TX_8X16:
+    case TX_16X8:
+    case TX_4X16:
+    case TX_16X4:
+    case TX_8X32:
+    case TX_32X8:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_16X16:
+    case TX_32X32:
+    case TX_64X64:
+    case TX_16X32:
+    case TX_32X16:
+    case TX_32X64:
+    case TX_64X32:
+    case TX_16X64:
+    case TX_64X16:
+    default:
+      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 000000000..c17f655c5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_w16_epi16(a, b) \
+  _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \
+  {                                                   \
+    __m256i t0 = _mm256_unpacklo_epi16(in0, in1);     \
+    __m256i t1 = _mm256_unpackhi_epi16(in0, in1);     \
+    __m256i u0 = _mm256_madd_epi16(t0, w0);           \
+    __m256i u1 = _mm256_madd_epi16(t1, w0);           \
+    __m256i v0 = _mm256_madd_epi16(t0, w1);           \
+    __m256i v1 = _mm256_madd_epi16(t1, w1);           \
+                                                      \
+    __m256i a0 = _mm256_add_epi32(u0, __rounding);    \
+    __m256i a1 = _mm256_add_epi32(u1, __rounding);    \
+    __m256i b0 = _mm256_add_epi32(v0, __rounding);    \
+    __m256i b1 = _mm256_add_epi32(v1, __rounding);    \
+                                                      \
+    __m256i c0 = _mm256_srai_epi32(a0, cos_bit);      \
+    __m256i c1 = _mm256_srai_epi32(a1, cos_bit);      \
+    __m256i d0 = _mm256_srai_epi32(b0, cos_bit);      \
+    __m256i d1 = _mm256_srai_epi32(b1, cos_bit);      \
+                                                      \
+    out0 = _mm256_packs_epi32(c0, c1);                \
+    out1 = _mm256_packs_epi32(d0, d1);                \
+  }
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
+  {                                                \
+    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+    const __m256i _in = in;                        \
+    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
+  }
+
+#define btf_16_adds_subs_avx2(in0, in1)  \
+  {                                      \
+    const __m256i _in0 = in0;            \
+    const __m256i _in1 = in1;            \
+    in0 = _mm256_adds_epi16(_in0, _in1); \
+    in1 = _mm256_subs_epi16(_in0, _in1); \
+  }
+
+#define btf_16_subs_adds_avx2(in0, in1)  \
+  {                                      \
+    const __m256i _in0 = in0;            \
+    const __m256i _in1 = in1;            \
+    in1 = _mm256_subs_epi16(_in0, _in1); \
+    in0 = _mm256_adds_epi16(_in0, _in1); \
+  }
+
+#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \
+  {                                                     \
+    const __m256i _in0 = in0;                           \
+    const __m256i _in1 = in1;                           \
+    out0 = _mm256_adds_epi16(_in0, _in1);               \
+    out1 = _mm256_subs_epi16(_in0, _in1);               \
+  }
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
+  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
+  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
+  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
+  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
+  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
+  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
+  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
+  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
+  // to:
+  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  // ...
+  __m256i a[16];
+  for (int i = 0; i < 16; i += 2) {
+    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+  }
+  __m256i b[16];
+  for (int i = 0; i < 16; i += 2) {
+    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+  }
+  __m256i c[16];
+  for (int i = 0; i < 16; i += 2) {
+    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+  }
+  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    __m256i scale = _mm256_set1_epi16(1 << (bit + 15));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+                                    int size) {
+  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm256_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+  __m128i y = _mm256_castsi256_si128(
+      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+  _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    write_recon_w16_avx2(in[j], output + i * stride);
+  }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 000000000..dd7cee24c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2917 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 4
+  // stage 5
+  output[0] = x[0];
+  output[7] = x[0];
+  output[1] = x[1];
+  output[6] = x[1];
+  output[2] = x[1];
+  output[5] = x[1];
+  output[3] = x[0];
+  output[4] = x[0];
+}
+
+void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 5
+  // stage 6
+  // stage 7
+  output[0] = x[0];
+  output[15] = x[0];
+  output[1] = x[1];
+  output[14] = x[1];
+  output[2] = x[1];
+  output[13] = x[1];
+  output[3] = x[0];
+  output[12] = x[0];
+  output[4] = x[0];
+  output[11] = x[0];
+  output[5] = x[1];
+  output[10] = x[1];
+  output[6] = x[1];
+  output[9] = x[1];
+  output[7] = x[0];
+  output[8] = x[0];
+}
+
+static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[2] = input[4];
+  x[4] = input[2];
+  x[6] = input[6];
+  x[8] = input[1];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5~7
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+  // stage 7
+  idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_new_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[1] = input[16];
+  x[2] = input[8];
+  x[3] = input[24];
+  x[4] = input[4];
+  x[5] = input[20];
+  x[6] = input[12];
+  x[7] = input[28];
+  x[8] = input[2];
+  x[9] = input[18];
+  x[10] = input[10];
+  x[11] = input[26];
+  x[12] = input[6];
+  x[13] = input[22];
+  x[14] = input[14];
+  x[15] = input[30];
+  x[16] = input[1];
+  x[17] = input[17];
+  x[18] = input[9];
+  x[19] = input[25];
+  x[20] = input[5];
+  x[21] = input[21];
+  x[22] = input[13];
+  x[23] = input[29];
+  x[24] = input[3];
+  x[25] = input[19];
+  x[26] = input[11];
+  x[27] = input[27];
+  x[28] = input[7];
+  x[29] = input[23];
+  x[30] = input[15];
+  x[31] = input[31];
+
+  // stage 2
+  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_adds_subs_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7~8
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_adds_subs_sse2(x[32], x[35]);
+  btf_16_adds_subs_sse2(x[33], x[34]);
+  btf_16_subs_adds_sse2(x[39], x[36]);
+  btf_16_subs_adds_sse2(x[38], x[37]);
+  btf_16_adds_subs_sse2(x[40], x[43]);
+  btf_16_adds_subs_sse2(x[41], x[42]);
+  btf_16_subs_adds_sse2(x[47], x[44]);
+  btf_16_subs_adds_sse2(x[46], x[45]);
+  btf_16_adds_subs_sse2(x[48], x[51]);
+  btf_16_adds_subs_sse2(x[49], x[50]);
+  btf_16_subs_adds_sse2(x[55], x[52]);
+  btf_16_subs_adds_sse2(x[54], x[53]);
+  btf_16_adds_subs_sse2(x[56], x[59]);
+  btf_16_adds_subs_sse2(x[57], x[58]);
+  btf_16_subs_adds_sse2(x[63], x[60]);
+  btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_adds_subs_sse2(x[32], x[39]);
+  btf_16_adds_subs_sse2(x[33], x[38]);
+  btf_16_adds_subs_sse2(x[34], x[37]);
+  btf_16_adds_subs_sse2(x[35], x[36]);
+  btf_16_subs_adds_sse2(x[47], x[40]);
+  btf_16_subs_adds_sse2(x[46], x[41]);
+  btf_16_subs_adds_sse2(x[45], x[42]);
+  btf_16_subs_adds_sse2(x[44], x[43]);
+  btf_16_adds_subs_sse2(x[48], x[55]);
+  btf_16_adds_subs_sse2(x[49], x[54]);
+  btf_16_adds_subs_sse2(x[50], x[53]);
+  btf_16_adds_subs_sse2(x[51], x[52]);
+  btf_16_subs_adds_sse2(x[63], x[56]);
+  btf_16_subs_adds_sse2(x[62], x[57]);
+  btf_16_subs_adds_sse2(x[61], x[58]);
+  btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[47]);
+  btf_16_adds_subs_sse2(x[33], x[46]);
+  btf_16_adds_subs_sse2(x[34], x[45]);
+  btf_16_adds_subs_sse2(x[35], x[44]);
+  btf_16_adds_subs_sse2(x[36], x[43]);
+  btf_16_adds_subs_sse2(x[37], x[42]);
+  btf_16_adds_subs_sse2(x[38], x[41]);
+  btf_16_adds_subs_sse2(x[39], x[40]);
+  btf_16_subs_adds_sse2(x[63], x[48]);
+  btf_16_subs_adds_sse2(x[62], x[49]);
+  btf_16_subs_adds_sse2(x[61], x[50]);
+  btf_16_subs_adds_sse2(x[60], x[51]);
+  btf_16_subs_adds_sse2(x[59], x[52]);
+  btf_16_subs_adds_sse2(x[58], x[53]);
+  btf_16_subs_adds_sse2(x[57], x[54]);
+  btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+                                       const __m128i __rounding,
+                                       int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[31]);
+  btf_16_adds_subs_sse2(x[1], x[30]);
+  btf_16_adds_subs_sse2(x[2], x[29]);
+  btf_16_adds_subs_sse2(x[3], x[28]);
+  btf_16_adds_subs_sse2(x[4], x[27]);
+  btf_16_adds_subs_sse2(x[5], x[26]);
+  btf_16_adds_subs_sse2(x[6], x[25]);
+  btf_16_adds_subs_sse2(x[7], x[24]);
+  btf_16_adds_subs_sse2(x[8], x[23]);
+  btf_16_adds_subs_sse2(x[9], x[22]);
+  btf_16_adds_subs_sse2(x[10], x[21]);
+  btf_16_adds_subs_sse2(x[11], x[20]);
+  btf_16_adds_subs_sse2(x[12], x[19]);
+  btf_16_adds_subs_sse2(x[13], x[18]);
+  btf_16_adds_subs_sse2(x[14], x[17]);
+  btf_16_adds_subs_sse2(x[15], x[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+  btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+  btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+  btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+  btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+  btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+  btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+  btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+  btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+  btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+  btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+  btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+  btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+  btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+  btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+  btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+  btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+  btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+  btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+  btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+  btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+  btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+  btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+  btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+  btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+  btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+  btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+  btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+  btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+  btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+  btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+  btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  x[9] = x[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[33]);
+  btf_16_subs_adds_sse2(x[35], x[34]);
+  btf_16_adds_subs_sse2(x[36], x[37]);
+  btf_16_subs_adds_sse2(x[39], x[38]);
+  btf_16_adds_subs_sse2(x[40], x[41]);
+  btf_16_subs_adds_sse2(x[43], x[42]);
+  btf_16_adds_subs_sse2(x[44], x[45]);
+  btf_16_subs_adds_sse2(x[47], x[46]);
+  btf_16_adds_subs_sse2(x[48], x[49]);
+  btf_16_subs_adds_sse2(x[51], x[50]);
+  btf_16_adds_subs_sse2(x[52], x[53]);
+  btf_16_subs_adds_sse2(x[55], x[54]);
+  btf_16_adds_subs_sse2(x[56], x[57]);
+  btf_16_subs_adds_sse2(x[59], x[58]);
+  btf_16_adds_subs_sse2(x[60], x[61]);
+  btf_16_subs_adds_sse2(x[63], x[62]);
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[4];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+  __m128i x1[16];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
+  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+  __m128i x2[8];
+  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[5]);
+  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+  x2[3] = _mm_add_epi32(x1[3], x1[7]);
+  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
+  x2[5] = _mm_add_epi32(x1[9], x1[11]);
+  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+  x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out1);
+  }
+}
+
+// TODO(binpengsmail@gmail.com):
+// To explore the reuse of VP9 versions of corresponding SSE2 functions and
+// evaluate whether there is a possibility for further speedup.
+void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[2];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+  __m128i x1[8];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
+  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+
+  __m128i x2[4];
+  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
+  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[i], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out0);
+  }
+}
+
+static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+  // stage 3
+  x[4] = x[0];
+  x[5] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+  // stage 5
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[8]);
+  btf_16_adds_subs_sse2(x[1], x[9]);
+  btf_16_adds_subs_sse2(x[2], x[10]);
+  btf_16_adds_subs_sse2(x[3], x[11]);
+  btf_16_adds_subs_sse2(x[4], x[12]);
+  btf_16_adds_subs_sse2(x[5], x[13]);
+  btf_16_adds_subs_sse2(x[6], x[14]);
+  btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[12]);
+  btf_16_adds_subs_sse2(x[9], x[13]);
+  btf_16_adds_subs_sse2(x[10], x[14]);
+  btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[10]);
+  btf_16_adds_subs_sse2(x[9], x[11]);
+  btf_16_adds_subs_sse2(x[12], x[14]);
+  btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+  const __m128i __zero = _mm_setzero_si128();
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[8]);
+  output[2] = x[12];
+  output[3] = _mm_subs_epi16(__zero, x[4]);
+  output[4] = x[6];
+  output[5] = _mm_subs_epi16(__zero, x[14]);
+  output[6] = x[10];
+  output[7] = _mm_subs_epi16(__zero, x[2]);
+  output[8] = x[3];
+  output[9] = _mm_subs_epi16(__zero, x[11]);
+  output[10] = x[15];
+  output[11] = _mm_subs_epi16(__zero, x[7]);
+  output[12] = x[5];
+  output[13] = _mm_subs_epi16(__zero, x[13]);
+  output[14] = x[9];
+  output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+  // stage 3
+  x[8] = x[0];
+  x[9] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+  // stage 5
+  x[4] = x[0];
+  x[5] = x[1];
+  x[12] = x[8];
+  x[13] = x[9];
+
+  // stage 6
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+  // stage 7
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+  x[10] = x[8];
+  x[11] = x[9];
+  x[14] = x[12];
+  x[15] = x[13];
+
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+  x[3] = input[2];
+  x[5] = input[4];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[1];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3~9
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
+                         int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+  // stage 5
+  iadst16_stage5_ssse3(x);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+  // stage 7
+  iadst16_stage7_ssse3(x);
+
+  // stage 8
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+  // stage 9
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 4; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    output[i] = _mm_adds_epi16(x, input[i]);
+  }
+}
+
+static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
+                                int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) {
+    output[i] = _mm_adds_epi16(input[i], input[i]);
+  }
+}
+
+static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 16; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+    output[i] = _mm_adds_epi16(x, srcx2);
+  }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+                                               __m128i res) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+  return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+    u = _mm_packus_epi16(u, zero);
+    *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+  }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+    _mm_storel_epi64((__m128i *)(output + i * stride), u);
+  }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
+      { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
+      { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
+      { idct32_new_sse2, NULL, NULL },
+      { idct64_low32_new_ssse3, NULL, NULL },
+    };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
+          { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
+          { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+      },
+      { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
+        { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
+        { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+      {
+          { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
+            NULL },
+          { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
+          idct32_new_sse2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
+          idct64_low32_new_ssse3 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
+      { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
+      { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+      { NULL, NULL, NULL },
+      { NULL, NULL, NULL },
+    };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                          (1 << (NewSqrt2Bits - shift - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m128i src = load_32bit_to_16bit(input_row);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m128i rect_scale =
+        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m128i src = load_32bit_to_16bit(input_row);
+      src = _mm_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+                                           __m128i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+  const __m128i zero = _mm_setzero_si128();
+  for (int h = 0; h < height; ++h) {
+    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+    lo = _mm_madd_epi16(lo, scale_coeff);
+    hi = _mm_madd_epi16(hi, scale_coeff);
+    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm_add_epi32(lo, shift_rounding);
+    hi = _mm_add_epi32(hi, shift_rounding);
+    lo = _mm_srai_epi32(lo, -shift);
+    hi = _mm_srai_epi32(hi, -shift);
+    __m128i x = _mm_packs_epi32(lo, hi);
+
+    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+    const __m128i u = _mm_packus_epi16(x, x);
+    _mm_storel_epi64((__m128i *)(output), u);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
+                                                   uint8_t *output, int stride,
+                                                   TX_SIZE tx_size) {
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m128i buf[32];
+
+  for (int i = 0; i < (input_stride >> 3); ++i) {
+    iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[4];
+  const TX_SIZE tx_size = TX_4X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x4(buf, buf);
+  row_txfm(buf, buf, cos_bit_row);
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x4(temp, buf);
+  } else {
+    transpose_16bit_4x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+                                                 __m128i res0, __m128i res1) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+  x0 = _mm_adds_epi16(res0, x0);
+  x1 = _mm_adds_epi16(res1, x1);
+  return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+                                     int size) {
+  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64 * 8];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1 + i * 8;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp,
+                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+                                   output + 16 * i, stride, ud_flip,
+                                   txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = (eobx + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  assert(fun_idx < 5);
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    __m128i buf0[64];
+    iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    uint8_t *out = output + 8 * i;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+      _mm_storel_epi64((__m128i *)(out), u);
+      out += stride;
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+      }
+    }
+
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+                              buf1 + j * 8, shift[1], 8, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case DCT_DCT:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_4X8;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x8(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_8x4(temp, buf);
+  } else {
+    transpose_16bit_8x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_8X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_8x4(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[8];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x8(temp, buf);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_4X16;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  const int row_one_loop = 8;
+  for (int i = 0; i < 2; ++i) {
+    const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+                                  row_one_loop);
+    transpose_16bit_4x8(buf_cur, buf_cur);
+    row_txfm(buf_cur, buf_cur, cos_bit_row);
+    round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    if (lr_flip) {
+      __m128i temp[8];
+      flip_buf_sse2(buf_cur, temp, txfm_size_col);
+      transpose_16bit_8x4(temp, buf_cur);
+    } else {
+      transpose_16bit_8x4(buf_cur, buf_cur);
+    }
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_16X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int row_one_loop = 8;
+  for (int i = 0; i < buf_size_w_div8; ++i) {
+    const int32_t *input_cur = input + i * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+                               txfm_size_row);
+    transpose_16bit_8x4(buf_cur, buf_cur);
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  if (lr_flip) {
+    __m128i temp[16];
+    flip_buf_sse2(buf, temp, 16);
+    transpose_16bit_4x8(temp, buf);
+    transpose_16bit_4x8(temp + 8, buf + 8);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+  }
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+                                          tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                            const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                   txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 000000000..dc9be25d2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>  // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1)    \
+  do {                                          \
+    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+    const __m128i _in = in;                     \
+    out0 = _mm_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm_mulhrs_epi16(_in, _w1);          \
+  } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+  do {                                                  \
+    const __m128i _in0 = in0;                           \
+    const __m128i _in1 = in1;                           \
+    out0 = _mm_adds_epi16(_in0, _in1);                  \
+    out1 = _mm_subs_epi16(_in0, _in1);                  \
+  } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
deleted file mode 100644
index fd0a6ed2c..000000000
--- a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef AV1_TXMF1D_SSE2_H_
-#define AV1_TXMF1D_SSE2_H_
-
-#include <smmintrin.h>
-#include "av1/common/av1_txfm.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-
-static INLINE void transpose_32_4x4(int stride, const __m128i *input,
-                                    __m128i *output) {
-  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
-  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
-  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
-  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
-
-  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
-  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
-  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
-  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
-}
-
-// the entire input block can be represent by a grid of 4x4 blocks
-// each 4x4 blocks can be represent by 4 vertical __m128i
-// we first transpose each 4x4 block internally
-// then transpose the grid
-static INLINE void transpose_32(int txfm_size, const __m128i *input,
-                                __m128i *output) {
-  const int num_per_128 = 4;
-  const int row_size = txfm_size;
-  const int col_size = txfm_size / num_per_128;
-  int r, c;
-
-  // transpose each 4x4 block internally
-  for (r = 0; r < row_size; r += 4) {
-    for (c = 0; c < col_size; c++) {
-      transpose_32_4x4(col_size, &input[r * col_size + c],
-                       &output[c * 4 * col_size + r / 4]);
-    }
-  }
-}
-
-static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
-  __m128i tmp, round;
-  round = _mm_set1_epi32(1 << (bit - 1));
-  tmp = _mm_add_epi32(vec, round);
-  return _mm_srai_epi32(tmp, bit);
-}
-
-static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
-                                               const int size, const int bit) {
-  if (bit > 0) {
-    int i;
-    for (i = 0; i < size; i++) {
-      output[i] = round_shift_32_sse4_1(input[i], bit);
-    }
-  } else {
-    int i;
-    for (i = 0; i < size; i++) {
-      output[i] = _mm_slli_epi32(input[i], -bit);
-    }
-  }
-}
-
-// out0 = in0*w0 + in1*w1
-// out1 = -in1*w0 + in0*w1
-#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
-  do {                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
-    ww0 = _mm_set1_epi32(w0);                                  \
-    ww1 = _mm_set1_epi32(w1);                                  \
-    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
-    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
-    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
-    out0 = round_shift_32_sse4_1(out0, bit);                   \
-    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
-    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
-    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
-    out1 = round_shift_32_sse4_1(out1, bit);                   \
-  } while (0)
-
-// out0 = in0*w0 + in1*w1
-// out1 = in1*w0 - in0*w1
-#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
-  do {                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
-    ww0 = _mm_set1_epi32(w0);                                  \
-    ww1 = _mm_set1_epi32(w1);                                  \
-    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
-    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
-    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
-    out0 = round_shift_32_sse4_1(out0, bit);                   \
-    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
-    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
-    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
-    out1 = round_shift_32_sse4_1(out1, bit);                   \
-  } while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 000000000..721cfe059
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+    const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+    const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+    __m128i *const out0, __m128i *const out1) {
+  const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i u0 = _mm_madd_epi16(t0, *w0);
+  const __m128i v0 = _mm_madd_epi16(t0, *w1);
+  const __m128i a0 = _mm_add_epi32(u0, __rounding);
+  const __m128i b0 = _mm_add_epi32(v0, __rounding);
+  const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+  const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+  *out0 = _mm_packs_epi32(c0, c0);
+  *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                                  \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
+    __m128i u0 = _mm_madd_epi16(t0, w0);             \
+    __m128i v0 = _mm_madd_epi16(t0, w1);             \
+                                                     \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
+                                                     \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
+                                                     \
+    out0 = _mm_packs_epi32(c0, c0);                  \
+    out1 = _mm_packs_epi32(d0, d0);                  \
+  }
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                               \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
+    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
+    __m128i u0 = _mm_madd_epi16(t0, w0);          \
+    __m128i u1 = _mm_madd_epi16(t1, w0);          \
+    __m128i v0 = _mm_madd_epi16(t0, w1);          \
+    __m128i v1 = _mm_madd_epi16(t1, w1);          \
+                                                  \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
+    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
+    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
+                                                  \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
+    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
+    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
+                                                  \
+    out0 = _mm_packs_epi32(c0, c1);               \
+    out1 = _mm_packs_epi32(d0, d1);               \
+  }
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+  const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m128i b = _mm_madd_epi16(a, scale_rounding);
+  return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+                                                int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+                                             int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+  _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+                                                 const int stride,
+                                                 __m128i *const out,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+                                                      const int stride,
+                                                      __m128i *const out,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+                                                 __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w4(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+                                                   uint16_t *out,
+                                                   const int stride) {
+  for (int i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+  }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_adds_epi16(in[i], rounding);
+      in[i] = _mm_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit);
+
+typedef struct {
+  transform_1d_sse2 col, row;  // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 000000000..cccc62f03
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,10 @@
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+  __m128i *const vec = (__m128i *)arr;
+  const int vec_size = size >> 2;
+  av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 000000000..faf7251fa
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,60 @@
+#ifndef AV1_TXFM_SSE4_H_
+#define AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
+                                                   __m128i *output,
+                                                   const int size,
+                                                   const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
+                                                        __m128i *output,
+                                                        const int size,
+                                                        const int bit) {
+  const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 000000000..a8bfdcce6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                           \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                      \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(    \
+      TX_SIZE tx_size) {                                                   \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {     \
+      subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      cfl_subsample_##bd##_null,            /* 64x64 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      cfl_subsample_##bd##_null,            /* 32x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x32 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      cfl_subsample_##bd##_null,            /* 16x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x16 (invalid CFL size) */ \
+    };                                                                     \
+    return subfn_##sub[tx_size];                                           \
+  }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                               // Forever 32
+  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+    _mm256_storeu_si256(row, sum_16x16);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                                // Forever 32
+  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+    _mm256_storeu_si256(row, top_16x16);
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  const __m256i zeros = _mm256_setzero_si256();
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+    row_lo = _mm256_slli_epi16(row_lo, 3);
+    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+    row_hi = _mm256_slli_epi16(row_hi, 3);
+
+    _mm256_storeu_si256(row, row_lo);
+    _mm256_storeu_si256(row + 1, row_hi);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+    __m256i sum = _mm256_add_epi16(top, bot);
+
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_add_epi16(hsum, hsum);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i hsum = _mm256_hadd_epi16(top, top_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_slli_epi16(hsum, 2);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+                                        __m256i alpha_sign, __m256i dc_q0) {
+  __m256i ac_q3 = _mm256_loadu_si256(input);
+  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+  __m256i scaled_luma_q0 =
+      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  (void)width;
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+  do {
+    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+    res = _mm256_packus_epi16(res, next);
+    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_storeu_si256((__m256i *)dst, res);
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd);
+CFL_PREDICT_X(avx2, 32, 16, lbd);
+CFL_PREDICT_X(avx2, 32, 32, lbd);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+    predict_lbd_4x4_ssse3,   /* 4x4 */
+    predict_lbd_8x8_ssse3,   /* 8x8 */
+    predict_lbd_16x16_ssse3, /* 16x16 */
+    predict_lbd_32x32_avx2,  /* 32x32 */
+    cfl_predict_lbd_null,    /* 64x64 (invalid CFL size) */
+    predict_lbd_4x8_ssse3,   /* 4x8 */
+    predict_lbd_8x4_ssse3,   /* 8x4 */
+    predict_lbd_8x16_ssse3,  /* 8x16 */
+    predict_lbd_16x8_ssse3,  /* 16x8 */
+    predict_lbd_16x32_ssse3, /* 16x32 */
+    predict_lbd_32x16_avx2,  /* 32x16 */
+    cfl_predict_lbd_null,    /* 32x64 (invalid CFL size) */
+    cfl_predict_lbd_null,    /* 64x32 (invalid CFL size) */
+    predict_lbd_4x16_ssse3,  /* 4x16  */
+    predict_lbd_16x4_ssse3,  /* 16x4  */
+    predict_lbd_8x32_ssse3,  /* 8x32  */
+    predict_lbd_32x8_avx2,   /* 32x8  */
+    cfl_predict_lbd_null,    /* 16x64 (invalid CFL size) */
+    cfl_predict_lbd_null,    /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+static __m256i highbd_max_epi16(int bd) {
+  const __m256i neg_one = _mm256_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  // Use SSSE3 version for smaller widths
+  assert(width == 16 || width == 32);
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+  const __m256i max = highbd_max_epi16(bd);
+
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    _mm256_storeu_si256((__m256i *)dst,
+                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+    if (width == 32) {
+      const __m256i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm256_storeu_si256(
+          (__m256i *)(dst + 16),
+          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+    predict_hbd_4x4_ssse3,  /* 4x4 */
+    predict_hbd_8x8_ssse3,  /* 8x8 */
+    predict_hbd_16x16_avx2, /* 16x16 */
+    predict_hbd_32x32_avx2, /* 32x32 */
+    cfl_predict_hbd_null,   /* 64x64 (invalid CFL size) */
+    predict_hbd_4x8_ssse3,  /* 4x8 */
+    predict_hbd_8x4_ssse3,  /* 8x4 */
+    predict_hbd_8x16_ssse3, /* 8x16 */
+    predict_hbd_16x8_avx2,  /* 16x8 */
+    predict_hbd_16x32_avx2, /* 16x32 */
+    predict_hbd_32x16_avx2, /* 32x16 */
+    cfl_predict_hbd_null,   /* 32x64 (invalid CFL size) */
+    cfl_predict_hbd_null,   /* 64x32 (invalid CFL size) */
+    predict_hbd_4x16_ssse3, /* 4x16  */
+    predict_hbd_16x4_avx2,  /* 16x4  */
+    predict_hbd_8x32_ssse3, /* 8x32  */
+    predict_hbd_32x8_avx2,  /* 32x8  */
+    cfl_predict_hbd_null,   /* 16x64 (invalid CFL size) */
+    cfl_predict_hbd_null,   /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+  // Given that a == [A, B, C, D, E, F, G, H]
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+  // a == [A', C', A', C', E', G', E', G']
+  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+  // a == [A', C', E', G', A', C', E', G']
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A'' == A' + C' and E'' == E' + G'
+  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+  return _mm256_hadd_epi32(a, a);
+  // Given that A''' == A'' + E''
+  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  // Use SSE2 version for smaller widths
+  assert(width == 16 || width == 32);
+
+  const __m256i *src = (__m256i *)src_ptr;
+  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+  // To maximize usage of the AVX2 registers, we sum two rows per loop
+  // iteration
+  const int step = 2 * CFL_BUF_LINE_I256;
+
+  __m256i sum = _mm256_setzero_si256();
+  // For width 32, we use a second sum accumulator to reduce accumulator
+  // dependencies in the loop.
+  __m256i sum2;
+  if (width == 32) sum2 = _mm256_setzero_si256();
+
+  do {
+    // Add top row to the bottom row
+    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+    if (width == 32) { /* Don't worry, this if it gets optimized out. */
+      // Add the second part of the top row to the second part of the bottom row
+      __m256i l1 =
+          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+    }
+    src += step;
+  } while (src < end);
+  // Combine both sum accumulators
+  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+  __m256i fill = fill_sum_epi32(sum);
+
+  __m256i avg_epi16 = _mm256_srli_epi32(
+      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+  // Store and subtract loop
+  src = (__m256i *)src_ptr;
+  __m256i *dst = (__m256i *)dst_ptr;
+  do {
+    _mm256_storeu_si256(dst,
+                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+    if (width == 32) {
+      _mm256_storeu_si256(
+          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+    }
+    src += CFL_BUF_LINE_I256;
+    dst += CFL_BUF_LINE_I256;
+  } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_sse2,   /* 4x4 */
+    subtract_average_8x8_sse2,   /* 8x8 */
+    subtract_average_16x16_avx2, /* 16x16 */
+    subtract_average_32x32_avx2, /* 32x32 */
+    cfl_subtract_average_null,   /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_sse2,   /* 4x8 */
+    subtract_average_8x4_sse2,   /* 8x4 */
+    subtract_average_8x16_sse2,  /* 8x16 */
+    subtract_average_16x8_avx2,  /* 16x8 */
+    subtract_average_16x32_avx2, /* 16x32 */
+    subtract_average_32x16_avx2, /* 32x16 */
+    cfl_subtract_average_null,   /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,   /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_sse2,  /* 4x16 */
+    subtract_average_16x4_avx2,  /* 16x4 */
+    subtract_average_8x32_sse2,  /* 8x32 */
+    subtract_average_32x8_avx2,  /* 32x8 */
+    cfl_subtract_average_null,   /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,   /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 000000000..7479ac3e1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+
+void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+
+void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, int alpha_q3);
+void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, int alpha_q3);
+
+void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, int alpha_q3, int bd);
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 000000000..4783fe098
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  const __m128i zeros = _mm_setzero_si128();
+  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+  const __m128i *src = (__m128i *)src_ptr;
+  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+  __m128i sum = zeros;
+  do {
+    __m128i l0;
+    if (width == 4) {
+      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpacklo_epi16(l1, zeros)));
+    } else {
+      if (width == 8) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src),
+                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+      } else {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+      }
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpackhi_epi16(l0, zeros)));
+      if (width == 32) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                               _mm_unpackhi_epi16(l0, zeros)));
+      }
+    }
+    src += step;
+  } while (src < end);
+
+  sum = fill_sum_epi32(sum);
+
+  __m128i avg_epi16 =
+      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+  src = (__m128i *)src_ptr;
+  __m128i *dst = (__m128i *)dst_ptr;
+  do {
+    if (width == 4) {
+      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+    } else {
+      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+      if (width > 8) {
+        _mm_storeu_si128(dst + 1,
+                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+        if (width == 32) {
+          _mm_storeu_si128(dst + 2,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+          _mm_storeu_si128(dst + 3,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+        }
+      }
+    }
+    src += CFL_BUF_LINE_I128;
+    dst += CFL_BUF_LINE_I128;
+  } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 000000000..bbf007295
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+  return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+  *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i twos = _mm_set1_epi8(2);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storel_epi64(pred_buf_m128i, sum);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeu_si128(pred_buf_m128i, sum);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, twos);
+        bot_1 = _mm_maddubs_epi16(bot_1, twos);
+        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i fours = _mm_set1_epi8(4);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeh_epi32(pred_buf_m128i, top);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storel_epi64(pred_buf_m128i, top);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeu_si128(pred_buf_m128i, top);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, fours);
+        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+      }
+    }
+    input += input_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i zeros = _mm_setzero_si128();
+  const int luma_stride = input_stride;
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i row = _mm_loadh_epi32((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else if (width == 8) {
+      __m128i row = _mm_loadl_epi64((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else {
+      __m128i row = _mm_loadu_si128((__m128i *)input);
+      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+      if (width == 32) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      sum = _mm_hadd_epi16(sum, sum);
+      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      if (width == 8) {
+        sum = _mm_hadd_epi16(sum, sum);
+        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i bot_2 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i bot_3 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+                           _mm_add_epi16(next_sum, next_sum));
+        }
+      }
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      if (width == 8) {
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+        _mm_storel_epi64(pred_buf_m128i, sum);
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+        _mm_storeu_si128(pred_buf_m128i, sum);
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+        }
+      }
+    }
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+    input += input_stride;
+  } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+    } else {
+      const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+      if (width >= 16) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        row_1 = _mm_slli_epi16(row_1, 3);
+        _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+        if (width == 32) {
+          __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          row_2 = _mm_slli_epi16(row_2, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+          __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          row_3 = _mm_slli_epi16(row_3, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+        }
+      }
+    }
+    input += input_stride;
+    pred_buf_q3 += CFL_BUF_LINE;
+  } while (pred_buf_q3 < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+                                        __m128i alpha_sign, __m128i dc_q0) {
+  __m128i ac_q3 = _mm_loadu_si128(input);
+  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint8_t *dst, int dst_stride,
+                                         int alpha_q3, int width, int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4)
+        _mm_storeh_epi32((__m128i *)dst, res);
+      else
+        _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      res = _mm_packus_epi16(res, next);
+      _mm_storeu_si128((__m128i *)dst, res);
+      if (width == 32) {
+        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+        res = _mm_packus_epi16(res, next);
+        _mm_storeu_si128((__m128i *)(dst + 16), res);
+      }
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+static INLINE __m128i highbd_max_epi16(int bd) {
+  const __m128i neg_one = _mm_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint16_t *dst, int dst_stride,
+                                         int alpha_q3, int bd, int width,
+                                         int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  const __m128i max = highbd_max_epi16(bd);
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    res = highbd_clamp_epi16(res, zeros, max);
+    if (width == 4) {
+      _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      _mm_storeu_si128((__m128i *)dst, res);
+    }
+    if (width >= 16) {
+      const __m128i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128(((__m128i *)dst) + 1,
+                       highbd_clamp_epi16(res_1, zeros, max));
+    }
+    if (width == 32) {
+      const __m128i res_2 =
+          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 16),
+                       highbd_clamp_epi16(res_2, zeros, max));
+      const __m128i res_3 =
+          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 24),
+                       highbd_clamp_epi16(res_3, zeros, max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 000000000..fd5e90a2e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i sum_round_v = _mm256_set1_epi32(
+      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+      ((1 << (offset_bits - conv_params->round_1)) >> 1));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+  for (j = 0; j < w; j += 8) {
+    for (i = 0; i < im_h; i += 2) {
+      __m256i data = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      // Load the next line
+      if (i + 1 < im_h)
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
+
+      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+    }
+
+    /* Vertical filter */
+    {
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      __m256i s[8];
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        __m256i res_a = convolve(s, coeffs_v);
+        __m256i res_b = convolve(s + 4, coeffs_v);
+
+        // Combine V round and 2F-H-V round into a single rounding
+        res_a =
+            _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
+        res_b =
+            _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+        if (w - j > 4) {
+          _mm_storel_epi64(p_0, res_0);
+          _mm_storel_epi64(p_1, res_1);
+        } else if (w == 4) {
+          xx_storel_32(p_0, res_0);
+          xx_storel_32(p_1, res_1);
+        } else {
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index e4d352c0e..fc0e65453 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,197 +11,20 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
 #include "av1/common/convolve.h"
 
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
-                          CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
-  DECLARE_ALIGNED(16, uint8_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-  const __m128i zero = _mm_setzero_si128();
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_packus_epi16(res, res);
-        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint8_t *data = &im_block[i * im_stride + j];
-        const __m128i src_01 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
-        const __m128i src_23 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
-        const __m128i src_45 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
-        const __m128i src_67 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
-
-        const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
-        const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
-        const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
-        const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
-        const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
-        const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
-        const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
-        } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
-        }
-      }
-    }
-  }
-}
-#else
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
-                          CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
   const int bd = 8;
 
   DECLARE_ALIGNED(16, int16_t,
@@ -211,10 +34,14 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  assert(conv_params->round_0 > 0);
 
   /* Horizontal filter */
   {
@@ -237,7 +64,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
     const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+        (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
     const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
 
     for (i = 0; i < im_h; ++i) {
@@ -302,10 +129,14 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
     // coeffs 6 7 6 7 6 7 6 7
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
+    const __m128i sum_round =
+        _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
     const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_1) >> 1) -
-        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift = _mm_cvtsi32_si128(bits);
 
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; j += 8) {
@@ -358,24 +189,285 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
 
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+        if (w == 2) {
+          *(uint16_t *)p = _mm_cvtsi128_si32(res);
+        } else if (w == 4) {
+          *(uint32_t *)p = _mm_cvtsi128_si32(res);
+        } else {
+          _mm_storel_epi64(p, res);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  int i, j;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  assert((w % 4) == 0);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+        const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+        const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+        const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+        const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+        const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+        const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
         if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+          const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+          const __m128i data_ref_0_hi =
+              _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+          const __m128i comp_avg_res_lo =
+              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i comp_avg_res_hi =
+              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 =
+              _mm_packus_epi16(round_result_lo, round_result_hi);
+
+          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+          _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+        const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+          else
+            *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
         } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
         }
       }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
     }
   }
 }
-#endif
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
index a0e58716d..6fdfb0954 100644
--- a/third_party/aom/av1/common/x86/convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -11,332 +11,267 @@
 
 #include <immintrin.h>
 
-#include "aom_dsp/aom_dsp_common.h"
-#include "./av1_rtcd.h"
-
-#if CONFIG_CONVOLVE_ROUND
-static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-// 16 epi16 pixels
-static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(*u, max);
-  clamped = _mm256_andnot_si256(mask, *u);
-  mask = _mm256_and_si256(mask, max);
-  clamped = _mm256_or_si256(mask, clamped);
-
-  const __m256i zero = _mm256_setzero_si256();
-  mask = _mm256_cmpgt_epi16(clamped, zero);
-  *u = _mm256_and_si256(clamped, mask);
-}
-
-// 8 epi16 pixels
-static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-
-  mask = _mm_cmpgt_epi16(*u, max);
-  clamped = _mm_andnot_si128(mask, *u);
-  mask = _mm_and_si128(mask, max);
-  clamped = _mm_or_si128(mask, clamped);
-
-  const __m128i zero = _mm_setzero_si128();
-  mask = _mm_cmpgt_epi16(clamped, zero);
-  *u = _mm_and_si128(clamped, mask);
-}
-
-// Work on multiple of 32 pixels
-static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
-                                          const __m256i *rnd, int shift,
-                                          int num) {
-  do {
-    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
-    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
-    x0 = _mm256_add_epi32(x0, *rnd);
-    x1 = _mm256_add_epi32(x1, *rnd);
-    x2 = _mm256_add_epi32(x2, *rnd);
-    x3 = _mm256_add_epi32(x3, *rnd);
-
-    x0 = _mm256_srai_epi32(x0, shift);
-    x1 = _mm256_srai_epi32(x1, shift);
-    x2 = _mm256_srai_epi32(x2, shift);
-    x3 = _mm256_srai_epi32(x3, shift);
-
-    x0 = _mm256_packs_epi32(x0, x1);
-    x2 = _mm256_packs_epi32(x2, x3);
-
-    pixel_clamp_avx2(&x0, 8);
-    pixel_clamp_avx2(&x2, 8);
-
-    x0 = _mm256_packus_epi16(x0, x2);
-    x1 = _mm256_loadu_si256((const __m256i *)sindex);
-    x2 = _mm256_permutevar8x32_epi32(x0, x1);
-
-    _mm256_storeu_si256((__m256i *)dst, x2);
-    src += 32;
-    dst += 32;
-    num--;
-  } while (num > 0);
-}
-
-static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
-                                        const __m256i *rnd, int shift) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x1 = _mm256_add_epi32(x1, *rnd);
-
-  x0 = _mm256_srai_epi32(x0, shift);
-  x1 = _mm256_srai_epi32(x1, shift);
-
-  x0 = _mm256_packs_epi32(x0, x1);
-  pixel_clamp_avx2(&x0, 8);
-
-  const __m256i x2 = _mm256_packus_epi16(x0, x0);
-  x1 = _mm256_loadu_si256((const __m256i *)sindex);
-  x0 = _mm256_permutevar8x32_epi32(x2, x1);
-
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
-                                       const __m256i *rnd, int shift) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x0 = _mm256_srai_epi32(x0, shift);
-
-  x0 = _mm256_packs_epi32(x0, x0);
-  pixel_clamp_avx2(&x0, 8);
-
-  x0 = _mm256_packus_epi16(x0, x0);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
-  x0 = _mm256_permutevar8x32_epi32(x0, x1);
+#include "config/av1_rtcd.h"
 
-  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
-                                       const __m128i *rnd, int shift) {
-  __m128i x = _mm_loadu_si128((const __m128i *)src);
-  x = _mm_add_epi32(x, *rnd);
-  x = _mm_srai_epi32(x, shift);
-
-  x = _mm_packs_epi32(x, x);
-  pixel_clamp_sse2(&x, 8);
-
-  x = _mm_packus_epi16(x, x);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(x);
-}
-
-void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                int bits) {
-  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
-  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
-  if (w > 64) {  // width = 128
-    do {
-      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 32) {  // width = 64
-    do {
-      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 16) {  // width = 32
-    do {
-      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 8) {  // width = 16
-    do {
-      cal_rounding_16_avx2(src, dst, &rnd_num, bits);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 4) {  // width = 8
-    do {
-      cal_rounding_8_avx2(src, dst, &rnd_num, bits);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 2) {  // width = 4
-    do {
-      cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else {  // width = 2
-    do {
-      dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
-      dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+  // right shift is F-1 because we are already dividing
+  // filter co-efficients by 2
+  const int right_shift_bits = (FILTER_BITS - 1);
+  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+  const __m256i right_shift_const =
+      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+  __m256i coeffs[4], s[8];
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  for (j = 0; j < w; j += 16) {
+    const uint8_t *data = &src_ptr[j];
+    __m256i src6;
+
+    // Load lines a and b. Line a to lower 128, line b to upper 128
+    const __m256i src_01a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        0x20);
+
+    const __m256i src_12a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        0x20);
+
+    const __m256i src_23a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        0x20);
+
+    const __m256i src_34a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        0x20);
+
+    const __m256i src_45a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        0x20);
+
+    src6 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+    const __m256i src_56a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        src6, 0x20);
+
+    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+    for (i = 0; i < h; i += 2) {
+      data = &src_ptr[i * src_stride + j];
+      const __m256i src_67a = _mm256_permute2x128_si256(
+          src6,
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          src6, 0x20);
+
+      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+      const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+      /* rounding code */
+      // shift by F - 1
+      const __m256i res_16b_lo = _mm256_sra_epi16(
+          _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+      // 8 bit conversion and saturation to uint8
+      __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+      if (w - j > 8) {
+        const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_hi = _mm256_sra_epi16(
+            _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+        __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_a);
+        const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         res_1);
+      } else {
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+        if (w - j > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else if (w - j > 2) {
+          xx_storel_32(&dst[i * dst_stride + j], res_0);
+          xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+
+      s[0] = s[1];
+      s[1] = s[2];
+      s[2] = s[3];
+
+      s[4] = s[5];
+      s[5] = s[6];
+      s[6] = s[7];
+    }
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
-                                                 uint16_t *dst,
-                                                 const __m256i *rnd, int shift,
-                                                 int num, int bd) {
-  do {
-    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
-    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
-    x0 = _mm256_add_epi32(x0, *rnd);
-    x1 = _mm256_add_epi32(x1, *rnd);
-    x2 = _mm256_add_epi32(x2, *rnd);
-    x3 = _mm256_add_epi32(x3, *rnd);
-
-    x0 = _mm256_srai_epi32(x0, shift);
-    x1 = _mm256_srai_epi32(x1, shift);
-    x2 = _mm256_srai_epi32(x2, shift);
-    x3 = _mm256_srai_epi32(x3, shift);
-
-    x0 = _mm256_packs_epi32(x0, x1);
-    x2 = _mm256_packs_epi32(x2, x3);
-
-    pixel_clamp_avx2(&x0, bd);
-    pixel_clamp_avx2(&x2, bd);
-
-    x0 = _mm256_permute4x64_epi64(x0, 0xD8);
-    x2 = _mm256_permute4x64_epi64(x2, 0xD8);
-
-    _mm256_storeu_si256((__m256i *)dst, x0);
-    _mm256_storeu_si256((__m256i *)(dst + 16), x2);
-    src += 32;
-    dst += 32;
-    num--;
-  } while (num > 0);
-}
-
-static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
-                                               uint16_t *dst,
-                                               const __m256i *rnd, int shift,
-                                               int bd) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x1 = _mm256_add_epi32(x1, *rnd);
-
-  x0 = _mm256_srai_epi32(x0, shift);
-  x1 = _mm256_srai_epi32(x1, shift);
-
-  x0 = _mm256_packs_epi32(x0, x1);
-  pixel_clamp_avx2(&x0, bd);
-
-  x0 = _mm256_permute4x64_epi64(x0, 0xD8);
-  _mm256_storeu_si256((__m256i *)dst, x0);
-}
-
-static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
-                                              const __m256i *rnd, int shift,
-                                              int bd) {
-  __m256i x = _mm256_loadu_si256((const __m256i *)src);
-  x = _mm256_add_epi32(x, *rnd);
-  x = _mm256_srai_epi32(x, shift);
-
-  x = _mm256_packs_epi32(x, x);
-  pixel_clamp_avx2(&x, bd);
-
-  x = _mm256_permute4x64_epi64(x, 0xD8);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
-}
-
-static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
-                                              const __m128i *rnd, int shift,
-                                              int bd) {
-  __m128i x = _mm_loadu_si128((const __m128i *)src);
-  x = _mm_add_epi32(x, *rnd);
-  x = _mm_srai_epi32(x, shift);
-
-  x = _mm_packs_epi32(x, x);
-  pixel_clamp_sse2(&x, bd);
-  _mm_storel_epi64((__m128i *)dst, x);
-}
-
-void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
-                                       uint8_t *dst8, int dst_stride, int w,
-                                       int h, int bits, int bd) {
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
-  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
-  if (w > 64) {  // width = 128
-    do {
-      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 32) {  // width = 64
-    do {
-      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 16) {  // width = 32
-    do {
-      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 8) {  // width = 16
-    do {
-      cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 4) {  // width = 8
-    do {
-      cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 2) {  // width = 4
-    do {
-      cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else {  // width = 2
-    do {
-      dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
-      dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  __m256i filt[4], coeffs[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  const __m256i round_0_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+  const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 8) {
+    for (i = 0; i < h; i += 2) {
+      const __m256i data = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+          _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+          0x20);
+
+      __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+      res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                 round_0_shift);
+
+      res_16b =
+          _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
+
+      /* rounding code */
+      // 8 bit conversion and saturation to uint8
+      __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+      if (w > 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+      } else if (w > 2) {
+        xx_storel_32(&dst[i * dst_stride], res_0);
+        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+      } else {
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
+        // 19 20 21 22 23
+        const __m256i data = _mm256_inserti128_si256(
+            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+            1);
+
+        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        // Store values into the destination buffer
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+        __m128i res = _mm256_castsi256_si128(res_8b);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+      }
+    }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_CONVOLVE_ROUND
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 000000000..18fe9ae5a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+  coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+  const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+  return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+  if (w <= 4) {
+    __m128i s[8], src6, res, res_round, res16;
+    uint32_t res_int;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi;
+      __m128i res_lo_round, res_hi_round, res16, res;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_0_const =
+      _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+  if (w <= 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+
+      const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+      const __m128i res = _mm_packus_epi16(res16, res16);
+
+      uint32_t r = _mm_cvtsi128_si32(res);
+      if (w == 2)
+        *(uint16_t *)dst = r;
+      else
+        *(uint32_t *)dst = r;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
index 4f77da446..c11edc1d4 100644
--- a/third_party/aom/av1/common/x86/filterintra_sse4.c
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,888 +11,65 @@
 
 #include <smmintrin.h>
 
-#include "./av1_rtcd.h"
-#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 #include "av1/common/enums.h"
 #include "av1/common/reconintra.h"
 
-#if USE_3TAP_INTRA_FILTER
-void filterintra_sse4_3tap_dummy_func(void);
-void filterintra_sse4_3tap_dummy_func(void) {}
-#else
-
-static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
-                                  __m128i *sum) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i u0 = _mm_unpacklo_epi8(a, zero);
-  __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(u0, u1);
-}
-
-static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
-                                  __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsSmall(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 4;
-  sum_value >>= 3;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
-                                  __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsSmall(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 8;
-  sum_value >>= 4;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
-                                  __m128i *sum) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i u0 = _mm_unpacklo_epi8(a, zero);
-  __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(u0, u1);
-
-  u0 = _mm_unpackhi_epi8(a, zero);
-  u1 = _mm_unpackhi_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(sum[0], u0);
-  sum[0] = _mm_add_epi16(sum[0], u1);
-}
-
-static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
-                                    __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsLarge(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 16;
-  sum_value >>= 5;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
-                                    __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector[2], u;
-  uint16_t sum_value;
-
-  AddPixelsLarge(above, left, &sum_vector[0]);
-  AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
-
-  sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
-  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
-  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector[0], 2);
-  sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-
-  sum_value = _mm_extract_epi16(sum_vector[0], 0);
-  sum_value += 32;
-  sum_value >>= 6;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
-                                         const uint8_t *left, int bs,
-                                         __m128i *params) {
-  int meanValue = 0;
-  switch (bs) {
-    case 4: meanValue = GetMeanValue4x4(above, left, params); break;
-    case 8: meanValue = GetMeanValue8x8(above, left, params); break;
-    case 16: meanValue = GetMeanValue16x16(above, left, params); break;
-    case 32: meanValue = GetMeanValue32x32(above, left, params); break;
-    default: assert(0);
-  }
-  return meanValue;
-}
-
-// Note:
-//  params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
-//
-static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
-  const TX_SIZE tx_size =
-      (bs == 32) ? TX_32X32
-                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
-  // c0
-  params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0]);
-  // c1
-  params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1]);
-  // c2
-  params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2]);
-  // c3
-  params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3]);
-}
-
-static const int maxBlkSize = 32;
-
-static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
-                               ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
-  __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
-  __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
-
-  p0 = _mm_add_epi32(p0, mean[0]);
-  p1 = _mm_add_epi32(p1, mean[0]);
-  p2 = _mm_add_epi32(p2, mean[0]);
-  p3 = _mm_add_epi32(p3, mean[0]);
-
-  p0 = _mm_packus_epi32(p0, p1);
-  p1 = _mm_packus_epi32(p2, p3);
-  p0 = _mm_packus_epi16(p0, p1);
-
-  *((int *)dst) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
-}
-
-static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
-                        ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3;
-  int r = 0;
-
-  while (r < 8) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    r += 1;
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    dst += stride;
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)dst, p0);
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
-                          ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3;
-  int r = 0;
-
-  while (r < 16) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)(dst + 8), p0);
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
-                          ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-  int r = 0;
-
-  while (r < 32) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
-    p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
-    p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
-    p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
-    p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p4 = _mm_add_epi32(p4, mean[0]);
-    p5 = _mm_add_epi32(p5, mean[0]);
-    p6 = _mm_add_epi32(p6, mean[0]);
-    p7 = _mm_add_epi32(p7, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    p4 = _mm_packus_epi32(p4, p5);
-    p5 = _mm_packus_epi32(p6, p7);
-    p4 = _mm_packus_epi16(p4, p5);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)(dst + 8), p0);
-
-    _mm_storel_epi64((__m128i *)(dst + 16), p4);
-    p4 = _mm_srli_si128(p4, 8);
-    _mm_storel_epi64((__m128i *)(dst + 24), p4);
-
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
-                           ptrdiff_t stride) {
-  switch (bs) {
-    case 4: SavePred4x4(pred, mean, dst, stride); break;
-    case 8: SavePred8x8(pred, mean, dst, stride); break;
-    case 16: SavePred16x16(pred, mean, dst, stride); break;
-    case 32: SavePred32x32(pred, mean, dst, stride); break;
-    default: assert(0);
-  }
-}
-
-typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
-                                  const int predStride);
-
-static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
-                              const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-
-  sum = _mm_extract_epi32(u0, 2);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 3) = x;
-
-  sum = _mm_extract_epi32(u0, 3);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 4) = x;
-}
-
-static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
-                               const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-
-  sum = _mm_extract_epi32(u0, 2);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 3) = x;
-}
-
-static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
-                             const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-}
-
-static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
-                             const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-}
-
-static ProducePixelsFunc prodPixelsFuncTab[4] = {
-  ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
-};
-
-static void ProducePixels(int *pred, const __m128i *prm, int remain) {
-  __m128i p[3];
-  const int predStride = (maxBlkSize << 1) + 1;
-  int index;
-
-  p[0] = _mm_loadu_si128((const __m128i *)pred);
-  p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
-  p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
-
-  if (remain <= 2) {
-    return;
-  }
-  if (remain > 5) {
-    index = 3;
-  } else {
-    index = remain - 3;
-  }
-  prodPixelsFuncTab[index](p, prm, pred, predStride);
-}
-
-// Note:
-//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
-                               const int bs, const __m128i *prm, int meanValue,
-                               uint8_t *dst, ptrdiff_t stride) {
-  int pred[33][65];
-  int r, c, colBound;
-  int remainings;
-
-  for (r = 0; r < bs; ++r) {
-    pred[r + 1][0] = (int)left[r] - meanValue;
-  }
-
-  above -= 1;
-  for (c = 0; c < 2 * bs + 1; ++c) {
-    pred[0][c] = (int)above[c] - meanValue;
-  }
-
-  r = 0;
-  c = 0;
-  while (r < bs) {
-    colBound = (bs << 1) - r;
-    for (c = 0; c < colBound; c += 4) {
-      remainings = colBound - c + 1;
-      ProducePixels(&pred[r][c], prm, remainings);
-    }
-    r += 1;
-  }
-
-  SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
-}
-
-static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
-                             __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
-  int meanValue = 0;
-  meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
-  GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
-}
-
-void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, V_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, H_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-// ============== High Bit Depth ==============
-#if CONFIG_HIGHBITDEPTH
-static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
-                                        const uint16_t *left, const int bd,
-                                        __m128i *params) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-  (void)bd;
-
-  sum_vector = _mm_add_epi16(a, l);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 4;
-  sum_value >>= 3;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
-                                        const uint16_t *left, const int bd,
-                                        __m128i *params) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-  (void)bd;
-
-  sum_vector = _mm_add_epi16(a, l);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 8;
-  sum_value >>= 4;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  Process 16 pixels above and left, 10-bit depth
-//  Add to the last 8 pixels sum
-static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
-                                  __m128i *sum) {
-  __m128i a = _mm_loadu_si128((const __m128i *)above);
-  __m128i l = _mm_loadu_si128((const __m128i *)left);
-  sum[0] = _mm_add_epi16(a, l);
-  a = _mm_loadu_si128((const __m128i *)(above + 8));
-  l = _mm_loadu_si128((const __m128i *)(left + 8));
-  sum[0] = _mm_add_epi16(sum[0], a);
-  sum[0] = _mm_add_epi16(sum[0], l);
-}
-
-// Note:
-//  Process 16 pixels above and left, 12-bit depth
-//  Add to the last 8 pixels sum
-static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
-                                  __m128i *sum) {
-  __m128i a = _mm_loadu_si128((const __m128i *)above);
-  __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i v0, v1;
-
-  v0 = _mm_unpacklo_epi16(a, zero);
-  v1 = _mm_unpacklo_epi16(l, zero);
-  sum[0] = _mm_add_epi32(v0, v1);
-
-  v0 = _mm_unpackhi_epi16(a, zero);
-  v1 = _mm_unpackhi_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-
-  a = _mm_loadu_si128((const __m128i *)(above + 8));
-  l = _mm_loadu_si128((const __m128i *)(left + 8));
-
-  v0 = _mm_unpacklo_epi16(a, zero);
-  v1 = _mm_unpacklo_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-
-  v0 = _mm_unpackhi_epi16(a, zero);
-  v1 = _mm_unpackhi_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-}
-
-static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
-                                          const uint16_t *left, const int bd,
-                                          __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint32_t sum_value = 0;
-
-  if (10 == bd) {
-    AddPixels10bit(above, left, &sum_vector);
-    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-    u = _mm_srli_si128(sum_vector, 2);
-    sum_vector = _mm_add_epi16(sum_vector, u);
-    sum_value = _mm_extract_epi16(sum_vector, 0);
-  } else if (12 == bd) {
-    AddPixels12bit(above, left, &sum_vector);
-
-    sum_vector = _mm_hadd_epi32(sum_vector, zero);
-    u = _mm_srli_si128(sum_vector, 4);
-    sum_vector = _mm_add_epi32(u, sum_vector);
-    sum_value = _mm_extract_epi32(sum_vector, 0);
-  }
-
-  sum_value += 16;
-  sum_value >>= 5;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
-                                          const uint16_t *left, const int bd,
-                                          __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector[2], u;
-  uint32_t sum_value = 0;
-
-  if (10 == bd) {
-    AddPixels10bit(above, left, &sum_vector[0]);
-    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
-
-    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
-    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
-    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
-
-    u = _mm_srli_si128(sum_vector[0], 2);
-    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-    sum_value = _mm_extract_epi16(sum_vector[0], 0);
-  } else if (12 == bd) {
-    AddPixels12bit(above, left, &sum_vector[0]);
-    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
-
-    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
-    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
-    u = _mm_srli_si128(sum_vector[0], 4);
-    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
-    sum_value = _mm_extract_epi32(sum_vector[0], 0);
-  }
-
-  sum_value += 32;
-  sum_value >>= 6;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
-                                               const uint16_t *left, int bs,
-                                               const int bd, __m128i *params) {
-  int meanValue = 0;
-  switch (bs) {
-    case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
-    case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
-    case 16:
-      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
-      break;
-    case 32:
-      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
-      break;
-    default: assert(0);
-  }
-  return meanValue;
-}
-
-// Note:
-//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void HighbdGeneratePrediction(const uint16_t *above,
-                                     const uint16_t *left, const int bs,
-                                     const int bd, const __m128i *prm,
-                                     int meanValue, uint16_t *dst,
-                                     ptrdiff_t stride) {
-  int pred[33][65];
-  int r, c, colBound;
-  int remainings;
-  int ipred;
-
-  for (r = 0; r < bs; ++r) {
-    pred[r + 1][0] = (int)left[r] - meanValue;
-  }
-
-  above -= 1;
-  for (c = 0; c < 2 * bs + 1; ++c) {
-    pred[0][c] = (int)above[c] - meanValue;
-  }
-
-  r = 0;
-  c = 0;
-  while (r < bs) {
-    colBound = (bs << 1) - r;
-    for (c = 0; c < colBound; c += 4) {
-      remainings = colBound - c + 1;
-      ProducePixels(&pred[r][c], prm, remainings);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                       TX_SIZE tx_size, const uint8_t *above,
+                                       const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  const __m128i filter_intra_scale_bits =
+      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+      const __m128i p_b = xx_loadl_64(p);
+      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+      // Rounding
+      const __m128i round_w =
+          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+      // Storing
+      xx_storel_32(&buffer[r][c], out_r);
+      xx_storel_32(&buffer[r + 1][c], out_r1);
     }
-    r += 1;
   }
 
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      ipred = pred[r + 1][c + 1] + meanValue;
-      dst[c] = clip_pixel_highbd(ipred, bd);
-    }
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
     dst += stride;
   }
 }
-
-static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
-                                   int bs, const int bd, __m128i *prm,
-                                   uint16_t *dst, ptrdiff_t stride) {
-  int meanValue = 0;
-  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
-  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
-}
-
-void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, V_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, H_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#endif  // USE_3TAP_INTRA_FILTER
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 000000000..a34c618d0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+          res_b_round =
+              _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+                               round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
new file mode 100644
index 000000000..bdf813fa0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m128i s[16];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_sse2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      __m128i s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 000000000..5d2fc465e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const __m128i offset_const_16b = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 8)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_16bit =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo =
+              _mm_add_epi32(res_32b_lo, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+                          res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 4) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+        const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_1 = _mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+          const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+                           res_unsigned_16b);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+        const __m128i res_unsigned_lo =
+            _mm_add_epi32(res_lo_round, offset_const);
+
+        if (w < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+            const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result = highbd_convolve_rounding_sse2(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result, round_result);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        } else {
+          const __m128i res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_hi_round, offset_const);
+
+          if (do_average) {
+            const __m128i data_lo =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_hi =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+            const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+            const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_lo =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result_lo, round_result_hi);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 195f0f570..a9cf6a4d6 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -12,375 +12,209 @@
 #include <tmmintrin.h>
 #include <assert.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
 #include "av1/common/convolve.h"
 
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
-                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
-                                  ConvolveParams *conv_params, int bd) {
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
+                                     uint16_t *dst, int dst_stride, int w,
+                                     int h, InterpFilterParams *filter_params_x,
+                                     InterpFilterParams *filter_params_y,
+                                     const int subpel_x_q4,
+                                     const int subpel_y_q4,
+                                     ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
+  int im_stride = 8;
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+  __m128i coeffs_x[4], coeffs_y[4], s[16];
+
+  const __m128i round_const_x = _mm_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+                     (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 1) {
+        const __m128i row00 =
             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
         res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
 
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128());
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
 
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
-        } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
-        }
+        _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
       }
     }
-  }
-}
-#else
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
-                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
-                                  ConvolveParams *conv_params, int bd) {
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int do_average = conv_params->do_average;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 =
+            _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
+        res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 =
+            _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
+        res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
 
-  // Check that, even with 12-bit input, the intermediate values will fit
-  // into an unsigned 15-bit intermediate array.
-  assert(conv_params->round_0 >= 5);
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
 
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
 
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_1) >> 1) -
-        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
-        } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
         }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
       }
     }
   }
 }
-#endif
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index 0e833e6d9..debb05a6d 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -11,8 +11,9 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 
 // Note:
@@ -85,17 +86,6 @@ static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
   }
 }
 
-static void round_shift_32x32(__m256i *in, int shift) {
-  __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
-  int i = 0;
-
-  while (i < 128) {
-    in[i] = _mm256_add_epi32(in[i], rnding);
-    in[i] = _mm256_srai_epi32(in[i], shift);
-    i++;
-  }
-}
-
 static __m256i highbd_clamp_epi32(__m256i x, int bd) {
   const __m256i zero = _mm256_setzero_si256();
   const __m256i one = _mm256_set1_epi16(1);
@@ -120,7 +110,7 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
   (void)fliplr;
   (void)flipud;
 
-  round_shift_32x32(in, shift);
+  __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
 
   while (i < 128) {
     u0 = _mm256_loadu_si256((const __m256i *)output);
@@ -136,6 +126,16 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
     v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
     v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
 
+    v0 = _mm256_add_epi32(v0, round);
+    v1 = _mm256_add_epi32(v1, round);
+    v2 = _mm256_add_epi32(v2, round);
+    v3 = _mm256_add_epi32(v3, round);
+
+    v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
+    v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
+    v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
+    v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
+
     v0 = _mm256_add_epi32(v0, x0);
     v1 = _mm256_add_epi32(v1, x1);
     v2 = _mm256_add_epi32(v2, x2);
@@ -167,7 +167,53 @@ static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
   return x;
 }
 
-static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+                        __m256i *out1, const __m256i *clamp_lo,
+                        const __m256i *clamp_hi) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
+                                 __m256i *out0, __m256i *out1) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
+                              __m256i *out0, __m256i *out1,
+                              const __m256i *clamp_lo, const __m256i *clamp_hi,
+                              int shift) {
+  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+  __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
+  __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
+  __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
@@ -220,6 +266,9 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i bf1[32], bf0[32];
   int col;
 
@@ -334,22 +383,15 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
         half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
     bf1[15] =
         half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
-    bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
-    bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
-    bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
-    bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
-    bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
-    bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
-    bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
-    bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
-    bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
-    bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
-    bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
-    bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
-    bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
-    bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
-    bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
-    bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
+
+    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
 
     // stage 4
     bf0[0] = bf1[0];
@@ -363,14 +405,12 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[6] =
         half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
     bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
-    bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
-    bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
-    bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
-    bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
-    bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
-    bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
-    bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
-    bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
+
+    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
     bf0[16] = bf1[16];
     bf0[17] =
         half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
@@ -405,10 +445,8 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
         half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
     bf1[3] =
         half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
-    bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
-    bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
-    bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
-    bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
+    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
     bf1[8] = bf0[8];
     bf1[9] =
         half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
@@ -421,42 +459,28 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf1[14] =
         half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
     bf1[15] = bf0[15];
-    bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
-    bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
-    bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
-    bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
-    bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
-    bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
-    bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
-    bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
-    bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
-    bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
-    bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
-    bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
-    bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
-    bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
-    bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
-    bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
+    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
 
     // stage 6
-    bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
-    bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
-    bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
-    bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
+    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
     bf0[4] = bf1[4];
     bf0[5] =
         half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
     bf0[6] =
         half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
     bf0[7] = bf1[7];
-    bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
-    bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
-    bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
-    bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
-    bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
-    bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
-    bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
-    bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
+    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
     bf0[16] = bf1[16];
     bf0[17] = bf1[17];
     bf0[18] =
@@ -483,14 +507,10 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[31] = bf1[31];
 
     // stage 7
-    bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
-    bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
-    bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
-    bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
-    bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
-    bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
-    bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
-    bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
+    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
     bf1[8] = bf0[8];
     bf1[9] = bf0[9];
     bf1[10] =
@@ -503,40 +523,24 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
         half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
     bf1[14] = bf0[14];
     bf1[15] = bf0[15];
-    bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
-    bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
-    bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
-    bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
-    bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
-    bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
-    bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
-    bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
-    bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
-    bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
-    bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
-    bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
-    bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
-    bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
-    bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
-    bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
+    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
 
     // stage 8
-    bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
-    bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
-    bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
-    bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
-    bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
-    bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
-    bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
-    bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
-    bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
-    bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
-    bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
-    bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
-    bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
-    bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
-    bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
-    bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
+    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
     bf0[16] = bf1[16];
     bf0[17] = bf1[17];
     bf0[18] = bf1[18];
@@ -563,58 +567,91 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[31] = bf1[31];
 
     // stage 9
-    out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
-    out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
-    out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
-    out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
-    out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
-    out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
-    out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
-    out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
-    out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
-    out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
-    out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
-    out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
-    out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
-    out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
-    out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
-    out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
-    out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
-    out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
-    out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
-    out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
-    out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
-    out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
-    out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
-    out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
-    out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
-    out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
-    out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
-    out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
-    out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
-    out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
-    out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
-    out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
+    if (do_cols) {
+      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
+                           out + 31 * 4 + col);
+      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
+                           out + 30 * 4 + col);
+      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
+                           out + 29 * 4 + col);
+      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
+                           out + 28 * 4 + col);
+      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
+                           out + 27 * 4 + col);
+      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
+                           out + 26 * 4 + col);
+      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
+                           out + 25 * 4 + col);
+      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
+                           out + 24 * 4 + col);
+      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
+                           out + 23 * 4 + col);
+      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
+                           out + 22 * 4 + col);
+      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
+                           out + 21 * 4 + col);
+      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
+                           out + 20 * 4 + col);
+      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
+                           out + 19 * 4 + col);
+      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
+                           out + 18 * 4 + col);
+      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
+                           out + 17 * 4 + col);
+      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
+                           out + 16 * 4 + col);
+    } else {
+      addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
+                        out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
+                        out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
+                        out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
+                        out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
+                        out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
+                        out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+    }
   }
 }
 
 void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m256i in[128], out[128];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_32;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_32;
       load_buffer_32x32(coeff, in);
       transpose_32x32(in, out);
-      idct32_avx2(out, in, row_cfg->cos_bit[2]);
-      round_shift_32x32(in, -row_cfg->shift[0]);
+      idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
       transpose_32x32(in, out);
-      idct32_avx2(out, in, col_cfg->cos_bit[2]);
-      write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
       break;
     default: assert(0);
   }
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 8613bed86..801a4133b 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -11,8 +11,9 @@
 #include <assert.h>
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
@@ -23,13 +24,82 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
 }
 
-static void idct4x4_sse4_1(__m128i *in, int bit) {
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+                          __m128i *out1, const __m128i *clamp_lo,
+                          const __m128i *clamp_hi) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
+                                   __m128i *out0, __m128i *out1) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
+                                __m128i *out0, __m128i *out1,
+                                const __m128i *clamp_lo,
+                                const __m128i *clamp_hi, int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i in0_w_offset = _mm_add_epi32(in0, offset);
+  __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
+  __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+                             __m128i *out0, __m128i *out1,
+                             const __m128i *clamp_lo, const __m128i *clamp_hi,
+                             int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i a0 = _mm_add_epi32(offset, in0);
+  __m128i a1 = _mm_sub_epi32(offset, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3, x, y;
 
@@ -65,84 +135,72 @@ static void idct4x4_sse4_1(__m128i *in, int bit) {
   v3 = _mm_add_epi32(v3, rnding);
   v3 = _mm_srai_epi32(v3, bit);
 
-  in[0] = _mm_add_epi32(v0, v3);
-  in[1] = _mm_add_epi32(v1, v2);
-  in[2] = _mm_sub_epi32(v1, v2);
-  in[3] = _mm_sub_epi32(v0, v3);
+  addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
 }
 
 static void iadst4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const int32_t *sinpi = sinpi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i zero = _mm_setzero_si128();
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
   __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3, x, y;
+  __m128i v0, v1, v2, v3;
 
   v0 = _mm_unpacklo_epi32(in[0], in[1]);
   v1 = _mm_unpackhi_epi32(in[0], in[1]);
   v2 = _mm_unpacklo_epi32(in[2], in[3]);
   v3 = _mm_unpackhi_epi32(in[2], in[3]);
 
-  u0 = _mm_unpacklo_epi64(v0, v2);
-  u1 = _mm_unpackhi_epi64(v0, v2);
-  u2 = _mm_unpacklo_epi64(v1, v3);
-  u3 = _mm_unpackhi_epi64(v1, v3);
-
-  // stage 0
-  // stage 1
-  u1 = _mm_sub_epi32(zero, u1);
-  u3 = _mm_sub_epi32(zero, u3);
-
-  // stage 2
-  v0 = u0;
-  v1 = u3;
-  x = _mm_mullo_epi32(u1, cospi32);
-  y = _mm_mullo_epi32(u2, cospi32);
-  v2 = _mm_add_epi32(x, y);
-  v2 = _mm_add_epi32(v2, rnding);
-  v2 = _mm_srai_epi32(v2, bit);
-
-  v3 = _mm_sub_epi32(x, y);
-  v3 = _mm_add_epi32(v3, rnding);
-  v3 = _mm_srai_epi32(v3, bit);
-
-  // stage 3
-  u0 = _mm_add_epi32(v0, v2);
-  u1 = _mm_add_epi32(v1, v3);
-  u2 = _mm_sub_epi32(v0, v2);
-  u3 = _mm_sub_epi32(v1, v3);
-
-  // stage 4
-  x = _mm_mullo_epi32(u0, cospi8);
-  y = _mm_mullo_epi32(u1, cospi56);
-  in[3] = _mm_add_epi32(x, y);
-  in[3] = _mm_add_epi32(in[3], rnding);
-  in[3] = _mm_srai_epi32(in[3], bit);
-
-  x = _mm_mullo_epi32(u0, cospi56);
-  y = _mm_mullo_epi32(u1, cospim8);
-  in[0] = _mm_add_epi32(x, y);
-  in[0] = _mm_add_epi32(in[0], rnding);
-  in[0] = _mm_srai_epi32(in[0], bit);
-
-  x = _mm_mullo_epi32(u2, cospi40);
-  y = _mm_mullo_epi32(u3, cospi24);
-  in[1] = _mm_add_epi32(x, y);
-  in[1] = _mm_add_epi32(in[1], rnding);
-  in[1] = _mm_srai_epi32(in[1], bit);
-
-  x = _mm_mullo_epi32(u2, cospi24);
-  y = _mm_mullo_epi32(u3, cospim40);
-  in[2] = _mm_add_epi32(x, y);
-  in[2] = _mm_add_epi32(in[2], rnding);
-  in[2] = _mm_srai_epi32(in[2], bit);
+  x0 = _mm_unpacklo_epi64(v0, v2);
+  x1 = _mm_unpackhi_epi64(v0, v2);
+  x2 = _mm_unpacklo_epi64(v1, v3);
+  x3 = _mm_unpackhi_epi64(v1, v3);
+
+  s0 = _mm_mullo_epi32(x0, sinpi1);
+  s1 = _mm_mullo_epi32(x0, sinpi2);
+  s2 = _mm_mullo_epi32(x1, sinpi3);
+  s3 = _mm_mullo_epi32(x2, sinpi4);
+  s4 = _mm_mullo_epi32(x2, sinpi1);
+  s5 = _mm_mullo_epi32(x3, sinpi2);
+  s6 = _mm_mullo_epi32(x3, sinpi4);
+  t = _mm_sub_epi32(x0, x2);
+  s7 = _mm_add_epi32(t, x3);
+
+  t = _mm_add_epi32(s0, s3);
+  s0 = _mm_add_epi32(t, s5);
+  t = _mm_sub_epi32(s1, s4);
+  s1 = _mm_sub_epi32(t, s6);
+  s3 = s2;
+  s2 = _mm_mullo_epi32(s7, sinpi3);
+
+  u0 = _mm_add_epi32(s0, s3);
+  u1 = _mm_add_epi32(s1, s3);
+  u2 = s2;
+  t = _mm_add_epi32(s0, s1);
+  u3 = _mm_sub_epi32(t, s3);
+
+  u0 = _mm_add_epi32(u0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(u1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(u2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(u3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
+
+  in[0] = u0;
+  in[1] = u1;
+  in[2] = u2;
+  in[3] = u3;
 }
 
 static INLINE void round_shift_4x4(__m128i *in, int shift) {
@@ -232,84 +290,65 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
@@ -334,7 +373,8 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
   in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
 }
 
-static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                           int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
@@ -347,6 +387,9 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
   __m128i x, y;
@@ -413,16 +456,12 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
     v3 = _mm_add_epi32(v3, rnding);
     v3 = _mm_srai_epi32(v3, bit);
 
-    v4 = _mm_add_epi32(u4, u5);
-    v5 = _mm_sub_epi32(u4, u5);
-    v6 = _mm_sub_epi32(u7, u6);
-    v7 = _mm_add_epi32(u6, u7);
+    addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 
     // stage 4
-    u0 = _mm_add_epi32(v0, v3);
-    u1 = _mm_add_epi32(v1, v2);
-    u2 = _mm_sub_epi32(v1, v2);
-    u3 = _mm_sub_epi32(v0, v3);
+    addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
     u4 = v4;
     u7 = v7;
 
@@ -437,195 +476,334 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
     u5 = _mm_srai_epi32(u5, bit);
 
     // stage 5
-    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
-    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
-    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
-    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
-    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
-    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
-    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
-    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
+      addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
+      addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
+      addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
+    } else {
+      addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+    }
   }
 }
 
-static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
-  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
-  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
-  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
-  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i zero = _mm_setzero_si128();
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x, y;
-  int col;
+  const __m128i kZero = _mm_setzero_si128();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[8], v[8], x;
 
-  // Note:
-  //  Even column: 0, 2, ..., 14
-  //  Odd column: 1, 3, ..., 15
-  //  one even column plus one odd column constructs one row (8 coeffs)
-  //  total we have 8 rows (8x8).
-  for (col = 0; col < 2; ++col) {
-    // stage 0
-    // stage 1
-    u0 = in[2 * 0 + col];
-    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
-    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
-    u3 = in[2 * 4 + col];
-    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
-    u5 = in[2 * 6 + col];
-    u6 = in[2 * 2 + col];
-    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
-
-    // stage 2
-    v0 = u0;
-    v1 = u1;
-
-    x = _mm_mullo_epi32(u2, cospi32);
-    y = _mm_mullo_epi32(u3, cospi32);
-    v2 = _mm_add_epi32(x, y);
-    v2 = _mm_add_epi32(v2, rnding);
-    v2 = _mm_srai_epi32(v2, bit);
-
-    v3 = _mm_sub_epi32(x, y);
-    v3 = _mm_add_epi32(v3, rnding);
-    v3 = _mm_srai_epi32(v3, bit);
-
-    v4 = u4;
-    v5 = u5;
-
-    x = _mm_mullo_epi32(u6, cospi32);
-    y = _mm_mullo_epi32(u7, cospi32);
-    v6 = _mm_add_epi32(x, y);
-    v6 = _mm_add_epi32(v6, rnding);
-    v6 = _mm_srai_epi32(v6, bit);
-
-    v7 = _mm_sub_epi32(x, y);
-    v7 = _mm_add_epi32(v7, rnding);
-    v7 = _mm_srai_epi32(v7, bit);
-
-    // stage 3
-    u0 = _mm_add_epi32(v0, v2);
-    u1 = _mm_add_epi32(v1, v3);
-    u2 = _mm_sub_epi32(v0, v2);
-    u3 = _mm_sub_epi32(v1, v3);
-    u4 = _mm_add_epi32(v4, v6);
-    u5 = _mm_add_epi32(v5, v7);
-    u6 = _mm_sub_epi32(v4, v6);
-    u7 = _mm_sub_epi32(v5, v7);
-
-    // stage 4
-    v0 = u0;
-    v1 = u1;
-    v2 = u2;
-    v3 = u3;
-
-    x = _mm_mullo_epi32(u4, cospi16);
-    y = _mm_mullo_epi32(u5, cospi48);
-    v4 = _mm_add_epi32(x, y);
-    v4 = _mm_add_epi32(v4, rnding);
-    v4 = _mm_srai_epi32(v4, bit);
-
-    x = _mm_mullo_epi32(u4, cospi48);
-    y = _mm_mullo_epi32(u5, cospim16);
-    v5 = _mm_add_epi32(x, y);
-    v5 = _mm_add_epi32(v5, rnding);
-    v5 = _mm_srai_epi32(v5, bit);
-
-    x = _mm_mullo_epi32(u6, cospim48);
-    y = _mm_mullo_epi32(u7, cospi16);
-    v6 = _mm_add_epi32(x, y);
-    v6 = _mm_add_epi32(v6, rnding);
-    v6 = _mm_srai_epi32(v6, bit);
-
-    x = _mm_mullo_epi32(u6, cospi16);
-    y = _mm_mullo_epi32(u7, cospi48);
-    v7 = _mm_add_epi32(x, y);
-    v7 = _mm_add_epi32(v7, rnding);
-    v7 = _mm_srai_epi32(v7, bit);
-
-    // stage 5
-    u0 = _mm_add_epi32(v0, v4);
-    u1 = _mm_add_epi32(v1, v5);
-    u2 = _mm_add_epi32(v2, v6);
-    u3 = _mm_add_epi32(v3, v7);
-    u4 = _mm_sub_epi32(v0, v4);
-    u5 = _mm_sub_epi32(v1, v5);
-    u6 = _mm_sub_epi32(v2, v6);
-    u7 = _mm_sub_epi32(v3, v7);
-
-    // stage 6
-    x = _mm_mullo_epi32(u0, cospi4);
-    y = _mm_mullo_epi32(u1, cospi60);
-    v0 = _mm_add_epi32(x, y);
-    v0 = _mm_add_epi32(v0, rnding);
-    v0 = _mm_srai_epi32(v0, bit);
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    x = _mm_mullo_epi32(u0, cospi60);
-    y = _mm_mullo_epi32(u1, cospim4);
-    v1 = _mm_add_epi32(x, y);
-    v1 = _mm_add_epi32(v1, rnding);
-    v1 = _mm_srai_epi32(v1, bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 
-    x = _mm_mullo_epi32(u2, cospi20);
-    y = _mm_mullo_epi32(u3, cospi44);
-    v2 = _mm_add_epi32(x, y);
-    v2 = _mm_add_epi32(v2, rnding);
-    v2 = _mm_srai_epi32(v2, bit);
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[2] = _mm_sub_epi32(kZero, u[4]);
+    out[4] = u[6];
+    out[6] = _mm_sub_epi32(kZero, u[2]);
+    out[8] = u[3];
+    out[10] = _mm_sub_epi32(kZero, u[7]);
+    out[12] = u[5];
+    out[14] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+                     out_shift);
+  }
 
-    x = _mm_mullo_epi32(u2, cospi44);
-    y = _mm_mullo_epi32(u3, cospim20);
-    v3 = _mm_add_epi32(x, y);
-    v3 = _mm_add_epi32(v3, rnding);
-    v3 = _mm_srai_epi32(v3, bit);
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    x = _mm_mullo_epi32(u4, cospi36);
-    y = _mm_mullo_epi32(u5, cospi28);
-    v4 = _mm_add_epi32(x, y);
-    v4 = _mm_add_epi32(v4, rnding);
-    v4 = _mm_srai_epi32(v4, bit);
-
-    x = _mm_mullo_epi32(u4, cospi28);
-    y = _mm_mullo_epi32(u5, cospim36);
-    v5 = _mm_add_epi32(x, y);
-    v5 = _mm_add_epi32(v5, rnding);
-    v5 = _mm_srai_epi32(v5, bit);
-
-    x = _mm_mullo_epi32(u6, cospi52);
-    y = _mm_mullo_epi32(u7, cospi12);
-    v6 = _mm_add_epi32(x, y);
-    v6 = _mm_add_epi32(v6, rnding);
-    v6 = _mm_srai_epi32(v6, bit);
-
-    x = _mm_mullo_epi32(u6, cospi12);
-    y = _mm_mullo_epi32(u7, cospim52);
-    v7 = _mm_add_epi32(x, y);
-    v7 = _mm_add_epi32(v7, rnding);
-    v7 = _mm_srai_epi32(v7, bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 
-    // stage 7
-    out[2 * 0 + col] = v1;
-    out[2 * 1 + col] = v6;
-    out[2 * 2 + col] = v3;
-    out[2 * 3 + col] = v4;
-    out[2 * 4 + col] = v5;
-    out[2 * 5 + col] = v2;
-    out[2 * 6 + col] = v7;
-    out[2 * 7 + col] = v0;
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[1] = u[0];
+    out[3] = _mm_sub_epi32(kZero, u[4]);
+    out[5] = u[6];
+    out[7] = _mm_sub_epi32(kZero, u[2]);
+    out[9] = u[3];
+    out[11] = _mm_sub_epi32(kZero, u[7]);
+    out[13] = u[5];
+    out[15] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+                     out_shift);
   }
 }
 
@@ -708,102 +886,92 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
@@ -868,7 +1036,8 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
   write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
 }
 
-static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
@@ -894,6 +1063,9 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u[16], v[16], x, y;
   int col;
 
@@ -945,14 +1117,10 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
     u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
     u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
-    u[8] = _mm_add_epi32(v[8], v[9]);
-    u[9] = _mm_sub_epi32(v[8], v[9]);
-    u[10] = _mm_sub_epi32(v[11], v[10]);
-    u[11] = _mm_add_epi32(v[10], v[11]);
-    u[12] = _mm_add_epi32(v[12], v[13]);
-    u[13] = _mm_sub_epi32(v[12], v[13]);
-    u[14] = _mm_sub_epi32(v[15], v[14]);
-    u[15] = _mm_add_epi32(v[14], v[15]);
+    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 
     // stage 4
     x = _mm_mullo_epi32(u[0], cospi32);
@@ -967,10 +1135,8 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
 
     v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
     v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
-    v[4] = _mm_add_epi32(u[4], u[5]);
-    v[5] = _mm_sub_epi32(u[4], u[5]);
-    v[6] = _mm_sub_epi32(u[7], u[6]);
-    v[7] = _mm_add_epi32(u[6], u[7]);
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
     v[8] = u[8];
     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
@@ -981,10 +1147,8 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[15] = u[15];
 
     // stage 5
-    u[0] = _mm_add_epi32(v[0], v[3]);
-    u[1] = _mm_add_epi32(v[1], v[2]);
-    u[2] = _mm_sub_epi32(v[1], v[2]);
-    u[3] = _mm_sub_epi32(v[0], v[3]);
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
     u[4] = v[4];
 
     x = _mm_mullo_epi32(v[5], cospi32);
@@ -998,24 +1162,16 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     u[6] = _mm_srai_epi32(u[6], bit);
 
     u[7] = v[7];
-    u[8] = _mm_add_epi32(v[8], v[11]);
-    u[9] = _mm_add_epi32(v[9], v[10]);
-    u[10] = _mm_sub_epi32(v[9], v[10]);
-    u[11] = _mm_sub_epi32(v[8], v[11]);
-    u[12] = _mm_sub_epi32(v[15], v[12]);
-    u[13] = _mm_sub_epi32(v[14], v[13]);
-    u[14] = _mm_add_epi32(v[13], v[14]);
-    u[15] = _mm_add_epi32(v[12], v[15]);
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 
     // stage 6
-    v[0] = _mm_add_epi32(u[0], u[7]);
-    v[1] = _mm_add_epi32(u[1], u[6]);
-    v[2] = _mm_add_epi32(u[2], u[5]);
-    v[3] = _mm_add_epi32(u[3], u[4]);
-    v[4] = _mm_sub_epi32(u[3], u[4]);
-    v[5] = _mm_sub_epi32(u[2], u[5]);
-    v[6] = _mm_sub_epi32(u[1], u[6]);
-    v[7] = _mm_sub_epi32(u[0], u[7]);
+    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
     v[8] = u[8];
     v[9] = u[9];
 
@@ -1043,386 +1199,1141 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[15] = u[15];
 
     // stage 7
-    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
-    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
-    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
-    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
-    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
-    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
-    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
-    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
-    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
-    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
-    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
-    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
-    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
-    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
-    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
-    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
+                             out + 15 * 4 + col);
+      addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
+                             out + 14 * 4 + col);
+      addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
+                             out + 13 * 4 + col);
+      addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
+                             out + 12 * 4 + col);
+      addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
+                             out + 11 * 4 + col);
+      addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
+                             out + 10 * 4 + col);
+      addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
+      addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
+    } else {
+      addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+    }
   }
 }
 
-static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
-  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
-  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
-  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
-  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
-  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
-  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
-  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i zero = _mm_setzero_si128();
-
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u[16], v[16], x, y;
+  const int col_num = 4;
   int col;
 
-  for (col = 0; col < 4; ++col) {
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
     // stage 0
     // stage 1
-    u[0] = in[0 * 4 + col];
-    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
-    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
-    u[3] = in[8 * 4 + col];
-    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
-    u[5] = in[12 * 4 + col];
-    u[6] = in[4 * 4 + col];
-    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
-    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
-    u[9] = in[14 * 4 + col];
-    u[10] = in[6 * 4 + col];
-    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
-    u[12] = in[2 * 4 + col];
-    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
-    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
-    u[15] = in[10 * 4 + col];
-
     // stage 2
-    v[0] = u[0];
-    v[1] = u[1];
+    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
 
-    x = _mm_mullo_epi32(u[2], cospi32);
-    y = _mm_mullo_epi32(u[3], cospi32);
-    v[2] = _mm_add_epi32(x, y);
+    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
     v[2] = _mm_add_epi32(v[2], rnding);
     v[2] = _mm_srai_epi32(v[2], bit);
 
-    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
     v[3] = _mm_add_epi32(v[3], rnding);
     v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = u[4];
-    v[5] = u[5];
-
-    x = _mm_mullo_epi32(u[6], cospi32);
-    y = _mm_mullo_epi32(u[7], cospi32);
-    v[6] = _mm_add_epi32(x, y);
+    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
     v[6] = _mm_add_epi32(v[6], rnding);
     v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
     v[7] = _mm_add_epi32(v[7], rnding);
     v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-
-    x = _mm_mullo_epi32(u[10], cospi32);
-    y = _mm_mullo_epi32(u[11], cospi32);
-    v[10] = _mm_add_epi32(x, y);
+    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
     v[10] = _mm_add_epi32(v[10], rnding);
     v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
     v[11] = _mm_add_epi32(v[11], rnding);
     v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = u[12];
-    v[13] = u[13];
+    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
 
-    x = _mm_mullo_epi32(u[14], cospi32);
-    y = _mm_mullo_epi32(u[15], cospi32);
-    v[14] = _mm_add_epi32(x, y);
+    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
     v[14] = _mm_add_epi32(v[14], rnding);
     v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
     v[15] = _mm_add_epi32(v[15], rnding);
     v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 3
-    u[0] = _mm_add_epi32(v[0], v[2]);
-    u[1] = _mm_add_epi32(v[1], v[3]);
-    u[2] = _mm_sub_epi32(v[0], v[2]);
-    u[3] = _mm_sub_epi32(v[1], v[3]);
-    u[4] = _mm_add_epi32(v[4], v[6]);
-    u[5] = _mm_add_epi32(v[5], v[7]);
-    u[6] = _mm_sub_epi32(v[4], v[6]);
-    u[7] = _mm_sub_epi32(v[5], v[7]);
-    u[8] = _mm_add_epi32(v[8], v[10]);
-    u[9] = _mm_add_epi32(v[9], v[11]);
-    u[10] = _mm_sub_epi32(v[8], v[10]);
-    u[11] = _mm_sub_epi32(v[9], v[11]);
-    u[12] = _mm_add_epi32(v[12], v[14]);
-    u[13] = _mm_add_epi32(v[13], v[15]);
-    u[14] = _mm_sub_epi32(v[12], v[14]);
-    u[15] = _mm_sub_epi32(v[13], v[15]);
+    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
     // stage 4
     v[0] = u[0];
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
-    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
-    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
-    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
-    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 5
-    u[0] = _mm_add_epi32(v[0], v[4]);
-    u[1] = _mm_add_epi32(v[1], v[5]);
-    u[2] = _mm_add_epi32(v[2], v[6]);
-    u[3] = _mm_add_epi32(v[3], v[7]);
-    u[4] = _mm_sub_epi32(v[0], v[4]);
-    u[5] = _mm_sub_epi32(v[1], v[5]);
-    u[6] = _mm_sub_epi32(v[2], v[6]);
-    u[7] = _mm_sub_epi32(v[3], v[7]);
-    u[8] = _mm_add_epi32(v[8], v[12]);
-    u[9] = _mm_add_epi32(v[9], v[13]);
-    u[10] = _mm_add_epi32(v[10], v[14]);
-    u[11] = _mm_add_epi32(v[11], v[15]);
-    u[12] = _mm_sub_epi32(v[8], v[12]);
-    u[13] = _mm_sub_epi32(v[9], v[13]);
-    u[14] = _mm_sub_epi32(v[10], v[14]);
-    u[15] = _mm_sub_epi32(v[11], v[15]);
+    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 
     // stage 6
     v[0] = u[0];
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
-    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 7
-    u[0] = _mm_add_epi32(v[0], v[8]);
-    u[1] = _mm_add_epi32(v[1], v[9]);
-    u[2] = _mm_add_epi32(v[2], v[10]);
-    u[3] = _mm_add_epi32(v[3], v[11]);
-    u[4] = _mm_add_epi32(v[4], v[12]);
-    u[5] = _mm_add_epi32(v[5], v[13]);
-    u[6] = _mm_add_epi32(v[6], v[14]);
-    u[7] = _mm_add_epi32(v[7], v[15]);
-    u[8] = _mm_sub_epi32(v[0], v[8]);
-    u[9] = _mm_sub_epi32(v[1], v[9]);
-    u[10] = _mm_sub_epi32(v[2], v[10]);
-    u[11] = _mm_sub_epi32(v[3], v[11]);
-    u[12] = _mm_sub_epi32(v[4], v[12]);
-    u[13] = _mm_sub_epi32(v[5], v[13]);
-    u[14] = _mm_sub_epi32(v[6], v[14]);
-    u[15] = _mm_sub_epi32(v[7], v[15]);
+    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 
     // stage 8
-    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
-    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
-    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
-    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
-    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
-    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
-    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
-    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
-    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
-    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 9
-    out[0 * 4 + col] = v[1];
-    out[1 * 4 + col] = v[14];
-    out[2 * 4 + col] = v[3];
-    out[3 * 4 + col] = v[12];
-    out[4 * 4 + col] = v[5];
-    out[5 * 4 + col] = v[10];
-    out[6 * 4 + col] = v[7];
-    out[7 * 4 + col] = v[8];
-    out[8 * 4 + col] = v[9];
-    out[9 * 4 + col] = v[6];
-    out[10 * 4 + col] = v[11];
-    out[11 * 4 + col] = v[4];
-    out[12 * 4 + col] = v[13];
-    out[13 * 4 + col] = v[2];
-    out[14 * 4 + col] = v[15];
-    out[15 * 4 + col] = v[0];
+    if (do_cols) {
+      out[0 * col_num + col] = v[0];
+      out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2 * col_num + col] = v[12];
+      out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4 * col_num + col] = v[6];
+      out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6 * col_num + col] = v[10];
+      out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8 * col_num + col] = v[3];
+      out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10 * col_num + col] = v[15];
+      out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12 * col_num + col] = v[5];
+      out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14 * col_num + col] = v[9];
+      out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+    } else {
+      neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
+                       out + 1 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
+                       out + 3 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
+                       out + 5 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
+                       out + 7 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
+                       out + 9 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
+                       out + 11 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
+                       out + 13 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
+                       out + 15 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+    }
   }
 }
 
-static void round_shift_16x16(__m128i *in, int shift) {
-  round_shift_8x8(&in[0], shift);
-  round_shift_8x8(&in[16], shift);
-  round_shift_8x8(&in[32], shift);
-  round_shift_8x8(&in[48], shift);
-}
-
 void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
                                      int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
       break;
-#endif
     default: assert(0);
   }
 }
+
+static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
+  int i, j;
+
+  __m128i zero = _mm_setzero_si128();
+
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 8; ++j) {
+      in[16 * i + j] =
+          _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
+      in[16 * i + j + 8] = zero;
+    }
+  }
+
+  for (i = 0; i < 512; ++i) in[512 + i] = zero;
+}
+
+static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
+  int i, j;
+  for (i = 0; i < (do_cols ? 16 : 8); ++i) {
+    for (j = 0; j < 8; ++j) {
+      TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
+                    in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
+                    out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
+                    out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
+    }
+  }
+}
+
+static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
+                                          int col) {
+  int i;
+  for (i = 0; i < 16 * 16 / 4; i += 4) {
+    in16x16[i] = in[col];
+    in16x16[i + 1] = in[col + 1];
+    in16x16[i + 2] = in[col + 2];
+    in16x16[i + 3] = in[col + 3];
+    col += 8;
+  }
+}
+
+static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in16x16[16 * 16 / 4];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[16];
+  uint16_t *leftDown = &output[16 * stride];
+  uint16_t *rightDown = &output[16 * stride + 16];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_16x16_input_from_32x32(in, in16x16, 0);
+  write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
+  write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
+  write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
+  write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
+                                          int col) {
+  int i;
+  for (i = 0; i < 32 * 32 / 4; i += 8) {
+    in32x32[i] = in[col];
+    in32x32[i + 1] = in[col + 1];
+    in32x32[i + 2] = in[col + 2];
+    in32x32[i + 3] = in[col + 3];
+    in32x32[i + 4] = in[col + 4];
+    in32x32[i + 5] = in[col + 5];
+    in32x32[i + 6] = in[col + 6];
+    in32x32[i + 7] = in[col + 7];
+    col += 16;
+  }
+}
+
+static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in32x32[32 * 32 / 4];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[32];
+  uint16_t *leftDown = &output[32 * stride];
+  uint16_t *rightDown = &output[32 * stride + 32];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_32x32_input_from_64x64(in, in32x32, 0);
+  write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
+  write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
+  write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
+  write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  int col;
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
+    __m128i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1 * 16 + col];
+    u[34] = in[17 * 16 + col];
+    u[36] = in[9 * 16 + col];
+    u[38] = in[25 * 16 + col];
+    u[40] = in[5 * 16 + col];
+    u[42] = in[21 * 16 + col];
+    u[44] = in[13 * 16 + col];
+    u[46] = in[29 * 16 + col];
+    u[48] = in[3 * 16 + col];
+    u[50] = in[19 * 16 + col];
+    u[52] = in[11 * 16 + col];
+    u[54] = in[27 * 16 + col];
+    u[56] = in[7 * 16 + col];
+    u[58] = in[23 * 16 + col];
+    u[60] = in[15 * 16 + col];
+    u[62] = in[31 * 16 + col];
+
+    v[16] = in[2 * 16 + col];
+    v[18] = in[18 * 16 + col];
+    v[20] = in[10 * 16 + col];
+    v[22] = in[26 * 16 + col];
+    v[24] = in[6 * 16 + col];
+    v[26] = in[22 * 16 + col];
+    v[28] = in[14 * 16 + col];
+    v[30] = in[30 * 16 + col];
+
+    u[8] = in[4 * 16 + col];
+    u[10] = in[20 * 16 + col];
+    u[12] = in[12 * 16 + col];
+    u[14] = in[28 * 16 + col];
+
+    v[4] = in[8 * 16 + col];
+    v[6] = in[24 * 16 + col];
+
+    u[0] = in[0 * 16 + col];
+    u[2] = in[16 * 16 + col];
+
+    // stage 2
+    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+    u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+    u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+    u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+    u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+    u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+    u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+    u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+    v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    if (do_cols) {
+      for (i = 0; i < 32; i++) {
+        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
+                               &out[16 * (63 - i) + col]);
+      }
+    } else {
+      for (i = 0; i < 32; i++) {
+        addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
+                            &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
+                            out_shift);
+      }
+    }
+  }
+}
+
+void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64 * 64 / 4], out[64 * 64 / 4];
+  const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
+  const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
+  const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_64x64_lower_32x32(coeff, in);
+      transpose_64x64(in, out, 0);
+      idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
+      transpose_64x64(in, out, 1);
+      idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+
+    default:
+      av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+      break;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 000000000..89d0ecb1e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,853 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit =
+            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        if (do_average) {
+          const __m256i data_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+          const __m256i res_unsigned_lo =
+              _mm256_add_epi32(res_32b_lo, offset_const);
+
+          const __m256i comp_avg_res_lo = highbd_comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+          const __m256i res_unsigned_hi =
+              _mm256_add_epi32(res_32b_hi, offset_const);
+
+          const __m256i comp_avg_res_hi = highbd_comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m256i res_unsigned_16b =
+              _mm256_adds_epu16(res, offset_const_16b);
+
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b, offset_const);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b_lo, offset_const);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+            const __m256i res_unsigned_hi =
+                _mm256_add_epi32(res_32b_hi, offset_const);
+
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
+                                     uint16_t *dst0, int dst_stride0, int w,
+                                     int h, InterpFilterParams *filter_params_x,
+                                     InterpFilterParams *filter_params_y,
+                                     const int subpel_x_q4,
+                                     const int subpel_y_q4,
+                                     ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        const __m256i res_unsigned_lo =
+            _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst0, int dst_stride0, int w,
+                                    int h, InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  int i, j;
+  __m256i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+      if (w - j < 8) {
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+          const __m256i comp_avg_res = highbd_comp_avg(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result = highbd_convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result, round_result);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      } else {
+        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+          const __m256i comp_avg_res_lo = highbd_comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_hi = highbd_comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                          res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst0, int dst_stride0, int w,
+                                    int h, InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  assert(bits >= 0);
+  int i, j;
+  __m256i s[8], coeffs_y[4];
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i round_const_y =
+      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+          res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 000000000..ccca6b07a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_jnt_convolve_y_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  assert(bits >= 0);
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i s[16], coeffs_y[4];
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+        res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+                                     round_shift_y);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+        res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+                                     round_shift_y);
+
+        __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+        __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+            const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
+                &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
+                &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_0, round_result_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_1, round_result_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+                             res_clip_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+
+          } else {
+            __m128i res_16b_0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+            __m128i res_16b_1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16b_1);
+          }
+        } else {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+          __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+          __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+            const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+            const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_lo_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_lo_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_lo_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_lo_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                            res_clip_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+          } else {
+            __m128i res_16bit0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+            __m128i res_16bit1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_16bit1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  int i, j;
+  __m128i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 1) {
+      const __m128i row00 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+      const __m128i row01 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+      // even pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 0);
+      s[1] = _mm_alignr_epi8(row01, row00, 4);
+      s[2] = _mm_alignr_epi8(row01, row00, 8);
+      s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+      __m128i res_even = convolve(s, coeffs_x);
+      res_even =
+          _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+      // odd pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 2);
+      s[1] = _mm_alignr_epi8(row01, row00, 6);
+      s[2] = _mm_alignr_epi8(row01, row00, 10);
+      s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+      __m128i res_odd = convolve(s, coeffs_x);
+      res_odd =
+          _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+      res_even = _mm_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+      __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+      __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+      if (w - j < 8) {
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+          const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i round_result = highbd_convolve_rounding_sse2(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+        }
+      } else {
+        __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index fb246674a..b29bd1d79 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -90,4 +90,14 @@ static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
   return x;
 }
 
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+                                        const __m128i *rounding, int bit) {
+  __m128i x;
+
+  x = _mm_mullo_epi32(*w0, *n0);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
 #endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 000000000..a08beaafd
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+};
+
+static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
+                                     int sx, int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+  const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+  const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+  const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+  const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  // Calculate filtered results
+  const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+  const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
+  const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
+  const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
+
+  __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                           _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  // Filter odd-index pixels
+  const __m128i tmp_1 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+  const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
+  const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
+  const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
+  const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+
+  __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+  res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+                          _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  // Combine results into one register.
+  // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+  // as this order helps with the vertical filter.
+  tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+                                   int width, int height, int stride,
+                                   uint16_t *pred, int p_col, int p_row,
+                                   int p_width, int p_height, int p_stride,
+                                   int subsampling_x, int subsampling_y, int bd,
+                                   ConvolveParams *conv_params, int16_t alpha,
+                                   int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(!(bd == 12 && reduce_bits_horiz < 5));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const __m128i res_sub_const =
+      _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          const __m128i src_01 = _mm_shuffle_epi8(
+              src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+          const __m128i src2_01 = _mm_shuffle_epi8(
+              src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+          __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+          __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+          }
+
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+          }
+
+          const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+          const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+          horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
+                            offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          res_lo = _mm_add_epi32(res_lo, res_add_const);
+          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+                                 reduce_bits_vert_shift);
+
+          if (conv_params->do_average) {
+            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+            if (conv_params->use_jnt_comp_avg) {
+              res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                     _mm_mullo_epi32(res_lo, wt1));
+              res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+            } else {
+              res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+            }
+
+            __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+            res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+                                     round_bits_shift);
+
+            __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+            res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+            _mm_storel_epi64(dst16, res16_lo);
+          } else {
+            res_lo = _mm_packus_epi32(res_lo, res_lo);
+            _mm_storel_epi64(p, res_lo);
+          }
+          if (p_width > 4) {
+            __m128i *const p4 =
+                (__m128i *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = _mm_add_epi32(res_hi, res_add_const);
+            res_hi =
+                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+                              reduce_bits_vert_shift);
+            if (conv_params->do_average) {
+              __m128i *const dst16_4 =
+                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+              __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+              if (conv_params->use_jnt_comp_avg) {
+                res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+                                       _mm_mullo_epi32(res_hi, wt1));
+                res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+              } else {
+                res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+              }
+
+              __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+              res32_hi = _mm_sra_epi32(
+                  _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+              __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+              res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+              _mm_storel_epi64(dst16_4, res16_hi);
+            } else {
+              res_hi = _mm_packus_epi32(res_hi, res_hi);
+              _mm_storel_epi64(p4, res_hi);
+            }
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+          const __m128i zero = _mm_setzero_si128();
+          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            _mm_storel_epi64(p, res_16bit);
+          } else {
+            _mm_storeu_si128(p, res_16bit);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
deleted file mode 100644
index 71b0ec7a3..000000000
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
-                                  int width, int height, int stride,
-                                  uint16_t *pred, int p_col, int p_row,
-                                  int p_width, int p_height, int p_stride,
-                                  int subsampling_x, int subsampling_y, int bd,
-                                  ConvolveParams *conv_params, int16_t alpha,
-                                  int16_t beta, int16_t gamma, int16_t delta) {
-  int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
-  __m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
-  int i, j, k;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
-  /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
-  }*/
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          const __m128i src2 =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
-          // Filter even-index pixels
-          const __m128i tmp_0 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_2 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_4 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_6 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
-          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
-          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
-          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
-          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-          const __m128i round_const = _mm_set1_epi32(
-              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
-          // Calculate filtered results
-          const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
-          const __m128i res_2 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
-          const __m128i res_4 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
-          const __m128i res_6 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
-          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                           _mm_add_epi32(res_2, res_6));
-          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
-                                   _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Filter odd-index pixels
-          const __m128i tmp_1 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_3 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_5 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_7 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          const __m128i res_1 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
-          const __m128i res_3 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
-          const __m128i res_5 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
-          const __m128i res_7 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
-
-          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                          _mm_add_epi32(res_3, res_7));
-          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
-                                  _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Combine results into one register.
-          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
-          // as this order helps with the vertical filter.
-          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_conv_params) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          const __m128i round_const = _mm_set1_epi32(
-              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
-              ((1 << (conv_params->round_1)) >> 1));
-          res_lo = _mm_add_epi32(res_lo, round_const);
-          res_lo =
-              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
-          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
-          _mm_storeu_si128(p, res_lo);
-          if (p_width > 4) {
-            res_hi = _mm_add_epi32(res_hi, round_const);
-            res_hi =
-                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
-            if (comp_avg)
-              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
-            _mm_storeu_si128(p + 1, res_hi);
-          }
-        } else {
-#else
-        {
-#endif
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          // Clamp res_16bit to the range [0, 2^bd - 1]
-          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
-          const __m128i zero = _mm_setzero_si128();
-          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            if (comp_avg)
-              res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
-            _mm_storel_epi64(p, res_16bit);
-          } else {
-            if (comp_avg)
-              res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
-            _mm_storeu_si128(p, res_16bit);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 000000000..0c8a8505b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+
+  /* Horizontal filter */
+  {
+    const __m256i clamp_high_ep =
+        _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+        // Load 16-bit src data
+        const __m256i src_0 = yy_loadu_256(src_ij + 0);
+        const __m256i src_1 = yy_loadu_256(src_ij + 1);
+        const __m256i src_2 = yy_loadu_256(src_ij + 2);
+        const __m256i src_3 = yy_loadu_256(src_ij + 3);
+        const __m256i src_4 = yy_loadu_256(src_ij + 4);
+        const __m256i src_5 = yy_loadu_256(src_ij + 5);
+        const __m256i src_6 = yy_loadu_256(src_ij + 6);
+        const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+        const __m256i res_16bit_clamped = _mm256_min_epi16(
+            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+        // Store in the dst array
+        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 000000000..818b1099c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int i, j;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero = _mm_setzero_si128();
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+  /* Horizontal filter */
+  {
+    const __m128i coeffs_x =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                  conv_params->round_0);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                 conv_params->round_0);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        const __m128i maxval =
+            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m128i coeffs_y =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, res_16bit);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
deleted file mode 100644
index c440d0f88..000000000
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // avx2
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-
-#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-
-void av1_idct16_avx2(__m256i *in) {
-  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
-  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
-  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
-  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
-  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
-
-  // stage 1, (0-7)
-  u0 = in[0];
-  u1 = in[8];
-  u2 = in[4];
-  u3 = in[12];
-  u4 = in[2];
-  u5 = in[10];
-  u6 = in[6];
-  u7 = in[14];
-
-  // stage 2, (0-7)
-  // stage 3, (0-7)
-  t0 = u0;
-  t1 = u1;
-  t2 = u2;
-  t3 = u3;
-  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
-  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
-
-  // stage 4, (0-7)
-  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
-  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
-  u4 = _mm256_add_epi16(t4, t5);
-  u5 = _mm256_sub_epi16(t4, t5);
-  u6 = _mm256_sub_epi16(t7, t6);
-  u7 = _mm256_add_epi16(t7, t6);
-
-  // stage 5, (0-7)
-  t0 = _mm256_add_epi16(u0, u3);
-  t1 = _mm256_add_epi16(u1, u2);
-  t2 = _mm256_sub_epi16(u1, u2);
-  t3 = _mm256_sub_epi16(u0, u3);
-  t4 = u4;
-  t7 = u7;
-  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
-
-  // stage 6, (0-7)
-  u0 = _mm256_add_epi16(t0, t7);
-  u1 = _mm256_add_epi16(t1, t6);
-  u2 = _mm256_add_epi16(t2, t5);
-  u3 = _mm256_add_epi16(t3, t4);
-  u4 = _mm256_sub_epi16(t3, t4);
-  u5 = _mm256_sub_epi16(t2, t5);
-  u6 = _mm256_sub_epi16(t1, t6);
-  u7 = _mm256_sub_epi16(t0, t7);
-
-  // stage 1, (8-15)
-  v0 = in[1];
-  v1 = in[9];
-  v2 = in[5];
-  v3 = in[13];
-  v4 = in[3];
-  v5 = in[11];
-  v6 = in[7];
-  v7 = in[15];
-
-  // stage 2, (8-15)
-  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
-  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
-  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
-
-  // stage 3, (8-15)
-  v0 = _mm256_add_epi16(t0, t1);
-  v1 = _mm256_sub_epi16(t0, t1);
-  v2 = _mm256_sub_epi16(t3, t2);
-  v3 = _mm256_add_epi16(t2, t3);
-  v4 = _mm256_add_epi16(t4, t5);
-  v5 = _mm256_sub_epi16(t4, t5);
-  v6 = _mm256_sub_epi16(t7, t6);
-  v7 = _mm256_add_epi16(t6, t7);
-
-  // stage 4, (8-15)
-  t0 = v0;
-  t7 = v7;
-  t3 = v3;
-  t4 = v4;
-  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  // stage 5, (8-15)
-  v0 = _mm256_add_epi16(t0, t3);
-  v1 = _mm256_add_epi16(t1, t2);
-  v2 = _mm256_sub_epi16(t1, t2);
-  v3 = _mm256_sub_epi16(t0, t3);
-  v4 = _mm256_sub_epi16(t7, t4);
-  v5 = _mm256_sub_epi16(t6, t5);
-  v6 = _mm256_add_epi16(t6, t5);
-  v7 = _mm256_add_epi16(t7, t4);
-
-  // stage 6, (8-15)
-  t0 = v0;
-  t1 = v1;
-  t6 = v6;
-  t7 = v7;
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
-
-  // stage 7
-  in[0] = _mm256_add_epi16(u0, t7);
-  in[1] = _mm256_add_epi16(u1, t6);
-  in[2] = _mm256_add_epi16(u2, t5);
-  in[3] = _mm256_add_epi16(u3, t4);
-  in[4] = _mm256_add_epi16(u4, t3);
-  in[5] = _mm256_add_epi16(u5, t2);
-  in[6] = _mm256_add_epi16(u6, t1);
-  in[7] = _mm256_add_epi16(u7, t0);
-  in[8] = _mm256_sub_epi16(u7, t0);
-  in[9] = _mm256_sub_epi16(u6, t1);
-  in[10] = _mm256_sub_epi16(u5, t2);
-  in[11] = _mm256_sub_epi16(u4, t3);
-  in[12] = _mm256_sub_epi16(u3, t4);
-  in[13] = _mm256_sub_epi16(u2, t5);
-  in[14] = _mm256_sub_epi16(u1, t6);
-  in[15] = _mm256_sub_epi16(u0, t7);
-}
-
-static void idct16(__m256i *in) {
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-}
-
-static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
-                                 const __m256i *c0, const __m256i *c1,
-                                 __m256i *b) {
-  __m256i x0, x1;
-  x0 = _mm256_unpacklo_epi16(*a0, *a1);
-  x1 = _mm256_unpackhi_epi16(*a0, *a1);
-  b[0] = _mm256_madd_epi16(x0, *c0);
-  b[1] = _mm256_madd_epi16(x1, *c0);
-  b[2] = _mm256_madd_epi16(x0, *c1);
-  b[3] = _mm256_madd_epi16(x1, *c1);
-}
-
-static INLINE void group_rounding(__m256i *a, int num) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  int i;
-  for (i = 0; i < num; ++i) {
-    a[i] = _mm256_add_epi32(a[i], dct_rounding);
-    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
-  }
-}
-
-static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
-  __m256i x[4];
-  x[0] = _mm256_add_epi32(a[0], b[0]);
-  x[1] = _mm256_add_epi32(a[1], b[1]);
-  x[2] = _mm256_add_epi32(a[2], b[2]);
-  x[3] = _mm256_add_epi32(a[3], b[3]);
-
-  group_rounding(x, 4);
-
-  out[0] = _mm256_packs_epi32(x[0], x[1]);
-  out[1] = _mm256_packs_epi32(x[2], x[3]);
-}
-
-static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
-  __m256i x[4];
-  x[0] = _mm256_sub_epi32(a[0], b[0]);
-  x[1] = _mm256_sub_epi32(a[1], b[1]);
-  x[2] = _mm256_sub_epi32(a[2], b[2]);
-  x[3] = _mm256_sub_epi32(a[3], b[3]);
-
-  group_rounding(x, 4);
-
-  out[0] = _mm256_packs_epi32(x[0], x[1]);
-  out[1] = _mm256_packs_epi32(x[2], x[3]);
-}
-
-static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
-  group_rounding(a, 4);
-  out[0] = _mm256_packs_epi32(a[0], a[1]);
-  out[1] = _mm256_packs_epi32(a[2], a[3]);
-}
-
-static void iadst16_avx2(__m256i *in) {
-  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i x[16], s[16];
-  __m256i u[4], v[4];
-
-  // stage 1
-  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
-  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
-  add_rnd(u, v, &x[0]);
-  sub_rnd(u, v, &x[8]);
-
-  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
-  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
-  add_rnd(u, v, &x[2]);
-  sub_rnd(u, v, &x[10]);
-
-  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
-  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
-  add_rnd(u, v, &x[4]);
-  sub_rnd(u, v, &x[12]);
-
-  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
-  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
-  add_rnd(u, v, &x[6]);
-  sub_rnd(u, v, &x[14]);
-
-  // stage 2
-  s[0] = _mm256_add_epi16(x[0], x[4]);
-  s[1] = _mm256_add_epi16(x[1], x[5]);
-  s[2] = _mm256_add_epi16(x[2], x[6]);
-  s[3] = _mm256_add_epi16(x[3], x[7]);
-  s[4] = _mm256_sub_epi16(x[0], x[4]);
-  s[5] = _mm256_sub_epi16(x[1], x[5]);
-  s[6] = _mm256_sub_epi16(x[2], x[6]);
-  s[7] = _mm256_sub_epi16(x[3], x[7]);
-  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
-  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
-  add_rnd(u, v, &s[8]);
-  sub_rnd(u, v, &s[12]);
-
-  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
-  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
-  add_rnd(u, v, &s[10]);
-  sub_rnd(u, v, &s[14]);
-
-  // stage 3
-  x[0] = _mm256_add_epi16(s[0], s[2]);
-  x[1] = _mm256_add_epi16(s[1], s[3]);
-  x[2] = _mm256_sub_epi16(s[0], s[2]);
-  x[3] = _mm256_sub_epi16(s[1], s[3]);
-
-  x[8] = _mm256_add_epi16(s[8], s[10]);
-  x[9] = _mm256_add_epi16(s[9], s[11]);
-  x[10] = _mm256_sub_epi16(s[8], s[10]);
-  x[11] = _mm256_sub_epi16(s[9], s[11]);
-
-  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
-  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
-  add_rnd(u, v, &x[4]);
-  sub_rnd(u, v, &x[6]);
-
-  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
-  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
-  add_rnd(u, v, &x[12]);
-  sub_rnd(u, v, &x[14]);
-
-  // stage 4
-  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
-  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
-  butterfly_rnd(u, &x[2]);
-  butterfly_rnd(v, &x[6]);
-
-  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
-  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
-  butterfly_rnd(u, &x[10]);
-  butterfly_rnd(v, &x[14]);
-
-  in[0] = x[0];
-  in[1] = _mm256_sub_epi16(zero, x[8]);
-  in[2] = x[12];
-  in[3] = _mm256_sub_epi16(zero, x[4]);
-  in[4] = x[6];
-  in[5] = x[14];
-  in[6] = x[10];
-  in[7] = x[2];
-  in[8] = x[3];
-  in[9] = x[11];
-  in[10] = x[15];
-  in[11] = x[7];
-  in[12] = x[5];
-  in[13] = _mm256_sub_epi16(zero, x[13]);
-  in[14] = x[9];
-  in[15] = _mm256_sub_epi16(zero, x[1]);
-}
-
-static void iadst16(__m256i *in) {
-  mm256_transpose_16x16(in, in);
-  iadst16_avx2(in);
-}
-
-#if CONFIG_EXT_TX
-static void flip_row(__m256i *in, int rows) {
-  int i;
-  for (i = 0; i < rows; ++i) {
-    mm256_reverse_epi16(&in[i]);
-  }
-}
-
-static void flip_col(uint8_t **dest, int *stride, int rows) {
-  *dest = *dest + (rows - 1) * (*stride);
-  *stride = -*stride;
-}
-
-static void iidtx16(__m256i *in) {
-  mm256_transpose_16x16(in, in);
-  txfm_scaling16_avx2((int16_t)Sqrt2, in);
-}
-#endif
-
-void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m256i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  load_buffer_16x16(input, in);
-  switch (tx_type) {
-    case DCT_DCT:
-      idct16(in);
-      idct16(in);
-      break;
-    case ADST_DCT:
-      idct16(in);
-      iadst16(in);
-      break;
-    case DCT_ADST:
-      iadst16(in);
-      idct16(in);
-      break;
-    case ADST_ADST:
-      iadst16(in);
-      iadst16(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      idct16(in);
-      iadst16(in);
-      flip_col(&dest, &stride, 16);
-      break;
-    case DCT_FLIPADST:
-      iadst16(in);
-      idct16(in);
-      flip_row(in, 16);
-      break;
-    case FLIPADST_FLIPADST:
-      iadst16(in);
-      iadst16(in);
-      flip_row(in, 16);
-      flip_col(&dest, &stride, 16);
-      break;
-    case ADST_FLIPADST:
-      iadst16(in);
-      iadst16(in);
-      flip_row(in, 16);
-      break;
-    case FLIPADST_ADST:
-      iadst16(in);
-      iadst16(in);
-      flip_col(&dest, &stride, 16);
-      break;
-    case IDTX:
-      iidtx16(in);
-      iidtx16(in);
-      break;
-    case V_DCT:
-      iidtx16(in);
-      idct16(in);
-      break;
-    case H_DCT:
-      idct16(in);
-      iidtx16(in);
-      break;
-    case V_ADST:
-      iidtx16(in);
-      iadst16(in);
-      break;
-    case H_ADST:
-      iadst16(in);
-      iidtx16(in);
-      break;
-    case V_FLIPADST:
-      iidtx16(in);
-      iadst16(in);
-      flip_col(&dest, &stride, 16);
-      break;
-    case H_FLIPADST:
-      iadst16(in);
-      iidtx16(in);
-      flip_row(in, 16);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-  store_buffer_16xN(in, stride, dest, 16);
-}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
deleted file mode 100644
index 541165c8d..000000000
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ /dev/null
@@ -1,1411 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-#include "av1/common/enums.h"
-
-#if CONFIG_EXT_TX
-static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
-  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
-  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
-}
-
-static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
-  in[0] = mm_reverse_epi16(in[0]);
-  in[1] = mm_reverse_epi16(in[1]);
-  in[2] = mm_reverse_epi16(in[2]);
-  in[3] = mm_reverse_epi16(in[3]);
-
-  in[4] = mm_reverse_epi16(in[4]);
-  in[5] = mm_reverse_epi16(in[5]);
-  in[6] = mm_reverse_epi16(in[6]);
-  in[7] = mm_reverse_epi16(in[7]);
-}
-
-static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
-  fliplr_8x8(&in[0]);
-  fliplr_8x8(&in[8]);
-}
-
-#define FLIPLR_16x16(in0, in1) \
-  do {                         \
-    __m128i *tmp;              \
-    fliplr_16x8(in0);          \
-    fliplr_16x8(in1);          \
-    tmp = (in0);               \
-    (in0) = (in1);             \
-    (in1) = tmp;               \
-  } while (0)
-
-#define FLIPUD_PTR(dest, stride, size)       \
-  do {                                       \
-    (dest) = (dest) + ((size)-1) * (stride); \
-    (stride) = -(stride);                    \
-  } while (0)
-#endif
-
-void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[2];
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct4_sse2(in);
-      aom_idct4_sse2(in);
-      break;
-    case ADST_DCT:
-      aom_idct4_sse2(in);
-      aom_iadst4_sse2(in);
-      break;
-    case DCT_ADST:
-      aom_iadst4_sse2(in);
-      aom_idct4_sse2(in);
-      break;
-    case ADST_ADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct4_sse2(in);
-      aom_iadst4_sse2(in);
-      FLIPUD_PTR(dest, stride, 4);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst4_sse2(in);
-      aom_idct4_sse2(in);
-      fliplr_4x4(in);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      FLIPUD_PTR(dest, stride, 4);
-      fliplr_4x4(in);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      fliplr_4x4(in);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      FLIPUD_PTR(dest, stride, 4);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  // Final round and shift
-  in[0] = _mm_add_epi16(in[0], eight);
-  in[1] = _mm_add_epi16(in[1], eight);
-
-  in[0] = _mm_srai_epi16(in[0], 4);
-  in[1] = _mm_srai_epi16(in[1], 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
-    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
-    d0 = _mm_unpacklo_epi32(d0, d1);
-    d2 = _mm_unpacklo_epi32(d2, d3);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store result[0]
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store result[1]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store result[2]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    // store result[3]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[8];
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct8_sse2(in);
-      aom_idct8_sse2(in);
-      break;
-    case ADST_DCT:
-      aom_idct8_sse2(in);
-      aom_iadst8_sse2(in);
-      break;
-    case DCT_ADST:
-      aom_iadst8_sse2(in);
-      aom_idct8_sse2(in);
-      break;
-    case ADST_ADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct8_sse2(in);
-      aom_iadst8_sse2(in);
-      FLIPUD_PTR(dest, stride, 8);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst8_sse2(in);
-      aom_idct8_sse2(in);
-      fliplr_8x8(in);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      FLIPUD_PTR(dest, stride, 8);
-      fliplr_8x8(in);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      fliplr_8x8(in);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      FLIPUD_PTR(dest, stride, 8);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 5);
-  in[1] = _mm_srai_epi16(in[1], 5);
-  in[2] = _mm_srai_epi16(in[2], 5);
-  in[3] = _mm_srai_epi16(in[3], 5);
-  in[4] = _mm_srai_epi16(in[4], 5);
-  in[5] = _mm_srai_epi16(in[5], 5);
-  in[6] = _mm_srai_epi16(in[6], 5);
-  in[7] = _mm_srai_epi16(in[7], 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-}
-
-#if CONFIG_EXT_TX
-static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idtx16_8col(in0);
-  idtx16_8col(in1);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m128i in[32];
-  __m128i *in0 = &in[0];
-  __m128i *in1 = &in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  load_buffer_8x16(input, in0);
-  input += 8;
-  load_buffer_8x16(input, in1);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case ADST_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-    case DCT_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case ADST_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case IDTX:
-      iidtx16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      break;
-    case V_DCT:
-      iidtx16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case H_DCT:
-      aom_idct16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      break;
-    case V_ADST:
-      iidtx16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-    case H_ADST:
-      aom_iadst16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      break;
-    case V_FLIPADST:
-      iidtx16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case H_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  write_buffer_8x16(dest, in0, stride);
-  dest += 8;
-  write_buffer_8x16(dest, in1, stride);
-}
-
-#if CONFIG_EXT_TX
-static void iidtx8_sse2(__m128i *in) {
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-}
-
-static INLINE void iidtx4_sse2(__m128i *in) {
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-}
-
-// load 8x8 array
-static INLINE void flip_buffer_lr_8x8(__m128i *in) {
-  in[0] = mm_reverse_epi16(in[0]);
-  in[1] = mm_reverse_epi16(in[1]);
-  in[2] = mm_reverse_epi16(in[2]);
-  in[3] = mm_reverse_epi16(in[3]);
-  in[4] = mm_reverse_epi16(in[4]);
-  in[5] = mm_reverse_epi16(in[5]);
-  in[6] = mm_reverse_epi16(in[6]);
-  in[7] = mm_reverse_epi16(in[7]);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
-  in[4] = load_input_data(input + 4 * 8);
-  in[5] = load_input_data(input + 5 * 8);
-  in[6] = load_input_data(input + 6 * 8);
-  in[7] = load_input_data(input + 7 * 8);
-
-  in[8] = load_input_data(input + 8 * 8);
-  in[9] = load_input_data(input + 9 * 8);
-  in[10] = load_input_data(input + 10 * 8);
-  in[11] = load_input_data(input + 11 * 8);
-  in[12] = load_input_data(input + 12 * 8);
-  in[13] = load_input_data(input + 13 * 8);
-  in[14] = load_input_data(input + 14 * 8);
-  in[15] = load_input_data(input + 15 * 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct8_sse2(in);
-      array_transpose_8x8(in, in);
-      aom_idct8_sse2(in + 8);
-      array_transpose_8x8(in + 8, in + 8);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      aom_iadst8_sse2(in);
-      array_transpose_8x8(in, in);
-      aom_iadst8_sse2(in + 8);
-      array_transpose_8x8(in + 8, in + 8);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-      iidtx8_sse2(in);
-      iidtx8_sse2(in + 8);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      idct16_8col(in);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      iadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX: idtx16_8col(in); break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case H_DCT:
-#endif
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      write_buffer_8x16(dest, in, stride);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x16(dest, in, stride);
-      break;
-    case FLIPADST_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x16(dest + stride * 15, in, -stride);
-      break;
-#endif
-    default: assert(0); break;
-  }
-}
-
-static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
-                                           int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-}
-
-void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  // Transpose 16x8 input into in[]
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-  array_transpose_8x8(in, in);
-
-  in[8] = load_input_data(input + 8 + 0 * 16);
-  in[9] = load_input_data(input + 8 + 1 * 16);
-  in[10] = load_input_data(input + 8 + 2 * 16);
-  in[11] = load_input_data(input + 8 + 3 * 16);
-  in[12] = load_input_data(input + 8 + 4 * 16);
-  in[13] = load_input_data(input + 8 + 5 * 16);
-  in[14] = load_input_data(input + 8 + 6 * 16);
-  in[15] = load_input_data(input + 8 + 7 * 16);
-  array_transpose_8x8(in + 8, in + 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      idct16_8col(in);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      iadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: idtx16_8col(in); break;
-#endif
-    default: assert(0); break;
-  }
-
-  // Scale
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct8_sse2(in);
-      aom_idct8_sse2(in + 8);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in + 8);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      array_transpose_8x8(in, in);
-      array_transpose_8x8(in + 8, in + 8);
-      iidtx8_sse2(in);
-      iidtx8_sse2(in + 8);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      write_buffer_8x8_round6(dest, in, stride);
-      write_buffer_8x8_round6(dest + 8, in + 8, stride);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST:
-      write_buffer_8x8_round6(dest + stride * 7, in, -stride);
-      write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
-      break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x8_round6(dest, in + 8, stride);
-      write_buffer_8x8_round6(dest + 8, in, stride);
-      break;
-    case FLIPADST_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
-      write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
-      break;
-#endif
-    default: assert(0); break;
-  }
-}
-
-static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
-                                           int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 5);
-  in[1] = _mm_srai_epi16(in[1], 5);
-  in[2] = _mm_srai_epi16(in[2], 5);
-  in[3] = _mm_srai_epi16(in[3], 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-}
-
-void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct8_sse2(in);
-      break;
-    case DCT_ADST:
-    case ADST_ADST: aom_iadst8_sse2(in); break;
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST: aom_iadst8_sse2(in); break;
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
-#endif
-      break;
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x8(in);
-
-  // Repack data. We pack into the bottom half of 'in'
-  // so that the next repacking stage can pack into the
-  // top half without overwriting anything
-  in[7] = _mm_unpacklo_epi64(in[6], in[7]);
-  in[6] = _mm_unpacklo_epi64(in[4], in[5]);
-  in[5] = _mm_unpacklo_epi64(in[2], in[3]);
-  in[4] = _mm_unpacklo_epi64(in[0], in[1]);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct4_sse2(in + 4);
-      aom_idct4_sse2(in + 6);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst4_sse2(in + 4);
-      aom_iadst4_sse2(in + 6);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      iidtx4_sse2(in + 4);
-      array_transpose_4x4(in + 4);
-      iidtx4_sse2(in + 6);
-      array_transpose_4x4(in + 6);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  // Repack data
-  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
-  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
-  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
-  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: break;
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      in[0] = mm_reverse_epi16(in[0]);
-      in[1] = mm_reverse_epi16(in[1]);
-      in[2] = mm_reverse_epi16(in[2]);
-      in[3] = mm_reverse_epi16(in[3]);
-      break;
-    case FLIPADST_FLIPADST:
-      in[0] = mm_reverse_epi16(in[0]);
-      in[1] = mm_reverse_epi16(in[1]);
-      in[2] = mm_reverse_epi16(in[2]);
-      in[3] = mm_reverse_epi16(in[3]);
-      FLIPUD_PTR(dest, stride, 4);
-#endif
-      break;
-    default: assert(0); break;
-  }
-  write_buffer_8x4_round5(dest, in, stride);
-}
-
-static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
-                                           int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 5);
-  in[1] = _mm_srai_epi16(in[1], 5);
-  in[2] = _mm_srai_epi16(in[2], 5);
-  in[3] = _mm_srai_epi16(in[3], 5);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
-    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
-    __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
-    __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
-    __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
-    __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
-
-    d0 = _mm_unpacklo_epi32(d0, d1);
-    d2 = _mm_unpacklo_epi32(d2, d3);
-    d4 = _mm_unpacklo_epi32(d4, d5);
-    d6 = _mm_unpacklo_epi32(d6, d7);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d4 = _mm_unpacklo_epi8(d4, zero);
-    d6 = _mm_unpacklo_epi8(d6, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d4 = _mm_add_epi16(d4, in[2]);
-    d6 = _mm_add_epi16(d6, in[3]);
-
-    d0 = _mm_packus_epi16(d0, d2);
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_packus_epi16(d4, d6);
-    *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  // Load rows, packed two per element of 'in'.
-  // We pack into the bottom half of 'in' so that the
-  // later repacking stage can pack into the
-  // top half without overwriting anything
-  in[4] = load_input_data(input + 0 * 8);
-  in[5] = load_input_data(input + 1 * 8);
-  in[6] = load_input_data(input + 2 * 8);
-  in[7] = load_input_data(input + 3 * 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct4_sse2(in + 4);
-      aom_idct4_sse2(in + 6);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      aom_iadst4_sse2(in + 4);
-      aom_iadst4_sse2(in + 6);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-      iidtx4_sse2(in + 4);
-      array_transpose_4x4(in + 4);
-      iidtx4_sse2(in + 6);
-      array_transpose_4x4(in + 6);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x4(in + 4);
-
-  // Repack data
-  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
-  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
-  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
-  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct8_sse2(in);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      iidtx8_sse2(in);
-      array_transpose_8x8(in, in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
-      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
-      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
-      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
-      break;
-    case FLIPADST_FLIPADST:
-      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
-      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
-      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
-      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
-      FLIPUD_PTR(dest, stride, 8);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  in[0] = _mm_unpacklo_epi64(in[0], in[1]);
-  in[1] = _mm_unpacklo_epi64(in[2], in[3]);
-  in[2] = _mm_unpacklo_epi64(in[4], in[5]);
-  in[3] = _mm_unpacklo_epi64(in[6], in[7]);
-  write_buffer_4x8_round5(dest, in, stride);
-}
-
-// Note: The 16-column 32-element transforms take input in the form of four
-// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
-// of the overall 16x32 input buffer.
-static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                __m128i *br) {
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-  idct32_8col(tl, bl);
-  idct32_8col(tr, br);
-}
-
-static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                      __m128i *br) {
-  __m128i tmpl[16], tmpr[16];
-  int i;
-
-  // Copy the top half of the input to temporary storage
-  for (i = 0; i < 16; ++i) {
-    tmpl[i] = tl[i];
-    tmpr[i] = tr[i];
-  }
-
-  // Generate the top half of the output
-  for (i = 0; i < 16; ++i) {
-    tl[i] = _mm_slli_epi16(bl[i], 2);
-    tr[i] = _mm_slli_epi16(br[i], 2);
-  }
-  array_transpose_16x16(tl, tr);
-
-  // Copy the temporary storage back to the bottom half of the input
-  for (i = 0; i < 16; ++i) {
-    bl[i] = tmpl[i];
-    br[i] = tmpr[i];
-  }
-
-  // Generate the bottom half of the output
-  scale_sqrt2_8x16(bl);
-  scale_sqrt2_8x16(br);
-  aom_idct16_sse2(bl, br);  // Includes a transposition
-}
-
-#if CONFIG_EXT_TX
-static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                 __m128i *br) {
-  int i;
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-  for (i = 0; i < 16; ++i) {
-    tl[i] = _mm_slli_epi16(tl[i], 2);
-    tr[i] = _mm_slli_epi16(tr[i], 2);
-    bl[i] = _mm_slli_epi16(bl[i], 2);
-    br[i] = _mm_slli_epi16(br[i], 2);
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
-                                             __m128i *intr, __m128i *inbl,
-                                             __m128i *inbr, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    intl[i] = _mm_adds_epi16(intl[i], final_rounding);
-    intr[i] = _mm_adds_epi16(intr[i], final_rounding);
-    inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
-    inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
-    intl[i] = _mm_srai_epi16(intl[i], 6);
-    intr[i] = _mm_srai_epi16(intr[i], 6);
-    inbl[i] = _mm_srai_epi16(inbl[i], 6);
-    inbr[i] = _mm_srai_epi16(inbr[i], 6);
-    RECON_AND_STORE(dest + i * stride + 0, intl[i]);
-    RECON_AND_STORE(dest + i * stride + 8, intr[i]);
-    RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
-    RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
-  }
-}
-
-void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m128i intl[16], intr[16], inbl[16], inbr[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    intl[i] = load_input_data(input + i * 16 + 0);
-    intr[i] = load_input_data(input + i * 16 + 8);
-    inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
-    inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
-  }
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct16_sse2(intl, intr);
-      aom_idct16_sse2(inbl, inbr);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      aom_iadst16_sse2(intl, intr);
-      aom_iadst16_sse2(inbl, inbr);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-      iidtx16_sse2(intl, intr);
-      iidtx16_sse2(inbl, inbr);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x16(intl);
-  scale_sqrt2_8x16(intr);
-  scale_sqrt2_8x16(inbl);
-  scale_sqrt2_8x16(inbr);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      idct32_16col(intl, intr, inbl, inbr);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      ihalfright32_16col(intl, intr, inbl, inbr);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp = intl[i];
-        intl[i] = mm_reverse_epi16(intr[i]);
-        intr[i] = mm_reverse_epi16(tmp);
-        tmp = inbl[i];
-        inbl[i] = mm_reverse_epi16(inbr[i]);
-        inbr[i] = mm_reverse_epi16(tmp);
-      }
-      break;
-    case FLIPADST_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp = intl[i];
-        intl[i] = mm_reverse_epi16(intr[i]);
-        intr[i] = mm_reverse_epi16(tmp);
-        tmp = inbl[i];
-        inbl[i] = mm_reverse_epi16(inbr[i]);
-        inbr[i] = mm_reverse_epi16(tmp);
-      }
-      FLIPUD_PTR(dest, stride, 32);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
-}
-
-static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
-                                             __m128i *in1, __m128i *in2,
-                                             __m128i *in3, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    in0[i] = _mm_adds_epi16(in0[i], final_rounding);
-    in1[i] = _mm_adds_epi16(in1[i], final_rounding);
-    in2[i] = _mm_adds_epi16(in2[i], final_rounding);
-    in3[i] = _mm_adds_epi16(in3[i], final_rounding);
-    in0[i] = _mm_srai_epi16(in0[i], 6);
-    in1[i] = _mm_srai_epi16(in1[i], 6);
-    in2[i] = _mm_srai_epi16(in2[i], 6);
-    in3[i] = _mm_srai_epi16(in3[i], 6);
-    RECON_AND_STORE(dest + i * stride + 0, in0[i]);
-    RECON_AND_STORE(dest + i * stride + 8, in1[i]);
-    RECON_AND_STORE(dest + i * stride + 16, in2[i]);
-    RECON_AND_STORE(dest + i * stride + 24, in3[i]);
-  }
-}
-
-void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m128i in0[16], in1[16], in2[16], in3[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    in0[i] = load_input_data(input + i * 32 + 0);
-    in1[i] = load_input_data(input + i * 32 + 8);
-    in2[i] = load_input_data(input + i * 32 + 16);
-    in3[i] = load_input_data(input + i * 32 + 24);
-  }
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      idct32_16col(in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      ihalfright32_16col(in0, in1, in2, in3);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
-#endif
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x16(in0);
-  scale_sqrt2_8x16(in1);
-  scale_sqrt2_8x16(in2);
-  scale_sqrt2_8x16(in3);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct16_sse2(in0, in1);
-      aom_idct16_sse2(in2, in3);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in2, in3);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      iidtx16_sse2(in0, in1);
-      iidtx16_sse2(in2, in3);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp1 = in0[i];
-        __m128i tmp2 = in1[i];
-        in0[i] = mm_reverse_epi16(in3[i]);
-        in1[i] = mm_reverse_epi16(in2[i]);
-        in2[i] = mm_reverse_epi16(tmp2);
-        in3[i] = mm_reverse_epi16(tmp1);
-      }
-      break;
-    case FLIPADST_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp1 = in0[i];
-        __m128i tmp2 = in1[i];
-        in0[i] = mm_reverse_epi16(in3[i]);
-        in1[i] = mm_reverse_epi16(in2[i]);
-        in2[i] = mm_reverse_epi16(tmp2);
-        in3[i] = mm_reverse_epi16(tmp1);
-      }
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
-}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
index ea4acff33..0c857b583 100644
--- a/third_party/aom/av1/common/x86/intra_edge_sse4.c
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -12,8 +12,8 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
   if (!strength) return;
@@ -39,9 +39,9 @@ void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
   // Adjust input pointer for filter support area
   uint8_t *in = (strength == 3) ? p - 1 : p;
 
-  // Avoid modifying first/last samples
+  // Avoid modifying first sample
   uint8_t *out = p + 1;
-  int len = sz - 2;
+  int len = sz - 1;
 
   const int use_3tap_filter = (strength < 3);
 
@@ -133,9 +133,9 @@ void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
   // Adjust input pointer for filter support area
   uint16_t *in = (strength == 3) ? p - 1 : p;
 
-  // Avoid modifying first/last samples
+  // Avoid modifying first sample
   uint16_t *out = p + 1;
-  int len = sz - 2;
+  int len = sz - 1;
 
   const int use_3tap_filter = (strength < 3);
 
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 000000000..ac1d2c9ca
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  __m256i filt[4], coeffs[4];
+
+  assert(bits >= 0);
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  const __m256i round_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  for (i = 0; i < h; i += 2) {
+    for (j = 0; j < w; j += 8) {
+      const __m256i data = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+          _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
+          0x20);
+
+      __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+      res = _mm256_slli_epi16(res, bits);
+
+      const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m256i data_ref_0 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+            0x20);
+
+        const __m256i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m256i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+        } else {
+          *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+              _mm_cvtsi128_si32(res_1);
+        }
+      } else {
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                        res_1);
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+  // +1 to compensate for dividing the filter coeffs by 2
+  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i coeffs[4], s[8];
+
+  assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+  (void)conv_params;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  for (j = 0; j < w; j += 16) {
+    const uint8_t *data = &src_ptr[j];
+    __m256i src6;
+
+    // Load lines a and b. Line a to lower 128, line b to upper 128
+    const __m256i src_01a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        0x20);
+
+    const __m256i src_12a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        0x20);
+
+    const __m256i src_23a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        0x20);
+
+    const __m256i src_34a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        0x20);
+
+    const __m256i src_45a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        0x20);
+
+    src6 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+    const __m256i src_56a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        src6, 0x20);
+
+    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+    for (i = 0; i < h; i += 2) {
+      data = &src_ptr[i * src_stride + j];
+      const __m256i src_67a = _mm256_permute2x128_si256(
+          src6,
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          src6, 0x20);
+
+      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+      __m256i res_lo = convolve_lowbd(s, coeffs);
+
+      res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+      const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+      const __m256i res_lo_0_shift =
+          _mm256_slli_epi32(res_lo_0_32b, left_shift);
+      const __m256i res_lo_0_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+      const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+      const __m256i res_lo_1_shift =
+          _mm256_slli_epi32(res_lo_1_32b, left_shift);
+      const __m256i res_lo_1_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+      const __m256i res_lo_round =
+          _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+      const __m256i res_lo_unsigned =
+          _mm256_add_epi16(res_lo_round, offset_const_2);
+
+      if (w - j < 16) {
+        if (do_average) {
+          const __m256i data_ref_0 = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+              0x20);
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      } else {
+        __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+        res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+        const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+        const __m256i res_hi_0_shift =
+            _mm256_slli_epi32(res_hi_0_32b, left_shift);
+        const __m256i res_hi_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+        const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+        const __m256i res_hi_1_shift =
+            _mm256_slli_epi32(res_hi_1_32b, left_shift);
+        const __m256i res_hi_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+        const __m256i res_hi_round =
+            _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+        const __m256i res_hi_unsigned =
+            _mm256_add_epi16(res_hi_round, offset_const_2);
+
+        if (do_average) {
+          const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+              0x20);
+
+          const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
+              0x20);
+
+          const __m256i comp_avg_res_lo =
+              comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i comp_avg_res_hi =
+              comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 =
+              _mm256_packus_epi16(round_result_lo, round_result_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128(
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+        } else {
+          const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+          const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_lo_1);
+
+          const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+
+          const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
+          _mm_store_si128(
+              (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+        }
+      }
+      s[0] = s[1];
+      s[1] = s[2];
+      s[2] = s[3];
+
+      s[4] = s[5];
+      s[5] = s[6];
+      s[6] = s[7];
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                              int dst_stride0, int w, int h,
+                              InterpFilterParams *filter_params_x,
+                              InterpFilterParams *filter_params_y,
+                              const int subpel_x_q4, const int subpel_y_q4,
+                              ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        __m256i data = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+        if (i + 1 < im_h)
+          data = _mm256_inserti128_si256(
+              data,
+              _mm_loadu_si128(
+                  (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+              1);
+        __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 = _mm256_permute2x128_si256(
+                _mm256_castsi128_si256(
+                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+                _mm256_castsi128_si256(_mm_loadu_si128(
+                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+                0x20);
+
+            const __m256i comp_avg_res =
+                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        } else {
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 = _mm256_permute2x128_si256(
+                _mm256_castsi128_si256(
+                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+                _mm256_castsi128_si256(_mm_loadu_si128(
+                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+                0x20);
+
+            const __m256i comp_avg_res =
+                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  int i, j;
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m256i data_ref_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                          _mm256_castsi256_si128(res_0));
+        } else {
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+              0x20);
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 000000000..4df7bd42e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+  if (w == 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      const __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+      const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+      const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+      } else {
+        _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+      }
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+        const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
+
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+  const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+  if (w == 4) {
+    __m128i s[8], src6, res, res_shift;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+      __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      res_16b = _mm_packs_epi32(res_shift, res_shift);
+      res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+
+        __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+        res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 000000000..e4d51ac8d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst0, int dst_stride0, int w, int h,
+                               InterpFilterParams *filter_params_x,
+                               InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+        const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+          else
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
deleted file mode 100644
index b3ed9efdf..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include <float.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/pvq_sse4.h"
-#include "../odintrin.h"
-#include "av1/common/pvq.h"
-
-#define EPSILON 1e-15f
-
-static __m128 horizontal_sum_ps(__m128 x) {
-  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
-  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
-  return x;
-}
-
-static __m128i horizontal_sum_epi32(__m128i x) {
-  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
-  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
-  return x;
-}
-
-static INLINE float rsqrtf(float x) {
-  float y;
-  _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
-  return y;
-}
-
-/** Find the codepoint on the given PSphere closest to the desired
- * vector. This is a float-precision PVQ search just to make sure
- * our tests aren't limited by numerical accuracy. It's close to the
- * pvq_search_rdo_double_c implementation, but is not bit accurate and
- * it performs slightly worse on PSNR. One reason is that this code runs
- * more RDO iterations than the C code. It also uses single precision
- * floating point math, whereas the C version uses double precision.
- *
- * @param [in]      xcoeff  input vector to quantize (x in the math doc)
- * @param [in]      n       number of dimensions
- * @param [in]      k       number of pulses
- * @param [out]     ypulse  optimal codevector found (y in the math doc)
- * @param [in]      g2      multiplier for the distortion (typically squared
- *                          gain units)
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in]      prev_k  number of pulses already in ypulse that we should
- *                          reuse for the search (or 0 for a new search)
- * @return                  cosine distance between x and y (between 0 and 1)
- */
-double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
-                                    int *ypulse, double g2,
-                                    double pvq_norm_lambda, int prev_k) {
-  int i, j;
-  int reuse_pulses = prev_k > 0 && prev_k <= k;
-  /* TODO - This blows our 8kB stack space budget and should be fixed when
-   converting PVQ to fixed point. */
-  float xx = 0, xy = 0, yy = 0;
-  float x[MAXN + 3];
-  float y[MAXN + 3];
-  float sign_y[MAXN + 3];
-  for (i = 0; i < n; i++) {
-    float tmp = (float)xcoeff[i];
-    xx += tmp * tmp;
-    x[i] = xcoeff[i];
-  }
-
-  x[n] = x[n + 1] = x[n + 2] = 0;
-  ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
-
-  __m128 sums = _mm_setzero_ps();
-  for (i = 0; i < n; i += 4) {
-    __m128 x4 = _mm_loadu_ps(&x[i]);
-    __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
-    /* Save the sign, we'll put it back later. */
-    _mm_storeu_ps(&sign_y[i], s4);
-    /* Get rid of the sign. */
-    x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
-    sums = _mm_add_ps(sums, x4);
-    if (!reuse_pulses) {
-      /* Clear y and ypulse in case we don't do the projection. */
-      _mm_storeu_ps(&y[i], _mm_setzero_ps());
-      _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
-    }
-    _mm_storeu_ps(&x[i], x4);
-  }
-  sums = horizontal_sum_ps(sums);
-  int pulses_left = k;
-  {
-    __m128i pulses_sum;
-    __m128 yy4, xy4;
-    xy4 = yy4 = _mm_setzero_ps();
-    pulses_sum = _mm_setzero_si128();
-    if (reuse_pulses) {
-      /* We reuse pulses from a previous search so we don't have to search them
-          again. */
-      for (j = 0; j < n; j += 4) {
-        __m128 x4, y4;
-        __m128i iy4;
-        iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
-        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
-        y4 = _mm_cvtepi32_ps(iy4);
-        x4 = _mm_loadu_ps(&x[j]);
-        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
-        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
-        /* Double the y[] vector so we don't have to do it in the search loop.
-         */
-        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
-      }
-      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
-      xy4 = horizontal_sum_ps(xy4);
-      xy = _mm_cvtss_f32(xy4);
-      yy4 = horizontal_sum_ps(yy4);
-      yy = _mm_cvtss_f32(yy4);
-    } else if (k > (n >> 1)) {
-      /* Do a pre-search by projecting on the pyramid. */
-      __m128 rcp4;
-      float sum = _mm_cvtss_f32(sums);
-      /* If x is too small, just replace it with a pulse at 0. This prevents
-         infinities and NaNs from causing too many pulses to be allocated. Here,
-         64 is an
-         approximation of infinity. */
-      if (sum <= EPSILON) {
-        x[0] = 1.f;
-        for (i = 1; i < n; i++) {
-          x[i] = 0;
-        }
-        sums = _mm_set_ps1(1.f);
-      }
-      /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
-      rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
-      xy4 = yy4 = _mm_setzero_ps();
-      pulses_sum = _mm_setzero_si128();
-      for (j = 0; j < n; j += 4) {
-        __m128 rx4, x4, y4;
-        __m128i iy4;
-        x4 = _mm_loadu_ps(&x[j]);
-        rx4 = _mm_mul_ps(x4, rcp4);
-        iy4 = _mm_cvttps_epi32(rx4);
-        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
-        y4 = _mm_cvtepi32_ps(iy4);
-        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
-        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
-        /* Double the y[] vector so we don't have to do it in the search loop.
-         */
-        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
-      }
-      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
-      xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
-      yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
-    }
-    x[n] = x[n + 1] = x[n + 2] = -100;
-    y[n] = y[n + 1] = y[n + 2] = 100;
-  }
-
-  /* This should never happen. */
-  OD_ASSERT(pulses_left <= n + 3);
-
-  float lambda_delta_rate[MAXN + 3];
-  if (pulses_left) {
-    /* Hoist lambda to avoid the multiply in the loop. */
-    float lambda =
-        0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
-    float delta_rate = 3.f / n;
-    __m128 count = _mm_set_ps(3, 2, 1, 0);
-    for (i = 0; i < n; i += 4) {
-      _mm_storeu_ps(&lambda_delta_rate[i],
-                    _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
-      count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
-    }
-  }
-  lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
-      1e30f;
-
-  for (i = 0; i < pulses_left; i++) {
-    int best_id = 0;
-    __m128 xy4, yy4;
-    __m128 max, max2;
-    __m128i count;
-    __m128i pos;
-
-    /* The squared magnitude term gets added anyway, so we might as well
-        add it outside the loop. */
-    yy = yy + 1;
-    xy4 = _mm_load1_ps(&xy);
-    yy4 = _mm_load1_ps(&yy);
-    max = _mm_setzero_ps();
-    pos = _mm_setzero_si128();
-    count = _mm_set_epi32(3, 2, 1, 0);
-    for (j = 0; j < n; j += 4) {
-      __m128 x4, y4, r4;
-      x4 = _mm_loadu_ps(&x[j]);
-      y4 = _mm_loadu_ps(&y[j]);
-      x4 = _mm_add_ps(x4, xy4);
-      y4 = _mm_add_ps(y4, yy4);
-      y4 = _mm_rsqrt_ps(y4);
-      r4 = _mm_mul_ps(x4, y4);
-      /* Subtract lambda. */
-      r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
-      /* Update the index of the max. */
-      pos = _mm_max_epi16(
-          pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
-      /* Update the max. */
-      max = _mm_max_ps(max, r4);
-      /* Update the indices (+4) */
-      count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
-    }
-    /* Horizontal max. */
-    max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
-    max2 =
-        _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
-    /* Now that max2 contains the max at all positions, look at which value(s)
-       of the
-        partial max is equal to the global max. */
-    pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
-    pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
-    pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
-    best_id = _mm_cvtsi128_si32(pos);
-    OD_ASSERT(best_id < n);
-    /* Updating the sums of the new pulse(s) */
-    xy = xy + x[best_id];
-    /* We're multiplying y[j] by two so we don't have to do it here. */
-    yy = yy + y[best_id];
-    /* Only now that we've made the final choice, update y/ypulse. */
-    /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
-    y[best_id] += 2;
-    ypulse[best_id]++;
-  }
-
-  /* Put the original sign back. */
-  for (i = 0; i < n; i += 4) {
-    __m128i y4;
-    __m128i s4;
-    y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
-    s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
-    y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
-    _mm_storeu_si128((__m128i *)&ypulse[i], y4);
-  }
-  return xy * rsqrtf(xx * yy + FLT_MIN);
-}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.h b/third_party/aom/av1/common/x86/pvq_sse4.h
deleted file mode 100644
index 3c4ce8543..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
-#define AOM_COMMON_PVQ_X86_SSE4_H_
-#endif  // AOM_COMMON_PVQ_X86_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 000000000..ffbb31849
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 16) {
+    av1_build_compound_diffwtd_mask_highbd_ssse3(
+        mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+  } else {
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    assert(bd >= 8);
+    assert((w % 16) == 0);
+    const __m256i y0 = _mm256_setzero_si256();
+    const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+        _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 000000000..5171ca493
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+                                const __m128i s1) {
+  const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+  return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+                                            DIFFWTD_MASK_TYPE mask_type,
+                                            const uint8_t *src0, int stride0,
+                                            const uint8_t *src1, int stride1,
+                                            int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m128i mask_base = _mm_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
+      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
+      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+      *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
+      *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += 8;
+      i += 2;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+      __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+      s0 = _mm_cvtepu8_epi16(s0);
+      s1 = _mm_cvtepu8_epi16(s1);
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+      _mm_storel_epi64((__m128i *)mask, m8);
+      src0 += stride0;
+      src1 += stride1;
+      mask += 8;
+      i += 1;
+    } while (i < h);
+  } else {
+    const __m128i zero = _mm_setzero_si128();
+    do {
+      int j = 0;
+      do {
+        const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+        const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+        const __m128i s0L = _mm_cvtepu8_epi16(s0);
+        const __m128i s1L = _mm_cvtepu8_epi16(s1);
+        const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+        const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+        const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+        const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+        const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+        _mm_store_si128((__m128i *)(mask + j), m8);
+        j += 16;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+  const int mask_base = 38;
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+  const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+  const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i add_const =
+      _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+  const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+  int i, j;
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const __m128i data_src0 =
+          _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+      const __m128i data_src1 =
+          _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+      const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+      const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+      const __m128i diff = _mm_max_epu16(diffa, diffb);
+      const __m128i diff_round =
+          _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+      const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+      const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+      __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+      // clamp to 0 can be skipped since we are using add and saturate
+      // instruction
+
+      const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+      const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+      // 8 bit conversion and saturation to uint8
+      const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+      // Store values into the destination buffer
+      __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+      if ((w - j) > 4) {
+        _mm_storel_epi64(dst, res_8);
+      } else {  // w==4
+        *(uint32_t *)dst = _mm_cvtsi128_si32(res_8);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 000000000..cf684447c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 8) {
+    av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+                                             src1, src1_stride, h, w, bd);
+  } else {
+    assert(bd >= 8);
+    assert((w % 8) == 0);
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    const __m128i x0 = _mm_setzero_si128();
+    const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+        _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m128i xmask_base = _mm_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 000000000..375def62e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+  return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x   = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s   = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+  const __m256i x01 = _mm256_slli_si256(x, 4);
+  const __m256i x02 = _mm256_add_epi32(x, x01);
+  const __m256i x03 = _mm256_slli_si256(x02, 8);
+  const __m256i x04 = _mm256_add_epi32(x02, x03);
+  const int32_t s = _mm256_extract_epi32(x04, 3);
+  const __m128i s01 = _mm_set1_epi32(s);
+  const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+  return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+  unsigned int i = 0;
+  for (i = 0; i < (count & 0xffffffe0); i += 32) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+  }
+  for (; i < (count & 0xfffffff8); i += 8) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+  }
+  for (; i < count; i++) {
+    dest[i] = 0;
+  }
+  return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+  const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+  const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+  const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+  const __m256i u = _mm256_sub_epi32(tr, tl);
+  const __m256i v = _mm256_sub_epi32(br, bl);
+  return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+  return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+  __m256i an, bb;
+  if (bit_depth > 8) {
+    const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m256i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m256i a =
+        _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+    const __m256i b =
+        _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm256_madd_epi16(b, b);
+    an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+  } else {
+    bb = _mm256_madd_epi16(sum1, sum1);
+    an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+  }
+  return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fours = _mm256_add_epi32(
+      xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+  const __m256i threes =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+  return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+                          threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m256i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+      const __m128i raw =
+          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m256i src =
+          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+      __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+      __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+                                    SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+      yy_storeu_256(dst + i * dst_stride + j, w);
+    }
+  }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fives =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+  const __m256i sixes = _mm256_add_epi32(xt, xb);
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+
+  const __m256i fives = _mm256_add_epi32(xl, xr);
+  const __m256i sixes = x;
+
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m256i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m256i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m256i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
+
+void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                     int dgd_stride, int32_t *flt0,
+                                     int32_t *flt1, int flt_stride,
+                                     int sgr_params_idx, int bit_depth,
+                                     int highbd) {
+  // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+  // Ctl and Dtl is 32-byte aligned.
+  const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+  DECLARE_ALIGNED(32, int32_t,
+                  buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 32 bytes for efficiency.
+  int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array.
+  int32_t *Atl = buf + 0 * buf_elts + 7;
+  int32_t *Btl = buf + 1 * buf_elts + 7;
+  int32_t *Ctl = buf + 2 * buf_elts + 7;
+  int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
+  }
+
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
+  }
+}
+
+void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                       int height, int stride, int eps,
+                                       const int *xqd, uint8_t *dst8,
+                                       int dst_stride, int32_t *tmpbuf,
+                                       int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
+                                  width, eps, bit_depth, highbd);
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+  decode_xq(xqd, xq, params);
+
+  __m256i xq0 = _mm256_set1_epi32(xq[0]);
+  __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+  for (int i = 0; i < height; ++i) {
+    // Calculate output in batches of 16 pixels
+    for (int j = 0; j < width; j += 16) {
+      const int k = i * width + j;
+      const int m = i * dst_stride + j;
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m256i ep_0, ep_1;
+      __m128i src_0, src_1;
+      if (highbd) {
+        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+        ep_0 = _mm256_cvtepu16_epi32(src_0);
+        ep_1 = _mm256_cvtepu16_epi32(src_1);
+      } else {
+        src_0 = xx_loadu_128(dat8ij);
+        ep_0 = _mm256_cvtepu8_epi32(src_0);
+        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+      }
+
+      const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+      const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+      __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+        const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+        const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+      }
+
+      const __m256i rounding =
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_0 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_1 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        // Note that packing into 16 bits messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+        const __m256i res = _mm256_min_epi16(tmp2, max);
+        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        // Note that each pack messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i res =
+            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+        const __m128i res2 =
+            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+        xx_storeu_128(dst8 + m, res2);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index 9de9177c1..a42c94028 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -1,1821 +1,643 @@
 #include <smmintrin.h>
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/restoration.h"
 #include "aom_dsp/x86/synonyms.h"
 
-/* Calculate four consecutive entries of the intermediate A and B arrays
-   (corresponding to the first loop in the C version of
-   av1_selfguided_restoration)
-*/
-static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
-                       __m128i *one_over_n_, __m128i *s_, int bit_depth,
-                       int idx, int32_t *A, int32_t *B) {
-  __m128i a, b, p;
-  __m128i one_over_n = *one_over_n_;
-  __m128i s = *s_;
-#if CONFIG_HIGHBITDEPTH
-  if (bit_depth > 8) {
-    __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
-    __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
-    __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
-    __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
-    a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
-    b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
-    a = _mm_mullo_epi32(a, n);
-    b = _mm_mullo_epi32(b, b);
-    p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
-  } else {
-#endif
-    (void)bit_depth;
-    a = _mm_mullo_epi32(sum_sq, n);
-    b = _mm_mullo_epi32(sum, sum);
-    p = _mm_sub_epi32(a, b);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-
-  __m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
-  __m128i z = _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rounding_z),
-                             SGRPROJ_MTABLE_BITS);
-  z = _mm_min_epi32(z, _mm_set1_epi32(255));
-
-  // 'Gather' type instructions are not available pre-AVX2, so synthesize a
-  // gather using scalar loads.
-  __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
-                                x_by_xplus1[_mm_extract_epi32(z, 2)],
-                                x_by_xplus1[_mm_extract_epi32(z, 1)],
-                                x_by_xplus1[_mm_extract_epi32(z, 0)]);
-
-  _mm_storeu_si128((__m128i *)&A[idx], a_res);
-
-  __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
-  __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
-  __m128i b_int =
-      _mm_mullo_epi32(a_complement, _mm_mullo_epi32(sum, one_over_n));
-  __m128i b_res =
-      _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
-
-  _mm_storeu_si128((__m128i *)&B[idx], b_res);
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+  return _mm_cvtepu8_epi32(xx_loadl_32(p));
 }
 
-static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
-                                       int src_stride, int32_t *A, int32_t *B,
-                                       int buf_stride) {
-  int i, j;
-
-  // Vertical sum
-  // When the width is not a multiple of 4, we know that 'stride' is rounded up
-  // to a multiple of 4. So it is safe for this loop to calculate extra columns
-  // at the right-hand edge of the frame.
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
-
-    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_madd_epi16(tmp, tmp);
-
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    for (i = 1; i < height - 2; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-      x = _mm_cvtepu8_epi32(
-          xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-      y = _mm_cvtepu8_epi32(
-          xx_loadl_32((__m128i *)&src[(i + 2) * src_stride + j]));
-
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
-    }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+  return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
 
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-  }
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+  const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+  return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
 }
 
-static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
-                                       int height, int buf_stride, int eps,
-                                       int bit_depth) {
-  int i, j;
-
-  // Horizontal sum
-  int width_extend = (width + 3) & ~3;
-  for (i = 0; i < height; ++i) {
-    int h = AOMMIN(2, height - i) + AOMMIN(1, i);
-
-    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
-    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
-    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
-    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
-    // Note: The _mm_slli_si128 call sets up a register containing
-    // {0, A[i * buf_stride], ..., A[i * buf_stride + 2]},
-    // so that the first element of 'sum' (which should only add two values
-    // together) ends up calculated correctly.
-    __m128i sum_ = _mm_add_epi32(_mm_slli_si128(b1, 4),
-                                 _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)));
-    __m128i sum_sq_ = _mm_add_epi32(
-        _mm_slli_si128(a1, 4), _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)));
-    __m128i n = _mm_set_epi32(3 * h, 3 * h, 3 * h, 2 * h);
-    __m128i one_over_n =
-        _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[3 * h - 1],
-                      one_by_x[3 * h - 1], one_by_x[2 * h - 1]);
-    __m128i s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
-               B);
-
-    n = _mm_set1_epi32(3 * h);
-    one_over_n = _mm_set1_epi32(one_by_x[3 * h - 1]);
-    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
-
-    // Re-align a1 and b1 so that they start at index i * buf_stride + 3
-    a2 = _mm_alignr_epi8(a2, a1, 12);
-    b2 = _mm_alignr_epi8(b2, b1, 12);
-
-    // Note: When the width is not a multiple of 4, this loop may end up
-    // writing to the last 4 columns of the frame, potentially with incorrect
-    // values (especially for r=2 and r=3).
-    // This is fine, since we fix up those values in the block after this
-    // loop, and in exchange we never have more than four values to
-    // write / fix up after this loop finishes.
-    for (j = 4; j < width_extend - 4; j += 4) {
-      a1 = a2;
-      b1 = b2;
-      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
-      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
-      /* Loop invariant: At this point,
-         a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
-         a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
-         and similar for b1,b2 and B
-      */
-      sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                             _mm_alignr_epi8(b2, b1, 8)));
-      sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                                _mm_alignr_epi8(a2, a1, 8)));
-      calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
-                 i * buf_stride + j, A, B);
-    }
-    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
-    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
-
-    j = width - 4;
-    switch (width % 4) {
-      case 0:
-        a1 = a2;
-        b1 = b2;
-        a2 = a3;
-        b2 = b3;
-        break;
-      case 1:
-        a1 = _mm_alignr_epi8(a2, a1, 4);
-        b1 = _mm_alignr_epi8(b2, b1, 4);
-        a2 = _mm_alignr_epi8(a3, a2, 4);
-        b2 = _mm_alignr_epi8(b3, b2, 4);
-        break;
-      case 2:
-        a1 = _mm_alignr_epi8(a2, a1, 8);
-        b1 = _mm_alignr_epi8(b2, b1, 8);
-        a2 = _mm_alignr_epi8(a3, a2, 8);
-        b2 = _mm_alignr_epi8(b3, b2, 8);
-        break;
-      case 3:
-        a1 = _mm_alignr_epi8(a2, a1, 12);
-        b1 = _mm_alignr_epi8(b2, b1, 12);
-        a2 = _mm_alignr_epi8(a3, a2, 12);
-        b2 = _mm_alignr_epi8(b3, b2, 12);
-        break;
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
+
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
+
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+      const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
+
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
     }
-
-    // Zero out the data loaded from "off the edge" of the array
-    __m128i zero = _mm_setzero_si128();
-    a2 = _mm_blend_epi16(a2, zero, 0xfc);
-    b2 = _mm_blend_epi16(b2, zero, 0xfc);
-
-    sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                           _mm_alignr_epi8(b2, b1, 8)));
-    sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                              _mm_alignr_epi8(a2, a1, 8)));
-    n = _mm_set_epi32(2 * h, 3 * h, 3 * h, 3 * h);
-    one_over_n = _mm_set_epi32(one_by_x[2 * h - 1], one_by_x[3 * h - 1],
-                               one_by_x[3 * h - 1], one_by_x[3 * h - 1]);
-    s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
-               A, B);
   }
 }
 
-static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
-                                       int src_stride, int32_t *A, int32_t *B,
-                                       int buf_stride) {
-  int i, j;
-
-  // Vertical sum
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, c2, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
-
-    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-    c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
 
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
-    // Important: Since c may be up to 2^8, the result on squaring may
-    // be up to 2^16. So we need to zero-extend, not sign-extend.
-    c2 = _mm_cvtepu16_epi32(_mm_mullo_epi16(c, c));
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
 
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
 
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+      const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
 
-    for (i = 2; i < height - 3; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
 
-      x = _mm_cvtepu8_epi32(
-          _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
-      y = _mm_cvtepu8_epi32(
-          _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
 
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
   }
 }
 
-static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
-                                       int height, int buf_stride, int eps,
-                                       int bit_depth) {
-  int i, j;
-
-  // Horizontal sum
-  int width_extend = (width + 3) & ~3;
-  for (i = 0; i < height; ++i) {
-    int h = AOMMIN(3, height - i) + AOMMIN(2, i);
-
-    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
-    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
-    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
-    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
-    __m128i sum_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(b1, 8), _mm_slli_si128(b1, 4)),
-            _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4))),
-        _mm_alignr_epi8(b2, b1, 8));
-    __m128i sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(a1, 8), _mm_slli_si128(a1, 4)),
-            _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4))),
-        _mm_alignr_epi8(a2, a1, 8));
-
-    __m128i n = _mm_set_epi32(5 * h, 5 * h, 4 * h, 3 * h);
-    __m128i one_over_n =
-        _mm_set_epi32(one_by_x[5 * h - 1], one_by_x[5 * h - 1],
-                      one_by_x[4 * h - 1], one_by_x[3 * h - 1]);
-    __m128i s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
-        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
-               B);
-
-    // Re-align a1 and b1 so that they start at index i * buf_stride + 2
-    a2 = _mm_alignr_epi8(a2, a1, 8);
-    b2 = _mm_alignr_epi8(b2, b1, 8);
-
-    n = _mm_set1_epi32(5 * h);
-    one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
-    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
-
-    for (j = 4; j < width_extend - 4; j += 4) {
-      a1 = a2;
-      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
-      b1 = b2;
-      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
-      /* Loop invariant: At this point,
-         a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
-         a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
-         and similar for b1,b2 and B
-      */
-      sum_ = _mm_add_epi32(
-          _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                          _mm_alignr_epi8(b2, b1, 8))),
-          _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
-      sum_sq_ = _mm_add_epi32(
-          _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                          _mm_alignr_epi8(a2, a1, 8))),
-          _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
-
-      calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
-                 i * buf_stride + j, A, B);
-    }
-    // If the width is not a multiple of 4, we need to reset j to width - 4
-    // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
-    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
-    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
-
-    j = width - 4;
-    switch (width % 4) {
-      case 0:
-        a1 = a2;
-        b1 = b2;
-        a2 = a3;
-        b2 = b3;
-        break;
-      case 1:
-        a1 = _mm_alignr_epi8(a2, a1, 4);
-        b1 = _mm_alignr_epi8(b2, b1, 4);
-        a2 = _mm_alignr_epi8(a3, a2, 4);
-        b2 = _mm_alignr_epi8(b3, b2, 4);
-        break;
-      case 2:
-        a1 = _mm_alignr_epi8(a2, a1, 8);
-        b1 = _mm_alignr_epi8(b2, b1, 8);
-        a2 = _mm_alignr_epi8(a3, a2, 8);
-        b2 = _mm_alignr_epi8(b3, b2, 8);
-        break;
-      case 3:
-        a1 = _mm_alignr_epi8(a2, a1, 12);
-        b1 = _mm_alignr_epi8(b2, b1, 12);
-        a2 = _mm_alignr_epi8(a3, a2, 12);
-        b2 = _mm_alignr_epi8(b3, b2, 12);
-        break;
-    }
-
-    // Zero out the data loaded from "off the edge" of the array
-    __m128i zero = _mm_setzero_si128();
-    a2 = _mm_blend_epi16(a2, zero, 0xf0);
-    b2 = _mm_blend_epi16(b2, zero, 0xf0);
-
-    sum_ = _mm_add_epi32(
-        _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                        _mm_alignr_epi8(b2, b1, 8))),
-        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
-    sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                        _mm_alignr_epi8(a2, a1, 8))),
-        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
-
-    n = _mm_set_epi32(3 * h, 4 * h, 5 * h, 5 * h);
-    one_over_n = _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[4 * h - 1],
-                               one_by_x[5 * h - 1], one_by_x[5 * h - 1]);
-    s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1],
-        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
-               A, B);
-  }
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+  const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+  const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+  const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+  const __m128i u = _mm_sub_epi32(tr, tl);
+  const __m128i v = _mm_sub_epi32(br, bl);
+  return _mm_sub_epi32(v, u);
 }
 
-static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
-                                       int src_stride, int32_t *A, int32_t *B,
-                                       int buf_stride) {
-  int i, j;
-
-  // Vertical sum over 7-pixel regions, 4 columns at a time
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, d, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp, tmp2;
+static __m128i round_for_shift(unsigned shift) {
+  return _mm_set1_epi32((1 << shift) >> 1);
+}
 
-    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-    c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
-    d = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+  __m128i an, bb;
+  if (bit_depth > 8) {
+    const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m128i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+    const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm_madd_epi16(b, b);
+    an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+  } else {
+    bb = _mm_madd_epi16(sum1, sum1);
+    an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+  }
+  return _mm_sub_epi32(an, bb);
+}
 
-    sum = _mm_cvtepi16_epi32(
-        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
-    tmp = _mm_unpacklo_epi16(a, b);
-    tmp2 = _mm_unpacklo_epi16(c, d);
-    sum_sq =
-        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
 
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
 
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[5 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
 
-    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[6 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
 
-    for (i = 3; i < height - 4; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
-      x = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i - 3) * src_stride + j]));
-      y = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i + 4) * src_stride + j]));
+      xx_storeu_128(A + i * buf_stride + j, a_res);
 
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
 
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
 
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+      xx_storeu_128(B + i * buf_stride + j, b_res);
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 3) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
   }
 }
 
-static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
-                                       int height, int buf_stride, int eps,
-                                       int bit_depth) {
-  int i, j;
-  // Horizontal sum over 7-pixel regions of dst
-  int width_extend = (width + 3) & ~3;
-  for (i = 0; i < height; ++i) {
-    int h = AOMMIN(4, height - i) + AOMMIN(3, i);
-
-    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
-    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
-    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
-    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
-    __m128i sum_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(b1, 12), _mm_slli_si128(b1, 8)),
-            _mm_add_epi32(_mm_slli_si128(b1, 4), b1)),
-        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                    _mm_alignr_epi8(b2, b1, 8)),
-                      _mm_alignr_epi8(b2, b1, 12)));
-    __m128i sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(a1, 12), _mm_slli_si128(a1, 8)),
-            _mm_add_epi32(_mm_slli_si128(a1, 4), a1)),
-        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                    _mm_alignr_epi8(a2, a1, 8)),
-                      _mm_alignr_epi8(a2, a1, 12)));
-
-    __m128i n = _mm_set_epi32(7 * h, 6 * h, 5 * h, 4 * h);
-    __m128i one_over_n =
-        _mm_set_epi32(one_by_x[7 * h - 1], one_by_x[6 * h - 1],
-                      one_by_x[5 * h - 1], one_by_x[4 * h - 1]);
-    __m128i s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1],
-        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
-               B);
-
-    // Re-align a1 and b1 so that they start at index i * buf_stride + 1
-    a2 = _mm_alignr_epi8(a2, a1, 4);
-    b2 = _mm_alignr_epi8(b2, b1, 4);
-
-    n = _mm_set1_epi32(7 * h);
-    one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
-    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
-
-    for (j = 4; j < width_extend - 4; j += 4) {
-      a1 = a2;
-      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
-      b1 = b2;
-      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
-      __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
-      __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
-      /* Loop invariant: At this point,
-         a1 = original A[i * buf_stride + j - 3 : i * buf_stride + j + 1]
-         a2 = original A[i * buf_stride + j + 1 : i * buf_stride + j + 5]
-         a3 = original A[i * buf_stride + j + 5 : i * buf_stride + j + 9]
-         and similar for b1,b2,b3 and B
-      */
-      sum_ = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
-                        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
-                                      _mm_alignr_epi8(b2, b1, 12))),
-          _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(b3, b2, 4)),
-                        _mm_alignr_epi8(b3, b2, 8)));
-      sum_sq_ = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
-                        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
-                                      _mm_alignr_epi8(a2, a1, 12))),
-          _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)),
-                        _mm_alignr_epi8(a3, a2, 8)));
-
-      calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
-                 i * buf_stride + j, A, B);
-    }
-    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
-    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
-
-    j = width - 4;
-    switch (width % 4) {
-      case 0:
-        a1 = a2;
-        b1 = b2;
-        a2 = a3;
-        b2 = b3;
-        break;
-      case 1:
-        a1 = _mm_alignr_epi8(a2, a1, 4);
-        b1 = _mm_alignr_epi8(b2, b1, 4);
-        a2 = _mm_alignr_epi8(a3, a2, 4);
-        b2 = _mm_alignr_epi8(b3, b2, 4);
-        break;
-      case 2:
-        a1 = _mm_alignr_epi8(a2, a1, 8);
-        b1 = _mm_alignr_epi8(b2, b1, 8);
-        a2 = _mm_alignr_epi8(a3, a2, 8);
-        b2 = _mm_alignr_epi8(b3, b2, 8);
-        break;
-      case 3:
-        a1 = _mm_alignr_epi8(a2, a1, 12);
-        b1 = _mm_alignr_epi8(b2, b1, 12);
-        a2 = _mm_alignr_epi8(a3, a2, 12);
-        b2 = _mm_alignr_epi8(b3, b2, 12);
-        break;
-    }
-
-    // Zero out the data loaded from "off the edge" of the array
-    __m128i zero = _mm_setzero_si128();
-    a2 = _mm_blend_epi16(a2, zero, 0xc0);
-    b2 = _mm_blend_epi16(b2, zero, 0xc0);
-
-    sum_ = _mm_add_epi32(
-        _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
-                      _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
-                                    _mm_alignr_epi8(b2, b1, 12))),
-        _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(zero, b2, 4)),
-                      _mm_alignr_epi8(zero, b2, 8)));
-    sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
-                      _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
-                                    _mm_alignr_epi8(a2, a1, 12))),
-        _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(zero, a2, 4)),
-                      _mm_alignr_epi8(zero, a2, 8)));
-
-    n = _mm_set_epi32(4 * h, 5 * h, 6 * h, 7 * h);
-    one_over_n = _mm_set_epi32(one_by_x[4 * h - 1], one_by_x[5 * h - 1],
-                               one_by_x[6 * h - 1], one_by_x[7 * h - 1]);
-    s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
-        sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
-               A, B);
-  }
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fours = _mm_add_epi32(
+      xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+  const __m128i threes =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+  return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
 }
 
-void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
-                                       int dgd_stride, int32_t *dst,
-                                       int dst_stride, int r, int eps) {
-  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
-  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *A = A_;
-  int32_t *B = B_;
-  int i, j;
-  // Adjusting the stride of A and B here appears to avoid bad cache effects,
-  // leading to a significant speed improvement.
-  // We also align the stride to a multiple of 16 bytes for efficiency.
-  int buf_stride = ((width_ext + 3) & ~3) + 16;
-
-  // Don't filter tiles with dimensions < 5 on any axis
-  if ((width < 5) || (height < 5)) return;
-
-  uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
-  if (r == 1) {
-    selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
-                               buf_stride);
-    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
-  } else if (r == 2) {
-    selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
-                               buf_stride);
-    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
-  } else if (r == 3) {
-    selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
-                               buf_stride);
-    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
-  } else {
-    assert(0);
-  }
-  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
-                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
-                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k + 1] + A[k - buf_stride + 1] +
-                        A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k + 1] + B[k - buf_stride + 1] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    // Vectorize the innermost loop
-    for (j = 1; j < width - 1; j += 4) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-
-      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
-      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
-      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
-      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
-      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
-      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
-
-      __m128i a0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
-                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
-          _mm_alignr_epi8(tmp1, tmp0, 4));
-      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
-                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
-                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
-      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
-
-      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
-      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
-      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
-      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
-      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
-      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
-
-      __m128i b0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
-          _mm_alignr_epi8(tmp7, tmp6, 4));
-      __m128i b1 =
-          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
-      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
-
-      __m128i src = _mm_cvtepu8_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
-
-      __m128i rounding = _mm_set1_epi32(
-          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
-      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m128i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+      const __m128i raw =
+          xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m128i src =
+          highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+      __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
       __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
                                  SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-      _mm_storeu_si128((__m128i *)&dst[m], w);
-    }
 
-    // Deal with any extra pixels at the right-hand edge of the frame
-    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
-    for (; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-      const int32_t a =
-          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
-              4 +
-          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
-           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
-              3;
-      const int32_t b =
-          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
-              4 +
-          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
-           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
-              3;
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k - 1] + A[k - buf_stride - 1] +
-                        A[k + buf_stride - 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k - 1] + B[k - buf_stride - 1] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
-                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
-                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-}
-
-void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride,
-                                int32_t *dst, int dst_stride, int corner,
-                                int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-    }
-  }
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-    }
-  }
-  __m128i center_ = _mm_set1_epi16(center);
-  __m128i edge_ = _mm_set1_epi16(edge);
-  __m128i corner_ = _mm_set1_epi16(corner);
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                    dgd[k - stride] + dgd[k + stride]);
-    }
-    // Process in units of 8 pixels at a time.
-    for (j = 1; j < width - 8; j += 8) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-
-      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
-      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
-      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
-
-      __m128i tl = _mm_cvtepu8_epi16(a);
-      __m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
-      __m128i cl = _mm_cvtepu8_epi16(b);
-      __m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
-      __m128i bl = _mm_cvtepu8_epi16(c);
-      __m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
-
-      __m128i x = _mm_alignr_epi8(cr, cl, 2);
-      __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
-                                _mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
-                                              _mm_alignr_epi8(cr, cl, 4)));
-      __m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
-                                _mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
-                                              _mm_alignr_epi8(br, bl, 4)));
-
-      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
-                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
-                                                _mm_mullo_epi16(z, corner_)));
-
-      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
-      _mm_storeu_si128((__m128i *)&dst[l + 4],
-                       _mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
-    }
-    // If there are enough pixels left in this row, do another batch of 4
-    // pixels.
-    for (; j < width - 4; j += 4) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-
-      __m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
-      __m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
-      __m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
-
-      __m128i tl = _mm_cvtepu8_epi16(a);
-      __m128i cl = _mm_cvtepu8_epi16(b);
-      __m128i bl = _mm_cvtepu8_epi16(c);
-
-      __m128i x = _mm_srli_si128(cl, 2);
-      __m128i y = _mm_add_epi16(
-          _mm_add_epi16(_mm_srli_si128(tl, 2), cl),
-          _mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
-      __m128i z = _mm_add_epi16(
-          _mm_add_epi16(tl, bl),
-          _mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
-
-      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
-                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
-                                                _mm_mullo_epi16(z, corner_)));
-
-      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
-    }
-    // Handle any leftover pixels
-    for (; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride] + dgd[k + stride]);
+      xx_storeu_128(dst + i * dst_stride + j, w);
     }
   }
 }
 
-void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
-                                         int stride, int eps, int *xqd,
-                                         uint8_t *dst, int dst_stride,
-                                         int32_t *tmpbuf) {
-  int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
-                             sgr_params[eps].corner, sgr_params[eps].edge);
-#else
-    av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
-                                      sgr_params[eps].r1, sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
-                                    sgr_params[eps].r2, sgr_params[eps].e2);
-  decode_xq(xqd, xq);
-
-  __m128i xq0 = _mm_set1_epi32(xq[0]);
-  __m128i xq1 = _mm_set1_epi32(xq[1]);
-  for (i = 0; i < height; ++i) {
-    // Calculate output in batches of 8 pixels
-    for (j = 0; j < width; j += 8) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      __m128i src =
-          _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
-                         SGRPROJ_RST_BITS);
-
-      const __m128i u_0 = _mm_cvtepu16_epi32(src);
-      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
-
-      const __m128i f1_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
-      const __m128i f2_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
-      const __m128i f1_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
-      const __m128i f2_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
-
-      const __m128i v_0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
-          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
-      const __m128i v_1 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
-          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
-
-      const __m128i rounding =
-          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
-      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
-                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
-                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-
-      const __m128i tmp = _mm_packs_epi32(w_0, w_1);
-      const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
-      _mm_storel_epi64((__m128i *)&dst[m], res);
-    }
-    // Process leftover pixels
-    for (; j < width; ++j) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int16_t w =
-          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = (uint16_t)clip_pixel(w);
-    }
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
   }
-}
-
-#if CONFIG_HIGHBITDEPTH
-// Only the vertical sums need to be adjusted for highbitdepth
 
-static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
-                                              int height, int src_stride,
-                                              int32_t *A, int32_t *B,
-                                              int buf_stride) {
-  int i, j;
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
 
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
 
-    a = _mm_loadl_epi64((__m128i *)&src[j]);
-    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
 
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_madd_epi16(tmp, tmp);
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
 
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
 
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
 
-    for (i = 1; i < height - 2; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
-      x = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-      y = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+      xx_storeu_128(A + i * buf_stride + j, a_res);
 
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
 
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
 
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+      xx_storeu_128(B + i * buf_stride + j, b_res);
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
   }
 }
 
-static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
-                                              int height, int src_stride,
-                                              int32_t *A, int32_t *B,
-                                              int buf_stride) {
-  int i, j;
-
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, c2, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
-
-    a = _mm_loadl_epi64((__m128i *)&src[j]);
-    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
-    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
-
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
-    // Important: We need to widen *before* squaring here, since
-    // c^2 may be up to 2^24.
-    c = _mm_cvtepu16_epi32(c);
-    c2 = _mm_mullo_epi32(c, c);
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
-
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    for (i = 2; i < height - 3; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-      x = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
-      y = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
-
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
-    }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-  }
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fives =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+  const __m128i sixes = _mm_add_epi32(xt, xb);
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+
+  const __m128i fives = _mm_add_epi32(xl, xr);
+  const __m128i sixes = x;
+
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
 }
 
-static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
-                                              int height, int src_stride,
-                                              int32_t *A, int32_t *B,
-                                              int buf_stride) {
-  int i, j;
-
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, d, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp, tmp2;
-
-    a = _mm_loadl_epi64((__m128i *)&src[j]);
-    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
-    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
-    d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
-
-    sum = _mm_cvtepi16_epi32(
-        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
-    tmp = _mm_unpacklo_epi16(a, b);
-    tmp2 = _mm_unpacklo_epi16(c, d);
-    sum_sq =
-        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
-
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    for (i = 3; i < height - 4; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-      x = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
-      y = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
-
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m128i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m128i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m128i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+                                   SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
   }
 }
 
-void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
-                                              int height, int dgd_stride,
-                                              int32_t *dst, int dst_stride,
-                                              int bit_depth, int r, int eps) {
+void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+                                       int height, int dgd_stride,
+                                       int32_t *flt0, int32_t *flt1,
+                                       int flt_stride, int sgr_params_idx,
+                                       int bit_depth, int highbd) {
+  DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
+  memset(buf, 0, sizeof(buf));
+
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *A = A_;
-  int32_t *B = B_;
-  int i, j;
+
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes for efficiency.
   int buf_stride = ((width_ext + 3) & ~3) + 16;
 
-  // Don't filter tiles with dimensions < 5 on any axis
-  if ((width < 5) || (height < 5)) return;
-
-  uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
-  if (r == 1) {
-    highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
-                                      A, B, buf_stride);
-    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
-                               bit_depth);
-  } else if (r == 2) {
-    highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
-                                      A, B, buf_stride);
-    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
-                               bit_depth);
-  } else if (r == 3) {
-    highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
-                                      A, B, buf_stride);
-    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
-                               bit_depth);
-  } else {
-    assert(0);
-  }
-  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
-                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
-                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k + 1] + A[k - buf_stride + 1] +
-                        A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k + 1] + B[k - buf_stride + 1] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    // Vectorize the innermost loop
-    for (j = 1; j < width - 1; j += 4) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-
-      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
-      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
-      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
-      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
-      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
-      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
-
-      __m128i a0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
-                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
-          _mm_alignr_epi8(tmp1, tmp0, 4));
-      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
-                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
-                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
-      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
-
-      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
-      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
-      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
-      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
-      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
-      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
-
-      __m128i b0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
-          _mm_alignr_epi8(tmp7, tmp6, 4));
-      __m128i b1 =
-          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
-      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
-
-      __m128i src = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
-
-      __m128i rounding = _mm_set1_epi32(
-          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
-      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
-      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
-                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-      _mm_storeu_si128((__m128i *)&dst[m], w);
-    }
-
-    // Deal with any extra pixels at the right-hand edge of the frame
-    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
-    for (; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-      const int32_t a =
-          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
-              4 +
-          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
-           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
-              3;
-      const int32_t b =
-          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
-              4 +
-          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
-           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
-              3;
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k - 1] + A[k - buf_stride - 1] +
-                        A[k + buf_stride - 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k - 1] + B[k - buf_stride - 1] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+  int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
   }
 
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
-                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
-                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
   }
 }
 
-void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
-                                       int stride, int32_t *dst, int dst_stride,
-                                       int corner, int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-    }
-  }
-  __m128i center_ = _mm_set1_epi32(center);
-  __m128i edge_ = _mm_set1_epi32(edge);
-  __m128i corner_ = _mm_set1_epi32(corner);
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                    dgd[k - stride] + dgd[k + stride]);
-    }
-    // Process 4 pixels at a time
-    for (j = 1; j < width - 4; j += 4) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-
-      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
-      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
-      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
-
-      __m128i tl = _mm_cvtepu16_epi32(a);
-      __m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
-      __m128i cl = _mm_cvtepu16_epi32(b);
-      __m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
-      __m128i bl = _mm_cvtepu16_epi32(c);
-      __m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
-
-      __m128i x = _mm_alignr_epi8(cr, cl, 4);
-      __m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
-                                _mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
-                                              _mm_alignr_epi8(cr, cl, 8)));
-      __m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
-                                _mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
-                                              _mm_alignr_epi8(br, bl, 8)));
-
-      __m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
-                                  _mm_add_epi32(_mm_mullo_epi32(y, edge_),
-                                                _mm_mullo_epi32(z, corner_)));
-
-      _mm_storeu_si128((__m128i *)&dst[l], res);
-    }
-    // Handle any leftover pixels
-    for (; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride] + dgd[k + stride]);
-    }
-  }
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-    }
-  }
-}
-
-void apply_selfguided_restoration_highbd_sse4_1(
-    uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
-    int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                         int height, int stride, int eps,
+                                         const int *xqd, uint8_t *dst8,
+                                         int dst_stride, int32_t *tmpbuf,
+                                         int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
+                                    width, eps, bit_depth, highbd);
+  const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
-                                    sgr_params[eps].corner,
-                                    sgr_params[eps].edge);
-#else
-  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
-                                           width, bit_depth, sgr_params[eps].r1,
-                                           sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
-                                           width, bit_depth, sgr_params[eps].r2,
-                                           sgr_params[eps].e2);
-  decode_xq(xqd, xq);
+  decode_xq(xqd, xq, params);
 
   __m128i xq0 = _mm_set1_epi32(xq[0]);
   __m128i xq1 = _mm_set1_epi32(xq[1]);
-  for (i = 0; i < height; ++i) {
+
+  for (int i = 0; i < height; ++i) {
     // Calculate output in batches of 8 pixels
-    for (j = 0; j < width; j += 8) {
+    for (int j = 0; j < width; j += 8) {
       const int k = i * width + j;
-      const int l = i * stride + j;
       const int m = i * dst_stride + j;
-      __m128i src =
-          _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
-
-      const __m128i u_0 = _mm_cvtepu16_epi32(src);
-      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
-
-      const __m128i f1_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
-      const __m128i f2_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
-      const __m128i f1_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
-      const __m128i f2_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
-
-      const __m128i v_0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
-          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
-      const __m128i v_1 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
-          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m128i src;
+      if (highbd) {
+        src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+      } else {
+        src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+      }
+
+      const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+      const __m128i u_0 = _mm_cvtepu16_epi32(u);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+      __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+        const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+        const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+      }
 
       const __m128i rounding =
-          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
       const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
                                          SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
       const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
                                          SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
 
-      // Pack into 16 bits and clamp to [0, 2^bit_depth)
-      const __m128i tmp = _mm_packus_epi32(w_0, w_1);
-      const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
-      const __m128i res = _mm_min_epi16(tmp, max);
-
-      _mm_store_si128((__m128i *)&dst[m], res);
-    }
-    // Process leftover pixels
-    for (; j < width; ++j) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int16_t w =
-          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+        const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+        const __m128i res = _mm_min_epi16(tmp, max);
+        xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+        const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+        xx_storel_64(dst8 + m, res);
+      }
     }
   }
 }
-
-#endif
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
deleted file mode 100644
index d30466ae6..000000000
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
-                          int height, int stride, uint8_t *pred, int p_col,
-                          int p_row, int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          ConvolveParams *conv_params, int16_t alpha,
-                          int16_t beta, int16_t gamma, int16_t delta) {
-  int comp_avg = conv_params->do_average;
-  __m128i tmp[15];
-  int i, j, k;
-  const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
-  /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
-  }*/
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i zero = _mm_setzero_si128();
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-
-          // Filter even-index pixels
-          const __m128i tmp_0 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_2 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_4 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_6 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
-          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
-          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
-          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
-          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-          const __m128i round_const = _mm_set1_epi32(
-              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
-          // Calculate filtered results
-          const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
-          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-          const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
-          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-          const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
-          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-          const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
-          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                           _mm_add_epi32(res_2, res_6));
-          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
-                                   _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Filter odd-index pixels
-          const __m128i tmp_1 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_3 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_5 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_7 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
-          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-          const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
-          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-          const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
-          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-          const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
-          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                          _mm_add_epi32(res_3, res_7));
-          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
-                                  _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Combine results into one register.
-          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
-          // as this order helps with the vertical filter.
-          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_conv_params) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          const __m128i round_const = _mm_set1_epi32(
-              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
-              ((1 << (conv_params->round_1)) >> 1));
-          res_lo = _mm_add_epi32(res_lo, round_const);
-          res_lo =
-              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
-          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
-          _mm_storeu_si128(p, res_lo);
-          if (p_width > 4) {
-            res_hi = _mm_add_epi32(res_hi, round_const);
-            res_hi =
-                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
-            if (comp_avg)
-              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
-            _mm_storeu_si128(p + 1, res_hi);
-          }
-        } else {
-#else
-        {
-#endif
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            if (comp_avg) {
-              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
-              res_8bit = _mm_avg_epu8(res_8bit, orig);
-            }
-            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
-          } else {
-            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
-            _mm_storel_epi64(p, res_8bit);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
new file mode 100644
index 000000000..efc542cbf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                       8, 10, 10, 12, 12, 14, 14, 0 };
+static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                      9, 11, 11, 13, 13, 15, 15, 0 };
+
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+                                     int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  const __m128i src_even =
+      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+  const __m128i src_odd =
+      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+  // The pixel order we need for 'src' is:
+  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
+  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                            _mm_srli_si128(src_odd, 4));
+  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
+  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+  const __m128i src_13 =
+      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
+  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+                                            _mm_srli_si128(src_even, 6));
+  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+
+  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  // Note: The values res_02 + res_46 and res_13 + res_57 both
+  // fit into int16s at this point, but their sum may be too wide to fit
+  // into an int16. However, once we also add round_const, the sum of
+  // all of these fits into a uint16.
+  //
+  // The wrapping behaviour of _mm_add_* is used here to make sure we
+  // get the correct result despite converting between different
+  // (implicit) types.
+  const __m128i res_even = _mm_add_epi16(res_02, res_46);
+  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+  const __m128i res =
+      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+                            int height, int stride, uint8_t *pred, int p_col,
+                            int p_row, int p_width, int p_height, int p_stride,
+                            int subsampling_x, int subsampling_y,
+                            ConvolveParams *conv_params, int16_t alpha,
+                            int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const __m128i res_sub_const =
+      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_right);
+          }
+          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          res_lo = _mm_add_epi32(res_lo, res_add_const);
+          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+                                 reduce_bits_vert_shift);
+          const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
+          __m128i res_lo_16;
+          if (conv_params->do_average) {
+            __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            const __m128i p_16 = _mm_loadl_epi64(p);
+
+            if (conv_params->use_jnt_comp_avg) {
+              const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+              const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+              const __m128i shifted_32 =
+                  _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+              res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+            } else {
+              res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+            }
+
+            res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
+
+            res_lo_16 = _mm_sra_epi16(
+                _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
+            __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+            *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+          } else {
+            _mm_storel_epi64(p, temp_lo_16);
+          }
+          if (p_width > 4) {
+            __m128i *const p4 =
+                (__m128i *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = _mm_add_epi32(res_hi, res_add_const);
+            res_hi =
+                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+                              reduce_bits_vert_shift);
+            const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
+            __m128i res_hi_16;
+
+            if (conv_params->do_average) {
+              __m128i *const dst8_4 =
+                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+              const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+              if (conv_params->use_jnt_comp_avg) {
+                const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+                const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
+                const __m128i shifted_32 =
+                    _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+                res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+              } else {
+                res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+              }
+              res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
+
+              res_hi_16 = _mm_sra_epi16(
+                  _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
+              __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+              *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+            } else {
+              _mm_storel_epi64(p4, temp_hi_16);
+            }
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+          } else {
+            _mm_storel_epi64(p, res_8bit);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
deleted file mode 100644
index 3986ad389..000000000
--- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c
+++ /dev/null
@@ -1,535 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-/* This is a modified version of 'warped_filter' from warped_motion.c:
-   * Each coefficient is stored in 8 bits instead of 16 bits
-   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
-
-     This is done in order to avoid overflow: Since the tap with the largest
-     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
-     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
-     convolve functions.
-
-     Instead, we use the summation order
-     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
-     The rearrangement of coefficients in this table is so that we can get the
-     coefficients into the correct order more quickly.
-*/
-/* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
-                filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
-#if WARPEDPIXEL_PREC_BITS == 6
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
-  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
-  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
-  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
-  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
-  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
-  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
-  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
-  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
-  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
-  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
-  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
-  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
-  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
-  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
-  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
-  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
-  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
-  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
-  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
-  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
-  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
-  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
-  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
-  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
-  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
-  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
-  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
-  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
-  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
-  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
-  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
-  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
-  // [1, 2)
-  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
-  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
-  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
-  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
-  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
-  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
-  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
-  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
-  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
-  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
-  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
-  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
-  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
-  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
-  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
-  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
-  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
-  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
-  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
-  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
-  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
-  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
-  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
-  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
-  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
-  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
-  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
-  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
-  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
-  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
-  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
-  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
-  // dummy (replicate row index 191)
-  { 0, 0,   2,  -1, 0,   0, 127, 0},
-
-#else
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
-  // [1, 2)
-  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
-  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
-  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
-  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
-  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
-  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
-  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
-  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
-  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
-  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
-  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
-  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
-  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
-  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
-  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
-  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
-  // dummy (replicate row index 95)
-  { 0, 0,   4,  -3, 0,  -1, 127, 1},
-#endif  // WARPEDPIXEL_PREC_BITS == 6
-};
-/* clang-format on */
-
-// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
-// in an SSE register into two sequences:
-// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
-// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
-static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
-                                       8, 10, 10, 12, 12, 14, 14, 0 };
-static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
-                                      9, 11, 11, 13, 13, 15, 15, 0 };
-
-void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
-                           int height, int stride, uint8_t *pred, int p_col,
-                           int p_row, int p_width, int p_height, int p_stride,
-                           int subsampling_x, int subsampling_y,
-                           ConvolveParams *conv_params, int16_t alpha,
-                           int16_t beta, int16_t gamma, int16_t delta) {
-  int comp_avg = conv_params->do_average;
-  __m128i tmp[15];
-  int i, j, k;
-  const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
-  /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
-  }*/
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          const __m128i src_even =
-              _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
-          const __m128i src_odd =
-              _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
-
-          // Filter even-index pixels
-          const __m128i tmp_0 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_1 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_2 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_3 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_4 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_5 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_6 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_7 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
-          const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
-          const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
-          const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
-          const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
-          // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
-          const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
-          // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
-          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
-          // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
-          const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
-          // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
-          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
-
-          // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-          // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-          // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          // The pixel order we need for 'src' is:
-          // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
-          const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
-          const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
-          // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
-          const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
-                                                    _mm_srli_si128(src_odd, 4));
-          const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
-          // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
-          const __m128i src_13 =
-              _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
-          const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
-          // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
-          const __m128i src_57 = _mm_unpacklo_epi64(
-              _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
-          const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
-
-          const __m128i round_const = _mm_set1_epi16(
-              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
-          // Note: The values res_02 + res_46 and res_13 + res_57 both
-          // fit into int16s at this point, but their sum may be too wide to fit
-          // into an int16. However, once we also add round_const, the sum of
-          // all of these fits into a uint16.
-          //
-          // The wrapping behaviour of _mm_add_* is used here to make sure we
-          // get the correct result despite converting between different
-          // (implicit) types.
-          const __m128i res_even = _mm_add_epi16(res_02, res_46);
-          const __m128i res_odd = _mm_add_epi16(res_13, res_57);
-          const __m128i res =
-              _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
-          tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_conv_params) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          const __m128i round_const = _mm_set1_epi32(
-              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
-              ((1 << (conv_params->round_1)) >> 1));
-          res_lo = _mm_add_epi32(res_lo, round_const);
-          res_lo =
-              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
-          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
-          _mm_storeu_si128(p, res_lo);
-          if (p_width > 4) {
-            res_hi = _mm_add_epi32(res_hi, round_const);
-            res_hi =
-                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
-            if (comp_avg)
-              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
-            _mm_storeu_si128(p + 1, res_hi);
-          }
-        } else {
-#else
-        {
-#endif
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            if (comp_avg) {
-              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
-              res_8bit = _mm_avg_epu8(res_8bit, orig);
-            }
-            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
-          } else {
-            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
-            _mm_storel_epi64(p, res_8bit);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 000000000..e1449fd21
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+  const __m256i clamp_high =
+      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+  /* Horizontal filter */
+  {
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+        // Load 8-bit src data
+        const __m128i data_0 = xx_loadu_128(data_ij + 0);
+        const __m128i data_1 = xx_loadu_128(data_ij + 1);
+        const __m128i data_2 = xx_loadu_128(data_ij + 2);
+        const __m128i data_3 = xx_loadu_128(data_ij + 3);
+        const __m128i data_4 = xx_loadu_128(data_ij + 4);
+        const __m128i data_5 = xx_loadu_128(data_ij + 5);
+        const __m128i data_6 = xx_loadu_128(data_ij + 6);
+        const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+        // (Zero-)Extend 8-bit data to 16-bit data
+        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+        // Reduce to 8-bit precision. This messes up the order:
+        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit =
+            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+        // Swap the two central 32-bit values to get the order:
+        // [ - - - - - - - - - - - - - - - - ]
+        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+        // Store the lower 128-bit lane in the dst array
+        xx_storeu_128(dst + i * dst_stride + j,
+                      _mm256_castsi256_si128(res_8bit2));
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 000000000..3083d224b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int i, j;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero = _mm_setzero_si128();
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+  /* Horizontal filter */
+  {
+    const __m128i coeffs_x =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                  conv_params->round_0);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                 conv_params->round_0);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_min_epi16(
+            _mm_max_epi16(res, zero),
+            _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m128i coeffs_y =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storel_epi64(p, res_8bit);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/zigzag.h b/third_party/aom/av1/common/zigzag.h
deleted file mode 100644
index c58b18b57..000000000
--- a/third_party/aom/av1/common/zigzag.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_zigzag_H)
-# define _zigzag_H (1)
-
-extern const unsigned char OD_ZIGZAG4_DCT_DCT[15][2];
-extern const unsigned char OD_ZIGZAG4_ADST_DCT[15][2];
-extern const unsigned char OD_ZIGZAG4_DCT_ADST[15][2];
-#define OD_ZIGZAG4_ADST_ADST OD_ZIGZAG4_DCT_DCT
-
-extern const unsigned char OD_ZIGZAG8_DCT_DCT[48][2];
-extern const unsigned char OD_ZIGZAG8_ADST_DCT[48][2];
-extern const unsigned char OD_ZIGZAG8_DCT_ADST[48][2];
-#define OD_ZIGZAG8_ADST_ADST OD_ZIGZAG8_DCT_DCT
-
-extern const unsigned char OD_ZIGZAG16_DCT_DCT[192][2];
-extern const unsigned char OD_ZIGZAG16_ADST_DCT[192][2];
-extern const unsigned char OD_ZIGZAG16_DCT_ADST[192][2];
-#define OD_ZIGZAG16_ADST_ADST OD_ZIGZAG16_DCT_DCT
-
-extern const unsigned char OD_ZIGZAG32_DCT_DCT[768][2];
-#endif
diff --git a/third_party/aom/av1/common/zigzag16.c b/third_party/aom/av1/common/zigzag16.c
deleted file mode 100644
index 6df6e3855..000000000
--- a/third_party/aom/av1/common/zigzag16.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/* This file is generated by gen_zigzag16.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_DCT[192][2] = {
-  {8, 0}, {8, 1}, {8, 2}, {9, 0},
-  {8, 3}, {9, 1}, {9, 2}, {10, 0},
-  {9, 3}, {10, 1}, {10, 2}, {11, 0},
-  {10, 3}, {11, 1}, {11, 2}, {11, 3},
-  {12, 0}, {12, 1}, {13, 0}, {12, 2},
-  {12, 3}, {13, 1}, {13, 2}, {14, 0},
-  {13, 3}, {14, 1}, {15, 0}, {14, 2},
-  {14, 3}, {15, 1}, {15, 2}, {15, 3},
-  {0, 8}, {1, 8}, {0, 9}, {2, 8},
-  {1, 9}, {3, 8}, {0, 10}, {2, 9},
-  {1, 10}, {3, 9}, {0, 11}, {2, 10},
-  {1, 11}, {3, 10}, {0, 12}, {2, 11},
-  {1, 12}, {3, 11}, {0, 13}, {2, 12},
-  {1, 13}, {0, 14}, {3, 12}, {2, 13},
-  {1, 14}, {3, 13}, {0, 15}, {2, 14},
-  {1, 15}, {3, 14}, {2, 15}, {3, 15},
-  {4, 8}, {5, 8}, {4, 9}, {8, 4},
-  {8, 5}, {6, 8}, {5, 9}, {4, 10},
-  {9, 4}, {8, 6}, {7, 8}, {9, 5},
-  {5, 10}, {8, 7}, {6, 9}, {4, 11},
-  {10, 4}, {9, 6}, {7, 9}, {8, 8},
-  {10, 5}, {6, 10}, {5, 11}, {9, 7},
-  {8, 9}, {10, 6}, {7, 10}, {4, 12},
-  {11, 4}, {9, 8}, {6, 11}, {10, 7},
-  {11, 5}, {5, 12}, {8, 10}, {7, 11},
-  {9, 9}, {4, 13}, {10, 8}, {11, 6},
-  {11, 7}, {6, 12}, {8, 11}, {9, 10},
-  {12, 4}, {5, 13}, {10, 9}, {12, 5},
-  {7, 12}, {11, 8}, {4, 14}, {6, 13},
-  {10, 10}, {9, 11}, {12, 6}, {13, 4},
-  {11, 9}, {8, 12}, {5, 14}, {12, 7},
-  {7, 13}, {4, 15}, {13, 5}, {10, 11},
-  {11, 10}, {9, 12}, {13, 6}, {12, 8},
-  {6, 14}, {8, 13}, {5, 15}, {13, 7},
-  {14, 4}, {12, 9}, {7, 14}, {11, 11},
-  {10, 12}, {9, 13}, {14, 5}, {6, 15},
-  {13, 8}, {8, 14}, {12, 10}, {14, 6},
-  {7, 15}, {13, 9}, {15, 4}, {10, 13},
-  {11, 12}, {14, 7}, {9, 14}, {12, 11},
-  {8, 15}, {15, 5}, {13, 10}, {14, 8},
-  {11, 13}, {15, 6}, {9, 15}, {10, 14},
-  {14, 9}, {15, 7}, {13, 11}, {12, 12},
-  {10, 15}, {11, 14}, {15, 8}, {14, 10},
-  {12, 13}, {13, 12}, {15, 9}, {11, 15},
-  {14, 11}, {13, 13}, {15, 10}, {12, 14},
-  {13, 14}, {15, 11}, {14, 12}, {12, 15},
-  {14, 13}, {13, 15}, {15, 12}, {14, 14},
-  {15, 13}, {14, 15}, {15, 14}, {15, 15}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG16_ADST_DCT[192][2] = {
-  {8, 0}, {9, 0}, {10, 0}, {8, 1},
-  {11, 0}, {9, 1}, {8, 2}, {12, 0},
-  {10, 1}, {9, 2}, {8, 3}, {13, 0},
-  {11, 1}, {10, 2}, {9, 3}, {14, 0},
-  {12, 1}, {10, 3}, {15, 0}, {11, 2},
-  {13, 1}, {11, 3}, {12, 2}, {14, 1},
-  {12, 3}, {13, 2}, {15, 1}, {13, 3},
-  {14, 2}, {14, 3}, {15, 2}, {15, 3},
-  {0, 8}, {1, 8}, {2, 8}, {0, 9},
-  {3, 8}, {1, 9}, {2, 9}, {0, 10},
-  {3, 9}, {1, 10}, {2, 10}, {0, 11},
-  {3, 10}, {1, 11}, {2, 11}, {0, 12},
-  {3, 11}, {1, 12}, {2, 12}, {0, 13},
-  {3, 12}, {1, 13}, {0, 14}, {2, 13},
-  {0, 15}, {1, 14}, {3, 13}, {2, 14},
-  {1, 15}, {3, 14}, {2, 15}, {3, 15},
-  {8, 4}, {9, 4}, {8, 5}, {4, 8},
-  {10, 4}, {9, 5}, {5, 8}, {8, 6},
-  {4, 9}, {10, 5}, {9, 6}, {6, 8},
-  {8, 7}, {11, 4}, {7, 8}, {5, 9},
-  {9, 7}, {11, 5}, {10, 6}, {4, 10},
-  {6, 9}, {8, 8}, {5, 10}, {7, 9},
-  {12, 4}, {10, 7}, {9, 8}, {11, 6},
-  {8, 9}, {4, 11}, {6, 10}, {7, 10},
-  {12, 5}, {5, 11}, {10, 8}, {11, 7},
-  {9, 9}, {4, 12}, {13, 4}, {8, 10},
-  {6, 11}, {12, 6}, {5, 12}, {10, 9},
-  {7, 11}, {9, 10}, {11, 8}, {13, 5},
-  {8, 11}, {4, 13}, {6, 12}, {10, 10},
-  {12, 7}, {11, 9}, {7, 12}, {14, 4},
-  {5, 13}, {9, 11}, {13, 6}, {8, 12},
-  {4, 14}, {12, 8}, {6, 13}, {11, 10},
-  {10, 11}, {12, 9}, {5, 14}, {13, 7},
-  {14, 5}, {9, 12}, {4, 15}, {7, 13},
-  {8, 13}, {6, 14}, {13, 8}, {11, 11},
-  {10, 12}, {15, 4}, {12, 10}, {14, 6},
-  {13, 9}, {5, 15}, {9, 13}, {7, 14},
-  {15, 5}, {6, 15}, {8, 14}, {14, 7},
-  {11, 12}, {7, 15}, {9, 14}, {13, 10},
-  {10, 13}, {14, 8}, {15, 6}, {14, 9},
-  {12, 11}, {8, 15}, {15, 7}, {10, 14},
-  {11, 13}, {9, 15}, {13, 11}, {12, 12},
-  {15, 8}, {14, 10}, {15, 9}, {10, 15},
-  {11, 14}, {13, 12}, {12, 13}, {15, 10},
-  {14, 11}, {11, 15}, {13, 13}, {15, 11},
-  {14, 12}, {12, 14}, {15, 12}, {13, 14},
-  {12, 15}, {14, 13}, {13, 15}, {15, 13},
-  {14, 14}, {15, 14}, {14, 15}, {15, 15}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_ADST[192][2] = {
-  {8, 0}, {8, 1}, {8, 2}, {8, 3},
-  {9, 0}, {9, 1}, {9, 2}, {9, 3},
-  {10, 0}, {10, 1}, {10, 2}, {10, 3},
-  {11, 0}, {11, 1}, {11, 2}, {11, 3},
-  {12, 0}, {12, 1}, {12, 2}, {12, 3},
-  {13, 0}, {13, 1}, {13, 2}, {13, 3},
-  {14, 0}, {15, 0}, {14, 1}, {14, 2},
-  {14, 3}, {15, 1}, {15, 2}, {15, 3},
-  {0, 8}, {0, 9}, {0, 10}, {1, 8},
-  {0, 11}, {1, 9}, {2, 8}, {0, 12},
-  {1, 10}, {2, 9}, {0, 13}, {1, 11},
-  {3, 8}, {2, 10}, {0, 14}, {1, 12},
-  {3, 9}, {0, 15}, {2, 11}, {3, 10},
-  {1, 13}, {2, 12}, {3, 11}, {1, 14},
-  {2, 13}, {1, 15}, {3, 12}, {2, 14},
-  {3, 13}, {2, 15}, {3, 14}, {3, 15},
-  {4, 8}, {4, 9}, {5, 8}, {4, 10},
-  {5, 9}, {4, 11}, {6, 8}, {5, 10},
-  {8, 4}, {6, 9}, {4, 12}, {5, 11},
-  {8, 5}, {6, 10}, {7, 8}, {8, 6},
-  {4, 13}, {7, 9}, {5, 12}, {8, 7},
-  {9, 4}, {6, 11}, {8, 8}, {7, 10},
-  {5, 13}, {9, 5}, {4, 14}, {9, 6},
-  {8, 9}, {6, 12}, {9, 7}, {7, 11},
-  {4, 15}, {8, 10}, {9, 8}, {5, 14},
-  {10, 4}, {6, 13}, {10, 5}, {9, 9},
-  {7, 12}, {8, 11}, {10, 6}, {5, 15},
-  {10, 7}, {6, 14}, {9, 10}, {7, 13},
-  {8, 12}, {10, 8}, {9, 11}, {6, 15},
-  {11, 4}, {11, 5}, {10, 9}, {8, 13},
-  {7, 14}, {11, 6}, {9, 12}, {11, 7},
-  {10, 10}, {7, 15}, {8, 14}, {12, 4},
-  {11, 8}, {12, 5}, {9, 13}, {10, 11},
-  {8, 15}, {11, 9}, {12, 6}, {12, 7},
-  {10, 12}, {9, 14}, {11, 10}, {13, 4},
-  {12, 8}, {9, 15}, {13, 5}, {11, 11},
-  {12, 9}, {10, 13}, {13, 6}, {13, 7},
-  {12, 10}, {14, 4}, {11, 12}, {13, 8},
-  {10, 14}, {14, 5}, {12, 11}, {13, 9},
-  {14, 6}, {10, 15}, {11, 13}, {15, 4},
-  {14, 7}, {12, 12}, {13, 10}, {14, 8},
-  {15, 5}, {13, 11}, {15, 6}, {11, 14},
-  {14, 9}, {12, 13}, {11, 15}, {15, 7},
-  {14, 10}, {15, 8}, {13, 12}, {12, 14},
-  {15, 9}, {14, 11}, {13, 13}, {12, 15},
-  {15, 10}, {14, 12}, {13, 14}, {15, 11},
-  {13, 15}, {14, 13}, {14, 14}, {15, 12},
-  {14, 15}, {15, 13}, {15, 14}, {15, 15}
-  };
diff --git a/third_party/aom/av1/common/zigzag32.c b/third_party/aom/av1/common/zigzag32.c
deleted file mode 100644
index cb3b9bc63..000000000
--- a/third_party/aom/av1/common/zigzag32.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/* This file is generated by gen_zigzag32.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-OD_EXTERN const unsigned char OD_ZIGZAG32_DCT_DCT[768][2] = {
-  { 16, 0 }, { 17, 0 }, { 18, 0 }, { 19, 0 },
-  { 16, 1 }, { 17, 1 }, { 20, 0 }, { 16, 2 },
-  { 18, 1 }, { 21, 0 }, { 17, 2 }, { 16, 3 },
-  { 19, 1 }, { 22, 0 }, { 18, 2 }, { 17, 3 },
-  { 20, 1 }, { 16, 4 }, { 23, 0 }, { 19, 2 },
-  { 24, 0 }, { 16, 5 }, { 21, 1 }, { 17, 4 },
-  { 18, 3 }, { 20, 2 }, { 17, 5 }, { 16, 6 },
-  { 19, 3 }, { 18, 4 }, { 25, 0 }, { 22, 1 },
-  { 16, 7 }, { 21, 2 }, { 17, 6 }, { 20, 3 },
-  { 26, 0 }, { 18, 5 }, { 19, 4 }, { 17, 7 },
-  { 23, 1 }, { 22, 2 }, { 18, 6 }, { 27, 0 },
-  { 19, 5 }, { 24, 1 }, { 21, 3 }, { 28, 0 },
-  { 20, 4 }, { 18, 7 }, { 19, 6 }, { 23, 2 },
-  { 29, 0 }, { 25, 1 }, { 21, 4 }, { 30, 0 },
-  { 20, 5 }, { 22, 3 }, { 31, 0 }, { 19, 7 },
-  { 24, 2 }, { 26, 1 }, { 20, 6 }, { 21, 5 },
-  { 22, 4 }, { 23, 3 }, { 27, 1 }, { 25, 2 },
-  { 20, 7 }, { 28, 1 }, { 24, 3 }, { 21, 6 },
-  { 22, 5 }, { 23, 4 }, { 26, 2 }, { 21, 7 },
-  { 29, 1 }, { 25, 3 }, { 30, 1 }, { 27, 2 },
-  { 22, 6 }, { 23, 5 }, { 31, 1 }, { 24, 4 },
-  { 26, 3 }, { 28, 2 }, { 22, 7 }, { 23, 6 },
-  { 25, 4 }, { 24, 5 }, { 29, 2 }, { 30, 2 },
-  { 27, 3 }, { 23, 7 }, { 31, 2 }, { 24, 6 },
-  { 26, 4 }, { 25, 5 }, { 28, 3 }, { 24, 7 },
-  { 27, 4 }, { 29, 3 }, { 25, 6 }, { 26, 5 },
-  { 30, 3 }, { 31, 3 }, { 28, 4 }, { 27, 5 },
-  { 25, 7 }, { 29, 4 }, { 26, 6 }, { 28, 5 },
-  { 30, 4 }, { 26, 7 }, { 27, 6 }, { 31, 4 },
-  { 29, 5 }, { 27, 7 }, { 30, 5 }, { 28, 6 },
-  { 31, 5 }, { 29, 6 }, { 28, 7 }, { 30, 6 },
-  { 31, 6 }, { 29, 7 }, { 30, 7 }, { 31, 7 },
-  { 0, 16 }, { 0, 17 }, { 1, 16 }, { 0, 18 },
-  { 1, 17 }, { 0, 19 }, { 2, 16 }, { 1, 18 },
-  { 0, 20 }, { 2, 17 }, { 3, 16 }, { 1, 19 },
-  { 2, 18 }, { 0, 21 }, { 3, 17 }, { 4, 16 },
-  { 1, 20 }, { 2, 19 }, { 0, 22 }, { 3, 18 },
-  { 4, 17 }, { 5, 16 }, { 0, 23 }, { 3, 19 },
-  { 2, 20 }, { 1, 21 }, { 4, 18 }, { 6, 16 },
-  { 5, 17 }, { 3, 20 }, { 2, 21 }, { 1, 22 },
-  { 0, 24 }, { 0, 25 }, { 4, 19 }, { 7, 16 },
-  { 6, 17 }, { 5, 18 }, { 0, 26 }, { 3, 21 },
-  { 2, 22 }, { 1, 23 }, { 4, 20 }, { 5, 19 },
-  { 6, 18 }, { 1, 24 }, { 7, 17 }, { 0, 27 },
-  { 2, 23 }, { 3, 22 }, { 4, 21 }, { 1, 25 },
-  { 5, 20 }, { 7, 18 }, { 0, 28 }, { 6, 19 },
-  { 2, 24 }, { 1, 26 }, { 0, 29 }, { 4, 22 },
-  { 3, 23 }, { 2, 25 }, { 5, 21 }, { 0, 31 },
-  { 7, 19 }, { 6, 20 }, { 0, 30 }, { 1, 27 },
-  { 3, 24 }, { 2, 26 }, { 4, 23 }, { 5, 22 },
-  { 7, 20 }, { 1, 28 }, { 6, 21 }, { 3, 25 },
-  { 2, 27 }, { 1, 29 }, { 4, 24 }, { 2, 28 },
-  { 1, 30 }, { 7, 21 }, { 5, 23 }, { 3, 26 },
-  { 6, 22 }, { 1, 31 }, { 4, 25 }, { 7, 22 },
-  { 3, 27 }, { 2, 29 }, { 2, 30 }, { 5, 24 },
-  { 2, 31 }, { 6, 23 }, { 4, 26 }, { 3, 28 },
-  { 5, 25 }, { 3, 29 }, { 6, 24 }, { 7, 23 },
-  { 3, 30 }, { 4, 27 }, { 3, 31 }, { 5, 26 },
-  { 6, 25 }, { 4, 28 }, { 7, 24 }, { 4, 29 },
-  { 5, 27 }, { 4, 30 }, { 4, 31 }, { 6, 26 },
-  { 5, 28 }, { 7, 25 }, { 6, 27 }, { 5, 29 },
-  { 7, 26 }, { 5, 30 }, { 5, 31 }, { 6, 28 },
-  { 7, 27 }, { 6, 29 }, { 6, 30 }, { 7, 28 },
-  { 6, 31 }, { 7, 29 }, { 7, 30 }, { 7, 31 },
-  { 8, 16 }, { 9, 16 }, { 8, 17 }, { 10, 16 },
-  { 9, 17 }, { 16, 8 }, { 8, 18 }, { 16, 9 },
-  { 10, 17 }, { 11, 16 }, { 17, 8 }, { 9, 18 },
-  { 8, 19 }, { 16, 10 }, { 11, 17 }, { 12, 16 },
-  { 10, 18 }, { 17, 9 }, { 9, 19 }, { 16, 11 },
-  { 8, 20 }, { 18, 8 }, { 17, 10 }, { 10, 19 },
-  { 12, 17 }, { 11, 18 }, { 9, 20 }, { 16, 12 },
-  { 18, 9 }, { 8, 21 }, { 13, 16 }, { 17, 11 },
-  { 19, 8 }, { 18, 10 }, { 13, 17 }, { 16, 13 },
-  { 11, 19 }, { 12, 18 }, { 10, 20 }, { 17, 12 },
-  { 9, 21 }, { 19, 9 }, { 8, 22 }, { 14, 16 },
-  { 18, 11 }, { 11, 20 }, { 10, 21 }, { 20, 8 },
-  { 13, 18 }, { 16, 14 }, { 12, 19 }, { 17, 13 },
-  { 19, 10 }, { 14, 17 }, { 9, 22 }, { 18, 12 },
-  { 8, 23 }, { 17, 14 }, { 20, 9 }, { 15, 16 },
-  { 16, 15 }, { 13, 19 }, { 10, 22 }, { 19, 11 },
-  { 11, 21 }, { 14, 18 }, { 12, 20 }, { 18, 13 },
-  { 20, 10 }, { 21, 8 }, { 15, 17 }, { 9, 23 },
-  { 19, 12 }, { 11, 22 }, { 8, 24 }, { 21, 9 },
-  { 17, 15 }, { 16, 16 }, { 14, 19 }, { 18, 14 },
-  { 12, 21 }, { 13, 20 }, { 20, 11 }, { 10, 23 },
-  { 19, 13 }, { 15, 18 }, { 16, 17 }, { 21, 10 },
-  { 22, 8 }, { 9, 24 }, { 8, 25 }, { 20, 12 },
-  { 15, 19 }, { 11, 23 }, { 17, 16 }, { 18, 15 },
-  { 14, 20 }, { 12, 22 }, { 10, 24 }, { 22, 9 },
-  { 21, 11 }, { 19, 14 }, { 13, 21 }, { 16, 18 },
-  { 9, 25 }, { 17, 17 }, { 8, 26 }, { 20, 13 },
-  { 23, 8 }, { 12, 23 }, { 13, 22 }, { 22, 10 },
-  { 19, 15 }, { 15, 20 }, { 16, 19 }, { 21, 12 },
-  { 11, 24 }, { 14, 21 }, { 8, 27 }, { 18, 16 },
-  { 10, 25 }, { 9, 26 }, { 22, 11 }, { 20, 14 },
-  { 23, 9 }, { 18, 17 }, { 17, 18 }, { 17, 19 },
-  { 19, 16 }, { 21, 13 }, { 10, 26 }, { 12, 24 },
-  { 23, 10 }, { 24, 8 }, { 8, 28 }, { 16, 20 },
-  { 9, 27 }, { 15, 21 }, { 22, 12 }, { 14, 22 },
-  { 13, 23 }, { 20, 15 }, { 11, 25 }, { 24, 9 },
-  { 18, 18 }, { 19, 17 }, { 23, 11 }, { 10, 27 },
-  { 8, 29 }, { 12, 25 }, { 9, 28 }, { 8, 30 },
-  { 21, 14 }, { 13, 24 }, { 11, 26 }, { 25, 8 },
-  { 24, 10 }, { 20, 16 }, { 19, 18 }, { 14, 23 },
-  { 22, 13 }, { 8, 31 }, { 17, 20 }, { 9, 29 },
-  { 23, 12 }, { 15, 22 }, { 25, 9 }, { 11, 27 },
-  { 10, 28 }, { 20, 17 }, { 21, 15 }, { 18, 19 },
-  { 16, 21 }, { 24, 11 }, { 9, 30 }, { 12, 26 },
-  { 10, 29 }, { 22, 14 }, { 14, 24 }, { 9, 31 },
-  { 26, 8 }, { 13, 25 }, { 25, 10 }, { 18, 20 },
-  { 19, 19 }, { 11, 28 }, { 15, 23 }, { 20, 18 },
-  { 10, 30 }, { 12, 27 }, { 17, 21 }, { 23, 13 },
-  { 24, 12 }, { 21, 16 }, { 16, 22 }, { 26, 9 },
-  { 27, 8 }, { 13, 26 }, { 22, 15 }, { 10, 31 },
-  { 14, 25 }, { 12, 28 }, { 25, 11 }, { 21, 17 },
-  { 26, 10 }, { 20, 19 }, { 11, 29 }, { 15, 24 },
-  { 23, 14 }, { 27, 9 }, { 11, 30 }, { 13, 27 },
-  { 19, 20 }, { 24, 13 }, { 28, 8 }, { 11, 31 },
-  { 22, 16 }, { 17, 22 }, { 16, 23 }, { 25, 12 },
-  { 18, 21 }, { 12, 29 }, { 21, 18 }, { 28, 9 },
-  { 27, 10 }, { 26, 11 }, { 29, 8 }, { 14, 26 },
-  { 15, 25 }, { 13, 28 }, { 12, 30 }, { 23, 15 },
-  { 30, 8 }, { 16, 24 }, { 13, 29 }, { 25, 13 },
-  { 24, 14 }, { 20, 20 }, { 31, 8 }, { 12, 31 },
-  { 14, 27 }, { 28, 10 }, { 26, 12 }, { 22, 17 },
-  { 21, 19 }, { 17, 23 }, { 18, 22 }, { 29, 9 },
-  { 27, 11 }, { 19, 21 }, { 27, 12 }, { 30, 9 },
-  { 31, 9 }, { 13, 30 }, { 24, 15 }, { 23, 16 },
-  { 15, 26 }, { 14, 28 }, { 29, 10 }, { 28, 11 },
-  { 26, 13 }, { 17, 24 }, { 13, 31 }, { 25, 14 },
-  { 22, 18 }, { 16, 25 }, { 30, 10 }, { 14, 29 },
-  { 15, 27 }, { 19, 22 }, { 21, 20 }, { 20, 21 },
-  { 27, 13 }, { 29, 11 }, { 18, 23 }, { 23, 17 },
-  { 16, 26 }, { 31, 10 }, { 24, 16 }, { 14, 30 },
-  { 22, 19 }, { 14, 31 }, { 28, 12 }, { 26, 14 },
-  { 30, 11 }, { 15, 28 }, { 25, 15 }, { 17, 25 },
-  { 23, 18 }, { 18, 24 }, { 15, 30 }, { 29, 12 },
-  { 31, 11 }, { 16, 27 }, { 24, 17 }, { 28, 13 },
-  { 19, 23 }, { 15, 29 }, { 25, 16 }, { 17, 26 },
-  { 27, 14 }, { 22, 20 }, { 15, 31 }, { 20, 22 },
-  { 21, 21 }, { 16, 28 }, { 17, 27 }, { 30, 12 },
-  { 26, 15 }, { 19, 24 }, { 18, 25 }, { 23, 19 },
-  { 29, 13 }, { 31, 12 }, { 24, 18 }, { 26, 16 },
-  { 25, 17 }, { 16, 29 }, { 28, 14 }, { 20, 23 },
-  { 18, 26 }, { 21, 22 }, { 19, 25 }, { 22, 21 },
-  { 27, 15 }, { 17, 28 }, { 16, 30 }, { 26, 17 },
-  { 23, 20 }, { 16, 31 }, { 25, 18 }, { 27, 16 },
-  { 20, 24 }, { 24, 19 }, { 31, 13 }, { 30, 13 },
-  { 29, 14 }, { 18, 27 }, { 28, 15 }, { 17, 29 },
-  { 19, 26 }, { 17, 30 }, { 21, 23 }, { 22, 22 },
-  { 30, 14 }, { 20, 25 }, { 23, 21 }, { 17, 31 },
-  { 18, 28 }, { 25, 19 }, { 24, 20 }, { 28, 16 },
-  { 31, 14 }, { 26, 18 }, { 19, 27 }, { 29, 15 },
-  { 27, 17 }, { 30, 15 }, { 21, 24 }, { 22, 23 },
-  { 26, 19 }, { 23, 22 }, { 28, 17 }, { 29, 16 },
-  { 18, 30 }, { 24, 21 }, { 25, 20 }, { 18, 31 },
-  { 18, 29 }, { 20, 26 }, { 19, 28 }, { 27, 18 },
-  { 31, 15 }, { 20, 27 }, { 30, 16 }, { 19, 29 },
-  { 29, 17 }, { 31, 16 }, { 27, 19 }, { 21, 25 },
-  { 28, 18 }, { 26, 20 }, { 22, 24 }, { 25, 21 },
-  { 19, 30 }, { 24, 22 }, { 30, 17 }, { 21, 26 },
-  { 23, 23 }, { 19, 31 }, { 20, 28 }, { 31, 17 },
-  { 28, 19 }, { 27, 20 }, { 21, 27 }, { 29, 18 },
-  { 30, 18 }, { 25, 22 }, { 26, 21 }, { 20, 29 },
-  { 22, 25 }, { 24, 23 }, { 29, 19 }, { 23, 24 },
-  { 20, 31 }, { 20, 30 }, { 28, 20 }, { 21, 28 },
-  { 22, 26 }, { 31, 18 }, { 27, 21 }, { 30, 19 },
-  { 22, 27 }, { 29, 20 }, { 23, 25 }, { 24, 24 },
-  { 26, 22 }, { 21, 29 }, { 25, 23 }, { 31, 19 },
-  { 21, 30 }, { 23, 26 }, { 28, 21 }, { 21, 31 },
-  { 22, 28 }, { 30, 20 }, { 25, 24 }, { 27, 22 },
-  { 29, 21 }, { 26, 23 }, { 24, 25 }, { 31, 20 },
-  { 23, 27 }, { 22, 29 }, { 30, 21 }, { 28, 22 },
-  { 24, 26 }, { 25, 25 }, { 27, 23 }, { 22, 30 },
-  { 23, 28 }, { 22, 31 }, { 26, 24 }, { 31, 21 },
-  { 24, 27 }, { 29, 22 }, { 27, 24 }, { 30, 22 },
-  { 25, 26 }, { 28, 23 }, { 23, 30 }, { 23, 29 },
-  { 24, 28 }, { 25, 27 }, { 31, 22 }, { 23, 31 },
-  { 26, 25 }, { 28, 24 }, { 29, 23 }, { 24, 29 },
-  { 24, 30 }, { 27, 25 }, { 25, 28 }, { 26, 26 },
-  { 30, 23 }, { 26, 27 }, { 31, 23 }, { 28, 25 },
-  { 27, 26 }, { 25, 29 }, { 24, 31 }, { 29, 24 },
-  { 30, 24 }, { 27, 27 }, { 29, 25 }, { 26, 28 },
-  { 31, 24 }, { 25, 30 }, { 25, 31 }, { 28, 26 },
-  { 27, 28 }, { 26, 29 }, { 30, 25 }, { 29, 26 },
-  { 28, 27 }, { 26, 30 }, { 31, 25 }, { 27, 29 },
-  { 26, 31 }, { 30, 26 }, { 28, 28 }, { 31, 26 },
-  { 29, 27 }, { 27, 30 }, { 28, 29 }, { 27, 31 },
-  { 30, 27 }, { 31, 27 }, { 28, 30 }, { 29, 28 },
-  { 30, 28 }, { 29, 29 }, { 30, 29 }, { 31, 28 },
-  { 28, 31 }, { 29, 30 }, { 29, 31 }, { 31, 29 },
-  { 30, 30 }, { 30, 31 }, { 31, 30 }, { 31, 31 }
-};
diff --git a/third_party/aom/av1/common/zigzag4.c b/third_party/aom/av1/common/zigzag4.c
deleted file mode 100644
index 1fb5a320b..000000000
--- a/third_party/aom/av1/common/zigzag4.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/* This file is generated by gen_zigzag4.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_DCT[15][2] = {
-  {0, 1}, {1, 0}, {1, 1}, {0, 2},
-  {2, 0}, {0, 3}, {1, 2}, {3, 0},
-  {2, 1}, {1, 3}, {2, 2}, {3, 1},
-  {2, 3}, {3, 2}, {3, 3} };
-
-OD_EXTERN const unsigned char OD_ZIGZAG4_ADST_DCT[15][2] = {
-  {1, 0}, {0, 1}, {2, 0}, {1, 1},
-  {3, 0}, {2, 1}, {0, 2}, {1, 2},
-  {3, 1}, {0, 3}, {2, 2}, {1, 3},
-  {3, 2}, {2, 3}, {3, 3} };
-
-OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_ADST[15][2] = {
-  {0, 1}, {0, 2}, {1, 0}, {0, 3},
-  {1, 1}, {1, 2}, {2, 0}, {1, 3},
-  {2, 1}, {2, 2}, {3, 0}, {3, 1},
-  {2, 3}, {3, 2}, {3, 3} };
diff --git a/third_party/aom/av1/common/zigzag8.c b/third_party/aom/av1/common/zigzag8.c
deleted file mode 100644
index 3f11e0c03..000000000
--- a/third_party/aom/av1/common/zigzag8.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* This file is generated by gen_zigzag8.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-
-OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_DCT[48][2] = {
-  {4, 0}, {4, 1}, {5, 0}, {5, 1},
-  {6, 0}, {7, 0}, {6, 1}, {7, 1},
-  {0, 4}, {1, 4}, {0, 5}, {1, 5},
-  {0, 6}, {1, 6}, {0, 7}, {1, 7},
-  {2, 4}, {4, 2}, {3, 4}, {2, 5},
-  {4, 3}, {5, 2}, {4, 4}, {3, 5},
-  {5, 3}, {2, 6}, {4, 5}, {6, 2},
-  {5, 4}, {3, 6}, {2, 7}, {6, 3},
-  {5, 5}, {7, 2}, {4, 6}, {3, 7},
-  {6, 4}, {7, 3}, {4, 7}, {5, 6},
-  {6, 5}, {7, 4}, {5, 7}, {6, 6},
-  {7, 5}, {6, 7}, {7, 6}, {7, 7}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG8_ADST_DCT[48][2] = {
-  {4, 0}, {5, 0}, {4, 1}, {6, 0},
-  {5, 1}, {7, 0}, {6, 1}, {7, 1},
-  {0, 4}, {1, 4}, {0, 5}, {1, 5},
-  {0, 6}, {1, 6}, {0, 7}, {1, 7},
-  {4, 2}, {2, 4}, {5, 2}, {4, 3},
-  {3, 4}, {2, 5}, {5, 3}, {4, 4},
-  {6, 2}, {3, 5}, {5, 4}, {2, 6},
-  {4, 5}, {6, 3}, {7, 2}, {3, 6},
-  {2, 7}, {5, 5}, {6, 4}, {4, 6},
-  {7, 3}, {3, 7}, {5, 6}, {6, 5},
-  {4, 7}, {7, 4}, {5, 7}, {7, 5},
-  {6, 6}, {7, 6}, {6, 7}, {7, 7}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_ADST[48][2] = {
-  {4, 0}, {4, 1}, {5, 0}, {5, 1},
-  {6, 0}, {6, 1}, {7, 0}, {7, 1},
-  {0, 4}, {0, 5}, {1, 4}, {0, 6},
-  {1, 5}, {0, 7}, {1, 6}, {1, 7},
-  {2, 4}, {2, 5}, {3, 4}, {4, 2},
-  {2, 6}, {4, 3}, {3, 5}, {4, 4},
-  {2, 7}, {3, 6}, {5, 2}, {4, 5},
-  {5, 3}, {3, 7}, {5, 4}, {4, 6},
-  {6, 2}, {5, 5}, {4, 7}, {6, 3},
-  {6, 4}, {5, 6}, {7, 2}, {6, 5},
-  {7, 3}, {5, 7}, {7, 4}, {6, 6},
-  {7, 5}, {6, 7}, {7, 6}, {7, 7}
-  };
diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c
index ba243c9e1..8d8f3dfdb 100644
--- a/third_party/aom/av1/decoder/accounting.c
+++ b/third_party/aom/av1/decoder/accounting.c
@@ -15,7 +15,7 @@
 #include <string.h>
 
 #include "aom/aom_integer.h"
-#include "./accounting.h"
+#include "av1/decoder/accounting.h"
 
 static int aom_accounting_hash(const char *str) {
   uint32_t val;
@@ -31,7 +31,7 @@ static int aom_accounting_hash(const char *str) {
 /* Dictionary lookup based on an open-addressing hash table. */
 int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
   int hash;
-  int len;
+  size_t len;
   AccountingDictionary *dictionary;
   dictionary = &accounting->syms.dictionary;
   hash = aom_accounting_hash(str);
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
index 889865b2e..9099d081b 100644
--- a/third_party/aom/av1/decoder/accounting.h
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -11,6 +11,7 @@
 #ifndef AOM_ACCOUNTING_H_
 #define AOM_ACCOUNTING_H_
 #include <stdlib.h>
+#include "aom/aomdx.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -58,8 +59,6 @@ typedef struct {
   AccountingDictionary dictionary;
 } AccountingSymbols;
 
-typedef struct Accounting Accounting;
-
 struct Accounting {
   AccountingSymbols syms;
   /** Size allocated for symbols (not all may be used). */
diff --git a/third_party/aom/av1/decoder/decint.h b/third_party/aom/av1/decoder/decint.h
deleted file mode 100644
index e887ad5e0..000000000
--- a/third_party/aom/av1/decoder/decint.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_decint_H)
-# define _decint_H (1)
-# include "av1/common/pvq_state.h"
-# include "aom_dsp/bitreader.h"
-# include "aom_dsp/entdec.h"
-
-typedef struct daala_dec_ctx daala_dec_ctx;
-
-typedef struct daala_dec_ctx od_dec_ctx;
-
-
-struct daala_dec_ctx {
-  /* Stores context-adaptive CDFs for PVQ. */
-  od_state state;
-  /* AOM entropy decoder. */
-  aom_reader *r;
-  int use_activity_masking;
-  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
-  int qm;
-};
-
-#endif
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index 9ec3b60eb..e92c6b28c 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -10,12 +10,12 @@
  */
 
 #include <assert.h>
-#include <stdlib.h>  // qsort()
+#include <stddef.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom/aom_codec.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -23,19 +23,19 @@
 #include "aom_dsp/bitreader.h"
 #include "aom_dsp/bitreader_buffer.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/mem_ops.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_util/aom_thread.h"
 
-#if CONFIG_BITSTREAM_DEBUG
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "av1/common/alloccommon.h"
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif
+#include "av1/common/cfl.h"
 #if CONFIG_INSPECTION
 #include "av1/decoder/inspection.h"
 #endif
@@ -49,78 +49,69 @@
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#if CONFIG_FRAME_SUPERRES
 #include "av1/common/resize.h"
-#endif  // CONFIG_FRAME_SUPERRES
 #include "av1/common/seg_common.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/tile_common.h"
-
+#include "av1/common/warped_motion.h"
+#include "av1/common/obmc.h"
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decodemv.h"
 #include "av1/decoder/decoder.h"
-#if CONFIG_LV_MAP
 #include "av1/decoder/decodetxb.h"
-#endif
 #include "av1/decoder/detokenize.h"
-#include "av1/decoder/dsubexp.h"
-#include "av1/decoder/symbolrate.h"
-
-#if CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
-#include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
 
-#define MAX_AV1_HEADER_SIZE 80
 #define ACCT_STR __func__
 
-#if CONFIG_PVQ
-#include "av1/common/partition.h"
-#include "av1/common/pvq.h"
-#include "av1/common/scan.h"
-#include "av1/decoder/decint.h"
-#include "av1/decoder/pvq_decoder.h"
-#include "av1/encoder/encodemb.h"
-#include "av1/encoder/hybrid_fwd_txfm.h"
-#endif
+// This is needed by ext_tile related unit tests.
+#define EXT_TILE_DEBUG 1
+#define MC_TEMP_BUF_PELS                       \
+  (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
+   ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
 
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
+// Checks that the remaining bits start with a 1 and ends with 0s.
+// It consumes an additional byte, if already byte aligned before the check.
+int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
+  int bits_before_alignment = 8 - rb->bit_offset % 8;
+  int trailing = aom_rb_read_literal(rb, bits_before_alignment);
+  if (trailing != (1 << (bits_before_alignment - 1))) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+  return 0;
+}
 
-#if CONFIG_STRIPED_LOOP_RESTORATION && !CONFIG_LOOP_RESTORATION
-#error "striped_loop_restoration requires loop_restoration"
-#endif
+// Use only_chroma = 1 to only set the chroma planes
+static void set_planes_to_neutral_grey(AV1_COMMON *const cm,
+                                       const YV12_BUFFER_CONFIG *const buf,
+                                       int only_chroma) {
+  const int val = 1 << (cm->bit_depth - 1);
+
+  for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+    const int is_uv = plane > 0;
+    for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+      if (cm->use_highbitdepth) {
+        // TODO(yaowu): replace this with aom_memset16() for speed
+        for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
+          uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+          base[row_idx * buf->strides[is_uv] + col_idx] = val;
+        }
+      } else {
+        memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
+               buf->crop_widths[is_uv]);
+      }
+    }
+  }
+}
 
-#if CONFIG_LOOP_RESTORATION
 static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
                                             MACROBLOCKD *xd,
                                             aom_reader *const r, int plane,
-                                            int rtile_idx);
-#endif
-
-static struct aom_read_bit_buffer *init_read_bit_buffer(
-    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
-    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
-static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
-                                  size_t partition_size);
-static size_t read_uncompressed_header(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb);
-
-static int is_compound_reference_allowed(const AV1_COMMON *cm) {
-#if CONFIG_ONE_SIDED_COMPOUND  // Normative in decoder
-  return !frame_is_intra_only(cm);
-#else
-  int i;
-  if (frame_is_intra_only(cm)) return 0;
-  for (i = 1; i < INTER_REFS_PER_FRAME; ++i)
-    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
-
-  return 0;
-#endif  // CONFIG_ONE_SIDED_COMPOUND
-}
+                                            int runit_idx);
 
 static void setup_compound_reference_mode(AV1_COMMON *cm) {
-#if CONFIG_EXT_REFS
   cm->comp_fwd_ref[0] = LAST_FRAME;
   cm->comp_fwd_ref[1] = LAST2_FRAME;
   cm->comp_fwd_ref[2] = LAST3_FRAME;
@@ -129,1952 +120,1099 @@ static void setup_compound_reference_mode(AV1_COMMON *cm) {
   cm->comp_bwd_ref[0] = BWDREF_FRAME;
   cm->comp_bwd_ref[1] = ALTREF2_FRAME;
   cm->comp_bwd_ref[2] = ALTREF_FRAME;
-#else   // !CONFIG_EXT_REFS
-  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
-    cm->comp_fixed_ref = GOLDEN_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  } else {
-    cm->comp_fixed_ref = LAST_FRAME;
-    cm->comp_var_ref[0] = GOLDEN_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  }
-#endif  // CONFIG_EXT_REFS
 }
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
 }
 
-static int decode_unsigned_max(struct aom_read_bit_buffer *rb, int max) {
-  const int data = aom_rb_read_literal(rb, get_unsigned_bits(max));
-  return data > max ? max : data;
-}
-
 static TX_MODE read_tx_mode(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-#if CONFIG_TX64X64
-  TX_MODE tx_mode;
-#endif
-  if (cm->all_lossless) return ONLY_4X4;
-#if CONFIG_VAR_TX_NO_TX_MODE
-  (void)rb;
-  return TX_MODE_SELECT;
-#else
-#if CONFIG_TX64X64
-  tx_mode = aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
-  if (tx_mode == ALLOW_32X32) tx_mode += aom_rb_read_bit(rb);
-  return tx_mode;
-#else
-  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_VAR_TX_NO_TX_MODE
+  if (cm->coded_lossless) return ONLY_4X4;
+  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
 }
 
-#if !CONFIG_RESTRICT_COMPRESSED_HDR
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i;
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->zeromv_prob[i], ACCT_STR);
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->refmv_prob[i], ACCT_STR);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->drl_prob[i], ACCT_STR);
-}
-#endif
-
 static REFERENCE_MODE read_frame_reference_mode(
     const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  if (is_compound_reference_allowed(cm)) {
-#if CONFIG_REF_ADAPT
-    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
-#else
-    return aom_rb_read_bit(rb)
-               ? REFERENCE_MODE_SELECT
-               : (aom_rb_read_bit(rb) ? COMPOUND_REFERENCE : SINGLE_REFERENCE);
-#endif  // CONFIG_REF_ADAPT
-  } else {
+  if (frame_is_intra_only(cm)) {
     return SINGLE_REFERENCE;
+  } else {
+    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
   }
 }
 
-#if !CONFIG_RESTRICT_COMPRESSED_HDR
-static void read_frame_reference_mode_probs(AV1_COMMON *cm, aom_reader *r) {
-  FRAME_CONTEXT *const fc = cm->fc;
-  int i;
-
-  if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
-      av1_diff_update_prob(r, &fc->comp_inter_prob[i], ACCT_STR);
-
-  if (cm->reference_mode != COMPOUND_REFERENCE) {
-    for (i = 0; i < REF_CONTEXTS; ++i) {
-      int j;
-      for (j = 0; j < (SINGLE_REFS - 1); ++j) {
-        av1_diff_update_prob(r, &fc->single_ref_prob[i][j], ACCT_STR);
-      }
-    }
-  }
-
-  if (cm->reference_mode != SINGLE_REFERENCE) {
-#if CONFIG_EXT_COMP_REFS
-    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i)
-      av1_diff_update_prob(r, &fc->comp_ref_type_prob[i], ACCT_STR);
-
-    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
-      int j;
-      for (j = 0; j < (UNIDIR_COMP_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->uni_comp_ref_prob[i][j], ACCT_STR);
-    }
-#endif  // CONFIG_EXT_COMP_REFS
-
-    for (i = 0; i < REF_CONTEXTS; ++i) {
-      int j;
-#if CONFIG_EXT_REFS
-      for (j = 0; j < (FWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
-      for (j = 0; j < (BWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_bwdref_prob[i][j], ACCT_STR);
-#else
-      for (j = 0; j < (COMP_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
-#endif  // CONFIG_EXT_REFS
-    }
-  }
-}
-
-static void update_mv_probs(aom_prob *p, int n, aom_reader *r) {
-  int i;
-  for (i = 0; i < n; ++i) av1_diff_update_prob(r, &p[i], ACCT_STR);
-}
-
-static void read_mv_probs(nmv_context *ctx, int allow_hp, aom_reader *r) {
-  int i;
-  if (allow_hp) {
-    for (i = 0; i < 2; ++i) {
-      nmv_component *const comp_ctx = &ctx->comps[i];
-      update_mv_probs(&comp_ctx->class0_hp, 1, r);
-      update_mv_probs(&comp_ctx->hp, 1, r);
-    }
-  }
-}
-#endif
-
 static void inverse_transform_block(MACROBLOCKD *xd, int plane,
-#if CONFIG_LGT_FROM_PRED
-                                    PREDICTION_MODE mode,
-#endif
                                     const TX_TYPE tx_type,
                                     const TX_SIZE tx_size, uint8_t *dst,
-                                    int stride, int16_t scan_line, int eob) {
+                                    int stride, int reduced_tx_set) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = pd->dqcoeff;
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              xd->mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, stride, eob);
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t scan_line = eob_data->max_scan_line;
+  uint16_t eob = eob_data->eob;
+
+  memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane],
+         (scan_line + 1) * sizeof(dqcoeff[0]));
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
+                              eob, reduced_tx_set);
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
-static int get_block_idx(const MACROBLOCKD *xd, int plane, int row, int col) {
-  const int bsize = xd->mi[0]->mbmi.sb_type;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const uint8_t txh_unit = tx_size_high_unit[tx_size];
-  return row * max_blocks_wide + col * txh_unit;
-}
-
-#if CONFIG_PVQ
-static int av1_pvq_decode_helper(MACROBLOCKD *xd, tran_low_t *ref_coeff,
-                                 tran_low_t *dqcoeff, int16_t *quant, int pli,
-                                 int bs, TX_TYPE tx_type, int xdec,
-                                 PVQ_SKIP_TYPE ac_dc_coded) {
-  unsigned int flags;  // used for daala's stream analyzer.
-  int off;
-  const int is_keyframe = 0;
-  const int has_dc_skip = 1;
-  int coeff_shift = 3 - av1_get_tx_scale(bs);
-  int hbd_downshift = 0;
-  int rounding_mask;
-  // DC quantizer for PVQ
-  int pvq_dc_quant;
-  int lossless = (quant[0] == 0);
-  const int blk_size = tx_size_wide[bs];
-  int eob = 0;
-  int i;
-  od_dec_ctx *dec = &xd->daala_dec;
-  int use_activity_masking = dec->use_activity_masking;
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-
-  od_coeff ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
-  od_coeff out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
-
-  hbd_downshift = xd->bd - 8;
-
-  od_raster_to_coding_order(ref_coeff_pvq, blk_size, tx_type, ref_coeff,
-                            blk_size);
-
-  assert(OD_COEFF_SHIFT >= 4);
-  if (lossless)
-    pvq_dc_quant = 1;
-  else {
-    if (use_activity_masking)
-      pvq_dc_quant =
-          OD_MAXI(1,
-                  (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
-                          dec->state.pvq_qm_q4[pli][od_qm_get_index(bs, 0)] >>
-                      4);
-    else
-      pvq_dc_quant =
-          OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
-  }
-
-  off = od_qm_offset(bs, xdec);
-
-  // copy int16 inputs to int32
-  for (i = 0; i < blk_size * blk_size; i++) {
-    ref_int32[i] =
-        AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
-        hbd_downshift;
-  }
-
-  od_pvq_decode(dec, ref_int32, out_int32,
-                OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >> hbd_downshift),
-                pli, bs, OD_PVQ_BETA[use_activity_masking][pli][bs],
-                is_keyframe, &flags, ac_dc_coded, dec->state.qm + off,
-                dec->state.qm_inv + off);
-
-  if (!has_dc_skip || out_int32[0]) {
-    out_int32[0] =
-        has_dc_skip + generic_decode(dec->r, &dec->state.adapt->model_dc[pli],
-                                     &dec->state.adapt->ex_dc[pli][bs][0], 2,
-                                     "dc:mag");
-    if (out_int32[0]) out_int32[0] *= aom_read_bit(dec->r, "dc:sign") ? -1 : 1;
-  }
-  out_int32[0] = out_int32[0] * pvq_dc_quant + ref_int32[0];
-
-  // copy int32 result back to int16
-  assert(OD_COEFF_SHIFT > coeff_shift);
-  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
-  for (i = 0; i < blk_size * blk_size; i++) {
-    out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
-    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
-                     (OD_COEFF_SHIFT - coeff_shift);
-  }
-
-  od_coding_order_to_raster(dqcoeff, blk_size, tx_type, dqcoeff_pvq, blk_size);
-
-  eob = blk_size * blk_size;
-
-  return eob;
-}
-
-static PVQ_SKIP_TYPE read_pvq_skip(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                   int plane, TX_SIZE tx_size) {
-  // decode ac/dc coded flag. bit0: DC coded, bit1 : AC coded
-  // NOTE : we don't use 5 symbols for luma here in aom codebase,
-  // since block partition is taken care of by aom.
-  // So, only AC/DC skip info is coded
-  const int ac_dc_coded = aom_read_symbol(
-      xd->daala_dec.r,
-      xd->daala_dec.state.adapt->skip_cdf[2 * tx_size + (plane != 0)], 4,
-      "skip");
-  if (ac_dc_coded < 0 || ac_dc_coded > 3) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "Invalid PVQ Skip Type");
-  }
-  return ac_dc_coded;
-}
-
-static int av1_pvq_decode_helper2(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                  MB_MODE_INFO *const mbmi, int plane, int row,
-                                  int col, TX_SIZE tx_size, TX_TYPE tx_type) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  // transform block size in pixels
-  int tx_blk_size = tx_size_wide[tx_size];
-  int i, j;
-  tran_low_t *pvq_ref_coeff = pd->pvq_ref_coeff;
-  const int diff_stride = tx_blk_size;
-  int16_t *pred = pd->pred;
-  tran_low_t *const dqcoeff = pd->dqcoeff;
-  uint8_t *dst;
-  int eob;
-  const PVQ_SKIP_TYPE ac_dc_coded = read_pvq_skip(cm, xd, plane, tx_size);
-
-  eob = 0;
-  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
-
-  if (ac_dc_coded) {
-    int xdec = pd->subsampling_x;
-    int seg_id = mbmi->segment_id;
-    int16_t *quant;
-    TxfmParam txfm_param;
-    // ToDo(yaowu): correct this with optimal number from decoding process.
-    const int max_scan_line = tx_size_2d[tx_size];
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++)
-          pred[diff_stride * j + i] =
-              CONVERT_TO_SHORTPTR(dst)[pd->dst.stride * j + i];
-    } else {
+static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                       aom_reader *const r, int plane, int row,
+                                       int col, TX_SIZE tx_size) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!mbmi->skip) {
+#if TXCOEFF_TIMER
+    struct aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
 #endif
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++)
-          pred[diff_stride * j + i] = dst[pd->dst.stride * j + i];
-#if CONFIG_HIGHBITDEPTH
-    }
+    av1_read_coeffs_txb_facade(cm, xd, r, row, col, plane, tx_size);
+#if TXCOEFF_TIMER
+    aom_usec_timer_mark(&timer);
+    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+    cm->txcoeff_timer += elapsed_time;
+    ++cm->txb_count;
 #endif
-
-    txfm_param.tx_type = tx_type;
-    txfm_param.tx_size = tx_size;
-    txfm_param.lossless = xd->lossless[seg_id];
-
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      txfm_param.bd = xd->bd;
-      av1_highbd_fwd_txfm(pred, pvq_ref_coeff, diff_stride, &txfm_param);
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      av1_fwd_txfm(pred, pvq_ref_coeff, diff_stride, &txfm_param);
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-
-    quant = &pd->seg_dequant[seg_id][0];  // aom's quantizer
-
-    eob = av1_pvq_decode_helper(xd, pvq_ref_coeff, dqcoeff, quant, plane,
-                                tx_size, tx_type, xdec, ac_dc_coded);
-
-    inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
-                            max_scan_line, eob);
   }
-
-  return eob;
 }
-#endif
 
-static void predict_and_reconstruct_intra_block(
-    AV1_COMMON *cm, MACROBLOCKD *const xd, aom_reader *const r,
-    MB_MODE_INFO *const mbmi, int plane, int row, int col, TX_SIZE tx_size) {
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  const int block_idx = get_block_idx(xd, plane, row, col);
-#if CONFIG_PVQ
+static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
+                                                MACROBLOCKD *const xd,
+                                                aom_reader *const r, int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
   (void)r;
-#endif
-  av1_predict_intra_block_facade(cm, xd, plane, block_idx, col, row, tx_size);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  PLANE_TYPE plane_type = get_plane_type(plane);
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
 
   if (!mbmi->skip) {
-#if !CONFIG_PVQ
     struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_LV_MAP
-    int16_t max_scan_line = 0;
-    int eob;
-    av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane,
-                               pd->dqcoeff, tx_size, &max_scan_line, &eob);
+
     // tx_type will be read out in av1_read_coeffs_txb_facade
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-#else   // CONFIG_LV_MAP
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-    int16_t max_scan_line = 0;
-    const int eob =
-        av1_decode_block_tokens(cm, xd, plane, scan_order, col, row, tx_size,
-                                tx_type, &max_scan_line, r, mbmi->segment_id);
-#endif  // CONFIG_LV_MAP
-    if (eob) {
+    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+                                            cm->reduced_tx_set_used);
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    if (eob_data->eob) {
       uint8_t *dst =
           &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-      inverse_transform_block(xd, plane,
-#if CONFIG_LGT_FROM_PRED
-                              mbmi->mode,
-#endif
-                              tx_type, tx_size, dst, pd->dst.stride,
-                              max_scan_line, eob);
+      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                              cm->reduced_tx_set_used);
     }
-#else   // !CONFIG_PVQ
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-    av1_pvq_decode_helper2(cm, xd, mbmi, plane, row, col, tx_size, tx_type);
-#endif  // !CONFIG_PVQ
-  }
-#if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+  }
+  if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
     cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
   }
-#endif  // CONFIG_CFL
 }
 
-#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
+static void inverse_transform_inter_block(const AV1_COMMON *const cm,
+                                          MACROBLOCKD *const xd,
+                                          aom_reader *const r,
+                                          const int blk_row, const int blk_col,
+                                          const int plane,
+                                          const TX_SIZE tx_size) {
+  (void)r;
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+
+  // tx_type will be read out in av1_read_coeffs_txb_facade
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+
+  if (plane == 0)
+    update_txk_array(mbmi->txk_type, mbmi->sb_type, blk_row, blk_col, tx_size,
+                     tx_type);
+
+  uint8_t *dst =
+      &pd->dst
+           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+  inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                          cm->reduced_tx_set_used);
+}
+
+static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
+                                  int plane) {
+  xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+  xd->txb_offset[plane] =
+      xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+}
+
 static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
                                   aom_reader *r, MB_MODE_INFO *const mbmi,
                                   int plane, BLOCK_SIZE plane_bsize,
                                   int blk_row, int blk_col, int block,
                                   TX_SIZE tx_size, int *eob_total) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
   const TX_SIZE plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
   // Scale to match transform block unit.
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  if (tx_size == plane_tx_size) {
-    PLANE_TYPE plane_type = get_plane_type(plane);
-#if CONFIG_LV_MAP
-    int16_t max_scan_line = 0;
-    int eob;
-    av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, block, plane,
-                               pd->dqcoeff, tx_size, &max_scan_line, &eob);
-    // tx_type will be read out in av1_read_coeffs_txb_facade
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, plane_tx_size);
-#else   // CONFIG_LV_MAP
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, plane_tx_size);
-    const SCAN_ORDER *sc = get_scan(cm, plane_tx_size, tx_type, mbmi);
-    int16_t max_scan_line = 0;
-    const int eob = av1_decode_block_tokens(
-        cm, xd, plane, sc, blk_col, blk_row, plane_tx_size, tx_type,
-        &max_scan_line, r, mbmi->segment_id);
-#endif  // CONFIG_LV_MAP
-    inverse_transform_block(xd, plane,
-#if CONFIG_LGT_FROM_PRED
-                            mbmi->mode,
-#endif
-                            tx_type, plane_tx_size,
-                            &pd->dst.buf[(blk_row * pd->dst.stride + blk_col)
-                                         << tx_size_wide_log2[0]],
-                            pd->dst.stride, max_scan_line, eob);
-    *eob_total += eob;
+  if (tx_size == plane_tx_size || plane) {
+#if TXCOEFF_TIMER
+    struct aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+#endif
+    av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, plane, tx_size);
+#if TXCOEFF_TIMER
+    aom_usec_timer_mark(&timer);
+    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+    cm->txcoeff_timer += elapsed_time;
+    ++cm->txb_count;
+#endif
+    inverse_transform_inter_block(cm, xd, r, blk_row, blk_col, plane, tx_size);
+
+#if CONFIG_MISMATCH_DEBUG
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
+                            pixel_c, pixel_r, blk_w, blk_h,
+                            xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    *eob_total += eob_data->eob;
+    set_cb_buffer_offsets(xd, tx_size, plane);
   } else {
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-    if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0);
-#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
     assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
-#endif
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int sub_step = bsw * bsh;
 
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + ((i >> 1) * bsl);
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + (i & 0x01) * bsl;
-#else
-      const int offsetr = blk_row + (i >> 1) * bsl;
-      const int offsetc = blk_col + (i & 0x01) * bsl;
-#endif
+    assert(bsw > 0 && bsh > 0);
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
 
-      decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
-                            offsetc, block, sub_txs, eob_total);
-      block += sub_step;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+                              offsetc, block, sub_txs, eob_total);
+        block += sub_step;
+      }
     }
   }
 }
-#endif  // CONFIG_VAR_TX
-
-#if !CONFIG_VAR_TX || CONFIG_SUPERTX || CONFIG_COEF_INTERLEAVE || \
-    (!CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX)
-static int reconstruct_inter_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                   aom_reader *const r, int segment_id,
-                                   int plane, int row, int col,
-                                   TX_SIZE tx_size) {
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  int block_idx = get_block_idx(xd, plane, row, col);
-#if CONFIG_PVQ
-  int eob;
-  (void)r;
-  (void)segment_id;
-#else
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-
-#if !CONFIG_PVQ
-#if CONFIG_LV_MAP
-  (void)segment_id;
-  int16_t max_scan_line = 0;
-  int eob;
-  av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane, pd->dqcoeff,
-                             tx_size, &max_scan_line, &eob);
-  // tx_type will be read out in av1_read_coeffs_txb_facade
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-#else   // CONFIG_LV_MAP
-  int16_t max_scan_line = 0;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-  const SCAN_ORDER *scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  const int eob =
-      av1_decode_block_tokens(cm, xd, plane, scan_order, col, row, tx_size,
-                              tx_type, &max_scan_line, r, segment_id);
-#endif  // CONFIG_LV_MAP
-  uint8_t *dst =
-      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-  if (eob)
-    inverse_transform_block(xd, plane,
-#if CONFIG_LGT_FROM_PRED
-                            xd->mi[0]->mbmi.mode,
-#endif
-                            tx_type, tx_size, dst, pd->dst.stride,
-                            max_scan_line, eob);
-#else
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-  eob = av1_pvq_decode_helper2(cm, xd, &xd->mi[0]->mbmi, plane, row, col,
-                               tx_size, tx_type);
-#endif
-  return eob;
-}
-#endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
 
 static void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                         BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
                         int bh, int x_mis, int y_mis) {
+  const int num_planes = av1_num_planes(cm);
+
   const int offset = mi_row * cm->mi_stride + mi_col;
-  int x, y;
   const TileInfo *const tile = &xd->tile;
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = &cm->mi[offset];
   // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
   // passing bsize from decode_partition().
-  xd->mi[0]->mbmi.sb_type = bsize;
+  xd->mi[0]->sb_type = bsize;
 #if CONFIG_RD_DEBUG
-  xd->mi[0]->mbmi.mi_row = mi_row;
-  xd->mi[0]->mbmi.mi_col = mi_col;
-#endif
-#if CONFIG_CFL
-  xd->cfl->mi_row = mi_row;
-  xd->cfl->mi_col = mi_col;
+  xd->mi[0]->mi_row = mi_row;
+  xd->mi[0]->mi_col = mi_col;
 #endif
-  for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
 
-  set_plane_n4(xd, bw, bh);
-  set_skip_context(xd, mi_row, mi_col);
+  assert(x_mis && y_mis);
+  for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
+  int idx = cm->mi_stride;
+  for (int y = 1; y < y_mis; ++y) {
+    memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
+    idx += cm->mi_stride;
+  }
 
-#if CONFIG_VAR_TX
-  xd->max_tx_size = max_txsize_lookup[bsize];
-#endif
+  set_plane_n4(xd, bw, bh, num_planes);
+  set_skip_context(xd, mi_row, mi_col, num_planes);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
+                       mi_col, 0, num_planes);
 }
 
-#if CONFIG_SUPERTX
-static MB_MODE_INFO *set_offsets_extend(AV1_COMMON *const cm,
-                                        MACROBLOCKD *const xd,
-                                        const TileInfo *const tile,
-                                        BLOCK_SIZE bsize_pred, int mi_row_pred,
-                                        int mi_col_pred, int mi_row_ori,
-                                        int mi_col_ori) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  const int bw = mi_size_wide[bsize_pred];
-  const int bh = mi_size_high[bsize_pred];
-  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
-  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-
-  xd->up_available = (mi_row_ori > tile->mi_row_start);
-  xd->left_available = (mi_col_ori > tile->mi_col_start);
+static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                              int mi_row, int mi_col, aom_reader *r,
+                              PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
 
-  set_plane_n4(xd, bw, bh);
+#if CONFIG_ACCOUNTING
+  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  xd->mi[0]->partition = partition;
+  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
 
-  return &xd->mi[0]->mbmi;
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
 }
 
-#if CONFIG_SUPERTX
-static MB_MODE_INFO *set_mb_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int bw, int bh, int x_mis, int y_mis) {
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  const TileInfo *const tile = &xd->tile;
-  int x, y;
+typedef struct PadBlock {
+  int x0;
+  int x1;
+  int y0;
+  int y1;
+} PadBlock;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
-  xd->mi[0]->mbmi.sb_type = bsize;
-  for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
-
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-  return &xd->mi[0]->mbmi;
-}
-#endif
+static void highbd_build_mc_border(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst8, int dst_stride, int x, int y,
+                                   int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const uint16_t *ref_row = src - x - y * src_stride;
 
-static void set_offsets_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 const TileInfo *const tile, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int offset = mi_row * cm->mi_stride + mi_col;
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
 
-  set_plane_n4(xd, bw, bh);
+    if (left > b_w) left = b_w;
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+    if (x + b_w > w) right = x + b_w - w;
 
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-}
+    if (right > b_w) right = b_w;
 
-static void set_param_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                               BLOCK_SIZE bsize, int mi_row, int mi_col,
-                               int txfm, int skip) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  int x, y;
+    copy = b_w - left - right;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
+    if (left) aom_memset16(dst, ref_row[0], left);
 
-  for (y = 0; y < y_mis; ++y)
-    for (x = 0; x < x_mis; ++x) {
-      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
-      xd->mi[y * cm->mi_stride + x]->mbmi.tx_type = txfm;
-    }
-#if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bw, bh, skip, xd);
-#endif
-}
+    if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
 
-static void set_ref(AV1_COMMON *const cm, MACROBLOCKD *const xd, int idx,
-                    int mi_row, int mi_col) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_COMPOUND_SINGLEREF
-  RefBuffer *ref_buffer =
-      has_second_ref(mbmi) ? &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME]
-                           : &cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME];
-#else
-  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  xd->block_refs[idx] = ref_buffer;
-  if (!av1_is_valid_scale(&ref_buffer->sf))
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Invalid scale factors");
-  av1_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
-                       &ref_buffer->sf);
-  aom_merge_corrupted_flag(&xd->corrupted, ref_buffer->buf->corrupted);
+    if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
 }
 
-static void dec_predict_b_extend(
-    AV1Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    int block, int mi_row_ori, int mi_col_ori, int mi_row_pred, int mi_col_pred,
-    int mi_row_top, int mi_col_top, int plane, uint8_t *dst_buf, int dst_stride,
-    BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred, int b_sub8x8, int bextend) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
-  // block: sub location of sub8x8 blocks
-  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
-  // bextend: 1: region to predict is an extension of ori; 0: not
-  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
-  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
-  const int mi_width_top = mi_size_wide[bsize_top];
-  const int mi_height_top = mi_size_high[bsize_top];
-  MB_MODE_INFO *mbmi;
-  AV1_COMMON *const cm = &pbi->common;
+static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int x, int y, int b_w, int b_h,
+                            int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
 
-  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
-      mi_row_pred >= mi_row_top + mi_height_top ||
-      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
-      mi_col_pred >= cm->mi_cols)
-    return;
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
 
-  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred, mi_row_pred, mi_col_pred,
-                            mi_row_ori, mi_col_ori);
-  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
-  if (has_second_ref(&xd->mi[0]->mbmi)
-#if CONFIG_COMPOUND_SINGLEREF
-      || is_inter_singleref_comp_mode(xd->mi[0]->mbmi.mode)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          )
-    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
-  if (!bextend) mbmi->tx_size = max_txsize_lookup[bsize_top];
-
-  xd->plane[plane].dst.stride = dst_stride;
-  xd->plane[plane].dst.buf =
-      dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
-      (c >> xd->plane[plane].subsampling_x);
-
-  if (!b_sub8x8)
-    av1_build_inter_predictor_sb_extend(&pbi->common, xd, mi_row_ori,
-                                        mi_col_ori, mi_row_pred, mi_col_pred,
-                                        plane, bsize_pred);
-  else
-    av1_build_inter_predictor_sb_sub8x8_extend(
-        &pbi->common, xd, mi_row_ori, mi_col_ori, mi_row_pred, mi_col_pred,
-        plane, bsize_pred, block);
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w) left = b_w;
+
+    if (x + b_w > w) right = x + b_w - w;
+
+    if (right > b_w) right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left) memset(dst, ref_row[0], left);
+
+    if (copy) memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right) memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
 }
 
-static void dec_extend_dir(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                           const TileInfo *const tile, int block,
-                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                           int mi_row_ori, int mi_col_ori, int mi_row,
-                           int mi_col, int mi_row_top, int mi_col_top,
-                           int plane, uint8_t *dst_buf, int dst_stride,
-                           int dir) {
-  // dir: 0-lower, 1-upper, 2-left, 3-right
-  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  int xss = xd->plane[1].subsampling_x;
-  int yss = xd->plane[1].subsampling_y;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
-  BLOCK_SIZE extend_bsize;
-  int mi_row_pred, mi_col_pred;
-
-  int wide_unit, high_unit;
-  int i, j;
-  int ext_offset = 0;
-
-  if (dir == 0 || dir == 1) {
-    extend_bsize =
-        (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
-            ? BLOCK_8X8
-            : BLOCK_16X8;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
+static INLINE int update_extend_mc_border_params(
+    const struct scale_factors *const sf, struct buf_2d *const pre_buf,
+    MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
+    int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
+  const int is_scaled = av1_is_scaled(sf);
+  // Get reference width and height.
+  int frame_width = pre_buf->width;
+  int frame_height = pre_buf->height;
+
+  // Do border extension if there is motion or
+  // width/height is not a multiple of 8 pixels.
+  if ((!is_intrabc) && (!do_warp) &&
+      (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
+       (frame_height & 0x7))) {
+    if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      block->x0 -= AOM_INTERP_EXTEND - 1;
+      block->x1 += AOM_INTERP_EXTEND;
+      *x_pad = 1;
     }
-#endif
 
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
-    mi_col_pred = mi_col;
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        dec_predict_b_extend(pbi, xd, tile, block, mi_row_ori, mi_col_ori,
-                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                             mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                             extend_bsize, b_sub8x8, 1);
-  } else if (dir == 2 || dir == 3) {
-    extend_bsize =
-        (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
-            ? BLOCK_8X8
-            : BLOCK_8X16;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
+    if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      block->y0 -= AOM_INTERP_EXTEND - 1;
+      block->y1 += AOM_INTERP_EXTEND;
+      *y_pad = 1;
     }
-#endif
 
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
+    // Skip border extension if block is inside the frame.
+    if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
+        block->y1 > frame_height - 1) {
+      return 1;
+    }
+  }
+  return 0;
+}
 
-    mi_row_pred = mi_row;
-    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
+static INLINE void extend_mc_border(const struct scale_factors *const sf,
+                                    struct buf_2d *const pre_buf,
+                                    MV32 scaled_mv, PadBlock block,
+                                    int subpel_x_mv, int subpel_y_mv,
+                                    int do_warp, int is_intrabc, int highbd,
+                                    uint8_t *mc_buf, uint8_t **pre,
+                                    int *src_stride) {
+  int x_pad = 0, y_pad = 0;
+  if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
+                                     subpel_x_mv, subpel_y_mv, do_warp,
+                                     is_intrabc, &x_pad, &y_pad)) {
+    // Get reference block pointer.
+    const uint8_t *const buf_ptr =
+        pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+    int buf_stride = pre_buf->stride;
+    const int b_w = block.x1 - block.x0;
+    const int b_h = block.y1 - block.y0;
+
+    // Extend the border.
+    if (highbd) {
+      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
+                             block.y0, b_w, b_h, pre_buf->width,
+                             pre_buf->height);
+    } else {
+      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+                      b_h, pre_buf->width, pre_buf->height);
+    }
+    *src_stride = b_w;
+    *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
+           x_pad * (AOM_INTERP_EXTEND - 1);
+  }
+}
 
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        dec_predict_b_extend(pbi, xd, tile, block, mi_row_ori, mi_col_ori,
-                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                             mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                             extend_bsize, b_sub8x8, 1);
+static INLINE void dec_calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, SubpelParams *subpel_params, int bw, int bh,
+    PadBlock *block, int mi_x, int mi_y, MV32 *scaled_mv, int *subpel_x_mv,
+    int *subpel_y_mv) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+
+    // Get reference block top left coordinate.
+    block->x0 = pos_x >> SCALE_SUBPEL_BITS;
+    block->y0 = pos_y >> SCALE_SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 =
+        ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
+    block->y1 =
+        ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
+
+    MV temp_mv;
+    temp_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
+                                        pd->subsampling_y);
+    *scaled_mv = av1_scale_mv(&temp_mv, (mi_x + x), (mi_y + y), sf);
+    scaled_mv->row += SCALE_EXTRA_OFF;
+    scaled_mv->col += SCALE_EXTRA_OFF;
+
+    *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
   } else {
-    extend_bsize = BLOCK_8X8;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
+    // Get block position in current frame.
+    int pos_x = (pre_x + x) << SUBPEL_BITS;
+    int pos_y = (pre_y + y) << SUBPEL_BITS;
+
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+
+    // Get reference block top left coordinate.
+    pos_x += mv_q4.col;
+    pos_y += mv_q4.row;
+    block->x0 = pos_x >> SUBPEL_BITS;
+    block->y0 = pos_y >> SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
+    block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
+
+    scaled_mv->row = mv_q4.row;
+    scaled_mv->col = mv_q4.col;
+    *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
+  }
+}
+
+static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
+                                              MACROBLOCKD *xd, int plane,
+                                              const MB_MODE_INFO *mi,
+                                              int build_for_obmc, int bw,
+                                              int bh, int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int is_compound = has_second_ref(mi);
+  int ref;
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  int is_global[2] = { 0, 0 };
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  sub8x8_inter = sub8x8_inter && !build_for_obmc;
+  if (sub8x8_inter) {
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
     }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
+  }
+
+  if (sub8x8_inter) {
+    // block size
+    const int b4_w = block_size_wide[bsize] >> ss_x;
+    const int b4_h = block_size_high[bsize] >> ss_y;
+    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+    const int b8_h = block_size_high[plane_bsize] >> ss_y;
+    assert(!is_compound);
+
+    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+    int row = row_start;
+    int src_stride;
+    for (int y = 0; y < b8_h; y += b4_h) {
+      int col = col_start;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        is_compound = has_second_ref(this_mbmi);
+        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
+        int tmp_dst_stride = 8;
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
+        struct buf_2d *const dst_buf = &pd->dst;
+        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        PadBlock block;
+        MV32 scaled_mv;
+        int subpel_x_mv, subpel_y_mv;
+        int highbd;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf,
+                               &subpel_params, bw, bh, &block, mi_x, mi_y,
+                               &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+        pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+        src_stride = pre_buf->stride;
+        highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+        extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
+                         subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
+                         &pre, &src_stride);
+        conv_params.ref = ref;
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
+        }
+
+        av1_make_inter_predictor(
+            pre, src_stride, dst, dst_buf->stride, &subpel_params, sf, b4_w,
+            b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
 
-    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
-                                                   : -(mi_height + ext_offset));
-    mi_col_pred =
-        mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
+        ++col;
+      }
+      ++row;
+    }
 
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        dec_predict_b_extend(pbi, xd, tile, block, mi_row_ori, mi_col_ori,
-                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                             mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                             extend_bsize, b_sub8x8, 1);
+    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+    return;
   }
-}
 
-static void dec_extend_all(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                           const TileInfo *const tile, int block,
-                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                           int mi_row_ori, int mi_col_ori, int mi_row,
-                           int mi_col, int mi_row_top, int mi_col_top,
-                           int plane, uint8_t *dst_buf, int dst_stride) {
-  for (int i = 0; i < 8; ++i) {
-    dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row_ori,
-                   mi_col_ori, mi_row, mi_col, mi_row_top, mi_col_top, plane,
-                   dst_buf, dst_stride, i);
+  {
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf;
+    uint8_t *pre[2];
+    SubpelParams subpel_params[2];
+    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
+    int src_stride[2];
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+      const MV mv = mi->mv[ref].as_mv;
+      PadBlock block;
+      MV32 scaled_mv;
+      int subpel_x_mv, subpel_y_mv;
+      int highbd;
+
+      dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
+                             &subpel_params[ref], bw, bh, &block, mi_x, mi_y,
+                             &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+      pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+      src_stride[ref] = pre_buf->stride;
+      highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+      int do_warp = (bw >= 8 && bh >= 8 &&
+                     av1_allow_warp(mi, &warp_types,
+                                    &xd->global_motion[mi->ref_frame[ref]],
+                                    build_for_obmc, subpel_params[ref].xs,
+                                    subpel_params[ref].ys, NULL));
+      do_warp = (do_warp && xd->cur_frame_force_integer_mv == 0);
+
+      extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv,
+                       do_warp, is_intrabc, highbd, xd->mc_buf[ref], &pre[ref],
+                       &src_stride[ref]);
+    }
+
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+      conv_params.ref = ref;
+      conv_params.do_average = ref;
+      if (is_masked_compound_type(mi->interinter_comp.type)) {
+        // masked compound type has its own average mechanism
+        conv_params.do_average = 0;
+      }
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type))
+        av1_make_masked_inter_predictor(
+            pre[ref], src_stride[ref], dst, dst_buf->stride,
+            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+            plane, &warp_types, mi_x >> pd->subsampling_x,
+            mi_y >> pd->subsampling_y, ref, xd, cm->allow_warped_motion);
+      else
+        av1_make_inter_predictor(
+            pre[ref], src_stride[ref], dst, dst_buf->stride,
+            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+            &warp_types, mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+    }
   }
 }
 
-static void dec_predict_sb_complex(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                                   const TileInfo *const tile, int mi_row,
-                                   int mi_col, int mi_row_top, int mi_col_top,
-                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                                   uint8_t *dst_buf[3], int dst_stride[3]) {
-  const AV1_COMMON *const cm = &pbi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-  int i;
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
+static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
+                                                  MACROBLOCKD *xd,
+                                                  BLOCK_SIZE bsize, int mi_row,
+                                                  int mi_col, int plane_from,
+                                                  int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = pd->width;
+    const int bh = pd->height;
 
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
-    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
-    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
-    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
-  } else {
-#endif
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
-    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
-    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
-    dst_buf3[0] = tmp_buf3;
-    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
-    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
-#if CONFIG_HIGHBITDEPTH
+    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
   }
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+}
 
-  xd->mi = cm->mi_grid_visible + mi_offset;
-  xd->mi[0] = cm->mi + mi_offset;
+static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
+                                           MACROBLOCKD *xd, int mi_row,
+                                           int mi_col, BUFFER_SET *ctx,
+                                           BLOCK_SIZE bsize) {
+  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = dst_buf[i];
-    xd->plane[i].dst.stride = dst_stride[i];
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
+                               { xd->plane[0].dst.stride, 0, 0 } };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, ctx, bsize);
   }
+}
 
-  switch (partition) {
-    case PARTITION_NONE:
-      assert(bsize < top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, bsize, 0, 0);
-        dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-      }
-      break;
-    case PARTITION_HORZ:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          // For sub8x8, predict in 8x8 unit
-          // First half
-          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf[i],
-                               dst_stride[i], top_bsize, BLOCK_8X8, 1, 0);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf[i], dst_stride[i]);
-
-          // Second half
-          dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], top_bsize, BLOCK_8X8, 1, 1);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf1[i], dst_stride1[i]);
-        }
-
-        // weighted average to smooth the boundary
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row + mode_offset_row,
-                                 mi_col, mi_row, mi_col, mi_row_top, mi_col_top,
-                                 i, dst_buf[i], dst_stride[i], top_bsize, bsize,
-                                 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize,
-                             mi_row + mode_offset_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i]);
-          } else {
-#endif
-            // First half
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row,
-                                 mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, subsize, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-            else
-              dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], 0);
-
-            if (mi_row + hbs < cm->mi_rows) {
-              // Second half
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
-                                   mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                                   i, dst_buf1[i], dst_stride1[i], top_bsize,
-                                   subsize, 0, 0);
-              if (bsize < top_bsize)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col, mi_row + hbs, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i]);
-              else
-                dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col, mi_row + hbs, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], 1);
-
-              // weighted average to smooth the boundary
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_HORZ, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_VERT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          // First half
-          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf[i],
-                               dst_stride[i], top_bsize, BLOCK_8X8, 1, 0);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf[i], dst_stride[i]);
-
-          // Second half
-          dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], top_bsize, BLOCK_8X8, 1, 1);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf1[i], dst_stride1[i]);
-        }
-
-        // Smooth
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-            assert(i > 0 && bsize == BLOCK_8X8);
-
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row,
-                                 mi_col + mode_offset_col, mi_row, mi_col,
-                                 mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, bsize, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row,
-                             mi_col + mode_offset_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i]);
-          } else {
-#endif
-            // First half
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row,
-                                 mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, subsize, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-            else
-              dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], 3);
-
-            // Second half
-            if (mi_col + hbs < cm->mi_cols) {
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
-                                   mi_row, mi_col + hbs, mi_row_top, mi_col_top,
-                                   i, dst_buf1[i], dst_stride1[i], top_bsize,
-                                   subsize, 0, 0);
-              if (bsize < top_bsize)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                               mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              else
-                dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                               mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf1[i], dst_stride1[i], 2);
-
-              // Smooth
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_VERT, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf[i],
-                               dst_stride[i], top_bsize, BLOCK_8X8, 1, 0);
-          dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], top_bsize, BLOCK_8X8, 1, 1);
-          dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf2[i],
-                               dst_stride2[i], top_bsize, BLOCK_8X8, 1, 1);
-          dec_predict_b_extend(pbi, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf3[i],
-                               dst_stride3[i], top_bsize, BLOCK_8X8, 1, 1);
-          if (bsize < top_bsize) {
-            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf[i], dst_stride[i]);
-            dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf1[i], dst_stride1[i]);
-            dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf2[i], dst_stride2[i]);
-            dec_extend_all(pbi, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf3[i], dst_stride3[i]);
-          }
-        }
-#if CONFIG_CB4X4
-      } else if (bsize == BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row =
-                CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0;
-            int mode_offset_col =
-                CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0;
-
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row + mode_offset_row,
-                                 mi_col + mode_offset_col, mi_row, mi_col,
-                                 mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, BLOCK_8X8, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, BLOCK_8X8, top_bsize,
-                             mi_row + mode_offset_row, mi_col + mode_offset_col,
-                             mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-          } else {
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row,
-                                 mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, subsize, 0, 0);
-            if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
-                                   mi_row, mi_col + hbs, mi_row_top, mi_col_top,
-                                   i, dst_buf1[i], dst_stride1[i], top_bsize,
-                                   subsize, 0, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
-                                   mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                                   i, dst_buf2[i], dst_stride2[i], top_bsize,
-                                   subsize, 0, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
-                                   mi_row + hbs, mi_col + hbs, mi_row_top,
-                                   mi_col_top, i, dst_buf3[i], dst_stride3[i],
-                                   top_bsize, subsize, 0, 0);
-
-            if (bsize < top_bsize) {
-              dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-              if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                               mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col, mi_row + hbs, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf2[i],
-                               dst_stride2[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col + hbs, mi_row + hbs,
-                               mi_col + hbs, mi_row_top, mi_col_top, i,
-                               dst_buf3[i], dst_stride3[i]);
-            }
-          }
-        }
-#endif
-      } else {
-        dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row_top,
-                               mi_col_top, subsize, top_bsize, dst_buf,
-                               dst_stride);
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col + hbs,
-                                 mi_row_top, mi_col_top, subsize, top_bsize,
-                                 dst_buf1, dst_stride1);
-        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col,
-                                 mi_row_top, mi_col_top, subsize, top_bsize,
-                                 dst_buf2, dst_stride2);
-        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col + hbs,
-                                 mi_row_top, mi_col_top, subsize, top_bsize,
-                                 dst_buf3, dst_stride3);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-        const struct macroblockd_plane *pd = &xd->plane[i];
-        int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-            subsize, pd->subsampling_x, pd->subsampling_y);
-        if (handle_chroma_sub8x8) continue;  // Skip <4x4 chroma smoothing
-#else
-        if (bsize == BLOCK_8X8 && i != 0)
-          continue;  // Skip <4x4 chroma smoothing
-#endif
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_VERT, i);
-          if (mi_row + hbs < cm->mi_rows) {
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_VERT, i);
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_HORZ, i);
-          }
-        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_HORZ, i);
-        }
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dst_buf, dst_stride);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf2,
-                           dst_stride2, top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       1);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      break;
-    case PARTITION_VERT_A:
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dst_buf, dst_stride);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                           dst_stride2, top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, 2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-    case PARTITION_HORZ_B:
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
-                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                     dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_VERT, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      break;
-    case PARTITION_VERT_B:
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
-                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                     dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_HORZ, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
+static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
+                                            MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BUFFER_SET *ctx,
+                                            BLOCK_SIZE bsize) {
+  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
+                                        MAX_MB_PLANE - 1);
+
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = {
+      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
+    };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sbuv(
+        cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
   }
 }
 
-static void set_segment_id_supertx(const AV1_COMMON *const cm, int mi_row,
-                                   int mi_col, BLOCK_SIZE bsize) {
-  const struct segmentation *seg = &cm->seg;
-  const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
-  const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
-  int r, c;
-  int seg_id_supertx = MAX_SEGMENTS;
-
-  if (!seg->enabled) {
-    seg_id_supertx = 0;
-  } else {
-    // Find the minimum segment_id
-    for (r = 0; r < mih; r++)
-      for (c = 0; c < miw; c++)
-        seg_id_supertx =
-            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
-    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
-  }
-
-  // Assign the the segment_id back to segment_id_supertx
-  for (r = 0; r < mih; r++)
-    for (c = 0; c < miw; c++)
-      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+static void dec_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
+  dec_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    dec_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
 }
-#endif  // CONFIG_SUPERTX
 
-static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                              int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                              int mi_row, int mi_col, aom_reader *r,
-#if CONFIG_EXT_PARTITION_TYPES
-                              PARTITION_TYPE partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES
-                              BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+static INLINE void dec_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
 
-#if CONFIG_ACCOUNTING
-  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
-#endif
-#if CONFIG_SUPERTX
-  if (supertx_enabled) {
-    set_mb_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-  } else {
-    set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-  }
-#if CONFIG_EXT_PARTITION_TYPES
-  xd->mi[0]->mbmi.partition = partition;
-#endif
-  av1_read_mode_info(pbi, xd, supertx_enabled, mi_row, mi_col, r, x_mis, y_mis);
-#else
-  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-#if CONFIG_EXT_PARTITION_TYPES
-  xd->mi[0]->mbmi.partition = partition;
-#endif
-  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
-#endif  // CONFIG_SUPERTX
-  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
-    const BLOCK_SIZE uv_subsize =
-        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
-    if (uv_subsize == BLOCK_INVALID)
-      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid block size.");
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           &backup_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
   }
+}
 
-#if CONFIG_SUPERTX
-  xd->mi[0]->mbmi.segment_id_supertx = MAX_SEGMENTS;
-#endif  // CONFIG_SUPERTX
+static void dec_build_prediction_by_above_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  int this_height = xd->n8_h * MI_SIZE;
+  int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                dec_build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
 
-  int reader_corrupted_flag = aom_reader_has_error(r);
-  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+static INLINE void dec_build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          &backup_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
 }
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void set_mode_info_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                  int mi_row, int mi_col) {
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = &cm->mi[offset];
+static void dec_build_prediction_by_left_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  int this_width = xd->n8_w * MI_SIZE;
+  int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               dec_build_prediction_by_left_pred, &ctxt);
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
 }
 
-static void get_ncobmc_recon(AV1_COMMON *const cm, MACROBLOCKD *xd, int mi_row,
-                             int mi_col, int bsize, int mode) {
-  uint8_t *pred_buf[4][MAX_MB_PLANE];
-  int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  // target block in pxl
-  int pxl_row = mi_row << MI_SIZE_LOG2;
-  int pxl_col = mi_col << MI_SIZE_LOG2;
+static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd, int mi_row,
+                                               int mi_col) {
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  int plane;
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
-                            len);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-  av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
-  av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
-                             pred_stride, mode);
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+  }
+  dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                      dst_width1, dst_height1, dst_stride1);
+  dec_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                     dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2);
+}
+
+static void cfl_store_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (store_cfl_required(cm, xd)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
 }
 
-static void av1_get_ncobmc_recon(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int bsize, const int mi_row, const int mi_col,
-                                 const NCOBMC_MODE modes) {
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
+static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    if (frame < LAST_FRAME) {
+      assert(is_intrabc_block(mbmi));
+      assert(frame == INTRA_FRAME);
+      assert(ref == 0);
+    } else {
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
-  assert(bsize >= BLOCK_8X8);
+      xd->block_refs[ref] = ref_buf;
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf,
+                           num_planes);
+    }
+  }
 
-  reset_xd_boundary(xd, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
-                    cm->mi_cols);
-  get_ncobmc_recon(cm, xd, mi_row, mi_col, bsize, modes);
+  dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+  if (mbmi->motion_mode == OBMC_CAUSAL)
+    dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
 }
 
-static void recon_ncobmc_intrpl_pred(AV1_COMMON *const cm,
-                                     MACROBLOCKD *const xd, int mi_row,
-                                     int mi_col, BLOCK_SIZE bsize) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int hbs = AOMMAX(mi_size_wide[bsize] / 2, mi_size_high[bsize] / 2);
-  const BLOCK_SIZE sqr_blk = bsize_2_sqr_bsize[bsize];
-  if (mi_width > mi_height) {
-    // horizontal partition
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col, mbmi->ncobmc_mode[0]);
-    xd->mi += hbs;
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col + hbs,
-                         mbmi->ncobmc_mode[1]);
-  } else if (mi_height > mi_width) {
-    // vertical partition
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col, mbmi->ncobmc_mode[0]);
-    xd->mi += hbs * xd->mi_stride;
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row + hbs, mi_col,
-                         mbmi->ncobmc_mode[1]);
-  } else {
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col, mbmi->ncobmc_mode[0]);
-  }
-  set_mode_info_offsets(cm, xd, mi_row, mi_col);
-  // restore dst buffer and mode info
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
+static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
+                                       aom_reader *r) {
+  (void)r;
+  Av1ColorMapParam params;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, NULL, NULL);
+  xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 
 static void decode_token_and_recon_block(AV1Decoder *const pbi,
                                          MACROBLOCKD *const xd, int mi_row,
                                          int mi_col, aom_reader *r,
                                          BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
 
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-  CFL_CTX *const cfl = xd->cfl;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  CFL_CTX *const cfl = &xd->cfl;
   cfl->is_chroma_reference = is_chroma_reference(
       mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
 
   if (cm->delta_q_present_flag) {
-    int i;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-#if CONFIG_EXT_DELTA_Q
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
       const int current_qindex =
           av1_get_qindex(&cm->seg, i, xd->current_qindex);
-#else
-      const int current_qindex = xd->current_qindex;
-#endif  // CONFIG_EXT_DELTA_Q
-      int j;
-      for (j = 0; j < MAX_MB_PLANE; ++j) {
-        const int dc_delta_q = j == 0 ? cm->y_dc_delta_q : cm->uv_dc_delta_q;
-        const int ac_delta_q = j == 0 ? 0 : cm->uv_ac_delta_q;
-
-        xd->plane[j].seg_dequant[i][0] =
-            av1_dc_quant(current_qindex, dc_delta_q, cm->bit_depth);
-        xd->plane[j].seg_dequant[i][1] =
-            av1_ac_quant(current_qindex, ac_delta_q, cm->bit_depth);
+      for (int j = 0; j < num_planes; ++j) {
+        const int dc_delta_q =
+            j == 0 ? cm->y_dc_delta_q
+                   : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
+        const int ac_delta_q =
+            j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+        xd->plane[j].seg_dequant_QTX[i][0] =
+            av1_dc_quant_QTX(current_qindex, dc_delta_q, cm->bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][1] =
+            av1_ac_quant_QTX(current_qindex, ac_delta_q, cm->bit_depth);
       }
     }
   }
+  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
 
-#if CONFIG_CB4X4
-  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-#else
-  if (mbmi->skip) {
-    av1_reset_skip_context(xd, mi_row, mi_col, AOMMAX(BLOCK_8X8, bsize));
-  }
-#endif
-
-#if CONFIG_COEF_INTERLEAVE
-  {
-    const struct macroblockd_plane *const pd_y = &xd->plane[0];
-    const struct macroblockd_plane *const pd_c = &xd->plane[1];
-    const TX_SIZE tx_log2_y = mbmi->tx_size;
-    const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
-    const int tx_sz_y = (1 << tx_log2_y);
-    const int tx_sz_c = (1 << tx_log2_c);
-    const int num_4x4_w_y = pd_y->n4_w;
-    const int num_4x4_h_y = pd_y->n4_h;
-    const int num_4x4_w_c = pd_c->n4_w;
-    const int num_4x4_h_c = pd_c->n4_h;
-    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
-                                             pd_y->subsampling_x);
-    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
-                                             pd_y->subsampling_y);
-    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
-                                             pd_c->subsampling_x);
-    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
-                                             pd_c->subsampling_y);
-
-    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
-    // i.e. when the SB is splitted by tile boundaries.
-    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_c = tu_num_w_c * tu_num_h_c;
-
-    if (!is_inter_block(mbmi)) {
-      int tu_idx_c = 0;
-      int row_y, col_y, row_c, col_c;
-      int plane;
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-      for (plane = 0; plane <= 1; ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane])
-          av1_decode_palette_tokens(xd, plane, r);
-      }
-#endif  // !CONFIG_PVQ
-
-      for (row_y = 0; row_y < tu_num_h_y; row_y++) {
-        for (col_y = 0; col_y < tu_num_w_y; col_y++) {
-          // luma
-          predict_and_reconstruct_intra_block(
-              cm, xd, r, mbmi, 0, row_y * tx_sz_y, col_y * tx_sz_y, tx_log2_y);
-          // chroma
-          if (tu_idx_c < tu_num_c) {
-            row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-            col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c,
-                                                col_c, tx_log2_c);
-            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c,
-                                                col_c, tx_log2_c);
-            tu_idx_c++;
-          }
-        }
-      }
-
-      // In 422 case, it's possilbe that Chroma has more TUs than Luma
-      while (tu_idx_c < tu_num_c) {
-        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c, col_c,
-                                            tx_log2_c);
-        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c, col_c,
-                                            tx_log2_c);
-        tu_idx_c++;
-      }
-    } else {
-      // Prediction
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL,
-                                    AOMMAX(bsize, BLOCK_8X8));
-
-      // Reconstruction
-      if (!mbmi->skip) {
-        int eobtotal = 0;
-        int tu_idx_c = 0;
-        int row_y, col_y, row_c, col_c;
-
-        for (row_y = 0; row_y < tu_num_h_y; row_y++) {
-          for (col_y = 0; col_y < tu_num_w_y; col_y++) {
-            // luma
-            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 0,
-                                                row_y * tx_sz_y,
-                                                col_y * tx_sz_y, tx_log2_y);
-            // chroma
-            if (tu_idx_c < tu_num_c) {
-              row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-              col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
-                                                  1, row_c, col_c, tx_log2_c);
-              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
-                                                  2, row_c, col_c, tx_log2_c);
-              tu_idx_c++;
+  if (!is_inter_block(mbmi)) {
+    int row, col;
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+    const int max_blocks_high = max_block_high(xd, bsize, 0);
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+    mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+    for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+      for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          const struct macroblockd_plane *const pd = &xd->plane[plane];
+          if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                   pd->subsampling_y))
+            continue;
+
+          const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+          const int stepr = tx_size_high_unit[tx_size];
+          const int stepc = tx_size_wide_unit[tx_size];
+
+          const int unit_height = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+          const int unit_width = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+
+          for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+               blk_row += stepr) {
+            for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                 blk_col += stepc) {
+              read_coeffs_tx_intra_block(cm, xd, r, plane, blk_row, blk_col,
+                                         tx_size);
+              predict_and_reconstruct_intra_block(cm, xd, r, plane, blk_row,
+                                                  blk_col, tx_size);
+              set_cb_buffer_offsets(xd, tx_size, plane);
             }
           }
         }
-
-        // In 422 case, it's possilbe that Chroma has more TUs than Luma
-        while (tu_idx_c < tu_num_c) {
-          row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-          col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 1,
-                                              row_c, col_c, tx_log2_c);
-          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 2,
-                                              row_c, col_c, tx_log2_c);
-          tu_idx_c++;
-        }
-
-        // TODO(CONFIG_COEF_INTERLEAVE owners): bring eob == 0 corner case
-        // into line with the defaut configuration
-        if (bsize >= BLOCK_8X8 && eobtotal == 0) mbmi->skip = 1;
       }
     }
-  }
-#else  // CONFIG_COEF_INTERLEAVE
-  if (!is_inter_block(mbmi)) {
-    int plane;
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-    for (plane = 0; plane <= 1; ++plane) {
-      if (mbmi->palette_mode_info.palette_size[plane])
-        av1_decode_palette_tokens(xd, plane, r);
-    }
-#endif  // #if !CONFIG_PVQ
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-      const int stepr = tx_size_high_unit[tx_size];
-      const int stepc = tx_size_wide_unit[tx_size];
-#if CONFIG_CHROMA_SUB8X8
-      const BLOCK_SIZE plane_bsize =
-          AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE plane_bsize =
-          get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-      int row, col;
-      const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-      const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-#if CONFIG_CB4X4
+  } else {
+    predict_inter_block(cm, xd, mi_row, mi_col, bsize);
+#if CONFIG_MISMATCH_DEBUG
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const struct macroblockd_plane *pd = &xd->plane[plane];
+      int pixel_c, pixel_r;
+      mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                      pd->subsampling_x, pd->subsampling_y);
       if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
                                pd->subsampling_y))
         continue;
+      mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                               plane, pixel_c, pixel_r, pd->width, pd->height,
+                               xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+    }
 #endif
-      int blk_row, blk_col;
-      const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+
+    // Reconstruction
+    if (!mbmi->skip) {
+      int eobtotal = 0;
+
+      const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+      const int max_blocks_high = max_block_high(xd, bsize, 0);
+      int row, col;
+
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
       int mu_blocks_wide =
           block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
       int mu_blocks_high =
           block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
       mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
       mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
       for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
-        const int unit_height = AOMMIN(mu_blocks_high + row, max_blocks_high);
         for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
-          const int unit_width = AOMMIN(mu_blocks_wide + col, max_blocks_wide);
-
-          for (blk_row = row; blk_row < unit_height; blk_row += stepr)
-            for (blk_col = col; blk_col < unit_width; blk_col += stepc)
-              predict_and_reconstruct_intra_block(cm, xd, r, mbmi, plane,
-                                                  blk_row, blk_col, tx_size);
-        }
-      }
-    }
-  } else {
-    int ref;
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + is_inter_anyref_comp_mode(mbmi->mode); ++ref)
-#else
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-      const MV_REFERENCE_FRAME frame =
-#if CONFIG_COMPOUND_SINGLEREF
-          has_second_ref(mbmi) ? mbmi->ref_frame[ref] : mbmi->ref_frame[0];
-#else
-          mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      if (frame < LAST_FRAME) {
-#if CONFIG_INTRABC
-        assert(is_intrabc_block(mbmi));
-        assert(frame == INTRA_FRAME);
-        assert(ref == 0);
-#else
-        assert(0);
-#endif  // CONFIG_INTRABC
-      } else {
-        RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-        xd->block_refs[ref] = ref_buf;
-        if ((!av1_is_valid_scale(&ref_buf->sf)))
-          aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                             "Reference frame has invalid dimensions");
-        av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
-                             &ref_buf->sf);
-      }
-    }
-
-#if CONFIG_CB4X4
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-#else
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL,
-                                  AOMMAX(bsize, BLOCK_8X8));
-#endif
-
-#if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_NCOBMC
-      av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#else
-      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#endif
-    }
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) {
-      int plane;
-      recon_ncobmc_intrpl_pred(cm, xd, mi_row, mi_col, bsize);
-      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-        get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
-      }
-    }
-#endif
-    // Reconstruction
-    if (!mbmi->skip) {
-      int eobtotal = 0;
-      int plane;
-
-      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-        const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-        const BLOCK_SIZE plane_bsize =
-            AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-        const BLOCK_SIZE plane_bsize =
-            get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-        const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-        const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-        int row, col;
-
-#if CONFIG_CB4X4
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
-#endif
-
-#if CONFIG_VAR_TX
-        const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
-        int mu_blocks_wide =
-            block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-        int mu_blocks_high =
-            block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
-        mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
-        mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
-
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(
-            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
-        const int bh_var_tx = tx_size_high_unit[max_tx_size];
-        const int bw_var_tx = tx_size_wide_unit[max_tx_size];
-        int block = 0;
-        int step =
-            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-
-        for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
-          for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+          for (int plane = 0; plane < num_planes; ++plane) {
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                     pd->subsampling_y))
+              continue;
+            const BLOCK_SIZE bsizec =
+                scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+            const BLOCK_SIZE plane_bsize = get_plane_block_size(
+                bsizec, pd->subsampling_x, pd->subsampling_y);
+
+            const TX_SIZE max_tx_size =
+                get_vartx_max_txsize(xd, plane_bsize, plane);
+            const int bh_var_tx = tx_size_high_unit[max_tx_size];
+            const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+            int block = 0;
+            int step =
+                tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
             int blk_row, blk_col;
-            const int unit_height =
-                AOMMIN(mu_blocks_high + row, max_blocks_high);
-            const int unit_width =
-                AOMMIN(mu_blocks_wide + col, max_blocks_wide);
-            for (blk_row = row; blk_row < unit_height; blk_row += bh_var_tx) {
-              for (blk_col = col; blk_col < unit_width; blk_col += bw_var_tx) {
+            const int unit_height = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_high + row, max_blocks_high),
+                pd->subsampling_y);
+            const int unit_width = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_wide + col, max_blocks_wide),
+                pd->subsampling_x);
+
+            for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+                 blk_row += bh_var_tx) {
+              for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                   blk_col += bw_var_tx) {
                 decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize,
                                       blk_row, blk_col, block, max_tx_size,
                                       &eobtotal);
@@ -2083,388 +1221,291 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
             }
           }
         }
-#else
-        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-        const int stepr = tx_size_high_unit[tx_size];
-        const int stepc = tx_size_wide_unit[tx_size];
-        for (row = 0; row < max_blocks_high; row += stepr)
-          for (col = 0; col < max_blocks_wide; col += stepc)
-            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
-                                                plane, row, col, tx_size);
-#endif
       }
     }
+    cfl_store_inter_block(cm, xd);
   }
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-  if (mbmi->uv_mode != UV_CFL_PRED) {
-#if CONFIG_DEBUG
-    if (cfl->is_chroma_reference) {
-      cfl_clear_sub8x8_val(cfl);
-    }
-#endif
-    if (!cfl->is_chroma_reference && is_inter_block(mbmi)) {
-      cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
-    }
-  }
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-#endif  // CONFIG_COEF_INTERLEAVE
+
+  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+                    set_color_index_map_offset);
 
   int reader_corrupted_flag = aom_reader_has_error(r);
   aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
 }
 
-#if NC_MODE_INFO && CONFIG_MOTION_VAR
-static void detoken_and_recon_sb(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, aom_reader *r,
-                                 BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int hbs = mi_size_wide[bsize] >> 1;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                               TX_SIZE tx_size, int depth, int blk_row,
+                               int blk_col, aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  int is_split = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
 
-  partition = get_partition(cm, mi_row, mi_col, bsize);
-  subsize = subsize_lookup[partition][bsize];
+  if (is_split) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    if (sub_txs == TX_4X4) {
+      for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+        for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+          const int index =
+              av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+          mbmi->inter_tx_size[index] = sub_txs;
+        }
+      }
+      mbmi->tx_size = sub_txs;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+      return;
+    }
 
-  if (!hbs && !unify_bsize) {
-    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
-    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
+      }
+    }
   } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
-        break;
-      case PARTITION_HORZ:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        if (has_rows)
-          decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r,
-                                       subsize);
-        break;
-      case PARTITION_VERT:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        if (has_cols)
-          decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r,
-                                       subsize);
-        break;
-      case PARTITION_SPLIT:
-        detoken_and_recon_sb(pbi, xd, mi_row, mi_col, r, subsize);
-        detoken_and_recon_sb(pbi, xd, mi_row, mi_col + hbs, r, subsize);
-        detoken_and_recon_sb(pbi, xd, mi_row + hbs, mi_col, r, subsize);
-        detoken_and_recon_sb(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
-#endif
-      case PARTITION_HORZ_A:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col + hbs, r,
-                                     bsize2);
-        break;
-      case PARTITION_VERT_A:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
-        break;
-      case PARTITION_VERT_B:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col + hbs, r,
-                                     bsize2);
-        break;
-#endif
-      default: assert(0 && "Invalid partition type");
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
     }
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  }
+}
+
+static TX_SIZE read_selected_tx_size(MACROBLOCKD *xd, aom_reader *r) {
+  // TODO(debargha): Clean up the logic here. This function should only
+  // be called for intra.
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int max_depths = bsize_to_max_depth(bsize);
+  const int ctx = get_tx_size_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
+                                    max_depths + 1, ACCT_STR);
+  assert(depth >= 0 && depth <= max_depths);
+  const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
+  return tx_size;
+}
+
+static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
+                            int allow_select_inter, aom_reader *r) {
+  const TX_MODE tx_mode = cm->tx_mode;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+
+  if (block_signals_txsize(bsize)) {
+    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+      const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
+      return coded_tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, tx_mode);
+    }
+  } else {
+    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+    return max_txsize_rect_lookup[bsize];
   }
 }
-#endif
 
 static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                         int supertx_enabled,
-#endif  // CONFIG_SUPERTX
                          int mi_row, int mi_col, aom_reader *r,
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_TYPE partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES
-                         BLOCK_SIZE bsize) {
-  decode_mbmi_block(pbi, xd,
-#if CONFIG_SUPERTX
-                    supertx_enabled,
-#endif
-                    mi_row, mi_col, r,
-#if CONFIG_EXT_PARTITION_TYPES
-                    partition,
-#endif
-                    bsize);
+                         PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+
+  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+                    av1_decode_palette_tokens);
+
+  AV1_COMMON *cm = &pbi->common;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+
+    for (int idy = 0; idy < height; idy += bh)
+      for (int idx = 0; idx < width; idx += bw)
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+  } else {
+    mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
+    if (inter_block_tx)
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+                  mbmi->skip && is_inter_block(mbmi), xd);
+  }
 
-#if !(CONFIG_MOTION_VAR && NC_MODE_INFO)
-#if CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-    decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
-#endif
+  decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
 }
 
-static PARTITION_TYPE read_partition(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, aom_reader *r,
-                                     int has_rows, int has_cols,
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     aom_reader *r, int has_rows, int has_cols,
                                      BLOCK_SIZE bsize) {
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const int ctx =
-      partition_plane_context(xd, mi_row, mi_col, has_rows, has_cols, bsize);
-#else
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-#endif
-  PARTITION_TYPE p;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  aom_cdf_prob *partition_cdf = (ctx >= 0) ? ec_ctx->partition_cdf[ctx] : NULL;
+  if (!has_rows && !has_cols) return PARTITION_SPLIT;
 
+  assert(ctx >= 0);
+  aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
   if (has_rows && has_cols) {
-#if CONFIG_EXT_PARTITION_TYPES
-    const int num_partition_types =
-        (mi_width_log2_lookup[bsize] > mi_width_log2_lookup[BLOCK_8X8])
-            ? EXT_PARTITION_TYPES
-            : PARTITION_TYPES;
-#else
-    const int num_partition_types = PARTITION_TYPES;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    p = (PARTITION_TYPE)aom_read_symbol(r, partition_cdf, num_partition_types,
-                                        ACCT_STR);
+    return (PARTITION_TYPE)aom_read_symbol(
+        r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
   } else if (!has_rows && has_cols) {
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_vert_alike(cdf, partition_cdf);
+    partition_gather_vert_alike(cdf, partition_cdf, bsize);
     assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
-    p = aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
-    // gather cols
-  } else if (has_rows && !has_cols) {
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+  } else {
+    assert(has_rows && !has_cols);
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_horz_alike(cdf, partition_cdf);
+    partition_gather_horz_alike(cdf, partition_cdf, bsize);
     assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
-    p = aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
-  } else {
-    p = PARTITION_SPLIT;
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
   }
-
-  return p;
 }
 
-#if CONFIG_SUPERTX
-static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
-                     aom_reader *r) {
-  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
-    return 1;
-  } else {
-    const int ctx = av1_get_skip_context(xd);
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
-#else
-    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->skip[ctx][skip];
-    return skip;
-  }
-}
-#endif  // CONFIG_SUPERTX
-
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                             int supertx_enabled,
-#endif
                              int mi_row, int mi_col, aom_reader *r,
                              BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  const int num_8x8_wh = mi_size_wide[bsize];
-  const int hbs = num_8x8_wh >> 1;
-#if CONFIG_EXT_PARTITION_TYPES && CONFIG_EXT_PARTITION_TYPES_AB
-  const int qbs = num_8x8_wh >> 2;
-#endif
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
+  const int bw = mi_size_wide[bsize];
+  const int hbs = bw >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-#if CONFIG_EXT_PARTITION_TYPES
-  const int quarter_step = num_8x8_wh / 4;
-  int i;
-#if !CONFIG_EXT_PARTITION_TYPES_AB
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-#endif
+  const int quarter_step = bw / 4;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
-#if CONFIG_SUPERTX
-  const int read_token = !supertx_enabled;
-  int skip = 0;
-  TX_SIZE supertx_size = max_txsize_lookup[bsize];
-  const TileInfo *const tile = &xd->tile;
-  int txfm = DCT_DCT;
-#endif  // CONFIG_SUPERTX
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1,
+                                           &tile_tl_idx)) {
+      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
+          loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+        }
+      }
+    }
+  }
+
   partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
-                                  : read_partition(cm, xd, mi_row, mi_col, r,
+                                  : read_partition(xd, mi_row, mi_col, r,
                                                    has_rows, has_cols, bsize);
-  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  subsize = get_partition_subsize(bsize, partition);
 
   // Check the bitstream is conformant: if there is subsampling on the
   // chroma planes, subsize must subsample to a valid block size.
   const struct macroblockd_plane *const pd_u = &xd->plane[1];
-  if (get_plane_block_size(subsize, pd_u) == BLOCK_INVALID) {
+  if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
+      BLOCK_INVALID) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Block size %dx%d invalid with this subsampling mode",
                        block_size_wide[subsize], block_size_high[subsize]);
   }
 
-#if CONFIG_PVQ
-  assert(partition < PARTITION_TYPES);
-  assert(subsize < BLOCK_SIZES_ALL);
-#endif
-#if CONFIG_SUPERTX
-  if (!frame_is_intra_only(cm) && partition != PARTITION_NONE &&
-      bsize <= MAX_SUPERTX_BLOCK_SIZE && !supertx_enabled && !xd->lossless[0]) {
-    const int supertx_context = partition_supertx_context_lookup[partition];
-    supertx_enabled = aom_read(
-        r, cm->fc->supertx_prob[supertx_context][supertx_size], ACCT_STR);
-    if (xd->counts)
-      xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
-#if CONFIG_VAR_TX
-    if (supertx_enabled) xd->supertx_size = supertx_size;
-#endif
-  }
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_SUPERTX
-#define DEC_BLOCK_STX_ARG supertx_enabled,
-#else
 #define DEC_BLOCK_STX_ARG
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
 #define DEC_BLOCK_EPT_ARG partition,
-#else
-#define DEC_BLOCK_EPT_ARG
-#endif
 #define DEC_BLOCK(db_r, db_c, db_subsize)                   \
   decode_block(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
                DEC_BLOCK_EPT_ARG(db_subsize))
 #define DEC_PARTITION(db_r, db_c, db_subsize) \
   decode_partition(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize))
 
-  if (!hbs && !unify_bsize) {
-    // calculate bmode block dimensions (log 2)
-    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
-    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    DEC_BLOCK(mi_row, mi_col, subsize);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
-      case PARTITION_HORZ:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
-        break;
-      case PARTITION_VERT:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
-        break;
-      case PARTITION_SPLIT:
-        DEC_PARTITION(mi_row, mi_col, subsize);
-        DEC_PARTITION(mi_row, mi_col + hbs, subsize);
-        DEC_PARTITION(mi_row + hbs, mi_col, subsize);
-        DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      case PARTITION_HORZ_A:
-        DEC_BLOCK(mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-        DEC_BLOCK(mi_row + qbs, mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-        DEC_BLOCK(mi_row + hbs, mi_col, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row + hbs, mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-        if (mi_row + 3 * qbs < cm->mi_rows)
-          DEC_BLOCK(mi_row + 3 * qbs, mi_col,
-                    get_subsize(bsize, PARTITION_HORZ_4));
-        break;
-      case PARTITION_VERT_A:
-        DEC_BLOCK(mi_row, mi_col, get_subsize(bsize, PARTITION_VERT_4));
-        DEC_BLOCK(mi_row, mi_col + qbs, get_subsize(bsize, PARTITION_VERT_4));
-        DEC_BLOCK(mi_row, mi_col + hbs, subsize);
-        break;
-      case PARTITION_VERT_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row, mi_col + hbs, get_subsize(bsize, PARTITION_VERT_4));
-        if (mi_col + 3 * qbs < cm->mi_cols)
-          DEC_BLOCK(mi_row, mi_col + 3 * qbs,
-                    get_subsize(bsize, PARTITION_VERT_4));
-        break;
-#else
-      case PARTITION_HORZ_A:
-        DEC_BLOCK(mi_row, mi_col, bsize2);
-        DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
-        break;
-      case PARTITION_VERT_A:
-        DEC_BLOCK(mi_row, mi_col, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
-        DEC_BLOCK(mi_row, mi_col + hbs, subsize);
-        break;
-      case PARTITION_VERT_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
-        break;
-#endif
-      case PARTITION_HORZ_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_row = mi_row + i * quarter_step;
-          if (i > 0 && this_mi_row >= cm->mi_rows) break;
-          DEC_BLOCK(this_mi_row, mi_col, subsize);
-        }
-        break;
-      case PARTITION_VERT_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_col = mi_col + i * quarter_step;
-          if (i > 0 && this_mi_col >= cm->mi_cols) break;
-          DEC_BLOCK(mi_row, this_mi_col, subsize);
-        }
-        break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      default: assert(0 && "Invalid partition type");
-    }
+  switch (partition) {
+    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+    case PARTITION_HORZ:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_VERT:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_SPLIT:
+      DEC_PARTITION(mi_row, mi_col, subsize);
+      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_HORZ_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_VERT_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_VERT_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+        DEC_BLOCK(this_mi_row, mi_col, subsize);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        DEC_BLOCK(mi_row, this_mi_col, subsize);
+      }
+      break;
+    default: assert(0 && "Invalid partition type");
   }
 
 #undef DEC_PARTITION
@@ -2472,219 +1513,13 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
 #undef DEC_BLOCK_EPT_ARG
 #undef DEC_BLOCK_STX_ARG
 
-#if CONFIG_SUPERTX
-  if (supertx_enabled && read_token) {
-    uint8_t *dst_buf[3];
-    int dst_stride[3], i;
-    int offset = mi_row * cm->mi_stride + mi_col;
-
-    set_segment_id_supertx(cm, mi_row, mi_col, bsize);
-
-    if (cm->delta_q_present_flag) {
-      for (i = 0; i < MAX_SEGMENTS; i++) {
-        int j;
-        for (j = 0; j < MAX_MB_PLANE; ++j) {
-          const int dc_delta_q = j == 0 ? cm->y_dc_delta_q : cm->uv_dc_delta_q;
-          const int ac_delta_q = j == 0 ? 0 : cm->uv_ac_delta_q;
-
-          xd->plane[j].seg_dequant[i][0] =
-              av1_dc_quant(xd->current_qindex, dc_delta_q, cm->bit_depth);
-          xd->plane[j].seg_dequant[i][1] =
-              av1_ac_quant(xd->current_qindex, ac_delta_q, cm->bit_depth);
-        }
-      }
-    }
-
-    xd->mi = cm->mi_grid_visible + offset;
-    xd->mi[0] = cm->mi + offset;
-    set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
-                   mi_size_wide[bsize],
-#if CONFIG_DEPENDENT_HORZTILES
-                   cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                   cm->mi_rows, cm->mi_cols);
-    set_skip_context(xd, mi_row, mi_col);
-    skip = read_skip(cm, xd, xd->mi[0]->mbmi.segment_id_supertx, r);
-    if (skip) {
-      av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-    } else {
-      FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#if CONFIG_EXT_TX
-      if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
-          1) {
-        const int eset =
-            get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-        if (eset > 0) {
-          const TxSetType tx_set_type = get_ext_tx_set_type(
-              supertx_size, bsize, 1, cm->reduced_tx_set_used);
-          const int packed_sym =
-              aom_read_symbol(r, ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
-                              av1_num_ext_tx_set[tx_set_type], ACCT_STR);
-          txfm = av1_ext_tx_inv[tx_set_type][packed_sym];
-#if CONFIG_ENTROPY_STATS
-          if (xd->counts) ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      }
-#else
-      if (supertx_size < TX_32X32) {
-        txfm = aom_read_symbol(r, ec_ctx->inter_ext_tx_cdf[supertx_size],
-                               TX_TYPES, ACCT_STR);
-#if CONFIG_ENTROPY_STATS
-        if (xd->counts) ++xd->counts->inter_ext_tx[supertx_size][txfm];
-#endif  // CONFIG_ENTROPY_STATS
-      }
-#endif  // CONFIG_EXT_TX
-    }
-
-    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                         mi_col);
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      dst_buf[i] = xd->plane[i].dst.buf;
-      dst_stride[i] = xd->plane[i].dst.stride;
-    }
-    dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row, mi_col, bsize,
-                           bsize, dst_buf, dst_stride);
-
-    if (!skip) {
-      int eobtotal = 0;
-      MB_MODE_INFO *mbmi;
-      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
-      mbmi = &xd->mi[0]->mbmi;
-      mbmi->tx_type = txfm;
-      assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        const struct macroblockd_plane *const pd = &xd->plane[i];
-        int row, col;
-        const TX_SIZE tx_size = av1_get_tx_size(i, xd);
-        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-        const int stepr = tx_size_high_unit[tx_size];
-        const int stepc = tx_size_wide_unit[tx_size];
-        const int max_blocks_wide = max_block_wide(xd, plane_bsize, i);
-        const int max_blocks_high = max_block_high(xd, plane_bsize, i);
-
-        for (row = 0; row < max_blocks_high; row += stepr)
-          for (col = 0; col < max_blocks_wide; col += stepc)
-            eobtotal += reconstruct_inter_block(
-                cm, xd, r, mbmi->segment_id_supertx, i, row, col, tx_size);
-      }
-      if ((unify_bsize || !(subsize < BLOCK_8X8)) && eobtotal == 0) skip = 1;
-    }
-    set_param_topblock(cm, xd, bsize, mi_row, mi_col, txfm, skip);
-  }
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_EXT_PARTITION_TYPES
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  // update partition context
-  if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_LPF_SB
-  if (bsize == cm->sb_size) {
-    int filt_lvl;
-    if (mi_row == 0 && mi_col == 0) {
-      filt_lvl = aom_read_literal(r, 6, ACCT_STR);
-      cm->mi_grid_visible[0]->mbmi.reuse_sb_lvl = 0;
-      cm->mi_grid_visible[0]->mbmi.delta = 0;
-      cm->mi_grid_visible[0]->mbmi.sign = 0;
-    } else {
-      int prev_mi_row, prev_mi_col;
-      if (mi_col - MAX_MIB_SIZE < 0) {
-        prev_mi_row = mi_row - MAX_MIB_SIZE;
-        prev_mi_col = mi_col;
-      } else {
-        prev_mi_row = mi_row;
-        prev_mi_col = mi_col - MAX_MIB_SIZE;
-      }
-
-      MB_MODE_INFO *curr_mbmi =
-          &cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi;
-      MB_MODE_INFO *prev_mbmi =
-          &cm->mi_grid_visible[prev_mi_row * cm->mi_stride + prev_mi_col]->mbmi;
-      const uint8_t prev_lvl = prev_mbmi->filt_lvl;
-
-      const int reuse_ctx = prev_mbmi->reuse_sb_lvl;
-      const int reuse_prev_lvl = aom_read_symbol(
-          r, xd->tile_ctx->lpf_reuse_cdf[reuse_ctx], 2, ACCT_STR);
-      curr_mbmi->reuse_sb_lvl = reuse_prev_lvl;
-
-      if (reuse_prev_lvl) {
-        filt_lvl = prev_lvl;
-        curr_mbmi->delta = 0;
-        curr_mbmi->sign = 0;
-      } else {
-        const int delta_ctx = prev_mbmi->delta;
-        unsigned int delta = aom_read_symbol(
-            r, xd->tile_ctx->lpf_delta_cdf[delta_ctx], DELTA_RANGE, ACCT_STR);
-        curr_mbmi->delta = delta;
-        delta *= LPF_STEP;
-
-        if (delta) {
-          const int sign_ctx = prev_mbmi->sign;
-          const int sign = aom_read_symbol(
-              r, xd->tile_ctx->lpf_sign_cdf[reuse_ctx][sign_ctx], 2, ACCT_STR);
-          curr_mbmi->sign = sign;
-          filt_lvl = sign ? prev_lvl + delta : prev_lvl - delta;
-        } else {
-          filt_lvl = prev_lvl;
-          curr_mbmi->sign = 0;
-        }
-      }
-    }
-
-    av1_loop_filter_sb_level_init(cm, mi_row, mi_col, filt_lvl);
-  }
-#endif
-
-#if CONFIG_CDEF
-  if (bsize == cm->sb_size) {
-    int width_step = mi_size_wide[BLOCK_64X64];
-    int height_step = mi_size_wide[BLOCK_64X64];
-    int w, h;
-    for (h = 0; (h < mi_size_high[cm->sb_size]) && (mi_row + h < cm->mi_rows);
-         h += height_step) {
-      for (w = 0; (w < mi_size_wide[cm->sb_size]) && (mi_col + w < cm->mi_cols);
-           w += width_step) {
-        if (!cm->all_lossless && !sb_all_skip(cm, mi_row + h, mi_col + w))
-          cm->mi_grid_visible[(mi_row + h) * cm->mi_stride + (mi_col + w)]
-              ->mbmi.cdef_strength =
-              aom_read_literal(r, cm->cdef_bits, ACCT_STR);
-        else
-          cm->mi_grid_visible[(mi_row + h) * cm->mi_stride + (mi_col + w)]
-              ->mbmi.cdef_strength = -1;
-      }
-    }
-  }
-#endif  // CONFIG_CDEF
-#if CONFIG_LOOP_RESTORATION
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    int rcol0, rcol1, rrow0, rrow1, nhtiles;
-    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
-                                           &rcol0, &rcol1, &rrow0, &rrow1,
-                                           &nhtiles)) {
-      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
-        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
-          int rtile_idx = rcol + rrow * nhtiles;
-          loop_restoration_read_sb_coeffs(cm, xd, r, plane, rtile_idx);
-        }
-      }
-    }
-  }
-#endif
 }
 
 static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
                                const size_t read_size,
                                struct aom_internal_error_info *error_info,
-                               aom_reader *r,
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-                               int window_size,
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-                               aom_decrypt_cb decrypt_cb, void *decrypt_state) {
+                               aom_reader *r, uint8_t allow_update_cdf) {
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
@@ -2692,117 +1527,147 @@ static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
     aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  r->window_size = window_size;
-#endif
-  if (aom_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+  if (aom_reader_init(r, data, read_size))
     aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
+
+  r->allow_update_cdf = allow_update_cdf;
 }
 
 static void setup_segmentation(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
   struct segmentation *const seg = &cm->seg;
-  int i, j;
 
   seg->update_map = 0;
   seg->update_data = 0;
   seg->temporal_update = 0;
 
   seg->enabled = aom_rb_read_bit(rb);
-  if (!seg->enabled) return;
+  if (!seg->enabled) {
+    if (cm->cur_frame->seg_map)
+      memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
-  // Segmentation map update
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    memset(seg, 0, sizeof(*seg));
+    segfeatures_copy(&cm->cur_frame->seg, seg);
+    return;
+  }
+  if (cm->seg.enabled && cm->prev_frame &&
+      (cm->mi_rows == cm->prev_frame->mi_rows) &&
+      (cm->mi_cols == cm->prev_frame->mi_cols)) {
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  } else {
+    cm->last_frame_seg_map = NULL;
+  }
+  // Read update flags
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    // These frames can't use previous frames, so must signal map + features
     seg->update_map = 1;
+    seg->temporal_update = 0;
+    seg->update_data = 1;
   } else {
     seg->update_map = aom_rb_read_bit(rb);
-  }
-  if (seg->update_map) {
-    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
-      seg->temporal_update = 0;
-    } else {
+    if (seg->update_map) {
       seg->temporal_update = aom_rb_read_bit(rb);
+    } else {
+      seg->temporal_update = 0;
     }
+    seg->update_data = aom_rb_read_bit(rb);
   }
 
   // Segmentation data update
-  seg->update_data = aom_rb_read_bit(rb);
   if (seg->update_data) {
-    seg->abs_delta = aom_rb_read_bit(rb);
-
     av1_clearall_segfeatures(seg);
 
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      for (j = 0; j < SEG_LVL_MAX; j++) {
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      for (int j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
         const int feature_enabled = aom_rb_read_bit(rb);
         if (feature_enabled) {
           av1_enable_segfeature(seg, i, j);
-          data = decode_unsigned_max(rb, av1_seg_feature_data_max(j));
-          if (av1_is_segfeature_signed(j))
-            data = aom_rb_read_bit(rb) ? -data : data;
+
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            data = aom_rb_read_inv_signed_literal(rb, ubits);
+          } else {
+            data = aom_rb_read_literal(rb, ubits);
+          }
+
+          data = clamp(data, data_min, data_max);
         }
         av1_set_segdata(seg, i, j, data);
       }
     }
+    calculate_segdata(seg);
+  } else if (cm->prev_frame) {
+    segfeatures_copy(seg, &cm->prev_frame->seg);
   }
+  segfeatures_copy(&cm->cur_frame->seg, seg);
 }
 
-#if CONFIG_LOOP_RESTORATION
 static void decode_restoration_mode(AV1_COMMON *cm,
                                     struct aom_read_bit_buffer *rb) {
-  int p;
-  RestorationInfo *rsi = &cm->rst_info[0];
-  if (aom_rb_read_bit(rb)) {
-    rsi->frame_restoration_type =
-        aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
-  } else {
-    rsi->frame_restoration_type =
-        aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
-  }
-  for (p = 1; p < MAX_MB_PLANE; ++p) {
-    rsi = &cm->rst_info[p];
+  assert(!cm->all_lossless);
+  const int num_planes = av1_num_planes(cm);
+  if (cm->allow_intrabc) return;
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
     if (aom_rb_read_bit(rb)) {
       rsi->frame_restoration_type =
           aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
     } else {
-      rsi->frame_restoration_type = RESTORE_NONE;
+      rsi->frame_restoration_type =
+          aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+    }
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
     }
   }
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
 
-  cm->rst_info[0].restoration_tilesize = RESTORATION_TILESIZE_MAX;
-  cm->rst_info[1].restoration_tilesize = RESTORATION_TILESIZE_MAX;
-  cm->rst_info[2].restoration_tilesize = RESTORATION_TILESIZE_MAX;
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    rsi = &cm->rst_info[0];
-    rsi->restoration_tilesize >>= aom_rb_read_bit(rb);
-    if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
-      rsi->restoration_tilesize >>= aom_rb_read_bit(rb);
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = sb_size;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    if (sb_size == 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
     }
-  }
-  int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
-  if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-            cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
-    cm->rst_info[1].restoration_tilesize =
-        cm->rst_info[0].restoration_tilesize >> (aom_rb_read_bit(rb) * s);
   } else {
-    cm->rst_info[1].restoration_tilesize = cm->rst_info[0].restoration_tilesize;
+    const int size = RESTORATION_UNITSIZE_MAX;
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = size;
   }
-  cm->rst_info[2].restoration_tilesize = cm->rst_info[1].restoration_tilesize;
 
-  cm->rst_info[0].procunit_width = cm->rst_info[0].procunit_height =
-      RESTORATION_PROC_UNIT_SIZE;
-  cm->rst_info[1].procunit_width = cm->rst_info[2].procunit_width =
-      RESTORATION_PROC_UNIT_SIZE >> cm->subsampling_x;
-  cm->rst_info[1].procunit_height = cm->rst_info[2].procunit_height =
-      RESTORATION_PROC_UNIT_SIZE >> cm->subsampling_y;
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+    if (s && !chroma_none) {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
+    } else {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size;
+    }
+    cm->rst_info[2].restoration_unit_size =
+        cm->rst_info[1].restoration_unit_size;
+  }
 }
 
 static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
                                WienerInfo *ref_wiener_info, aom_reader *rb) {
+  memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
+  memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
+
   if (wiener_win == WIENER_WIN)
     wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
         aom_read_primitive_refsubexpfin(
@@ -2860,75 +1725,109 @@ static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
 static void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
                                 SgrprojInfo *ref_sgrproj_info, aom_reader *rb) {
   sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
-  sgrproj_info->xqd[0] =
-      aom_read_primitive_refsubexpfin(
-          rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
-          ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
-      SGRPROJ_PRJ_MIN0;
-  sgrproj_info->xqd[1] =
-      aom_read_primitive_refsubexpfin(
-          rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
-          ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
-      SGRPROJ_PRJ_MIN1;
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    sgrproj_info->xqd[0] = 0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  } else if (params->r[1] == 0) {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
+                                 SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+  } else {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  }
+
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
 static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
                                             MACROBLOCKD *xd,
                                             aom_reader *const r, int plane,
-                                            int rtile_idx) {
-  const RestorationInfo *rsi = cm->rst_info + plane;
+                                            int runit_idx) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
   if (rsi->frame_restoration_type == RESTORE_NONE) return;
 
+  assert(!cm->all_lossless);
+
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
   WienerInfo *wiener_info = xd->wiener_info + plane;
   SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
 
   if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
-    assert(plane == 0);
-    rsi->restoration_type[rtile_idx] =
-        aom_read_tree(r, av1_switchable_restore_tree,
-                      cm->fc->switchable_restore_prob, ACCT_STR);
-
-    if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) {
-      read_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                         r);
-    } else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) {
-      read_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, r);
+    rui->restoration_type =
+        aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
+                        RESTORE_SWITCHABLE_TYPES, ACCT_STR);
+    switch (rui->restoration_type) {
+      case RESTORE_WIENER:
+        read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+        break;
+      case RESTORE_SGRPROJ:
+        read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+        break;
+      default: assert(rui->restoration_type == RESTORE_NONE); break;
     }
   } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
-    if (aom_read(r, RESTORE_NONE_WIENER_PROB, ACCT_STR)) {
-      rsi->restoration_type[rtile_idx] = RESTORE_WIENER;
-      read_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                         r);
+    if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_WIENER;
+      read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
     } else {
-      rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+      rui->restoration_type = RESTORE_NONE;
     }
   } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
-    if (aom_read(r, RESTORE_NONE_SGRPROJ_PROB, ACCT_STR)) {
-      rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ;
-      read_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, r);
+    if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_SGRPROJ;
+      read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
     } else {
-      rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+      rui->restoration_type = RESTORE_NONE;
     }
   }
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
 static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
   struct loopfilter *lf = &cm->lf;
-#if !CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
+  if (cm->allow_intrabc || cm->coded_lossless) {
+    // write default deltas to frame buffer
+    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+    return;
+  }
+  assert(!cm->coded_lossless);
+  if (cm->prev_frame) {
+    // write deltas to frame buffer
+    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  } else {
+    av1_set_default_ref_deltas(lf->ref_deltas);
+    av1_set_default_mode_deltas(lf->mode_deltas);
+  }
   lf->filter_level[0] = aom_rb_read_literal(rb, 6);
   lf->filter_level[1] = aom_rb_read_literal(rb, 6);
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    lf->filter_level_u = aom_rb_read_literal(rb, 6);
-    lf->filter_level_v = aom_rb_read_literal(rb, 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      lf->filter_level_u = aom_rb_read_literal(rb, 6);
+      lf->filter_level_v = aom_rb_read_literal(rb, 6);
+    }
   }
-#else
-  lf->filter_level = aom_rb_read_literal(rb, 6);
-#endif
-#endif  // CONFIG_LPF_SB
   lf->sharpness_level = aom_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
@@ -2939,38 +1838,33 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   if (lf->mode_ref_delta_enabled) {
     lf->mode_ref_delta_update = aom_rb_read_bit(rb);
     if (lf->mode_ref_delta_update) {
-      int i;
-
-      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+      for (int i = 0; i < REF_FRAMES; i++)
         if (aom_rb_read_bit(rb))
           lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
         if (aom_rb_read_bit(rb))
           lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
     }
   }
+
+  // write deltas to frame buffer
+  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
 }
 
-#if CONFIG_CDEF
 static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  int i;
-#if CONFIG_CDEF_SINGLEPASS
+  const int num_planes = av1_num_planes(cm);
+  if (cm->allow_intrabc) return;
   cm->cdef_pri_damping = cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
-#else
-  cm->cdef_pri_damping = aom_rb_read_literal(rb, 1) + 5;
-  cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
-#endif
   cm->cdef_bits = aom_rb_read_literal(rb, 2);
   cm->nb_cdef_strengths = 1 << cm->cdef_bits;
-  for (i = 0; i < cm->nb_cdef_strengths; i++) {
+  for (int i = 0; i < cm->nb_cdef_strengths; i++) {
     cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
-    cm->cdef_uv_strengths[i] = cm->subsampling_x == cm->subsampling_y
-                                   ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS)
-                                   : 0;
+    cm->cdef_uv_strengths[i] =
+        num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
   }
 }
-#endif  // CONFIG_CDEF
 
 static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
   return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
@@ -2978,66 +1872,74 @@ static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
 
 static void setup_quantization(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
   cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
   cm->y_dc_delta_q = read_delta_q(rb);
-  cm->uv_dc_delta_q = read_delta_q(rb);
-  cm->uv_ac_delta_q = read_delta_q(rb);
+  if (num_planes > 1) {
+    int diff_uv_delta = 0;
+    if (cm->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+    cm->u_dc_delta_q = read_delta_q(rb);
+    cm->u_ac_delta_q = read_delta_q(rb);
+    if (diff_uv_delta) {
+      cm->v_dc_delta_q = read_delta_q(rb);
+      cm->v_ac_delta_q = read_delta_q(rb);
+    } else {
+      cm->v_dc_delta_q = cm->u_dc_delta_q;
+      cm->v_ac_delta_q = cm->u_ac_delta_q;
+    }
+  }
   cm->dequant_bit_depth = cm->bit_depth;
-#if CONFIG_AOM_QM
   cm->using_qmatrix = aom_rb_read_bit(rb);
   if (cm->using_qmatrix) {
-    cm->min_qmlevel = aom_rb_read_literal(rb, QM_LEVEL_BITS);
-    cm->max_qmlevel = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    if (!cm->separate_uv_delta_q)
+      cm->qm_v = cm->qm_u;
+    else
+      cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
   } else {
-    cm->min_qmlevel = 0;
-    cm->max_qmlevel = 0;
+    cm->qm_y = 0;
+    cm->qm_u = 0;
+    cm->qm_v = 0;
   }
-#endif
 }
 
 // Build y/uv dequant values based on segmentation.
 static void setup_segmentation_dequant(AV1_COMMON *const cm) {
-#if CONFIG_AOM_QM
   const int using_qm = cm->using_qmatrix;
-  const int minqm = cm->min_qmlevel;
-  const int maxqm = cm->max_qmlevel;
-#endif
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
   const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
   for (int i = 0; i < max_segments; ++i) {
     const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
-    cm->y_dequant[i][0] = av1_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
-    cm->y_dequant[i][1] = av1_ac_quant(qindex, 0, cm->bit_depth);
-    cm->uv_dequant[i][0] =
-        av1_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
-    cm->uv_dequant[i][1] =
-        av1_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
-#if CONFIG_AOM_QM
+    cm->y_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, cm->bit_depth);
+    cm->u_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, cm->bit_depth);
+    cm->u_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, cm->bit_depth);
+    cm->v_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, cm->bit_depth);
+    cm->v_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, cm->bit_depth);
     const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                         cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+                         cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                         cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
     // NB: depends on base index so there is only 1 set per frame
     // No quant weighting when lossless or signalled not using QM
-    const int qmlevel = (lossless || using_qm == 0)
-                            ? NUM_QM_LEVELS - 1
-                            : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+    int qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_y;
     for (int j = 0; j < TX_SIZES_ALL; ++j) {
-      cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 0, j, 1);
-      cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 0, j, 0);
-      cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
-      cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
+      cm->y_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_Y, j);
     }
-#endif  // CONFIG_AOM_QM
-#if CONFIG_NEW_QUANT
-    for (int dq = 0; dq < QUANT_PROFILES; dq++) {
-      for (int b = 0; b < COEF_BANDS; ++b) {
-        av1_get_dequant_val_nuq(cm->y_dequant[i][b != 0], b,
-                                cm->y_dequant_nuq[i][dq][b], NULL, dq);
-        av1_get_dequant_val_nuq(cm->uv_dequant[i][b != 0], b,
-                                cm->uv_dequant_nuq[i][dq][b], NULL, dq);
-      }
+    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_u;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->u_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_U, j);
+    }
+    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_v;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->v_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_V, j);
     }
-#endif  //  CONFIG_NEW_QUANT
   }
 }
 
@@ -3047,23 +1949,21 @@ static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
 }
 
 static void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-#if CONFIG_FRAME_SUPERRES
   cm->render_width = cm->superres_upscaled_width;
   cm->render_height = cm->superres_upscaled_height;
-#else
-  cm->render_width = cm->width;
-  cm->render_height = cm->height;
-#endif  // CONFIG_FRAME_SUPERRES
   if (aom_rb_read_bit(rb))
-    av1_read_frame_size(rb, &cm->render_width, &cm->render_height);
+    av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
 }
 
-#if CONFIG_FRAME_SUPERRES
 // TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
 static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
                            int *width, int *height) {
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) return;
+
   if (aom_rb_read_bit(rb)) {
     cm->superres_scale_denominator =
         (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
@@ -3077,7 +1977,6 @@ static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
     cm->superres_scale_denominator = SCALE_NUMERATOR;
   }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
 static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
 #if CONFIG_SIZE_LIMIT
@@ -3111,24 +2010,34 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
   cm->cur_frame->height = cm->height;
 }
 
-static void setup_frame_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+                             struct aom_read_bit_buffer *rb) {
   int width, height;
   BufferPool *const pool = cm->buffer_pool;
-  av1_read_frame_size(rb, &width, &height);
-#if CONFIG_FRAME_SUPERRES
+
+  if (frame_size_override_flag) {
+    int num_bits_width = cm->seq_params.num_bits_width;
+    int num_bits_height = cm->seq_params.num_bits_height;
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    if (width > cm->seq_params.max_frame_width ||
+        height > cm->seq_params.max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Frame dimensions are larger than the maximum values");
+    }
+  } else {
+    width = cm->seq_params.max_frame_width;
+    height = cm->seq_params.max_frame_height;
+  }
+
   setup_superres(cm, rb, &width, &height);
-#endif  // CONFIG_FRAME_SUPERRES
-  setup_render_size(cm, rb);
   resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
-          cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-          cm->use_highbitdepth,
-#endif
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
     unlock_buffer_pool(pool);
@@ -3140,25 +2049,22 @@ static void setup_frame_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  pool->frame_bufs[cm->new_fb_idx].buf.transfer_function =
-      cm->transfer_function;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
+      cm->transfer_characteristics;
+  pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
+      cm->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
   pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
       cm->chroma_sample_position;
-#endif
   pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
   pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void setup_sb_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  (void)rb;
-#if CONFIG_EXT_PARTITION
-  set_sb_size(cm, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
-#else
-  set_sb_size(cm, BLOCK_64X64);
-#endif  // CONFIG_EXT_PARTITION
+static void setup_sb_size(SequenceHeader *seq_params,
+                          struct aom_read_bit_buffer *rb) {
+  set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
 }
 
 static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
@@ -3172,29 +2078,30 @@ static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
 static void setup_frame_size_with_refs(AV1_COMMON *cm,
                                        struct aom_read_bit_buffer *rb) {
   int width, height;
-  int found = 0, i;
+  int found = 0;
   int has_valid_ref_frame = 0;
   BufferPool *const pool = cm->buffer_pool;
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     if (aom_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
       cm->render_width = buf->render_width;
       cm->render_height = buf->render_height;
-#if CONFIG_FRAME_SUPERRES
       setup_superres(cm, rb, &width, &height);
-#endif  // CONFIG_FRAME_SUPERRES
+      resize_context_buffers(cm, width, height);
       found = 1;
       break;
     }
   }
 
   if (!found) {
-    av1_read_frame_size(rb, &width, &height);
-#if CONFIG_FRAME_SUPERRES
+    int num_bits_width = cm->seq_params.num_bits_width;
+    int num_bits_height = cm->seq_params.num_bits_height;
+
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
     setup_superres(cm, rb, &width, &height);
-#endif  // CONFIG_FRAME_SUPERRES
+    resize_context_buffers(cm, width, height);
     setup_render_size(cm, rb);
   }
 
@@ -3204,7 +2111,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
 
   // Check to make sure at least one of frames that this frame references
   // has valid dimensions.
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     has_valid_ref_frame |=
         valid_ref_frame_size(ref_frame->buf->y_crop_width,
@@ -3213,7 +2120,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   if (!has_valid_ref_frame)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
                                  ref_frame->buf->subsampling_x,
@@ -3223,16 +2130,11 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
                          "Referenced frame has incompatible color format");
   }
 
-  resize_context_buffers(cm, width, height);
-
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
-          cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-          cm->use_highbitdepth,
-#endif
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
     unlock_buffer_pool(pool);
@@ -3244,33 +2146,19 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  pool->frame_bufs[cm->new_fb_idx].buf.transfer_function =
-      cm->transfer_function;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
+      cm->transfer_characteristics;
+  pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
+      cm->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
   pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
       cm->chroma_sample_position;
-#endif
   pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
   pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void read_tile_group_range(AV1Decoder *pbi,
-                                  struct aom_read_bit_buffer *const rb) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int num_bits = cm->log2_tile_rows + cm->log2_tile_cols;
-  const int num_tiles =
-      cm->tile_rows * cm->tile_cols;  // Note: May be < (1<<num_bits)
-  pbi->tg_start = aom_rb_read_literal(rb, num_bits);
-  pbi->tg_size = 1 + aom_rb_read_literal(rb, num_bits);
-  if (pbi->tg_start + pbi->tg_size > num_tiles)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Tile group extends past last tile in frame");
-}
-
-#if CONFIG_MAX_TILE
-
 // Same function as av1_read_uniform but reading from uncompresses header wb
 static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
   const int l = get_unsigned_bits(n);
@@ -3285,11 +2173,10 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
 
 static void read_tile_info_max_tile(AV1_COMMON *const cm,
                                     struct aom_read_bit_buffer *const rb) {
-  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int width_sb = width_mi >> MAX_MIB_SIZE_LOG2;
-  int height_sb = height_mi >> MAX_MIB_SIZE_LOG2;
-  int start_sb, size_sb, i;
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
 
   av1_get_tile_limits(cm);
   cm->uniform_tile_spacing_flag = aom_rb_read_bit(rb);
@@ -3304,8 +2191,11 @@ static void read_tile_info_max_tile(AV1_COMMON *const cm,
       cm->log2_tile_cols++;
     }
   } else {
+    int i;
+    int start_sb;
     for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
-      size_sb = 1 + rb_read_uniform(rb, AOMMIN(width_sb, MAX_TILE_WIDTH_SB));
+      const int size_sb =
+          1 + rb_read_uniform(rb, AOMMIN(width_sb, cm->max_tile_width_sb));
       cm->tile_col_start_sb[i] = start_sb;
       start_sb += size_sb;
       width_sb -= size_sb;
@@ -3325,8 +2215,10 @@ static void read_tile_info_max_tile(AV1_COMMON *const cm,
       cm->log2_tile_rows++;
     }
   } else {
+    int i;
+    int start_sb;
     for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
-      size_sb =
+      const int size_sb =
           1 + rb_read_uniform(rb, AOMMIN(height_sb, cm->max_tile_height_sb));
       cm->tile_row_start_sb[i] = start_sb;
       start_sb += size_sb;
@@ -3337,110 +2229,61 @@ static void read_tile_info_max_tile(AV1_COMMON *const cm,
   }
   av1_calculate_tile_rows(cm);
 }
-#endif
 
-static void read_tile_info(AV1Decoder *const pbi,
-                           struct aom_read_bit_buffer *const rb) {
-  AV1_COMMON *const cm = &pbi->common;
-#if CONFIG_EXT_TILE
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
   cm->single_tile_decoding = 0;
   if (cm->large_scale_tile) {
     struct loopfilter *lf = &cm->lf;
 
     // Figure out single_tile_decoding by loopfilter_level.
-    cm->single_tile_decoding = (!lf->filter_level) ? 1 : 0;
-// Read the tile width/height
-#if CONFIG_EXT_PARTITION
-    if (cm->sb_size == BLOCK_128X128) {
-      cm->tile_width = aom_rb_read_literal(rb, 5) + 1;
-      cm->tile_height = aom_rb_read_literal(rb, 5) + 1;
-    } else {
-#endif  // CONFIG_EXT_PARTITION
-      cm->tile_width = aom_rb_read_literal(rb, 6) + 1;
-      cm->tile_height = aom_rb_read_literal(rb, 6) + 1;
-#if CONFIG_EXT_PARTITION
-    }
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    cm->loop_filter_across_tiles_enabled = aom_rb_read_bit(rb);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-    cm->tile_width <<= cm->mib_size_log2;
-    cm->tile_height <<= cm->mib_size_log2;
-
-    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
-    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
-
-    // Get the number of tiles
-    cm->tile_cols = 1;
-    while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
-
-    cm->tile_rows = 1;
-    while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
-
-    if (cm->tile_cols * cm->tile_rows > 1) {
-      // Read the number of bytes used to store tile size
-      pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
-      pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
-    }
-
-#if CONFIG_DEPENDENT_HORZTILES
-    cm->dependent_horz_tiles = 0;
-#endif
-  } else {
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_MAX_TILE
-    read_tile_info_max_tile(cm, rb);
-#else
-  int min_log2_tile_cols, max_log2_tile_cols, max_ones;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
-
-  // columns
-  max_ones = max_log2_tile_cols - min_log2_tile_cols;
-  cm->log2_tile_cols = min_log2_tile_cols;
-  while (max_ones-- && aom_rb_read_bit(rb)) cm->log2_tile_cols++;
-
-  if (cm->log2_tile_cols > 6)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Invalid number of tile columns");
-
-  // rows
-  cm->log2_tile_rows = aom_rb_read_bit(rb);
-  if (cm->log2_tile_rows) cm->log2_tile_rows += aom_rb_read_bit(rb);
+    const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
+    const int no_cdef = cm->cdef_bits == 0 && cm->cdef_strengths[0] == 0 &&
+                        cm->cdef_uv_strengths[0] == 0;
+    const int no_restoration =
+        cm->rst_info[0].frame_restoration_type == RESTORE_NONE &&
+        cm->rst_info[1].frame_restoration_type == RESTORE_NONE &&
+        cm->rst_info[2].frame_restoration_type == RESTORE_NONE;
+    assert(IMPLIES(cm->coded_lossless, no_loopfilter && no_cdef));
+    assert(IMPLIES(cm->all_lossless, no_restoration));
+    cm->single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+  }
+}
 
-  cm->tile_width =
-      get_tile_size(cm->mi_cols, cm->log2_tile_cols, &cm->tile_cols);
-  cm->tile_height =
-      get_tile_size(cm->mi_rows, cm->log2_tile_rows, &cm->tile_rows);
+static void read_tile_info(AV1Decoder *const pbi,
+                           struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
 
-#endif  // CONFIG_MAX_TILE
-#if CONFIG_DEPENDENT_HORZTILES
-    if (cm->tile_rows > 1)
-      cm->dependent_horz_tiles = aom_rb_read_bit(rb);
-    else
-      cm->dependent_horz_tiles = 0;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    cm->loop_filter_across_tiles_enabled = aom_rb_read_bit(rb);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  read_tile_info_max_tile(cm, rb);
 
+  cm->context_update_tile_id = 0;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // tile to use for cdf update
+    cm->context_update_tile_id =
+        aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
     // tile size magnitude
     pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
+}
 
-// each tile group header is in its own tile group OBU
-#if !CONFIG_OBU
-  // Store an index to the location of the tile group information
-  pbi->tg_size_bit_offset = rb->bit_offset;
-  read_tile_group_range(pbi, rb);
-#endif
+#if EXT_TILE_DEBUG
+static void read_ext_tile_info(AV1Decoder *const pbi,
+                               struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  // This information is stored as a separate byte.
+  int mod = rb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
+  assert(rb->bit_offset % CHAR_BIT == 0);
+
+  if (cm->tile_cols * cm->tile_rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
 }
+#endif  // EXT_TILE_DEBUG
 
-static int mem_get_varsize(const uint8_t *src, int sz) {
+static size_t mem_get_varsize(const uint8_t *src, int sz) {
   switch (sz) {
     case 1: return src[0];
     case 2: return mem_get_le16(src);
@@ -3450,14 +2293,14 @@ static int mem_get_varsize(const uint8_t *src, int sz) {
   }
 }
 
-#if CONFIG_EXT_TILE
+#if EXT_TILE_DEBUG
 // Reads the next tile returning its size and adjusting '*data' accordingly
-// based on 'is_last'.
+// based on 'is_last'. On return, '*data' is updated to point to the end of the
+// raw tile buffer in the bit stream.
 static void get_ls_tile_buffer(
     const uint8_t *const data_end, struct aom_internal_error_info *error_info,
-    const uint8_t **data, aom_decrypt_cb decrypt_cb, void *decrypt_state,
-    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int tile_size_bytes,
-    int col, int row, int tile_copy_mode) {
+    const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+    int tile_size_bytes, int col, int row, int tile_copy_mode) {
   size_t size;
 
   size_t copy_size = 0;
@@ -3466,15 +2309,7 @@ static void get_ls_tile_buffer(
   if (!read_is_valid(*data, tile_size_bytes, data_end))
     aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
-  if (decrypt_cb) {
-    uint8_t be_data[4];
-    decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
-
-    // Only read number of bytes in cm->tile_size_bytes.
-    size = mem_get_varsize(be_data, tile_size_bytes);
-  } else {
-    size = mem_get_varsize(*data, tile_size_bytes);
-  }
+  size = mem_get_varsize(*data, tile_size_bytes);
 
   // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
   // mode.
@@ -3486,6 +2321,8 @@ static void get_ls_tile_buffer(
     copy_data = tile_buffers[row - offset][col].data;
     copy_size = tile_buffers[row - offset][col].size;
     size = 0;
+  } else {
+    size += AV1_MIN_TILE_SIZE_BYTES;
   }
 
   *data += tile_size_bytes;
@@ -3503,30 +2340,31 @@ static void get_ls_tile_buffer(
   }
 
   *data += size;
-
-  tile_buffers[row][col].raw_data_end = *data;
 }
 
-static void get_ls_tile_buffers(
+// Returns the end of the last tile buffer
+// (tile_buffers[cm->tile_rows - 1][cm->tile_cols - 1]).
+static const uint8_t *get_ls_tile_buffers(
     AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
     TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
   AV1_COMMON *const cm = &pbi->common;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   const int have_tiles = tile_cols * tile_rows > 1;
+  const uint8_t *raw_data_end;  // The end of the last tile buffer
 
   if (!have_tiles) {
     const size_t tile_size = data_end - data;
     tile_buffers[0][0].data = data;
     tile_buffers[0][0].size = tile_size;
-    tile_buffers[0][0].raw_data_end = NULL;
+    raw_data_end = NULL;
   } else {
     // We locate only the tile buffers that are required, which are the ones
     // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
     // need the last (bottom right) tile buffer, as we need to know where the
     // end of the compressed frame buffer is for proper superframe decoding.
 
-    const uint8_t *tile_col_data_end[MAX_TILE_COLS];
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
     const uint8_t *const data_start = data;
 
     const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
@@ -3543,12 +2381,11 @@ static void get_ls_tile_buffers(
     const int tile_copy_mode =
         ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1
                                                                            : 0;
-    size_t tile_col_size;
-    int r, c;
-
     // Read tile column sizes for all columns (we need the last tile buffer)
-    for (c = 0; c < tile_cols; ++c) {
+    for (int c = 0; c < tile_cols; ++c) {
       const int is_last = c == tile_cols - 1;
+      size_t tile_col_size;
+
       if (!is_last) {
         tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
         data += tile_col_size_bytes;
@@ -3563,7 +2400,7 @@ static void get_ls_tile_buffers(
     data = data_start;
 
     // Read the required tile sizes.
-    for (c = tile_cols_start; c < tile_cols_end; ++c) {
+    for (int c = tile_cols_start; c < tile_cols_end; ++c) {
       const int is_last = c == tile_cols - 1;
 
       if (c > 0) data = tile_col_data_end[c - 1];
@@ -3571,40 +2408,45 @@ static void get_ls_tile_buffers(
       if (!is_last) data += tile_col_size_bytes;
 
       // Get the whole of the last column, otherwise stop at the required tile.
-      for (r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
-        tile_buffers[r][c].col = c;
-
+      for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
         get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
-                           pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
-                           tile_size_bytes, c, r, tile_copy_mode);
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
 
     // If we have not read the last column, then read it to get the last tile.
     if (tile_cols_end != tile_cols) {
-      c = tile_cols - 1;
+      const int c = tile_cols - 1;
 
       data = tile_col_data_end[c - 1];
 
-      for (r = 0; r < tile_rows; ++r) {
-        tile_buffers[r][c].col = c;
-
+      for (int r = 0; r < tile_rows; ++r) {
         get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
-                           pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
-                           tile_size_bytes, c, r, tile_copy_mode);
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
+    raw_data_end = data;
   }
+  return raw_data_end;
+}
+#endif  // EXT_TILE_DEBUG
+
+static const uint8_t *get_ls_single_tile_buffer(
+    AV1Decoder *pbi, const uint8_t *data,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0);
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data;
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size =
+      (size_t)pbi->coded_tile_data_size;
+  return data + pbi->coded_tile_data_size;
 }
-#endif  // CONFIG_EXT_TILE
 
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
                             const int tile_size_bytes, int is_last,
                             struct aom_internal_error_info *error_info,
-                            const uint8_t **data, aom_decrypt_cb decrypt_cb,
-                            void *decrypt_state, TileBufferDec *const buf) {
+                            const uint8_t **data, TileBufferDec *const buf) {
   size_t size;
 
   if (!is_last) {
@@ -3612,13 +2454,7 @@ static void get_tile_buffer(const uint8_t *const data_end,
       aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-    if (decrypt_cb) {
-      uint8_t be_data[4];
-      decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
-      size = mem_get_varsize(be_data, tile_size_bytes);
-    } else {
-      size = mem_get_varsize(*data, tile_size_bytes);
-    }
+    size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
     *data += tile_size_bytes;
 
     if (size > (size_t)(data_end - *data))
@@ -3637,140 +2473,123 @@ static void get_tile_buffer(const uint8_t *const data_end,
 static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
                              const uint8_t *data_end,
                              TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
-                             int startTile, int endTile) {
+                             int start_tile, int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  int r, c;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   int tc = 0;
   int first_tile_in_tg = 0;
-  struct aom_read_bit_buffer rb_tg_hdr;
-  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
-#if !CONFIG_OBU
-  const size_t hdr_size = pbi->uncomp_hdr_size + pbi->first_partition_size;
-  const int tg_size_bit_offset = pbi->tg_size_bit_offset;
-#else
-  const int tg_size_bit_offset = 0;
-#endif
-
-#if CONFIG_DEPENDENT_HORZTILES
-  int tile_group_start_col = 0;
-  int tile_group_start_row = 0;
-#endif
 
-  for (r = 0; r < tile_rows; ++r) {
-    for (c = 0; c < tile_cols; ++c, ++tc) {
+  for (int r = 0; r < tile_rows; ++r) {
+    for (int c = 0; c < tile_cols; ++c, ++tc) {
       TileBufferDec *const buf = &tile_buffers[r][c];
-#if CONFIG_OBU
-      const int is_last = (tc == endTile);
+
+      const int is_last = (tc == end_tile);
       const size_t hdr_offset = 0;
-#else
-      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
-      const size_t hdr_offset = (tc && tc == first_tile_in_tg) ? hdr_size : 0;
-#endif
 
-      if (tc < startTile || tc > endTile) continue;
+      if (tc < start_tile || tc > end_tile) continue;
 
       if (data + hdr_offset >= data_end)
         aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Data ended before all tiles were read.");
-      buf->col = c;
-      if (hdr_offset) {
-        init_read_bit_buffer(pbi, &rb_tg_hdr, data, data_end, clear_data);
-        rb_tg_hdr.bit_offset = tg_size_bit_offset;
-        read_tile_group_range(pbi, &rb_tg_hdr);
-#if CONFIG_DEPENDENT_HORZTILES
-        tile_group_start_row = r;
-        tile_group_start_col = c;
-#endif
-      }
       first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
       data += hdr_offset;
       get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
-                      &pbi->common.error, &data, pbi->decrypt_cb,
-                      pbi->decrypt_state, buf);
-#if CONFIG_DEPENDENT_HORZTILES
-      cm->tile_group_start_row[r][c] = tile_group_start_row;
-      cm->tile_group_start_col[r][c] = tile_group_start_col;
-#endif
+                      &pbi->common.error, &data, buf);
     }
   }
 }
 
-#if CONFIG_PVQ
-static void daala_dec_init(AV1_COMMON *const cm, daala_dec_ctx *daala_dec,
-                           aom_reader *r) {
-  daala_dec->r = r;
+static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer,
+                          const int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
+    xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
+    xd->cb_offset[plane] = 0;
+    xd->txb_offset[plane] = 0;
+  }
+  xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
+  xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
+  xd->color_index_map_offset[0] = 0;
+  xd->color_index_map_offset[1] = 0;
+}
+
+static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+                               TileInfo tile_info, const int mi_row) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  av1_zero_left_context(&td->xd);
 
-  // TODO(yushin) : activity masking info needs be signaled by a bitstream
-  daala_dec->use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
+  for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+       mi_col += cm->seq_params.mib_size) {
+    set_cb_buffer(&td->xd, &td->cb_buffer_base, num_planes);
 
-  if (daala_dec->use_activity_masking)
-    daala_dec->qm = OD_HVS_QM;
-  else
-    daala_dec->qm = OD_FLAT_QM;
+    decode_partition(pbi, &td->xd, mi_row, mi_col, td->bit_reader,
+                     cm->seq_params.sb_size);
+  }
+}
 
-  od_init_qm(daala_dec->state.qm, daala_dec->state.qm_inv,
-             daala_dec->qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+  uint32_t nb_bits = aom_reader_tell(r);
+  uint32_t nb_bytes = (nb_bits + 7) >> 3;
 
-  if (daala_dec->use_activity_masking) {
-    int pli;
-    int use_masking = daala_dec->use_activity_masking;
-    int segment_id = 0;
-    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  const uint8_t *p_begin = aom_reader_find_begin(r);
+  const uint8_t *p_end = aom_reader_find_end(r);
 
-    for (pli = 0; pli < MAX_MB_PLANE; pli++) {
-      int i;
-      int q;
+  // It is legal to have no padding bytes (nb_bytes == p_end - p_begin).
+  if ((ptrdiff_t)nb_bytes > p_end - p_begin) return -1;
+  const uint8_t *p = p_begin + nb_bytes;
 
-      q = qindex;
-      if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
-        od_interp_qm(&daala_dec->state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
-      } else {
-        i = 0;
-        while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
-               q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
-                       << OD_COEFF_SHIFT) {
-          i++;
-        }
-        od_interp_qm(&daala_dec->state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][i][pli],
-                     &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
-      }
-    }
+  // aom_reader_tell() returns 1 for a newly initialized decoder, and the
+  // return value only increases as values are decoded. So nb_bits > 0, and
+  // thus p > p_begin. Therefore accessing p[-1] is safe.
+  uint8_t last_byte = p[-1];
+  uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
+  if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
+
+  // Make sure that all padding bytes are zero as required by the spec.
+  while (p < p_end) {
+    if (*p != 0) return -1;
+    p++;
   }
+  return 0;
 }
-#endif  // #if CONFIG_PVQ
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-static void dec_setup_across_tile_boundary_info(
-    const AV1_COMMON *const cm, const TileInfo *const tile_info) {
-  if (tile_info->mi_row_start >= tile_info->mi_row_end ||
-      tile_info->mi_col_start >= tile_info->mi_col_end)
-    return;
+static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
+                        int tile_col) {
+  TileInfo tile_info;
+
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end,
+                         tile_row);
+  av1_reset_loop_restoration(&td->xd, num_planes);
 
-  if (!cm->loop_filter_across_tiles_enabled) {
-    av1_setup_across_tile_boundary_info(cm, tile_info);
+  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    decode_tile_sb_row(pbi, td, tile_info, mi_row);
   }
+
+  int corrupted =
+      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
 }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 
 static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
-                                   const uint8_t *data_end, int startTile,
-                                   int endTile) {
+                                   const uint8_t *data_end, int start_tile,
+                                   int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
-#if CONFIG_EXT_TILE
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
   const int single_row = pbi->dec_tile_row >= 0;
   const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
   const int single_col = pbi->dec_tile_col >= 0;
-#endif  // CONFIG_EXT_TILE
   int tile_rows_start;
   int tile_rows_end;
   int tile_cols_start;
@@ -3778,8 +2597,9 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
   int inv_col_order;
   int inv_row_order;
   int tile_row, tile_col;
+  uint8_t allow_update_cdf;
+  const uint8_t *raw_data_end = NULL;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
@@ -3787,46 +2607,38 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
     tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
     inv_col_order = pbi->inv_tile_order && !single_col;
     inv_row_order = pbi->inv_tile_order && !single_row;
+    allow_update_cdf = 0;
   } else {
-#endif  // CONFIG_EXT_TILE
     tile_rows_start = 0;
     tile_rows_end = tile_rows;
     tile_cols_start = 0;
     tile_cols_end = tile_cols;
     inv_col_order = pbi->inv_tile_order;
     inv_row_order = pbi->inv_tile_order;
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-
-  if (cm->lf.filter_level && !cm->skip_loop_filter &&
-      pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
-                    aom_memalign(32, sizeof(LFWorkerData)));
-    pbi->lf_worker.hook = (AVxWorkerHook)av1_loop_filter_worker;
-    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                         "Loop filter thread creation failed");
-    }
+    allow_update_cdf = 1;
   }
 
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-    // Be sure to sync as we might be resuming after a failed frame decode.
-    winterface->sync(&pbi->lf_worker);
-    av1_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
-                               pbi->mb.plane);
-  }
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * cm->tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * cm->tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
 
   assert(tile_rows <= MAX_TILE_ROWS);
   assert(tile_cols <= MAX_TILE_COLS);
 
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile)
-    get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && !pbi->ext_tile_debug)
+    raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
+  else if (cm->large_scale_tile && pbi->ext_tile_debug)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
-#endif  // CONFIG_EXT_TILE
-    get_tile_buffers(pbi, data, data_end, tile_buffers, startTile, endTile);
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     aom_free(pbi->tile_data);
@@ -3839,754 +2651,874 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
     aom_accounting_reset(&pbi->accounting);
   }
 #endif
-  // Load all tile information into tile_data.
+  // Load all tile information into thread_data.
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+
     for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
-      const TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
-      TileData *const td = pbi->tile_data + tile_cols * tile_row + tile_col;
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      ThreadData *const td = &pbi->td;
+      TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
 
-      if (tile_row * cm->tile_cols + tile_col < startTile ||
-          tile_row * cm->tile_cols + tile_col > endTile)
+      if (row * cm->tile_cols + col < start_tile ||
+          row * cm->tile_cols + col > end_tile)
         continue;
 
-      td->cm = cm;
       td->xd = pbi->mb;
       td->xd.corrupted = 0;
-      td->xd.counts =
-          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
-              ? &cm->counts
-              : NULL;
+      td->xd.mc_buf[0] = pbi->td.mc_buf[0];
+      td->xd.mc_buf[1] = pbi->td.mc_buf[1];
+      td->bit_reader = &tile_data->bit_reader;
       av1_zero(td->dqcoeff);
-#if CONFIG_PVQ
-      av1_zero(td->pvq_ref_coeff);
-#endif
-      av1_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
-      setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
-                         &td->bit_reader,
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-                         1 << cm->ans_window_size_log2,
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-                         pbi->decrypt_cb, pbi->decrypt_state);
+      av1_tile_init(&td->xd.tile, cm, row, col);
+      setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
+                         &cm->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
       if (pbi->acct_enabled) {
-        td->bit_reader.accounting = &pbi->accounting;
+        td->bit_reader->accounting = &pbi->accounting;
+        td->bit_reader->accounting->last_tell_frac =
+            aom_reader_tell_frac(td->bit_reader);
       } else {
-        td->bit_reader.accounting = NULL;
+        td->bit_reader->accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &td->xd,
-#if CONFIG_PVQ
-                           td->pvq_ref_coeff,
-#endif
-#if CONFIG_CFL
-                           &td->cfl,
-#endif
-                           td->dqcoeff);
+      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_above_context(cm, &td->xd, row);
 
       // Initialise the tile context from the frame context
-      td->tctx = *cm->fc;
-      td->xd.tile_ctx = &td->tctx;
-
-#if CONFIG_PVQ
-      daala_dec_init(cm, &td->xd.daala_dec, &td->bit_reader);
-      td->xd.daala_dec.state.adapt = &td->tctx.pvq_context;
-#endif
+      tile_data->tctx = *cm->fc;
+      td->xd.tile_ctx = &tile_data->tctx;
 
-      td->xd.plane[0].color_index_map = td->color_index_map[0];
-      td->xd.plane[1].color_index_map = td->color_index_map[1];
-#if CONFIG_MRC_TX
-      td->xd.mrc_mask = td->mrc_mask;
-#endif  // CONFIG_MRC_TX
+      // decode tile
+      decode_tile(pbi, &pbi->td, row, col);
+      aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
+      if (pbi->mb.corrupted)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
     }
   }
 
-  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
-    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
-    int mi_row = 0;
-    TileInfo tile_info;
-
-    av1_tile_set_row(&tile_info, cm, row);
-
-    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
-      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
-      TileData *const td = pbi->tile_data + tile_cols * row + col;
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 
-      if (tile_row * cm->tile_cols + tile_col < startTile ||
-          tile_row * cm->tile_cols + tile_col > endTile)
-        continue;
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
 
-#if CONFIG_ACCOUNTING
-      if (pbi->acct_enabled) {
-        td->bit_reader.accounting->last_tell_frac =
-            aom_reader_tell_frac(&td->bit_reader);
-      }
-#endif
+static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
+  TileJobsDec *cur_job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(tile_mt_info->job_mutex);
 
-      av1_tile_set_col(&tile_info, cm, col);
+  if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
+    cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
+    tile_mt_info->jobs_dequeued++;
+  }
 
-#if CONFIG_DEPENDENT_HORZTILES
-      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-      if (!cm->dependent_horz_tiles || tile_row == 0 ||
-          tile_info.tg_horz_boundary) {
-        av1_zero_above_context(cm, tile_info.mi_col_start,
-                               tile_info.mi_col_end);
-      }
+  pthread_mutex_unlock(tile_mt_info->job_mutex);
 #else
-      av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end);
+  (void)tile_mt_info;
 #endif
-#if CONFIG_LOOP_RESTORATION
-      for (int p = 0; p < MAX_MB_PLANE; ++p) {
-        set_default_wiener(td->xd.wiener_info + p);
-        set_default_sgrproj(td->xd.sgrproj_info + p);
-      }
-#endif  // CONFIG_LOOP_RESTORATION
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      dec_setup_across_tile_boundary_info(cm, &tile_info);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  return cur_job_info;
+}
 
-      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += cm->mib_size) {
-        int mi_col;
+static int tile_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
 
-        av1_zero_left_context(&td->xd);
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+    return 0;
+  }
+  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
 
-        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += cm->mib_size) {
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-          alloc_ncobmc_pred_buffer(&td->xd);
-          set_sb_mi_boundaries(cm, &td->xd, mi_row, mi_col);
-#endif
-          decode_partition(pbi, &td->xd,
-#if CONFIG_SUPERTX
-                           0,
-#endif  // CONFIG_SUPERTX
-                           mi_row, mi_col, &td->bit_reader, cm->sb_size);
-#if NC_MODE_INFO && CONFIG_MOTION_VAR
-          detoken_and_recon_sb(pbi, &td->xd, mi_row, mi_col, &td->bit_reader,
-                               cm->sb_size);
-#endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-          free_ncobmc_pred_buffer(&td->xd);
-#endif
-        }
-        aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
-        if (pbi->mb.corrupted)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                             "Failed to decode tile data");
-      }
-    }
+  assert(cm->tile_cols > 0);
+  while (1) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
-#if !CONFIG_OBU
-    assert(mi_row > 0);
-#endif
+    if (cur_job_info != NULL && !td->xd.corrupted) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      volatile int tile_row = tile_data->tile_info.tile_row;
+      volatile int tile_col = tile_data->tile_info.tile_col;
 
-// when Parallel deblocking is enabled, deblocking should not
-// be interleaved with decoding. Instead, deblocking should be done
-// after the entire frame is decoded.
-#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING && !CONFIG_CB4X4
-    // Loopfilter one tile row.
-    // Note: If out-of-order tile decoding is used(for example, inv_row_order
-    // = 1), the loopfiltering has be done after all tile rows are decoded.
-    if (!inv_row_order && cm->lf.filter_level && !cm->skip_loop_filter) {
-      LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-      const int lf_start = AOMMAX(0, tile_info.mi_row_start - cm->mib_size);
-      const int lf_end = tile_info.mi_row_end - cm->mib_size;
-
-      // Delay the loopfilter if the first tile row is only
-      // a single superblock high.
-      if (lf_end <= 0) continue;
-
-      // Decoding has completed. Finish up the loop filter in this thread.
-      if (tile_info.mi_row_end >= cm->mi_rows) continue;
-
-      winterface->sync(&pbi->lf_worker);
-      lf_data->start = lf_start;
-      lf_data->stop = lf_end;
-      if (pbi->max_threads > 1) {
-        winterface->launch(&pbi->lf_worker);
+      td->xd = pbi->mb;
+      td->xd.corrupted = 0;
+      td->xd.mc_buf[0] = td->mc_buf[0];
+      td->xd.mc_buf[1] = td->mc_buf[1];
+      td->bit_reader = &tile_data->bit_reader;
+      av1_zero(td->dqcoeff);
+      av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+      setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+                         tile_buffer->size, &cm->error, td->bit_reader,
+                         allow_update_cdf);
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        td->bit_reader->accounting = &pbi->accounting;
+        td->bit_reader->accounting->last_tell_frac =
+            aom_reader_tell_frac(td->bit_reader);
       } else {
-        winterface->execute(&pbi->lf_worker);
+        td->bit_reader->accounting = NULL;
       }
-    }
-#endif  // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
-
-    // After loopfiltering, the last 7 row pixels in each superblock row may
-    // still be changed by the longest loopfilter of the next superblock row.
-    if (cm->frame_parallel_decode)
-      av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
-  }
+#endif
+      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_above_context(cm, &td->xd, tile_row);
 
-#if CONFIG_VAR_TX || CONFIG_CB4X4
-// Loopfilter the whole frame.
-#if CONFIG_LPF_SB
-  av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                        cm->lf.filter_level, 0, 0, 0, 0);
-#else
-#if CONFIG_LOOPFILTER_LEVEL
-  if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level[0], cm->lf.filter_level[1], 0, 0);
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level_u, cm->lf.filter_level_u, 1, 0);
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level_v, cm->lf.filter_level_v, 2, 0);
-  }
-#else
-#if CONFIG_OBU
-  if (endTile == cm->tile_rows * cm->tile_cols - 1)
+      // Initialise the tile context from the frame context
+      tile_data->tctx = *cm->fc;
+      td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        tile_data->bit_reader.accounting->last_tell_frac =
+            aom_reader_tell_frac(&tile_data->bit_reader);
+      }
 #endif
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level, 0, 0);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
-#else
-#if CONFIG_PARALLEL_DEBLOCKING
-  // Loopfilter all rows in the frame in the frame.
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-    winterface->sync(&pbi->lf_worker);
-    lf_data->start = 0;
-    lf_data->stop = cm->mi_rows;
-    winterface->execute(&pbi->lf_worker);
-  }
-#else
-  // Loopfilter remaining rows in the frame.
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-    winterface->sync(&pbi->lf_worker);
-    lf_data->start = lf_data->stop;
-    lf_data->stop = cm->mi_rows;
-    winterface->execute(&pbi->lf_worker);
-  }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-#endif  // CONFIG_VAR_TX
-  if (cm->frame_parallel_decode)
-    av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);
-
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    if (n_tiles == 1) {
-#if CONFIG_ANS
-      return data_end;
-#else
-      // Find the end of the single tile buffer
-      return aom_reader_find_end(&pbi->tile_data->bit_reader);
-#endif  // CONFIG_ANS
+      // decode tile
+      decode_tile(pbi, td, tile_row, tile_col);
     } else {
-      // Return the end of the last tile buffer
-      return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+      break;
     }
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_ANS
-    return data_end;
-#else
-#if !CONFIG_OBU
-  {
-    // Get last tile data.
-    TileData *const td = pbi->tile_data + tile_cols * tile_rows - 1;
-    return aom_reader_find_end(&td->bit_reader);
-  }
-#else
-  TileData *const td = pbi->tile_data + endTile;
-  return aom_reader_find_end(&td->bit_reader);
-#endif
-#endif  // CONFIG_ANS
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
+  return !td->xd.corrupted;
 }
 
-static int tile_worker_hook(TileWorkerData *const tile_data,
-                            const TileInfo *const tile) {
-  AV1Decoder *const pbi = tile_data->pbi;
-  const AV1_COMMON *const cm = &pbi->common;
-  int mi_row, mi_col;
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileJobsDec *const buf1 = (const TileJobsDec *)a;
+  const TileJobsDec *const buf2 = (const TileJobsDec *)b;
+  return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
+}
 
-  if (setjmp(tile_data->error_info.jmp)) {
-    tile_data->error_info.setjmp = 0;
-    aom_merge_corrupted_flag(&tile_data->xd.corrupted, 1);
-    return 0;
+static void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+                              int tile_rows_start, int tile_rows_end,
+                              int tile_cols_start, int tile_cols_end,
+                              int startTile, int endTile) {
+  AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
+  TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
+  tile_mt_info->jobs_enqueued = 0;
+  tile_mt_info->jobs_dequeued = 0;
+
+  for (int row = tile_rows_start; row < tile_rows_end; row++) {
+    for (int col = tile_cols_start; col < tile_cols_end; col++) {
+      if (row * cm->tile_cols + col < startTile ||
+          row * cm->tile_cols + col > endTile)
+        continue;
+      tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
+      tile_job_queue->tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      tile_job_queue++;
+      tile_mt_info->jobs_enqueued++;
+    }
   }
+}
+
+static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
+                           int tile_rows, int tile_cols) {
+  tile_mt_info->alloc_tile_rows = tile_rows;
+  tile_mt_info->alloc_tile_cols = tile_cols;
+  int num_tiles = tile_rows * tile_cols;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
+                    aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
 
-  tile_data->error_info.setjmp = 1;
-  tile_data->xd.error_info = &tile_data->error_info;
-#if CONFIG_DEPENDENT_HORZTILES
-  if (!cm->dependent_horz_tiles || tile->tg_horz_boundary) {
-    av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
+    for (int i = 0; i < num_tiles; i++) {
+      pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
+    }
   }
-#else
-  av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
 #endif
+  CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
+                  aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
+}
 
-  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += cm->mib_size) {
-    av1_zero_left_context(&tile_data->xd);
-
-    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += cm->mib_size) {
-      decode_partition(pbi, &tile_data->xd,
-#if CONFIG_SUPERTX
-                       0,
-#endif
-                       mi_row, mi_col, &tile_data->bit_reader, cm->sb_size);
-#if NC_MODE_INFO && CONFIG_MOTION_VAR
-      detoken_and_recon_sb(pbi, &tile_data->xd, mi_row, mi_col,
-                           &tile_data->bit_reader, cm->sb_size);
-#endif
-    }
+void av1_free_mc_tmp_buf(void *td, int use_highbd) {
+  ThreadData *thread_data = (ThreadData *)td;
+  int ref;
+  for (ref = 0; ref < 2; ref++) {
+    if (use_highbd)
+      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
+    else
+      aom_free(thread_data->mc_buf[ref]);
+    thread_data->mc_buf[ref] = NULL;
   }
-  return !tile_data->xd.corrupted;
+  thread_data->mc_buf_size = 0;
 }
 
-// sorts in descending order
-static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBufferDec *const buf1 = (const TileBufferDec *)a;
-  const TileBufferDec *const buf2 = (const TileBufferDec *)b;
-  return (int)(buf2->size - buf1->size);
+static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size,
+                                int use_highbd) {
+  ThreadData *thread_data = (ThreadData *)td;
+
+  for (int ref = 0; ref < 2; ref++) {
+    if (use_highbd) {
+      uint16_t *hbd_mc_buf;
+      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
+    } else {
+      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
+                      (uint8_t *)aom_memalign(16, buf_size));
+    }
+  }
+  thread_data->mc_buf_size = buf_size;
 }
 
 static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
-                                      const uint8_t *data_end) {
+                                      const uint8_t *data_end, int start_tile,
+                                      int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
-  const int num_workers = AOMMIN(pbi->max_threads & ~1, tile_cols);
+  const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
-#if CONFIG_EXT_TILE
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
   const int single_row = pbi->dec_tile_row >= 0;
   const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
   const int single_col = pbi->dec_tile_col >= 0;
-#endif  // CONFIG_EXT_TILE
   int tile_rows_start;
   int tile_rows_end;
   int tile_cols_start;
   int tile_cols_end;
-  int tile_row, tile_col;
-  int i;
+  int tile_count_tg;
+  int num_workers;
+  int worker_idx;
+  const uint8_t *raw_data_end = NULL;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
     tile_cols_start = single_col ? dec_tile_col : 0;
     tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
   } else {
-#endif  // CONFIG_EXT_TILE
     tile_rows_start = 0;
     tile_rows_end = tile_rows;
     tile_cols_start = 0;
     tile_cols_end = tile_cols;
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
+  tile_count_tg = end_tile - start_tile + 1;
+  num_workers = AOMMIN(pbi->max_threads, tile_count_tg);
 
-#if !CONFIG_ANS
-  int final_worker = -1;
-#endif  // !CONFIG_ANS
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
 
   assert(tile_rows <= MAX_TILE_ROWS);
   assert(tile_cols <= MAX_TILE_COLS);
-
-  assert(tile_cols * tile_rows > 1);
-
-  // TODO(jzern): See if we can remove the restriction of passing in max
-  // threads to the decoder.
-  if (pbi->num_tile_workers == 0) {
-    const int num_threads = pbi->max_threads & ~1;
+  assert(tile_count_tg > 0);
+  assert(num_workers > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  // Create workers and thread_data
+  if (pbi->num_workers == 0) {
+    const int num_threads = pbi->max_threads;
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
-    // Ensure tile data offsets will be properly aligned. This may fail on
-    // platforms without DECLARE_ALIGNED().
-    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
-    CHECK_MEM_ERROR(
-        cm, pbi->tile_worker_data,
-        aom_memalign(32, num_threads * sizeof(*pbi->tile_worker_data)));
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
-                    aom_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
-    for (i = 0; i < num_threads; ++i) {
-      AVxWorker *const worker = &pbi->tile_workers[i];
-      ++pbi->num_tile_workers;
+    CHECK_MEM_ERROR(cm, pbi->thread_data,
+                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      ++pbi->num_workers;
 
       winterface->init(worker);
-      if (i < num_threads - 1 && !winterface->reset(worker)) {
+      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
         aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
+
+      if (worker_idx < num_threads - 1) {
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        aom_memalign(32, sizeof(*thread_data->td)));
+        av1_zero(*thread_data->td);
+      } else {
+        // Main thread acts as a worker and uses the thread data in pbi
+        thread_data->td = &pbi->td;
+      }
+    }
+  }
+  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    if (thread_data->td->mc_buf_size != buf_size) {
+      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
     }
   }
 
+    // get tile size in tile group
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    aom_free(pbi->tile_data);
+    CHECK_MEM_ERROR(cm, pbi->tile_data,
+                    aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+    pbi->allocated_tiles = n_tiles;
+  }
+
   // Reset tile decoding hook
-  for (i = 0; i < num_workers; ++i) {
-    AVxWorker *const worker = &pbi->tile_workers[i];
+  for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
     winterface->sync(worker);
-    worker->hook = (AVxWorkerHook)tile_worker_hook;
-    worker->data1 = &pbi->tile_worker_data[i];
-    worker->data2 = &pbi->tile_worker_info[i];
+
+    worker->hook = tile_worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = pbi;
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
+    }
+  }
+
+  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+  }
+  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile);
+  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+
+  {
+    const int base = tile_count_tg / num_workers;
+    const int remain = tile_count_tg % num_workers;
+    int tile_start = start_tile;
+    int corrupted = 0;
+
+    for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+      // compute number of tiles assign to each worker
+      const int count = base + (remain + worker_idx) / num_workers;
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+      thread_data->data_end = data_end;
+      tile_start += count;
+
+      worker->had_error = 0;
+      if (worker_idx == num_workers - 1) {
+        winterface->execute(worker);
+      } else {
+        winterface->launch(worker);
+      }
+    }
+
+    for (; worker_idx > 0; --worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+      aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+    }
+
+    pbi->mb.corrupted = corrupted;
   }
 
-  // Initialize thread frame counts.
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    for (i = 0; i < num_workers; ++i) {
-      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
-      av1_zero(twd->counts);
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
     }
+    // Return the end of the last tile buffer
+    return raw_data_end;
   }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 
-// Load tile data into tile_buffers
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile)
-    get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void error_handler(void *data) {
+  AV1_COMMON *const cm = (AV1_COMMON *)data;
+  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
+// cm->bit_depth based on the values of those fields and cm->profile. Reports
+// errors by calling rb->error_handler() or aom_internal_error().
+static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  const int high_bitdepth = aom_rb_read_bit(rb);
+  if (cm->profile == PROFILE_2 && high_bitdepth) {
+    const int twelve_bit = aom_rb_read_bit(rb);
+    cm->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+  } else if (cm->profile <= PROFILE_2) {
+    cm->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+  } else {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported profile/bit-depth combination");
+  }
+}
+
+void av1_read_film_grain_params(AV1_COMMON *cm,
+                                struct aom_read_bit_buffer *rb) {
+  aom_film_grain_t *pars = &cm->film_grain_params;
+
+  pars->apply_grain = aom_rb_read_bit(rb);
+  if (!pars->apply_grain) {
+    memset(pars, 0, sizeof(*pars));
+    return;
+  }
+
+  pars->random_seed = aom_rb_read_literal(rb, 16);
+  if (cm->frame_type == INTER_FRAME)
+    pars->update_parameters = aom_rb_read_bit(rb);
   else
-#endif  // CONFIG_EXT_TILE
-    get_tile_buffers(pbi, data, data_end, tile_buffers, 0,
-                     cm->tile_rows * cm->tile_cols - 1);
+    pars->update_parameters = 1;
+
+  if (!pars->update_parameters) {
+    // inherit parameters from a previous reference frame
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
+    int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx];
+    if (buf_idx == INVALID_IDX) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid Film grain reference idx");
+    }
+    if (!frame_bufs[buf_idx].film_grain_params_present) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Film grain reference parameters not available");
+    }
+    uint16_t random_seed = pars->random_seed;
+    *pars = frame_bufs[buf_idx].film_grain_params;  // inherit paramaters
+    pars->random_seed = random_seed;                // with new random seed
+    return;
+  }
 
-  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
-    // Sort the buffers in this tile row based on size in descending order.
-    qsort(&tile_buffers[tile_row][tile_cols_start],
-          tile_cols_end - tile_cols_start, sizeof(tile_buffers[0][0]),
-          compare_tile_buffers);
-
-    // Rearrange the tile buffers in this tile row such that per-tile group
-    // the largest, and presumably the most difficult tile will be decoded in
-    // the main thread. This should help minimize the number of instances
-    // where the main thread is waiting for a worker to complete.
-    {
-      int group_start;
-      for (group_start = tile_cols_start; group_start < tile_cols_end;
-           group_start += num_workers) {
-        const int group_end = AOMMIN(group_start + num_workers, tile_cols);
-        const TileBufferDec largest = tile_buffers[tile_row][group_start];
-        memmove(&tile_buffers[tile_row][group_start],
-                &tile_buffers[tile_row][group_start + 1],
-                (group_end - group_start - 1) * sizeof(tile_buffers[0][0]));
-        tile_buffers[tile_row][group_end - 1] = largest;
-      }
+  // Scaling functions parameters
+  pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
+  if (pars->num_y_points > 14)
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Number of points for film grain luma scaling function "
+                       "exceeds the maximum value.");
+  for (int i = 0; i < pars->num_y_points; i++) {
+    pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
+    if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "First coordinate of the scaling function points "
+                         "shall be increasing.");
+    pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
+  }
+
+  if (!cm->seq_params.monochrome)
+    pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+
+  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+       (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
+  } else {
+    pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cb_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cb scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
     }
 
-    for (tile_col = tile_cols_start; tile_col < tile_cols_end;) {
-      // Launch workers for individual columns
-      for (i = 0; i < num_workers && tile_col < tile_cols_end;
-           ++i, ++tile_col) {
-        TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
-        AVxWorker *const worker = &pbi->tile_workers[i];
-        TileWorkerData *const twd = (TileWorkerData *)worker->data1;
-        TileInfo *const tile_info = (TileInfo *)worker->data2;
-
-        twd->pbi = pbi;
-        twd->xd = pbi->mb;
-        twd->xd.corrupted = 0;
-        twd->xd.counts =
-            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
-                ? &twd->counts
-                : NULL;
-        av1_zero(twd->dqcoeff);
-        av1_tile_init(tile_info, cm, tile_row, buf->col);
-        av1_tile_init(&twd->xd.tile, cm, tile_row, buf->col);
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-        dec_setup_across_tile_boundary_info(cm, tile_info);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-        setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
-                           &twd->bit_reader,
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-                           1 << cm->ans_window_size_log2,
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-                           pbi->decrypt_cb, pbi->decrypt_state);
-        av1_init_macroblockd(cm, &twd->xd,
-#if CONFIG_PVQ
-                             twd->pvq_ref_coeff,
-#endif
-#if CONFIG_CFL
-                             &twd->cfl,
-#endif
-                             twd->dqcoeff);
-#if CONFIG_PVQ
-        daala_dec_init(cm, &twd->xd.daala_dec, &twd->bit_reader);
-        twd->xd.daala_dec.state.adapt = &twd->tctx.pvq_context;
-#endif
-        // Initialise the tile context from the frame context
-        twd->tctx = *cm->fc;
-        twd->xd.tile_ctx = &twd->tctx;
-        twd->xd.plane[0].color_index_map = twd->color_index_map[0];
-        twd->xd.plane[1].color_index_map = twd->color_index_map[1];
-
-        worker->had_error = 0;
-        if (i == num_workers - 1 || tile_col == tile_cols_end - 1) {
-          winterface->execute(worker);
-        } else {
-          winterface->launch(worker);
-        }
+    pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cr_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cr scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
+    }
 
-#if !CONFIG_ANS
-        if (tile_row == tile_rows - 1 && buf->col == tile_cols - 1) {
-          final_worker = i;
-        }
-#endif  // !CONFIG_ANS
-      }
+    if ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+        (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
+         ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "In YCbCr 4:2:0, film grain shall be applied "
+                         "to both chroma components or neither.");
+  }
+
+  pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8;  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6;  // 6 + value
 
-      // Sync all workers
-      for (; i > 0; --i) {
-        AVxWorker *const worker = &pbi->tile_workers[i - 1];
-        // TODO(jzern): The tile may have specific error data associated with
-        // its aom_internal_error_info which could be propagated to the main
-        // info in cm. Additionally once the threads have been synced and an
-        // error is detected, there's no point in continuing to decode tiles.
-        pbi->mb.corrupted |= !winterface->sync(worker);
-      }
-    }
-  }
+  pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
 
-  // Accumulate thread frame counts.
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    for (i = 0; i < num_workers; ++i) {
-      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
-      av1_accumulate_frame_counts(&cm->counts, &twd->counts);
-    }
+  if (pars->num_cb_points) {
+    pars->cb_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_offset = aom_rb_read_literal(rb, 9);
   }
 
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    // Return the end of the last tile buffer
-    return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_ANS
-    return data_end;
-#else
-  assert(final_worker != -1);
-  {
-    TileWorkerData *const twd =
-        (TileWorkerData *)pbi->tile_workers[final_worker].data1;
-    return aom_reader_find_end(&twd->bit_reader);
-  }
-#endif  // CONFIG_ANS
-#if CONFIG_EXT_TILE
+  if (pars->num_cr_points) {
+    pars->cr_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_offset = aom_rb_read_literal(rb, 9);
   }
-#endif  // CONFIG_EXT_TILE
-}
 
-static void error_handler(void *data) {
-  AV1_COMMON *const cm = (AV1_COMMON *)data;
-  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+  pars->overlap_flag = aom_rb_read_bit(rb);
+
+  pars->clip_to_restricted_range = aom_rb_read_bit(rb);
 }
 
-static void read_bitdepth_colorspace_sampling(AV1_COMMON *cm,
-                                              struct aom_read_bit_buffer *rb,
-                                              int allow_lowbitdepth) {
-  if (cm->profile >= PROFILE_2) {
-    cm->bit_depth = aom_rb_read_bit(rb) ? AOM_BITS_12 : AOM_BITS_10;
+static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+    av1_read_film_grain_params(cm, rb);
   } else {
-    cm->bit_depth = AOM_BITS_8;
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
+  cm->film_grain_params.bit_depth = cm->bit_depth;
+  memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
+         sizeof(aom_film_grain_t));
+}
+
+void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth) {
+  av1_read_bitdepth(cm, rb);
 
-#if CONFIG_HIGHBITDEPTH
   cm->use_highbitdepth = cm->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
-#else
-  (void)allow_lowbitdepth;
-#endif
-#if CONFIG_COLORSPACE_HEADERS
-  cm->color_space = aom_rb_read_literal(rb, 5);
-  cm->transfer_function = aom_rb_read_literal(rb, 5);
-#else
-  cm->color_space = aom_rb_read_literal(rb, 3);
-#endif
-  if (cm->color_space != AOM_CS_SRGB) {
+  // monochrome bit (not needed for PROFILE_1)
+  const int is_monochrome = cm->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+  cm->seq_params.monochrome = is_monochrome;
+  int color_description_present_flag = aom_rb_read_bit(rb);
+  if (color_description_present_flag) {
+    cm->color_primaries = aom_rb_read_literal(rb, 8);
+    cm->transfer_characteristics = aom_rb_read_literal(rb, 8);
+    cm->matrix_coefficients = aom_rb_read_literal(rb, 8);
+  } else {
+    cm->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+    cm->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+    cm->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+  }
+  if (is_monochrome) {
     // [16,235] (including xvycc) vs [0,255] range
     cm->color_range = aom_rb_read_bit(rb);
-    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      cm->subsampling_x = aom_rb_read_bit(rb);
-      cm->subsampling_y = aom_rb_read_bit(rb);
-      if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "4:2:0 color not supported in profile 1 or 3");
-      if (aom_rb_read_bit(rb))
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reserved bit set");
+    cm->subsampling_y = cm->subsampling_x = 1;
+    cm->chroma_sample_position = AOM_CSP_UNKNOWN;
+    cm->separate_uv_delta_q = 0;
+    return;
+  }
+  if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
+      cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {  // it would be better
+                                                          // to remove this
+                                                          // dependency too
+    cm->subsampling_y = cm->subsampling_x = 0;
+    cm->color_range = 1;  // assume full color-range
+    if (!(cm->profile == PROFILE_1 ||
+          (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12))) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "sRGB colorspace not compatible with specified profile");
+    }
+  } else {
+    // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = aom_rb_read_bit(rb);
+    if (cm->profile == PROFILE_0) {
+      // 420 only
+      cm->subsampling_x = cm->subsampling_y = 1;
+    } else if (cm->profile == PROFILE_1) {
+      // 444 only
+      cm->subsampling_x = cm->subsampling_y = 0;
     } else {
-      cm->subsampling_y = cm->subsampling_x = 1;
+      assert(cm->profile == PROFILE_2);
+      if (cm->bit_depth == AOM_BITS_12) {
+        cm->subsampling_x = aom_rb_read_bit(rb);
+        if (cm->subsampling_x)
+          cm->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
+        else
+          cm->subsampling_y = 0;  // 444
+      } else {
+        // 422
+        cm->subsampling_x = 1;
+        cm->subsampling_y = 0;
+      }
     }
-#if CONFIG_COLORSPACE_HEADERS
-    if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
+    if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+        (cm->subsampling_x || cm->subsampling_y)) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
+    }
+    if (cm->subsampling_x && cm->subsampling_y) {
       cm->chroma_sample_position = aom_rb_read_literal(rb, 2);
     }
-#endif
-  } else {
-    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
-      // 4:2:2 or 4:4:0 chroma sampling is not allowed.
-      cm->subsampling_y = cm->subsampling_x = 0;
-      if (aom_rb_read_bit(rb))
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reserved bit set");
-    } else {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "4:4:4 color not supported in profile 0 or 2");
+  }
+  cm->separate_uv_delta_q = aom_rb_read_bit(rb);
+}
+
+void av1_read_timing_info_header(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->timing_info.num_units_in_display_tick = aom_rb_read_unsigned_literal(
+      rb, 32);  // Number of units in a display tick
+  cm->timing_info.time_scale =
+      aom_rb_read_unsigned_literal(rb, 32);  // Time scale
+  if (cm->timing_info.num_units_in_display_tick == 0 ||
+      cm->timing_info.time_scale == 0) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "num_units_in_display_tick and time_scale must be greater than 0.");
+  }
+  cm->timing_info.equal_picture_interval =
+      aom_rb_read_bit(rb);  // Equal picture interval bit
+  if (cm->timing_info.equal_picture_interval) {
+    cm->timing_info.num_ticks_per_picture =
+        aom_rb_read_uvlc(rb) + 1;  // ticks per picture
+    if (cm->timing_info.num_ticks_per_picture == 0) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
     }
   }
 }
 
-#if CONFIG_REFERENCE_BUFFER
-void read_sequence_header(SequenceHeader *seq_params,
-                          struct aom_read_bit_buffer *rb) {
-  /* Placeholder for actually reading from the bitstream */
-  seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
-  if (seq_params->frame_id_numbers_present_flag) {
-    seq_params->frame_id_length_minus7 = aom_rb_read_literal(rb, 4);
-    seq_params->delta_frame_id_length_minus2 = aom_rb_read_literal(rb, 4);
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->buffer_model.encoder_decoder_buffer_delay_length =
+      aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
+      rb, 32);  // Number of units in a decoding tick
+  cm->buffer_model.buffer_removal_delay_length = aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.frame_presentation_delay_length =
+      aom_rb_read_literal(rb, 5) + 1;
+}
+
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb, int op_num) {
+  // The cm->op_params array has MAX_NUM_OPERATING_POINTS + 1 elements.
+  if (op_num > MAX_NUM_OPERATING_POINTS) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "AV1 does not support %d decoder model operating points",
+                       op_num + 1);
   }
+
+  cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_literal(
+      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_literal(
+      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
 }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-static void read_compound_tools(AV1_COMMON *cm,
-                                struct aom_read_bit_buffer *rb) {
-  (void)cm;
-  (void)rb;
-#if CONFIG_INTERINTRA
-  if (!frame_is_intra_only(cm) && cm->reference_mode != COMPOUND_REFERENCE) {
-    cm->allow_interintra_compound = aom_rb_read_bit(rb);
-  } else {
-    cm->allow_interintra_compound = 0;
-  }
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm)) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    cm->allow_masked_compound = aom_rb_read_bit(rb);
+static void av1_read_tu_pts_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->tu_presentation_delay =
+      aom_rb_read_literal(rb, cm->buffer_model.frame_presentation_delay_length);
+}
+
+void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  // rb->error_handler may be triggered during aom_rb_read_bit(), raising
+  // internal errors and immediate decoding termination. We use a local variable
+  // to store the info. as we decode. At the end, if no errors have occurred,
+  // cm->seq_params is updated.
+  SequenceHeader sh = cm->seq_params;
+  SequenceHeader *const seq_params = &sh;
+  int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+  int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+  int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag = 0;
   } else {
-    cm->allow_masked_compound = 0;
+    seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
+  }
+  if (seq_params->frame_id_numbers_present_flag) {
+    // We must always have delta_frame_id_length < frame_id_length,
+    // in order for a frame to be referenced with a unique delta.
+    // Avoid wasting bits by using a coding that enforces this restriction.
+    seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
+    seq_params->frame_id_length =
+        aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
+    if (seq_params->frame_id_length > 16)
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid frame_id_length");
   }
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-}
 
-#if CONFIG_VAR_REFS
-static void check_valid_ref_frames(AV1_COMMON *cm) {
-  MV_REFERENCE_FRAME ref_frame;
-  // TODO(zoeliu): To handle ALTREF_FRAME the same way as do with other
-  //               reference frames: Current encoder invalid ALTREF when ALTREF
-  //               is the same as LAST, but invalid all the other references
-  //               when they are the same as ALTREF.
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
-
-    if (ref_buf->idx != INVALID_IDX) {
-      ref_buf->is_valid = 1;
-
-      MV_REFERENCE_FRAME ref;
-      for (ref = LAST_FRAME; ref < ref_frame; ++ref) {
-        RefBuffer *const buf = &cm->frame_refs[ref - LAST_FRAME];
-        if (buf->is_valid && buf->idx == ref_buf->idx) {
-          if (ref_frame != ALTREF_FRAME || ref == LAST_FRAME) {
-            ref_buf->is_valid = 0;
-            break;
-          } else {
-            buf->is_valid = 0;
-          }
-        }
+  setup_sb_size(seq_params, rb);
+
+  seq_params->enable_filter_intra = aom_rb_read_bit(rb);
+  seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->enable_interintra_compound = 0;
+    seq_params->enable_masked_compound = 0;
+    seq_params->enable_warped_motion = 0;
+    seq_params->enable_dual_filter = 0;
+    seq_params->enable_order_hint = 0;
+    seq_params->enable_jnt_comp = 0;
+    seq_params->enable_ref_frame_mvs = 0;
+    seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
+    seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
+    seq_params->order_hint_bits_minus_1 = -1;
+  } else {
+    seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
+    seq_params->enable_masked_compound = aom_rb_read_bit(rb);
+    seq_params->enable_warped_motion = aom_rb_read_bit(rb);
+    seq_params->enable_dual_filter = aom_rb_read_bit(rb);
+
+    seq_params->enable_order_hint = aom_rb_read_bit(rb);
+    seq_params->enable_jnt_comp =
+        seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+    seq_params->enable_ref_frame_mvs =
+        seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+
+    if (aom_rb_read_bit(rb)) {
+      seq_params->force_screen_content_tools =
+          2;  // SELECT_SCREEN_CONTENT_TOOLS
+    } else {
+      seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
+    }
+
+    if (seq_params->force_screen_content_tools > 0) {
+      if (aom_rb_read_bit(rb)) {
+        seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
+      } else {
+        seq_params->force_integer_mv = aom_rb_read_bit(rb);
       }
     } else {
-      ref_buf->is_valid = 0;
+      seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
     }
+    seq_params->order_hint_bits_minus_1 =
+        seq_params->enable_order_hint ? aom_rb_read_literal(rb, 3) : -1;
   }
+
+  seq_params->enable_superres = aom_rb_read_bit(rb);
+  seq_params->enable_cdef = aom_rb_read_bit(rb);
+  seq_params->enable_restoration = aom_rb_read_bit(rb);
+  cm->seq_params = *seq_params;
 }
-#endif  // CONFIG_VAR_REFS
 
-#if CONFIG_GLOBAL_MOTION
 static int read_global_motion_params(WarpedMotionParams *params,
                                      const WarpedMotionParams *ref_params,
                                      struct aom_read_bit_buffer *rb,
                                      int allow_hp) {
   TransformationType type = aom_rb_read_bit(rb);
   if (type != IDENTITY) {
-#if GLOBAL_TRANS_TYPES > 4
-    type += aom_rb_read_literal(rb, GLOBAL_TYPE_BITS);
-#else
     if (aom_rb_read_bit(rb))
       type = ROTZOOM;
     else
       type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
-#endif  // GLOBAL_TRANS_TYPES > 4
   }
 
-  int trans_bits;
-  int trans_dec_factor;
-  int trans_prec_diff;
   *params = default_warp_params;
   params->wmtype = type;
-  switch (type) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (type != HORTRAPEZOID)
-        params->wmmat[6] =
-            aom_rb_read_signed_primitive_refsubexpfin(
-                rb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-                (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)) *
-            GM_ROW3HOMO_DECODE_FACTOR;
-      if (type != VERTRAPEZOID)
-        params->wmmat[7] =
-            aom_rb_read_signed_primitive_refsubexpfin(
-                rb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-                (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)) *
-            GM_ROW3HOMO_DECODE_FACTOR;
-    case AFFINE:
-    case ROTZOOM:
-      params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
-                             rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                             (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
-                                 (1 << GM_ALPHA_PREC_BITS)) *
-                             GM_ALPHA_DECODE_FACTOR +
-                         (1 << WARPEDMODEL_PREC_BITS);
-      if (type != VERTRAPEZOID)
-        params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
-                               rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                               (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
-                           GM_ALPHA_DECODE_FACTOR;
-      if (type >= AFFINE) {
-        if (type != HORTRAPEZOID)
-          params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
-                                 rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                                 (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
-                             GM_ALPHA_DECODE_FACTOR;
-        params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
-                               rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                               (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                                   (1 << GM_ALPHA_PREC_BITS)) *
-                               GM_ALPHA_DECODE_FACTOR +
-                           (1 << WARPEDMODEL_PREC_BITS);
-      } else {
-        params->wmmat[4] = -params->wmmat[3];
-        params->wmmat[5] = params->wmmat[2];
-      }
-    // fallthrough intended
-    case TRANSLATION:
-      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                                         : GM_ABS_TRANS_BITS;
-      trans_dec_factor = (type == TRANSLATION)
-                             ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
-                             : GM_TRANS_DECODE_FACTOR;
-      trans_prec_diff = (type == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
-                             rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-                             (ref_params->wmmat[0] >> trans_prec_diff)) *
-                         trans_dec_factor;
-      params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
-                             rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-                             (ref_params->wmmat[1] >> trans_prec_diff)) *
-                         trans_dec_factor;
-    case IDENTITY: break;
-    default: assert(0);
+
+  if (type >= ROTZOOM) {
+    params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+    params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+  }
+
+  if (type >= AFFINE) {
+    params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+    params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+  } else {
+    params->wmmat[4] = -params->wmmat[3];
+    params->wmmat[5] = params->wmmat[2];
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_dec_factor =
+        (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+                              : GM_TRANS_DECODE_FACTOR;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[0] >> trans_prec_diff)) *
+                       trans_dec_factor;
+    params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[1] >> trans_prec_diff)) *
+                       trans_dec_factor;
   }
+
   if (params->wmtype <= AFFINE) {
     int good_shear_params = get_shear_params(params);
     if (!good_shear_params) return 0;
@@ -4596,16 +3528,18 @@ static int read_global_motion_params(WarpedMotionParams *params,
 }
 
 static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  int frame;
-  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+  for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
     const WarpedMotionParams *ref_params =
-        cm->error_resilient_mode ? &default_warp_params
-                                 : &cm->prev_frame->global_motion[frame];
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
     int good_params = read_global_motion_params(
         &cm->global_motion[frame], ref_params, rb, cm->allow_high_precision_mv);
-    if (!good_params)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid shear parameters for global motion.");
+    if (!good_params) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected global motion shear params from aomenc\n");
+#endif
+      cm->global_motion[frame].invalid = 1;
+    }
 
     // TODO(sarahparker, debargha): The logic in the commented out code below
     // does not work currently and causes mismatches when resize is on. Fix it
@@ -4631,252 +3565,397 @@ static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
            */
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
-         TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
+         REF_FRAMES * sizeof(WarpedMotionParams));
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
-static size_t read_uncompressed_header(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb) {
+static void show_existing_frame_reset(AV1Decoder *const pbi,
+                                      int existing_frame_idx) {
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  assert(cm->show_existing_frame);
+
+  cm->frame_type = KEY_FRAME;
+
+  pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    cm->frame_refs[i].idx = INVALID_IDX;
+    cm->frame_refs[i].buf = NULL;
+  }
+
+  if (pbi->need_resync) {
+    memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+    pbi->need_resync = 0;
+  }
+
+  cm->cur_frame->intra_only = 1;
+
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    /* If bitmask is set, update reference frame id values and
+       mark frames as valid for reference.
+       Note that the displayed frame be valid for referencing
+       in order to have been selected.
+    */
+    int refresh_frame_flags = pbi->refresh_frame_flags;
+    int display_frame_id = cm->ref_frame_id[existing_frame_idx];
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = display_frame_id;
+        cm->valid_for_referencing[i] = 1;
+      }
+    }
+  }
+
+  cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  int ref_index = 0;
+  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  // Reload the adapted CDFs from when we originally coded this keyframe
+  *cm->fc = cm->frame_contexts[existing_frame_idx];
+}
+
+static int read_uncompressed_header(AV1Decoder *pbi,
+                                    struct aom_read_bit_buffer *rb) {
   AV1_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
-  int i, mask, ref_index = 0;
-  size_t sz;
+
+  if (!pbi->sequence_header_ready) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No sequence header");
+  }
 
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
-#if CONFIG_EXT_REFS
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
-#endif  // CONFIG_EXT_REFS
 
-#if !CONFIG_OBU
-  if (aom_rb_read_literal(rb, 2) != AOM_FRAME_MARKER)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Invalid frame marker");
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    cm->show_existing_frame = 0;
+    cm->show_frame = 1;
+    cm->frame_type = KEY_FRAME;
+    cm->error_resilient_mode = 1;
+  } else {
+    cm->show_existing_frame = aom_rb_read_bit(rb);
+    cm->reset_decoder_state = 0;
+
+    if (cm->show_existing_frame) {
+      // Show an existing frame directly.
+      const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+      const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0) {
+        av1_read_tu_pts_info(cm, rb);
+      }
+      if (cm->seq_params.frame_id_numbers_present_flag) {
+        int frame_id_length = cm->seq_params.frame_id_length;
+        int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+        /* Compare display_frame_id with ref_frame_id and check valid for
+         * referencing */
+        if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+            cm->valid_for_referencing[existing_frame_idx] == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Reference buffer frame ID mismatch");
+      }
+      lock_buffer_pool(pool);
+      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+        unlock_buffer_pool(pool);
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer %d does not contain a decoded frame",
+                           frame_to_show);
+      }
+      ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+      cm->reset_decoder_state =
+          frame_bufs[frame_to_show].frame_type == KEY_FRAME;
+      unlock_buffer_pool(pool);
 
-  cm->profile = av1_read_profile(rb);
+      cm->lf.filter_level[0] = 0;
+      cm->lf.filter_level[1] = 0;
+      cm->show_frame = 1;
 
-  const BITSTREAM_PROFILE MAX_SUPPORTED_PROFILE =
-      CONFIG_HIGHBITDEPTH ? MAX_PROFILES : PROFILE_2;
+      if (!frame_bufs[frame_to_show].showable_frame) {
+        aom_merge_corrupted_flag(&xd->corrupted, 1);
+      }
+      if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0;
 
-  if (cm->profile >= MAX_SUPPORTED_PROFILE)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Unsupported bitstream profile");
-#endif
+      cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params;
 
-#if CONFIG_EXT_TILE
-  cm->large_scale_tile = aom_rb_read_literal(rb, 1);
-#if CONFIG_REFERENCE_BUFFER
-  if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0;
-#endif  // CONFIG_REFERENCE_BUFFER
-#endif  // CONFIG_EXT_TILE
+      if (cm->reset_decoder_state) {
+        show_existing_frame_reset(pbi, existing_frame_idx);
+      } else {
+        pbi->refresh_frame_flags = 0;
+      }
 
-  cm->show_existing_frame = aom_rb_read_bit(rb);
+      return 0;
+    }
 
-  if (cm->show_existing_frame) {
-    // Show an existing frame directly.
-    const int existing_frame_idx = aom_rb_read_literal(rb, 3);
-    const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
-#if CONFIG_REFERENCE_BUFFER
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_length = cm->seq_params.frame_id_length_minus7 + 7;
-      int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
-      /* Compare display_frame_id with ref_frame_id and check valid for
-       * referencing */
-      if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
-          cm->valid_for_referencing[existing_frame_idx] == 0)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                           "Reference buffer frame ID mismatch");
+    cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
+    cm->show_frame = aom_rb_read_bit(rb);
+    if (cm->seq_params.still_picture &&
+        (cm->frame_type != KEY_FRAME || !cm->show_frame)) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Still pictures must be coded as shown keyframes");
     }
-#endif
-    lock_buffer_pool(pool);
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      unlock_buffer_pool(pool);
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer %d does not contain a decoded frame",
-                         frame_to_show);
+    cm->showable_frame = cm->frame_type != KEY_FRAME;
+    if (cm->show_frame) {
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0)
+        av1_read_tu_pts_info(cm, rb);
+    } else {
+      // See if this frame can be used as show_existing_frame in future
+      cm->showable_frame = aom_rb_read_bit(rb);
     }
-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-    unlock_buffer_pool(pool);
+    cm->cur_frame->showable_frame = cm->showable_frame;
+    cm->intra_only = cm->frame_type == INTRA_ONLY_FRAME;
+    cm->error_resilient_mode =
+        frame_is_sframe(cm) || (cm->frame_type == KEY_FRAME && cm->show_frame)
+            ? 1
+            : aom_rb_read_bit(rb);
+  }
 
-#if CONFIG_LOOPFILTER_LEVEL
-    cm->lf.filter_level[0] = 0;
-    cm->lf.filter_level[1] = 0;
-#else
-    cm->lf.filter_level = 0;
-#endif
-    cm->show_frame = 1;
-    pbi->refresh_frame_flags = 0;
+  cm->disable_cdf_update = aom_rb_read_bit(rb);
+  if (cm->seq_params.force_screen_content_tools == 2) {
+    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+  } else {
+    cm->allow_screen_content_tools = cm->seq_params.force_screen_content_tools;
+  }
 
-    if (cm->frame_parallel_decode) {
-      for (i = 0; i < REF_FRAMES; ++i)
-        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+  if (cm->allow_screen_content_tools) {
+    if (cm->seq_params.force_integer_mv == 2) {
+      cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+    } else {
+      cm->cur_frame_force_integer_mv = cm->seq_params.force_integer_mv;
     }
-
-    return 0;
+  } else {
+    cm->cur_frame_force_integer_mv = 0;
   }
 
-#if !CONFIG_OBU
-  cm->frame_type = (FRAME_TYPE)aom_rb_read_bit(rb);
-  cm->show_frame = aom_rb_read_bit(rb);
-  if (cm->frame_type != KEY_FRAME)
-    cm->intra_only = cm->show_frame ? 0 : aom_rb_read_bit(rb);
-#else
-  cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
-  cm->show_frame = aom_rb_read_bit(rb);
-  cm->intra_only = cm->frame_type == INTRA_ONLY_FRAME;
-#endif
-  cm->error_resilient_mode = aom_rb_read_bit(rb);
-#if CONFIG_REFERENCE_BUFFER
-#if !CONFIG_OBU
-  if (frame_is_intra_only(cm)) read_sequence_header(&cm->seq_params, rb);
-#endif  // !CONFIG_OBU
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_length = cm->seq_params.frame_id_length_minus7 + 7;
-    int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-    int prev_frame_id = 0;
-    if (cm->frame_type != KEY_FRAME) {
-      prev_frame_id = cm->current_frame_id;
-    }
-    cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+  cm->frame_refs_short_signaling = 0;
+  int frame_size_override_flag = 0;
+  cm->allow_intrabc = 0;
+  cm->primary_ref_frame = PRIMARY_REF_NONE;
 
-    if (cm->frame_type != KEY_FRAME) {
-      int diff_frame_id;
-      if (cm->current_frame_id > prev_frame_id) {
-        diff_frame_id = cm->current_frame_id - prev_frame_id;
-      } else {
-        diff_frame_id =
-            (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+  if (!cm->seq_params.reduced_still_picture_hdr) {
+    if (cm->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_length = cm->seq_params.frame_id_length;
+      int diff_len = cm->seq_params.delta_frame_id_length;
+      int prev_frame_id = 0;
+      int have_prev_frame_id = !pbi->decoding_first_frame &&
+                               !(cm->frame_type == KEY_FRAME && cm->show_frame);
+      if (have_prev_frame_id) {
+        prev_frame_id = cm->current_frame_id;
       }
-      /* Check current_frame_id for conformance */
-      if (prev_frame_id == cm->current_frame_id ||
-          diff_frame_id >= (1 << (frame_id_length - 1))) {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                           "Invalid value of current_frame_id");
+      cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+      if (have_prev_frame_id) {
+        int diff_frame_id;
+        if (cm->current_frame_id > prev_frame_id) {
+          diff_frame_id = cm->current_frame_id - prev_frame_id;
+        } else {
+          diff_frame_id =
+              (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+        }
+        /* Check current_frame_id for conformance */
+        if (prev_frame_id == cm->current_frame_id ||
+            diff_frame_id >= (1 << (frame_id_length - 1))) {
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Invalid value of current_frame_id");
+        }
       }
-    }
-    /* Check if some frames need to be marked as not valid for referencing */
-    for (i = 0; i < REF_FRAMES; i++) {
-      if (cm->frame_type == KEY_FRAME) {
-        cm->valid_for_referencing[i] = 0;
-      } else if (cm->current_frame_id - (1 << diff_len) > 0) {
-        if (cm->ref_frame_id[i] > cm->current_frame_id ||
-            cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
-          cm->valid_for_referencing[i] = 0;
-      } else {
-        if (cm->ref_frame_id[i] > cm->current_frame_id &&
-            cm->ref_frame_id[i] <
-                (1 << frame_id_length) + cm->current_frame_id - (1 << diff_len))
+      /* Check if some frames need to be marked as not valid for referencing */
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->frame_type == KEY_FRAME && cm->show_frame) {
           cm->valid_for_referencing[i] = 0;
+        } else if (cm->current_frame_id - (1 << diff_len) > 0) {
+          if (cm->ref_frame_id[i] > cm->current_frame_id ||
+              cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+            cm->valid_for_referencing[i] = 0;
+        } else {
+          if (cm->ref_frame_id[i] > cm->current_frame_id &&
+              cm->ref_frame_id[i] < (1 << frame_id_length) +
+                                        cm->current_frame_id - (1 << diff_len))
+            cm->valid_for_referencing[i] = 0;
+        }
+      }
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
+
+    cm->frame_offset =
+        aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+    cm->current_video_frame = cm->frame_offset;
+
+    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+      cm->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (cm->seq_params.decoder_model_info_present_flag) {
+    cm->buffer_removal_delay_present = aom_rb_read_bit(rb);
+    if (cm->buffer_removal_delay_present) {
+      for (int op_num = 0;
+           op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+          if ((((cm->seq_params.operating_point_idc[op_num] >>
+                 cm->temporal_layer_id) &
+                0x1) &&
+               ((cm->seq_params.operating_point_idc[op_num] >>
+                 (cm->spatial_layer_id + 8)) &
+                0x1)) ||
+              cm->seq_params.operating_point_idc[op_num] == 0) {
+            cm->op_frame_timing[op_num].buffer_removal_delay =
+                aom_rb_read_literal(
+                    rb, cm->buffer_model.buffer_removal_delay_length);
+          } else {
+            cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+          }
+        } else {
+          cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+        }
       }
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
   if (cm->frame_type == KEY_FRAME) {
-#if !CONFIG_OBU
-    read_bitdepth_colorspace_sampling(cm, rb, pbi->allow_lowbitdepth);
-#endif
-    pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+    if (!cm->show_frame)  // unshown keyframe (forward keyframe)
+      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+    else  // shown keyframe
+      pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
-    for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
       cm->frame_refs[i].idx = INVALID_IDX;
       cm->frame_refs[i].buf = NULL;
-#if CONFIG_VAR_REFS
-      cm->frame_refs[i].is_valid = 0;
-#endif  // CONFIG_VAR_REFS
     }
-
-    setup_frame_size(cm, rb);
-    setup_sb_size(cm, rb);
-
     if (pbi->need_resync) {
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
       pbi->need_resync = 0;
     }
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-    cm->ans_window_size_log2 = aom_rb_read_literal(rb, 4) + 8;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
-#if CONFIG_AMVR
-    if (cm->allow_screen_content_tools) {
-      if (aom_rb_read_bit(rb)) {
-        cm->seq_mv_precision_level = 2;
-      } else {
-        cm->seq_mv_precision_level = aom_rb_read_bit(rb) ? 0 : 1;
-      }
-    } else {
-      cm->seq_mv_precision_level = 0;
-    }
-#endif
-#if CONFIG_TEMPMV_SIGNALING
-    cm->use_prev_frame_mvs = 0;
-#endif
   } else {
-    if (cm->intra_only) cm->allow_screen_content_tools = aom_rb_read_bit(rb);
-#if CONFIG_TEMPMV_SIGNALING
-    if (cm->intra_only || cm->error_resilient_mode) cm->use_prev_frame_mvs = 0;
-#endif
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-// The only way to reset all frame contexts to their default values is with a
-// keyframe.
-#else
-    if (cm->error_resilient_mode) {
-      cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
-    } else {
-      if (cm->intra_only) {
-        cm->reset_frame_context = aom_rb_read_bit(rb)
-                                      ? RESET_FRAME_CONTEXT_ALL
-                                      : RESET_FRAME_CONTEXT_CURRENT;
-      } else {
-        cm->reset_frame_context = aom_rb_read_bit(rb)
-                                      ? RESET_FRAME_CONTEXT_CURRENT
-                                      : RESET_FRAME_CONTEXT_NONE;
-        if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT)
-          cm->reset_frame_context = aom_rb_read_bit(rb)
-                                        ? RESET_FRAME_CONTEXT_ALL
-                                        : RESET_FRAME_CONTEXT_CURRENT;
-      }
-    }
-#endif
-
     if (cm->intra_only) {
-#if !CONFIG_OBU
-      read_bitdepth_colorspace_sampling(cm, rb, pbi->allow_lowbitdepth);
-#endif
-
       pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-      setup_frame_size(cm, rb);
-      setup_sb_size(cm, rb);
+      if (pbi->refresh_frame_flags == 0xFF) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Intra only frames cannot have refresh flags 0xFF");
+      }
       if (pbi->need_resync) {
         memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
         pbi->need_resync = 0;
       }
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-      cm->ans_window_size_log2 = aom_rb_read_literal(rb, 4) + 8;
-#endif
     } else if (pbi->need_resync != 1) { /* Skip if need resync */
-#if CONFIG_OBU
-      pbi->refresh_frame_flags = (cm->frame_type == S_FRAME)
-                                     ? ~(1 << REF_FRAMES)
-                                     : aom_rb_read_literal(rb, REF_FRAMES);
-#else
-      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-#endif
-
-#if CONFIG_EXT_REFS
+      pbi->refresh_frame_flags =
+          frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
       if (!pbi->refresh_frame_flags) {
         // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
         //       will not be used as a reference
         cm->is_reference_frame = 0;
       }
-#endif  // CONFIG_EXT_REFS
+    }
+  }
+
+  if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
+    // Read all ref frame order hints if error_resilient_mode == 1
+    if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Read order hint from bit stream
+        unsigned int frame_offset =
+            aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+        // Get buffer index
+        int buf_idx = cm->ref_frame_map[ref_idx];
+        assert(buf_idx < FRAME_BUFFERS);
+        if (buf_idx == -1 ||
+            frame_offset != frame_bufs[buf_idx].cur_frame_offset) {
+          if (buf_idx >= 0) {
+            lock_buffer_pool(pool);
+            decrease_ref_count(buf_idx, frame_bufs, pool);
+            unlock_buffer_pool(pool);
+          }
+          // If no corresponding buffer exists, allocate a new buffer with all
+          // pixels set to neutral grey.
+          buf_idx = get_free_fb(cm);
+          if (buf_idx == INVALID_IDX) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          lock_buffer_pool(pool);
+          if (aom_realloc_frame_buffer(
+                  &frame_bufs[buf_idx].buf, cm->seq_params.max_frame_width,
+                  cm->seq_params.max_frame_height, cm->subsampling_x,
+                  cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->byte_alignment,
+                  &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
+                  pool->cb_priv)) {
+            unlock_buffer_pool(pool);
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+          unlock_buffer_pool(pool);
+          set_planes_to_neutral_grey(cm, &frame_bufs[buf_idx].buf, 0);
+
+          cm->ref_frame_map[ref_idx] = buf_idx;
+          frame_bufs[buf_idx].cur_frame_offset = frame_offset;
+        }
+      }
+    }
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    setup_frame_size(cm, frame_size_override_flag, rb);
+
+    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+      cm->allow_intrabc = aom_rb_read_bit(rb);
+    cm->allow_ref_frame_mvs = 0;
+    cm->prev_frame = NULL;
+  } else {
+    cm->allow_ref_frame_mvs = 0;
+
+    if (cm->intra_only) {
+      cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+      setup_frame_size(cm, frame_size_override_flag, rb);
+      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+        cm->allow_intrabc = aom_rb_read_bit(rb);
+
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+
+      // Frame refs short signaling is off when error resilient mode is on.
+      if (cm->seq_params.enable_order_hint)
+        cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
 
-      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-        const int ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
-        const int idx = cm->ref_frame_map[ref];
+      if (cm->frame_refs_short_signaling) {
+        // == LAST_FRAME ==
+        const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int lst_idx = cm->ref_frame_map[lst_ref];
+
+        // == GOLDEN_FRAME ==
+        const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int gld_idx = cm->ref_frame_map[gld_ref];
 
         // Most of the time, streams start with a keyframe. In that case,
         // ref_frame_map will have been filled in at that point and will not
@@ -4884,146 +3963,136 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
         // with an intra-only frame, so long as they don't then signal a
         // reference to a slot that hasn't been set yet. That's what we are
         // checking here.
-        if (idx == -1)
+        if (lst_idx == -1)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Inter frame requests nonexistent reference");
+        if (gld_idx == -1)
           aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
 
-        RefBuffer *const ref_frame = &cm->frame_refs[i];
-        ref_frame->idx = idx;
-        ref_frame->buf = &frame_bufs[idx].buf;
-#if CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_OBU
-        // NOTE: For the scenario of (cm->frame_type != S_FRAME),
-        // ref_frame_sign_bias will be reset based on frame offsets.
+        av1_set_frame_refs(cm, lst_ref, gld_ref);
+      }
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        int ref = 0;
+        if (!cm->frame_refs_short_signaling) {
+          ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+          const int idx = cm->ref_frame_map[ref];
+
+          // Most of the time, streams start with a keyframe. In that case,
+          // ref_frame_map will have been filled in at that point and will not
+          // contain any -1's. However, streams are explicitly allowed to start
+          // with an intra-only frame, so long as they don't then signal a
+          // reference to a slot that hasn't been set yet. That's what we are
+          // checking here.
+          if (idx == -1)
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Inter frame requests nonexistent reference");
+
+          RefBuffer *const ref_frame = &cm->frame_refs[i];
+          ref_frame->idx = idx;
+          ref_frame->buf = &frame_bufs[idx].buf;
+          ref_frame->map_idx = ref;
+        } else {
+          ref = cm->frame_refs[i].map_idx;
+        }
+
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
-#endif  // CONFIG_OBU
-#else   // !CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_OBU
-        cm->ref_frame_sign_bias[LAST_FRAME + i] =
-            (cm->frame_type == S_FRAME) ? 0 : aom_rb_read_bit(rb);
-#else   // !CONFIG_OBU
-        cm->ref_frame_sign_bias[LAST_FRAME + i] = aom_rb_read_bit(rb);
-#endif  // CONFIG_OBU
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_REFERENCE_BUFFER
+
         if (cm->seq_params.frame_id_numbers_present_flag) {
-          int frame_id_length = cm->seq_params.frame_id_length_minus7 + 7;
-          int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-          int delta_frame_id_minus1 = aom_rb_read_literal(rb, diff_len);
+          int frame_id_length = cm->seq_params.frame_id_length;
+          int diff_len = cm->seq_params.delta_frame_id_length;
+          int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
           int ref_frame_id =
-              ((cm->current_frame_id - (delta_frame_id_minus1 + 1) +
+              ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
                 (1 << frame_id_length)) %
                (1 << frame_id_length));
-          /* Compare values derived from delta_frame_id_minus1 and
-           * refresh_frame_flags. Also, check valid for referencing */
+          // Compare values derived from delta_frame_id_minus_1 and
+          // refresh_frame_flags. Also, check valid for referencing
           if (ref_frame_id != cm->ref_frame_id[ref] ||
               cm->valid_for_referencing[ref] == 0)
             aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                                "Reference buffer frame ID mismatch");
         }
-#endif  // CONFIG_REFERENCE_BUFFER
       }
 
-#if CONFIG_VAR_REFS
-      check_valid_ref_frames(cm);
-#endif  // CONFIG_VAR_REFS
-
-#if CONFIG_FRAME_SIZE
-      if (cm->error_resilient_mode == 0) {
+      if (!cm->error_resilient_mode && frame_size_override_flag) {
         setup_frame_size_with_refs(cm, rb);
       } else {
-        setup_frame_size(cm, rb);
+        setup_frame_size(cm, frame_size_override_flag, rb);
       }
-#else
-      setup_frame_size_with_refs(cm, rb);
-#endif
 
-#if CONFIG_AMVR
-      if (cm->seq_mv_precision_level == 2) {
-        cm->cur_frame_mv_precision_level = aom_rb_read_bit(rb) ? 0 : 1;
+      if (cm->cur_frame_force_integer_mv) {
+        cm->allow_high_precision_mv = 0;
       } else {
-        cm->cur_frame_mv_precision_level = cm->seq_mv_precision_level;
+        cm->allow_high_precision_mv = aom_rb_read_bit(rb);
       }
-#endif
-      cm->allow_high_precision_mv = aom_rb_read_bit(rb);
       cm->interp_filter = read_frame_interp_filter(rb);
-#if CONFIG_TEMPMV_SIGNALING
-      if (frame_might_use_prev_frame_mvs(cm))
-        cm->use_prev_frame_mvs = aom_rb_read_bit(rb);
+      cm->switchable_motion_mode = aom_rb_read_bit(rb);
+    }
+
+    cm->prev_frame = get_prev_frame(cm);
+    if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Reference frame containing this frame's initial "
+                         "frame context is unavailable.");
+    }
+
+    if (!cm->intra_only && pbi->need_resync != 1) {
+      if (frame_might_allow_ref_frame_mvs(cm))
+        cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
       else
-        cm->use_prev_frame_mvs = 0;
-#endif
-      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        cm->allow_ref_frame_mvs = 0;
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
         RefBuffer *const ref_buf = &cm->frame_refs[i];
-#if CONFIG_HIGHBITDEPTH
-        av1_setup_scale_factors_for_frame(
-            &ref_buf->sf, ref_buf->buf->y_crop_width,
-            ref_buf->buf->y_crop_height, cm->width, cm->height,
-            cm->use_highbitdepth);
-#else
         av1_setup_scale_factors_for_frame(
             &ref_buf->sf, ref_buf->buf->y_crop_width,
             ref_buf->buf->y_crop_height, cm->width, cm->height);
-#endif
+        if ((!av1_is_valid_scale(&ref_buf->sf)))
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "Reference frame has invalid dimensions");
       }
     }
   }
 
-#if CONFIG_FRAME_MARKER
-  if (cm->show_frame == 0) {
-    cm->frame_offset = cm->current_video_frame + aom_rb_read_literal(rb, 4);
-  } else {
-    cm->frame_offset = cm->current_video_frame;
-  }
   av1_setup_frame_buf_refs(cm);
 
-#if CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_OBU
-  if (cm->frame_type != S_FRAME)
-#endif  // CONFIG_OBU
-    av1_setup_frame_sign_bias(cm);
-#define FRAME_SIGN_BIAS_DEBUG 0
-#if FRAME_SIGN_BIAS_DEBUG
-  {
-    printf("\n\nDECODER: Frame=%d, show_frame=%d:", cm->current_video_frame,
-           cm->show_frame);
-    MV_REFERENCE_FRAME ref_frame;
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      printf(" sign_bias[%d]=%d", ref_frame,
-             cm->ref_frame_sign_bias[ref_frame]);
-    }
-    printf("\n");
-  }
-#endif  // FRAME_SIGN_BIAS_DEBUG
-#undef FRAME_SIGN_BIAS_DEBUG
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#endif  // CONFIG_FRAME_MARKER
+  av1_setup_frame_sign_bias(cm);
 
-#if CONFIG_TEMPMV_SIGNALING
   cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
-#endif
+  cm->cur_frame->frame_type = cm->frame_type;
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     /* If bitmask is set, update reference frame id values and
        mark frames as valid for reference */
-    int refresh_frame_flags =
-        cm->frame_type == KEY_FRAME ? 0xFF : pbi->refresh_frame_flags;
-    for (i = 0; i < REF_FRAMES; i++) {
+    int refresh_frame_flags = pbi->refresh_frame_flags;
+    for (int i = 0; i < REF_FRAMES; i++) {
       if ((refresh_frame_flags >> i) & 1) {
         cm->ref_frame_id[i] = cm->current_frame_id;
         cm->valid_for_referencing[i] = 1;
       }
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
+
+  const int might_bwd_adapt =
+      !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  if (might_bwd_adapt) {
+    cm->refresh_frame_context = aom_rb_read_bit(rb)
+                                    ? REFRESH_FRAME_CONTEXT_DISABLED
+                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
+  } else {
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+  }
 
   get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
-  get_frame_new_buffer(cm)->color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  get_frame_new_buffer(cm)->transfer_function = cm->transfer_function;
+  get_frame_new_buffer(cm)->color_primaries = cm->color_primaries;
+  get_frame_new_buffer(cm)->transfer_characteristics =
+      cm->transfer_characteristics;
+  get_frame_new_buffer(cm)->matrix_coefficients = cm->matrix_coefficients;
+  get_frame_new_buffer(cm)->monochrome = cm->seq_params.monochrome;
   get_frame_new_buffer(cm)->chroma_sample_position = cm->chroma_sample_position;
-#endif
   get_frame_new_buffer(cm)->color_range = cm->color_range;
   get_frame_new_buffer(cm)->render_width = cm->render_width;
   get_frame_new_buffer(cm)->render_height = cm->render_height;
@@ -5034,22 +4103,10 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
                        " state");
   }
 
-  if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = aom_rb_read_bit(rb)
-                                    ? REFRESH_FRAME_CONTEXT_FORWARD
-                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
-  } else {
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
-  }
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  // This flag will be overridden by the call to av1_setup_past_independence
-  // below, forcing the use of context 0 for those frame types.
-  cm->frame_context_idx = aom_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
-#endif
-
   // Generate next_ref_frame_map.
   lock_buffer_pool(pool);
-  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+  int ref_index = 0;
+  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     if (mask & 1) {
       cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
       ++frame_bufs[cm->new_fb_idx].ref_count;
@@ -5072,461 +4129,185 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
   unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 1;
 
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
-    av1_setup_past_independence(cm);
+  if (cm->allow_intrabc) {
+    // Set parameters corresponding to no filtering.
+    struct loopfilter *lf = &cm->lf;
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
 
-  setup_loopfilter(cm, rb);
+  read_tile_info(pbi, rb);
   setup_quantization(cm, rb);
   xd->bd = (int)cm->bit_depth;
 
-#if CONFIG_Q_ADAPT_PROBS
-  av1_default_coef_probs(cm);
-  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
-    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (cm->frame_refs[0].idx <= 0) {
-      cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
-    }
-#else
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+      cm->num_allocated_above_contexts < cm->tile_rows) {
+    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
   }
-#endif  // CONFIG_Q_ADAPT_PROBS
 
-  setup_segmentation(cm, rb);
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    av1_setup_past_independence(cm);
+  }
 
-  {
-    struct segmentation *const seg = &cm->seg;
-    int segment_quantizer_active = 0;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
-        segment_quantizer_active = 1;
-      }
-    }
+  setup_segmentation(cm, rb);
 
-    cm->delta_q_res = 1;
-#if CONFIG_EXT_DELTA_Q
-    cm->delta_lf_res = 1;
-    cm->delta_lf_present_flag = 0;
-#if CONFIG_LOOPFILTER_LEVEL
-    cm->delta_lf_multi = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
-    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
-      cm->delta_q_present_flag = aom_rb_read_bit(rb);
-    } else {
-      cm->delta_q_present_flag = 0;
-    }
-    if (cm->delta_q_present_flag) {
-      xd->prev_qindex = cm->base_qindex;
-      cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
-#if CONFIG_EXT_DELTA_Q
-      assert(!segment_quantizer_active);
-      cm->delta_lf_present_flag = aom_rb_read_bit(rb);
-      if (cm->delta_lf_present_flag) {
-        xd->prev_delta_lf_from_base = 0;
-        cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
-#if CONFIG_LOOPFILTER_LEVEL
-        cm->delta_lf_multi = aom_rb_read_bit(rb);
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-          xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      }
-#endif  // CONFIG_EXT_DELTA_Q
+  cm->delta_q_res = 1;
+  cm->delta_lf_res = 1;
+  cm->delta_lf_present_flag = 0;
+  cm->delta_lf_multi = 0;
+  cm->delta_q_present_flag = cm->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+  if (cm->delta_q_present_flag) {
+    xd->current_qindex = cm->base_qindex;
+    cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+    if (!cm->allow_intrabc) cm->delta_lf_present_flag = aom_rb_read_bit(rb);
+    if (cm->delta_lf_present_flag) {
+      cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+      cm->delta_lf_multi = aom_rb_read_bit(rb);
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
   }
-#if CONFIG_AMVR
-  xd->cur_frame_mv_precision_level = cm->cur_frame_mv_precision_level;
-#endif
 
-  for (i = 0; i < MAX_SEGMENTS; ++i) {
+  xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+
+  for (int i = 0; i < MAX_SEGMENTS; ++i) {
     const int qindex = cm->seg.enabled
                            ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
                            : cm->base_qindex;
     xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                      cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
     xd->qindex[i] = qindex;
   }
-  cm->all_lossless = all_lossless(cm, xd);
+  cm->coded_lossless = is_coded_lossless(cm, xd);
+  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
   setup_segmentation_dequant(cm);
-#if CONFIG_CDEF
-  if (!cm->all_lossless) {
-    setup_cdef(cm, rb);
+  if (cm->coded_lossless) {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
   }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  decode_restoration_mode(cm, rb);
-#endif  // CONFIG_LOOP_RESTORATION
-  cm->tx_mode = read_tx_mode(cm, rb);
-  cm->reference_mode = read_frame_reference_mode(cm, rb);
-  if (cm->reference_mode != SINGLE_REFERENCE) setup_compound_reference_mode(cm);
-  read_compound_tools(cm, rb);
-
-#if CONFIG_EXT_TX
-  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_ADAPT_SCAN
-  cm->use_adapt_scan = aom_rb_read_bit(rb);
-  // TODO(angiebird): call av1_init_scan_order only when use_adapt_scan
-  // switches from 1 to 0
-  if (cm->use_adapt_scan == 0) av1_init_scan_order(cm);
-#endif  // CONFIG_ADAPT_SCAN
-
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
-  //               show_exisiting_frame=1, nor can it take a frame not used as
-  //               a reference, it is probable that by the time it is being
-  //               referred to, the frame buffer it originally points to may
-  //               already get expired and have been reassigned to the current
-  //               newly coded frame. Hence, we need to check whether this is
-  //               the case, and if yes, we have 2 choices:
-  //               (1) Simply disable the use of previous frame mvs; or
-  //               (2) Have cm->prev_frame point to one reference frame buffer,
-  //                   e.g. LAST_FRAME.
-  if (!dec_is_ref_frame_buf(pbi, cm->prev_frame)) {
-    // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame =
-        cm->frame_refs[LAST_FRAME - LAST_FRAME].idx != INVALID_IDX
-            ? &cm->buffer_pool
-                   ->frame_bufs[cm->frame_refs[LAST_FRAME - LAST_FRAME].idx]
-            : NULL;
-  }
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_TEMPMV_SIGNALING
-  if (cm->use_prev_frame_mvs && !frame_can_use_prev_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Frame wrongly requests previous frame MVs");
+  if (cm->coded_lossless || !cm->seq_params.enable_cdef) {
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->cdef_uv_strengths[0] = 0;
   }
-#else
-  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
-#if CONFIG_FRAME_SUPERRES
-                           cm->width == cm->last_width &&
-                           cm->height == cm->last_height &&
-#else
-                           cm->width == cm->prev_frame->buf.y_crop_width &&
-                           cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-                           !cm->last_intra_only && cm->last_show_frame &&
-                           (cm->last_frame_type != KEY_FRAME);
-#endif  // CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_GLOBAL_MOTION
-  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
-#endif
-
-  read_tile_info(pbi, rb);
-  if (use_compressed_header(cm)) {
-    sz = aom_rb_read_literal(rb, 16);
-    if (sz == 0)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid header size");
-  } else {
-    sz = 0;
+  if (cm->all_lossless || !cm->seq_params.enable_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
-  return sz;
-}
+  setup_loopfilter(cm, rb);
 
-#if CONFIG_SUPERTX
-static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i, j;
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
-    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-      for (j = TX_8X8; j < TX_SIZES; ++j) {
-        av1_diff_update_prob(r, &fc->supertx_prob[i][j], ACCT_STR);
-      }
-    }
+  if (!cm->coded_lossless && cm->seq_params.enable_cdef) {
+    setup_cdef(cm, rb);
+  }
+  if (!cm->all_lossless && cm->seq_params.enable_restoration) {
+    decode_restoration_mode(cm, rb);
   }
-}
-#endif  // CONFIG_SUPERTX
-
-static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
-                                  size_t partition_size) {
-#if CONFIG_RESTRICT_COMPRESSED_HDR
-  (void)pbi;
-  (void)data;
-  (void)partition_size;
-  return 0;
-#else
-  AV1_COMMON *const cm = &pbi->common;
-#if CONFIG_SUPERTX
-  MACROBLOCKD *const xd = &pbi->mb;
-#endif
-  aom_reader r;
-#if !CONFIG_NEW_MULTISYMBOL
-  FRAME_CONTEXT *const fc = cm->fc;
-  int i;
-#endif
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  r.window_size = 1 << cm->ans_window_size_log2;
-#endif
-  if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
-                      pbi->decrypt_state))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder 0");
 
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  if (cm->tx_mode == TX_MODE_SELECT)
-    av1_diff_update_prob(&r, &fc->quarter_tx_size_prob, ACCT_STR);
-#endif
+  cm->tx_mode = read_tx_mode(cm, rb);
+  cm->reference_mode = read_frame_reference_mode(cm, rb);
+  if (cm->reference_mode != SINGLE_REFERENCE) setup_compound_reference_mode(cm);
 
-#if CONFIG_LV_MAP && !LV_MAP_PROB
-  av1_read_txb_probs(fc, cm->tx_mode, &r, &cm->counts);
-#endif  // CONFIG_LV_MAP && !LV_MAP_PROB
-
-#if !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT)
-    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
-      av1_diff_update_prob(&r, &fc->txfm_partition_prob[i], ACCT_STR);
-#endif  // CONFIG_VAR_TX
-  for (i = 0; i < SKIP_CONTEXTS; ++i)
-    av1_diff_update_prob(&r, &fc->skip_probs[i], ACCT_STR);
-#endif
+  av1_setup_skip_mode_allowed(cm);
+  cm->skip_mode_flag = cm->is_skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
 
-  if (!frame_is_intra_only(cm)) {
-#if !CONFIG_NEW_MULTISYMBOL
-    read_inter_mode_probs(fc, &r);
-#endif
+  if (frame_might_allow_warped_motion(cm))
+    cm->allow_warped_motion = aom_rb_read_bit(rb);
+  else
+    cm->allow_warped_motion = 0;
 
-#if CONFIG_INTERINTRA
-    if (cm->reference_mode != COMPOUND_REFERENCE &&
-        cm->allow_interintra_compound) {
-#if !CONFIG_NEW_MULTISYMBOL
-      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-        if (is_interintra_allowed_bsize_group(i)) {
-          av1_diff_update_prob(&r, &fc->interintra_prob[i], ACCT_STR);
-        }
-      }
-#endif
-#if CONFIG_WEDGE && !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_PARTITION_TYPES
-      int block_sizes_to_update = BLOCK_SIZES_ALL;
-#else
-      int block_sizes_to_update = BLOCK_SIZES;
-#endif
-      for (i = 0; i < block_sizes_to_update; i++) {
-        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
-          av1_diff_update_prob(&r, &fc->wedge_interintra_prob[i], ACCT_STR);
-        }
-      }
-#endif  // CONFIG_WEDGE
-    }
-#endif  // CONFIG_INTERINTRA
+  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      av1_diff_update_prob(&r, &fc->intra_inter_prob[i], ACCT_STR);
-#endif
+  if (cm->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Frame wrongly requests reference frame MVs");
+  }
 
-#if !CONFIG_NEW_MULTISYMBOL
-    read_frame_reference_mode_probs(cm, &r);
-#endif
+  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
 
-#if CONFIG_COMPOUND_SINGLEREF
-    for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
-      av1_diff_update_prob(&r, &fc->comp_inter_mode_prob[i], ACCT_STR);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+  read_film_grain(cm, rb);
 
-#if !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_AMVR
-    if (cm->cur_frame_mv_precision_level == 0) {
-#endif
-      for (i = 0; i < NMV_CONTEXTS; ++i)
-        read_mv_probs(&fc->nmvc[i], cm->allow_high_precision_mv, &r);
-#if CONFIG_AMVR
-    }
-#endif
-#endif
-#if CONFIG_SUPERTX
-    if (!xd->lossless[0]) read_supertx_probs(fc, &r);
-#endif
+#if EXT_TILE_DEBUG
+  if (pbi->ext_tile_debug && cm->large_scale_tile) {
+    read_ext_tile_info(pbi, rb);
+    av1_set_single_tile_decoding_mode(cm);
   }
-
-  return aom_reader_has_error(&r);
-#endif  // CONFIG_RESTRICT_COMPRESSED_HDR
-}
-
-#ifdef NDEBUG
-#define debug_check_frame_counts(cm) (void)0
-#else  // !NDEBUG
-// Counts should only be incremented when frame_parallel_decoding_mode and
-// error_resilient_mode are disabled.
-static void debug_check_frame_counts(const AV1_COMMON *const cm) {
-  FRAME_COUNTS zero_counts;
-  av1_zero(zero_counts);
-  assert(cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD ||
-         cm->error_resilient_mode);
-  assert(!memcmp(cm->counts.partition, zero_counts.partition,
-                 sizeof(cm->counts.partition)));
-  assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
-                 sizeof(cm->counts.switchable_interp)));
-  assert(!memcmp(cm->counts.inter_compound_mode,
-                 zero_counts.inter_compound_mode,
-                 sizeof(cm->counts.inter_compound_mode)));
-#if CONFIG_INTERINTRA
-  assert(!memcmp(cm->counts.interintra, zero_counts.interintra,
-                 sizeof(cm->counts.interintra)));
-#if CONFIG_WEDGE
-  assert(!memcmp(cm->counts.wedge_interintra, zero_counts.wedge_interintra,
-                 sizeof(cm->counts.wedge_interintra)));
-#endif  // CONFIG_WEDGE
-#endif  // CONFIG_INTERINTRA
-  assert(!memcmp(cm->counts.compound_interinter,
-                 zero_counts.compound_interinter,
-                 sizeof(cm->counts.compound_interinter)));
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  assert(!memcmp(cm->counts.motion_mode, zero_counts.motion_mode,
-                 sizeof(cm->counts.motion_mode)));
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  assert(!memcmp(cm->counts.ncobmc_mode, zero_counts.ncobmc_mode,
-                 sizeof(cm->counts.ncobmc_mode)));
-#endif
-  assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
-                 sizeof(cm->counts.intra_inter)));
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(!memcmp(cm->counts.comp_inter_mode, zero_counts.comp_inter_mode,
-                 sizeof(cm->counts.comp_inter_mode)));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
-                 sizeof(cm->counts.comp_inter)));
-#if CONFIG_EXT_COMP_REFS
-  assert(!memcmp(cm->counts.comp_ref_type, zero_counts.comp_ref_type,
-                 sizeof(cm->counts.comp_ref_type)));
-  assert(!memcmp(cm->counts.uni_comp_ref, zero_counts.uni_comp_ref,
-                 sizeof(cm->counts.uni_comp_ref)));
-#endif  // CONFIG_EXT_COMP_REFS
-  assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
-                 sizeof(cm->counts.single_ref)));
-  assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
-                 sizeof(cm->counts.comp_ref)));
-#if CONFIG_EXT_REFS
-  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
-                 sizeof(cm->counts.comp_bwdref)));
-#endif  // CONFIG_EXT_REFS
-  assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
-                 sizeof(cm->counts.tx_size)));
-  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
-  assert(
-      !memcmp(&cm->counts.mv[0], &zero_counts.mv[0], sizeof(cm->counts.mv[0])));
-  assert(
-      !memcmp(&cm->counts.mv[1], &zero_counts.mv[1], sizeof(cm->counts.mv[0])));
+#endif  // EXT_TILE_DEBUG
+  return 0;
 }
-#endif  // NDEBUG
 
-static struct aom_read_bit_buffer *init_read_bit_buffer(
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
     AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
-    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]) {
+    const uint8_t *data_end) {
   rb->bit_offset = 0;
   rb->error_handler = error_handler;
   rb->error_handler_data = &pbi->common;
-  if (pbi->decrypt_cb) {
-    const int n = (int)AOMMIN(MAX_AV1_HEADER_SIZE, data_end - data);
-    pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
-    rb->bit_buffer = clear_data;
-    rb->bit_buffer_end = clear_data + n;
-  } else {
-    rb->bit_buffer = data;
-    rb->bit_buffer_end = data_end;
-  }
+  rb->bit_buffer = data;
+  rb->bit_buffer_end = data_end;
   return rb;
 }
 
-//------------------------------------------------------------------------------
-
-void av1_read_frame_size(struct aom_read_bit_buffer *rb, int *width,
-                         int *height) {
-  *width = aom_rb_read_literal(rb, 16) + 1;
-  *height = aom_rb_read_literal(rb, 16) + 1;
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height) {
+  *width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  *height = aom_rb_read_literal(rb, num_bits_height) + 1;
 }
 
 BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
-  int profile = aom_rb_read_bit(rb);
-  profile |= aom_rb_read_bit(rb) << 1;
-  if (profile > 2) profile += aom_rb_read_bit(rb);
+  int profile = aom_rb_read_literal(rb, PROFILE_BITS);
   return (BITSTREAM_PROFILE)profile;
 }
 
-static void make_update_tile_list_dec(AV1Decoder *pbi, int tile_rows,
-                                      int tile_cols, FRAME_CONTEXT *ec_ctxs[]) {
-  int i;
-  for (i = 0; i < tile_rows * tile_cols; ++i)
-    ec_ctxs[i] = &pbi->tile_data[i].tctx;
-}
-
-#if CONFIG_FRAME_SUPERRES
 void superres_post_decode(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
-  if (av1_superres_unscaled(cm)) return;
+  if (!av1_superres_scaled(cm)) return;
+  assert(!cm->all_lossless);
 
   lock_buffer_pool(pool);
   av1_superres_upscale(cm, pool);
   unlock_buffer_pool(pool);
 }
-#endif  // CONFIG_FRAME_SUPERRES
-
-static void dec_setup_frame_boundary_info(AV1_COMMON *const cm) {
-// Note: When LOOPFILTERING_ACROSS_TILES is enabled, we need to clear the
-// boundary information every frame, since the tile boundaries may
-// change every frame (particularly when dependent-horztiles is also
-// enabled); when it is disabled, the only information stored is the frame
-// boundaries, which only depend on the frame size.
-#if !CONFIG_LOOPFILTERING_ACROSS_TILES
-  if (cm->width != cm->last_width || cm->height != cm->last_height)
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-  {
-    int row, col;
-    for (row = 0; row < cm->mi_rows; ++row) {
-      MODE_INFO *mi = cm->mi + row * cm->mi_stride;
-      for (col = 0; col < cm->mi_cols; ++col) {
-        mi->mbmi.boundary_info = 0;
-        mi++;
-      }
-    }
-    av1_setup_frame_boundary_info(cm);
-  }
-}
 
-size_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, const uint8_t *data,
-                                          const uint8_t *data_end,
-                                          const uint8_t **p_data_end) {
+int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb,
+                                       const uint8_t *data,
+                                       const uint8_t **p_data_end,
+                                       int trailing_bits_present) {
   AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &pbi->mb;
-  struct aom_read_bit_buffer rb;
-  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
-  size_t first_partition_size;
-  YV12_BUFFER_CONFIG *new_fb;
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  RefBuffer *last_fb_ref_buf = &cm->frame_refs[LAST_FRAME - LAST_FRAME];
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_ADAPT_SCAN
-  av1_deliver_eob_threshold(cm, xd);
-#endif
+
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
 #endif
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_r();
+#endif
 
-#if CONFIG_GLOBAL_MOTION
-  int i;
-  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     cm->global_motion[i] = default_warp_params;
     cm->cur_frame->global_motion[i] = default_warp_params;
   }
   xd->global_motion = cm->global_motion;
-#endif  // CONFIG_GLOBAL_MOTION
 
-  first_partition_size = read_uncompressed_header(
-      pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
+  read_uncompressed_header(pbi, rb);
+
+  if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
 
-#if CONFIG_EXT_TILE
   // If cm->single_tile_decoding = 0, the independent decoding of a single tile
   // or a section of a frame is not allowed.
   if (!cm->single_tile_decoding &&
@@ -5534,268 +4315,160 @@ size_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, const uint8_t *data,
     pbi->dec_tile_row = -1;
     pbi->dec_tile_col = -1;
   }
-#endif  // CONFIG_EXT_TILE
 
-  pbi->first_partition_size = first_partition_size;
-  pbi->uncomp_hdr_size = aom_rb_bytes_read(&rb);
-  new_fb = get_frame_new_buffer(cm);
+  pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+  YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
   xd->cur_buf = new_fb;
-#if CONFIG_INTRABC
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(
-      &xd->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
-      xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
-      cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(
-      &xd->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
-      xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_INTRABC
+  if (av1_allow_intrabc(cm)) {
+    av1_setup_scale_factors_for_frame(
+        &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
+        xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
+  }
 
   if (cm->show_existing_frame) {
     // showing a frame directly
-    *p_data_end = data + aom_rb_bytes_read(&rb);
+    *p_data_end = data + aom_rb_bytes_read(rb);
+    if (cm->reset_decoder_state) {
+      // Use the default frame context values.
+      *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+      if (!cm->fc->initialized)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Uninitialized entropy context.");
+    }
     return 0;
   }
 
-  data += aom_rb_bytes_read(&rb);
-  if (first_partition_size)
-    if (!read_is_valid(data, first_partition_size, data_end))
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt header length");
-
   cm->setup_mi(cm);
 
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
-  //               show_exisiting_frame=1, nor can it take a frame not used as
-  //               a reference, it is probable that by the time it is being
-  //               referred to, the frame buffer it originally points to may
-  //               already get expired and have been reassigned to the current
-  //               newly coded frame. Hence, we need to check whether this is
-  //               the case, and if yes, we have 2 choices:
-  //               (1) Simply disable the use of previous frame mvs; or
-  //               (2) Have cm->prev_frame point to one reference frame buffer,
-  //                   e.g. LAST_FRAME.
-  if (!dec_is_ref_frame_buf(pbi, cm->prev_frame)) {
-    // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame = last_fb_ref_buf->idx != INVALID_IDX
-                         ? &cm->buffer_pool->frame_bufs[last_fb_ref_buf->idx]
-                         : NULL;
-  }
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_TEMPMV_SIGNALING
-  if (cm->use_prev_frame_mvs && !frame_can_use_prev_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Frame wrongly requests previous frame MVs");
-  }
-#else
-  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
-#if CONFIG_FRAME_SUPERRES
-                           cm->width == cm->last_width &&
-                           cm->height == cm->last_height &&
-#else
-                           cm->width == cm->prev_frame->buf.y_crop_width &&
-                           cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-                           !cm->last_intra_only && cm->last_show_frame &&
-                           (cm->last_frame_type != KEY_FRAME);
-#endif  // CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_MFMV
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+
   av1_setup_motion_field(cm);
-#endif  // CONFIG_MFMV
 
-  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (cm->error_resilient_mode || frame_is_intra_only(cm)) {
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
-    cm->pre_fc = &cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
   } else {
-    *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
-    cm->pre_fc = &cm->frame_contexts[cm->frame_refs[0].idx];
+    *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
   }
-#else
-  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
-  cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
   if (!cm->fc->initialized)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
-  av1_zero(cm->counts);
-
   xd->corrupted = 0;
-  if (first_partition_size) {
-    new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
-    if (new_fb->corrupted)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Decode failed. Frame data header is corrupted.");
-  }
-  return first_partition_size;
+  return 0;
 }
 
-void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
-                                    const uint8_t *data_end,
-                                    const uint8_t **p_data_end, int startTile,
-                                    int endTile, int initialize_flag) {
+// Once-per-frame initialization
+static void setup_frame_info(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  int context_updated = 0;
 
-#if CONFIG_LOOP_RESTORATION
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
-#endif
-
-#if !CONFIG_LOOPFILTER_LEVEL
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    av1_loop_filter_frame_init(cm, cm->lf.filter_level, cm->lf.filter_level);
+  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  if (pbi->td.mc_buf_size != buf_size) {
+    av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
   }
-#endif
+}
 
-  // If encoded in frame parallel mode, frame context is ready after decoding
-  // the frame header.
-  if (cm->frame_parallel_decode && initialize_flag &&
-      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD) {
-    AVxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
-      context_updated = 1;
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-      cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
-#else
-      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    }
-    av1_frameworker_lock_stats(worker);
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    frame_worker_data->frame_context_ready = 1;
-    // Signal the main thread that context is ready.
-    av1_frameworker_signal_stats(worker);
-    av1_frameworker_unlock_stats(worker);
-  }
-
-  dec_setup_frame_boundary_info(cm);
-
-  if (pbi->max_threads > 1 && !CONFIG_CB4X4 &&
-#if CONFIG_EXT_TILE
-      pbi->dec_tile_col < 0 &&  // Decoding all columns
-#endif                          // CONFIG_EXT_TILE
-      cm->tile_cols > 1) {
-    // Multi-threaded tile decoder
-    *p_data_end =
-        decode_tiles_mt(pbi, data + pbi->first_partition_size, data_end);
-    if (!xd->corrupted) {
-      if (!cm->skip_loop_filter) {
-// If multiple threads are used to decode tiles, then we use those
-// threads to do parallel loopfiltering.
-#if CONFIG_LOOPFILTER_LEVEL
-        av1_loop_filter_frame_mt(
-            (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, pbi->mb.plane,
-            cm->lf.filter_level[0], cm->lf.filter_level[1], 0, 0,
-            pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync);
-#else
-        av1_loop_filter_frame_mt((YV12_BUFFER_CONFIG *)xd->cur_buf, cm,
-                                 pbi->mb.plane, cm->lf.filter_level, 0, 0,
-                                 pbi->tile_workers, pbi->num_tile_workers,
-                                 &pbi->lf_row_sync);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      }
-    } else {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Decode failed. Frame data is corrupted.");
-    }
-  } else {
-#if CONFIG_OBU
-    *p_data_end = decode_tiles(pbi, data, data_end, startTile, endTile);
-#else
-    *p_data_end = decode_tiles(
-        pbi, data + pbi->uncomp_hdr_size + pbi->first_partition_size, data_end,
-        startTile, endTile);
-#endif
-  }
+void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
+                                    const uint8_t *data_end,
+                                    const uint8_t **p_data_end, int start_tile,
+                                    int end_tile, int initialize_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int tile_count_tg = end_tile - start_tile + 1;
 
-  if (endTile != cm->tile_rows * cm->tile_cols - 1) {
-    return;
-  }
+  if (initialize_flag) setup_frame_info(pbi);
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm);
-  }
-#endif
+  if (pbi->max_threads > 1 && tile_count_tg > 1 && !cm->large_scale_tile)
+    *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
+  else
+    *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 
-#if CONFIG_CDEF
-  if (!cm->skip_loop_filter && !cm->all_lossless) {
-    av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+  const int num_planes = av1_num_planes(cm);
+  // If the bit stream is monochrome, set the U and V buffers to a constant.
+  if (num_planes < 3) set_planes_to_neutral_grey(cm, xd->cur_buf, 1);
+
+  if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
+    return;
   }
-#endif  // CONFIG_CDEF
 
-#if CONFIG_FRAME_SUPERRES
-  superres_post_decode(pbi);
-#endif  // CONFIG_FRAME_SUPERRES
+  if (!cm->allow_intrabc && !cm->single_tile_decoding) {
+    if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+      av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                            num_planes, 0);
+#else
+      if (pbi->num_workers > 1) {
+        av1_loop_filter_frame_mt(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                                 num_planes, 0, pbi->tile_workers,
+                                 pbi->num_workers, &pbi->lf_row_sync);
+      } else {
+        av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                              num_planes, 0);
+      }
+#endif
+    }
 
-#if CONFIG_LOOP_RESTORATION
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    aom_extend_frame_borders((YV12_BUFFER_CONFIG *)xd->cur_buf);
-    av1_loop_restoration_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, cm,
-                               cm->rst_info, 7, 0, NULL);
+    const int do_loop_restoration =
+        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+    const int do_cdef =
+        !cm->skip_loop_filter && !cm->coded_lossless &&
+        (cm->cdef_bits || cm->cdef_strengths[0] || cm->cdef_uv_strengths[0]);
+    const int do_superres = av1_superres_scaled(cm);
+    const int optimized_loop_restoration = !do_cdef && !do_superres;
+
+    if (!optimized_loop_restoration) {
+      if (do_loop_restoration)
+        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 0);
+
+      if (do_cdef) av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+
+      superres_post_decode(pbi);
+
+      if (do_loop_restoration) {
+        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 1);
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    } else {
+      // In no cdef and no superres case. Provide an optimized version of
+      // loop_restoration_filter.
+      if (do_loop_restoration) {
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    }
   }
-#endif  // CONFIG_LOOP_RESTORATION
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
-                                             sizeof(&pbi->tile_data[0].tctx));
-      aom_cdf_prob **cdf_ptrs =
-          aom_malloc(cm->tile_rows * cm->tile_cols *
-                     sizeof(&pbi->tile_data[0].tctx.partition_cdf[0][0]));
-      make_update_tile_list_dec(pbi, cm->tile_rows, cm->tile_cols, tile_ctxs);
-#if CONFIG_LV_MAP
-      av1_adapt_coef_probs(cm);
-#endif  // CONFIG_LV_MAP
-#if CONFIG_SYMBOLRATE
-      av1_dump_symbol_rate(cm);
-#endif
-      av1_adapt_intra_frame_probs(cm);
-      av1_average_tile_coef_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
-                                 cm->tile_rows * cm->tile_cols);
-      av1_average_tile_intra_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
-                                  cm->tile_rows * cm->tile_cols);
-#if CONFIG_PVQ
-      av1_average_tile_pvq_cdfs(pbi->common.fc, tile_ctxs,
-                                cm->tile_rows * cm->tile_cols);
-#endif  // CONFIG_PVQ
-#if CONFIG_ADAPT_SCAN
-      av1_adapt_scan_order(cm);
-#endif  // CONFIG_ADAPT_SCAN
-
-      if (!frame_is_intra_only(cm)) {
-        av1_adapt_inter_frame_probs(cm);
-#if !CONFIG_NEW_MULTISYMBOL
-        av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
-#endif
-        av1_average_tile_inter_cdfs(&pbi->common, pbi->common.fc, tile_ctxs,
-                                    cdf_ptrs, cm->tile_rows * cm->tile_cols);
-        av1_average_tile_mv_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
-                                 cm->tile_rows * cm->tile_cols);
-      }
-      aom_free(tile_ctxs);
-      aom_free(cdf_ptrs);
-    } else {
-      debug_check_frame_counts(cm);
+      *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
+      av1_reset_cdf_symbol_counters(cm->fc);
     }
   } else {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -5808,153 +4481,8 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
   }
 #endif
 
-// Non frame parallel update frame context here.
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (!context_updated) cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
-#else
-  if (!cm->error_resilient_mode && !context_updated)
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif
-}
-
-#if CONFIG_OBU
-
-static OBU_TYPE read_obu_header(struct aom_read_bit_buffer *rb,
-                                uint32_t *header_size) {
-  OBU_TYPE obu_type;
-  int obu_extension_flag;
-
-  *header_size = 1;
-
-  obu_type = (OBU_TYPE)aom_rb_read_literal(rb, 5);
-  aom_rb_read_literal(rb, 2);  // reserved
-  obu_extension_flag = aom_rb_read_bit(rb);
-  if (obu_extension_flag) {
-    *header_size += 1;
-    aom_rb_read_literal(rb, 3);  // temporal_id
-    aom_rb_read_literal(rb, 2);
-    aom_rb_read_literal(rb, 2);
-    aom_rb_read_literal(rb, 1);  // reserved
-  }
-
-  return obu_type;
-}
-
-static uint32_t read_temporal_delimiter_obu() { return 0; }
-
-static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
-                                         struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *const cm = &pbi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  uint32_t saved_bit_offset = rb->bit_offset;
-
-  cm->profile = av1_read_profile(rb);
-  aom_rb_read_literal(rb, 4);  // level
-
-  seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
-  if (seq_params->frame_id_numbers_present_flag) {
-    seq_params->frame_id_length_minus7 = aom_rb_read_literal(rb, 4);
-    seq_params->delta_frame_id_length_minus2 = aom_rb_read_literal(rb, 4);
-  }
-
-  read_bitdepth_colorspace_sampling(cm, rb, pbi->allow_lowbitdepth);
-
-  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
-}
-
-static uint32_t read_frame_header_obu(AV1Decoder *pbi, const uint8_t *data,
-                                      const uint8_t *data_end,
-                                      const uint8_t **p_data_end) {
-  size_t header_size;
-
-  header_size =
-      av1_decode_frame_headers_and_setup(pbi, data, data_end, p_data_end);
-  return (uint32_t)(pbi->uncomp_hdr_size + header_size);
-}
-
-static uint32_t read_tile_group_header(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb,
-                                       int *startTile, int *endTile) {
-  AV1_COMMON *const cm = &pbi->common;
-  uint32_t saved_bit_offset = rb->bit_offset;
-
-  *startTile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
-  *endTile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
-
-  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
-}
-
-static uint32_t read_one_tile_group_obu(AV1Decoder *pbi,
-                                        struct aom_read_bit_buffer *rb,
-                                        int is_first_tg, const uint8_t *data,
-                                        const uint8_t *data_end,
-                                        const uint8_t **p_data_end,
-                                        int *is_last_tg) {
-  AV1_COMMON *const cm = &pbi->common;
-  int startTile, endTile;
-  uint32_t header_size, tg_payload_size;
-
-  header_size = read_tile_group_header(pbi, rb, &startTile, &endTile);
-  data += header_size;
-  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, startTile,
-                                 endTile, is_first_tg);
-  tg_payload_size = (uint32_t)(*p_data_end - data);
-
-  // TODO(shan):  For now, assume all tile groups received in order
-  *is_last_tg = endTile == cm->tile_rows * cm->tile_cols - 1;
-
-  return header_size + tg_payload_size;
-}
-
-void av1_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
-                                const uint8_t *data_end,
-                                const uint8_t **p_data_end) {
-  AV1_COMMON *const cm = &pbi->common;
-  int frame_decoding_finished = 0;
-  int is_first_tg_obu_received = 1;
-  int frame_header_received = 0;
-  int frame_header_size = 0;
-
-  // decode frame as a series of OBUs
-  while (!frame_decoding_finished && !cm->error.error_code) {
-    struct aom_read_bit_buffer rb;
-    uint8_t clear_data[80];
-    uint32_t obu_size, obu_header_size, obu_payload_size = 0;
-    OBU_TYPE obu_type;
-
-    init_read_bit_buffer(pbi, &rb, data + 4, data_end, clear_data);
-
-    // every obu is preceded by 4-byte size of obu (obu header + payload size)
-    // The obu size is only needed for tile group OBUs
-    obu_size = mem_get_le32(data);
-    obu_type = read_obu_header(&rb, &obu_header_size);
-    data += (4 + obu_header_size);
-
-    switch (obu_type) {
-      case OBU_TD: obu_payload_size = read_temporal_delimiter_obu(); break;
-      case OBU_SEQUENCE_HEADER:
-        obu_payload_size = read_sequence_header_obu(pbi, &rb);
-        break;
-      case OBU_FRAME_HEADER:
-        // Only decode first frame header received
-        if (!frame_header_received) {
-          frame_header_size = obu_payload_size =
-              read_frame_header_obu(pbi, data, data_end, p_data_end);
-          frame_header_received = 1;
-        } else {
-          obu_payload_size = frame_header_size;
-        }
-        if (cm->show_existing_frame) frame_decoding_finished = 1;
-        break;
-      case OBU_TILE_GROUP:
-        obu_payload_size = read_one_tile_group_obu(
-            pbi, &rb, is_first_tg_obu_received, data, data + obu_size - 1,
-            p_data_end, &frame_decoding_finished);
-        is_first_tg_obu_received = 0;
-        break;
-      default: break;
-    }
-    data += obu_payload_size;
+  // Non frame parallel update frame context here.
+  if (!cm->large_scale_tile) {
+    cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
   }
 }
-#endif
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index 0e7eb6a1d..330cedcdc 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -19,35 +19,59 @@ extern "C" {
 struct AV1Decoder;
 struct aom_read_bit_buffer;
 
-#if CONFIG_REFERENCE_BUFFER
-/* Placeholder for now */
-void read_sequence_header(SequenceHeader *seq_params,
-                          struct aom_read_bit_buffer *rb);
-#endif
+// Reads the middle part of the sequence header OBU (from
+// frame_width_bits_minus_1 to enable_restoration) into cm->seq_params (a
+// SequenceHeader). Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb);
 
-void av1_read_frame_size(struct aom_read_bit_buffer *rb, int *width,
-                         int *height);
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height);
 BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
 
-// This function is now obsolete
-void av1_decode_frame(struct AV1Decoder *pbi, const uint8_t *data,
-                      const uint8_t *data_end, const uint8_t **p_data_end);
-size_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
-                                          const uint8_t *data,
-                                          const uint8_t *data_end,
-                                          const uint8_t **p_data_end);
+// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on
+// failure.
+int av1_check_trailing_bits(struct AV1Decoder *pbi,
+                            struct aom_read_bit_buffer *rb);
+
+int av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb,
+                                       const uint8_t *data,
+                                       const uint8_t **p_data_end,
+                                       int trailing_bits_present);
 
 void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
                                     const uint8_t *data_end,
                                     const uint8_t **p_data_end, int startTile,
                                     int endTile, int initialize_flag);
 
-#if CONFIG_OBU
-// replaces av1_decode_frame
-void av1_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
-                                const uint8_t *data_end,
-                                const uint8_t **p_data_end);
-#endif
+// Implements the color_config() function in the spec. Reports errors by
+// calling rb->error_handler() or aom_internal_error().
+void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth);
+
+// Implements the timing_info() function in the spec. Reports errors by calling
+// rb->error_handler().
+void av1_read_timing_info_header(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the decoder_model_info() function in the spec. Reports errors by
+// calling rb->error_handler().
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the operating_parameters_info() function in the spec. Reports
+// errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb, int op_num);
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+    struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end);
+
+void av1_free_mc_tmp_buf(void *td, int use_highbd);
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index cac27e9a6..cc8f4d29e 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 
+#include "av1/common/cfl.h"
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
@@ -18,13 +19,9 @@
 #include "av1/common/mvref_common.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
-#if CONFIG_EXT_INTRA
 #include "av1/common/reconintra.h"
-#endif  // CONFIG_EXT_INTRA
 #include "av1/common/seg_common.h"
-#if CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION
 
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decodemv.h"
@@ -39,30 +36,51 @@ static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
   return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
 }
 
-static int read_delta_qindex(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
-                             MB_MODE_INFO *const mbmi, int mi_col, int mi_row) {
-  FRAME_COUNTS *counts = xd->counts;
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd,
+                      int mi_col, int mi_row) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (cm->coded_lossless) return;
+  if (cm->allow_intrabc) {
+    assert(cm->cdef_bits == 0);
+    return;
+  }
+
+  if (!(mi_col & (cm->seq_params.mib_size - 1)) &&
+      !(mi_row & (cm->seq_params.mib_size - 1))) {  // Top left?
+    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+        xd->cdef_preset[3] = -1;
+  }
+  // Read CDEF param at the first non-skip coding block
+  const int mask = (1 << (6 - MI_SIZE_LOG2));
+  const int m = ~(mask - 1);
+  const int index = cm->seq_params.sb_size == BLOCK_128X128
+                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+                        : 0;
+  cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)]
+      ->cdef_strength = xd->cdef_preset[index] =
+      xd->cdef_preset[index] == -1 && !mbmi->skip
+          ? aom_read_literal(r, cm->cdef_bits, ACCT_STR)
+          : xd->cdef_preset[index];
+}
+
+static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             aom_reader *r, MB_MODE_INFO *const mbmi,
+                             int mi_col, int mi_row) {
   int sign, abs, reduced_delta_qindex = 0;
   BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = mi_col & MAX_MIB_MASK;
-  const int b_row = mi_row & MAX_MIB_MASK;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
-  int rem_bits, thr;
-  int i, smallval;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  if ((bsize != BLOCK_LARGEST || mbmi->skip == 0) && read_delta_q_flag) {
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_q_flag) {
     abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
-    smallval = (abs < DELTA_Q_SMALL);
-    if (counts) {
-      for (i = 0; i < abs; ++i) counts->delta_q[i][1]++;
-      if (smallval) counts->delta_q[abs][0]++;
-    }
+    const int smallval = (abs < DELTA_Q_SMALL);
 
     if (!smallval) {
-      rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
-      thr = (1 << rem_bits) + 1;
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
       abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
     }
 
@@ -76,56 +94,33 @@ static int read_delta_qindex(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
   }
   return reduced_delta_qindex;
 }
-#if CONFIG_EXT_DELTA_Q
-static int read_delta_lflevel(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
-#if CONFIG_LOOPFILTER_LEVEL
-                              int lf_id,
-#endif
+static int read_delta_lflevel(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                              aom_reader *r, int lf_id,
                               MB_MODE_INFO *const mbmi, int mi_col,
                               int mi_row) {
-  FRAME_COUNTS *counts = xd->counts;
   int sign, abs, reduced_delta_lflevel = 0;
   BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = mi_col & MAX_MIB_MASK;
-  const int b_row = mi_row & MAX_MIB_MASK;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
-  int rem_bits, thr;
-  int i, smallval;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  if ((bsize != cm->sb_size || mbmi->skip == 0) && read_delta_lf_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_lf_flag) {
     if (cm->delta_lf_multi) {
-      assert(lf_id >= 0 && lf_id < FRAME_LF_COUNT);
+      assert(lf_id >= 0 &&
+             lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                             : FRAME_LF_COUNT - 2));
       abs = aom_read_symbol(r, ec_ctx->delta_lf_multi_cdf[lf_id],
                             DELTA_LF_PROBS + 1, ACCT_STR);
     } else {
       abs = aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1,
                             ACCT_STR);
     }
-#else
-    abs =
-        aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1, ACCT_STR);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-    smallval = (abs < DELTA_LF_SMALL);
-    if (counts) {
-#if CONFIG_LOOPFILTER_LEVEL
-      if (cm->delta_lf_multi) {
-        for (i = 0; i < abs; ++i) counts->delta_lf_multi[lf_id][i][1]++;
-        if (smallval) counts->delta_lf_multi[lf_id][abs][0]++;
-      } else {
-        for (i = 0; i < abs; ++i) counts->delta_lf[i][1]++;
-        if (smallval) counts->delta_lf[abs][0]++;
-      }
-#else
-      for (i = 0; i < abs; ++i) counts->delta_lf[i][1]++;
-      if (smallval) counts->delta_lf[abs][0]++;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-    }
+    const int smallval = (abs < DELTA_LF_SMALL);
     if (!smallval) {
-      rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
-      thr = (1 << rem_bits) + 1;
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
       abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
     }
 
@@ -139,21 +134,17 @@ static int read_delta_lflevel(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
   }
   return reduced_delta_lflevel;
 }
-#endif
 
 static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx,
                                              aom_reader *r,
+                                             CFL_ALLOWED_TYPE cfl_allowed,
                                              PREDICTION_MODE y_mode) {
   const UV_PREDICTION_MODE uv_mode =
-#if CONFIG_CFL
-      aom_read_symbol(r, ec_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES, ACCT_STR);
-#else
-      read_intra_mode(r, ec_ctx->uv_mode_cdf[y_mode]);
-#endif  // CONFIG_CFL
+      aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                      UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
   return uv_mode;
 }
 
-#if CONFIG_CFL
 static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
                            int *signs_out) {
   const int joint_sign =
@@ -172,400 +163,145 @@ static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
   *signs_out = joint_sign;
   return idx;
 }
-#endif
 
-#if CONFIG_INTERINTRA
-static INTERINTRA_MODE read_interintra_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                            aom_reader *r, int size_group) {
-  (void)cm;
+static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r,
+                                            int size_group) {
   const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
       r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
       ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-  if (counts) ++counts->interintra_mode[size_group][ii_mode];
   return ii_mode;
 }
-#endif  // CONFIG_INTERINTRA
 
-static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
-                                       aom_reader *r, int16_t ctx) {
-  FRAME_COUNTS *counts = xd->counts;
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
+                                       int16_t ctx) {
   int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
   int is_newmv, is_zeromv, is_refmv;
-#if CONFIG_NEW_MULTISYMBOL
   is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
-#else
-  is_newmv = aom_read(r, ec_ctx->newmv_prob[mode_ctx], ACCT_STR) == 0;
-#endif
-
-  if (is_newmv) {
-    if (counts) ++counts->newmv_mode[mode_ctx][0];
-    return NEWMV;
-  }
-  if (counts) ++counts->newmv_mode[mode_ctx][1];
-
-  if (ctx & (1 << ALL_ZERO_FLAG_OFFSET)) return ZEROMV;
+  if (is_newmv) return NEWMV;
 
-  mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-
-#if CONFIG_NEW_MULTISYMBOL
+  mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
   is_zeromv =
       aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
-#else
-  is_zeromv = aom_read(r, ec_ctx->zeromv_prob[mode_ctx], ACCT_STR) == 0;
-#endif
-  if (is_zeromv) {
-    if (counts) ++counts->zeromv_mode[mode_ctx][0];
-    return ZEROMV;
-  }
-  if (counts) ++counts->zeromv_mode[mode_ctx][1];
+  if (is_zeromv) return GLOBALMV;
 
   mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-  if (ctx & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-  if (ctx & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-  if (ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
-
-#if CONFIG_NEW_MULTISYMBOL
   is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
-#else
-  is_refmv = aom_read(r, ec_ctx->refmv_prob[mode_ctx], ACCT_STR) == 0;
-#endif
-
-  if (is_refmv) {
-    if (counts) ++counts->refmv_mode[mode_ctx][0];
-
+  if (is_refmv)
     return NEARESTMV;
-  } else {
-    if (counts) ++counts->refmv_mode[mode_ctx][1];
+  else
     return NEARMV;
-  }
-
-  // Invalid prediction mode.
-  assert(0);
 }
 
 static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
                          MB_MODE_INFO *mbmi, aom_reader *r) {
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
-
-  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-      || mbmi->mode == SR_NEW_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      ) {
-    int idx;
-    for (idx = 0; idx < 2; ++idx) {
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
-#if CONFIG_NEW_MULTISYMBOL
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
-#else
-        int drl_idx = aom_read(r, ec_ctx->drl_prob[drl_ctx], ACCT_STR);
-#endif
         mbmi->ref_mv_idx = idx + drl_idx;
-        if (xd->counts) ++xd->counts->drl_mode[drl_ctx][drl_idx];
         if (!drl_idx) return;
       }
     }
   }
-
   if (have_nearmv_in_inter_mode(mbmi->mode)) {
-    int idx;
     // Offset the NEARESTMV mode.
     // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
     // mode is factored in.
-    for (idx = 1; idx < 3; ++idx) {
+    for (int idx = 1; idx < 3; ++idx) {
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
-#if CONFIG_NEW_MULTISYMBOL
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
-#else
-        int drl_idx = aom_read(r, ec_ctx->drl_prob[drl_ctx], ACCT_STR);
-#endif
         mbmi->ref_mv_idx = idx + drl_idx - 1;
-        if (xd->counts) ++xd->counts->drl_mode[drl_ctx][drl_idx];
         if (!drl_idx) return;
       }
     }
   }
 }
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    MODE_INFO *mi, aom_reader *r) {
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-#if !CONFIG_MOTION_VAR || !CONFIG_WARPED_MOTION || CONFIG_NEW_MULTISYMBOL || \
-    CONFIG_NCOBMC_ADAPT_WEIGHT
-  (void)cm;
-#endif
+                                    MB_MODE_INFO *mbmi, aom_reader *r) {
+  if (cm->switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+  if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
 
-  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
+  const MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed(xd->global_motion, xd, mbmi, cm->allow_warped_motion);
   int motion_mode;
-  FRAME_COUNTS *counts = xd->counts;
 
   if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (last_motion_mode_allowed == NCOBMC_ADAPT_WEIGHT) {
-    motion_mode = aom_read_symbol(r, xd->tile_ctx->ncobmc_cdf[mbmi->sb_type],
-                                  OBMC_FAMILY_MODES, ACCT_STR);
-    if (counts) ++counts->ncobmc[mbmi->sb_type][motion_mode];
-    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-  } else if (last_motion_mode_allowed == OBMC_CAUSAL) {
-    motion_mode =
-        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
-    if (counts) ++counts->obmc[mbmi->sb_type][motion_mode];
-    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-  } else {
-#else
+
   if (last_motion_mode_allowed == OBMC_CAUSAL) {
-#if CONFIG_NEW_MULTISYMBOL
     motion_mode =
         aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
-#else
-    motion_mode = aom_read(r, cm->fc->obmc_prob[mbmi->sb_type], ACCT_STR);
-#endif
-    if (counts) ++counts->obmc[mbmi->sb_type][motion_mode];
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   } else {
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
     motion_mode =
         aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
                         MOTION_MODES, ACCT_STR);
-    if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
 }
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void read_ncobmc_mode(MACROBLOCKD *xd, MODE_INFO *mi,
-                             NCOBMC_MODE ncobmc_mode[2], aom_reader *r) {
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  FRAME_COUNTS *counts = xd->counts;
-  ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type];
-  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return;
-
-  ncobmc_mode[0] = aom_read_symbol(r, xd->tile_ctx->ncobmc_mode_cdf[ao_block],
-                                   MAX_NCOBMC_MODES, ACCT_STR);
-  if (counts) ++counts->ncobmc_mode[ao_block][ncobmc_mode[0]];
-
-  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-    ncobmc_mode[1] = aom_read_symbol(r, xd->tile_ctx->ncobmc_mode_cdf[ao_block],
-                                     MAX_NCOBMC_MODES, ACCT_STR);
-    if (counts) ++counts->ncobmc_mode[ao_block][ncobmc_mode[1]];
-  }
-}
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                                aom_reader *r, int16_t ctx) {
-  (void)cm;
+static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r,
+                                                int16_t ctx) {
   const int mode =
       aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
                       INTER_COMPOUND_MODES, ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-
-  if (counts) ++counts->inter_compound_mode[ctx][mode];
-
   assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
   return NEAREST_NEARESTMV + mode;
 }
 
-#if CONFIG_COMPOUND_SINGLEREF
-static PREDICTION_MODE read_inter_singleref_comp_mode(MACROBLOCKD *xd,
-                                                      aom_reader *r,
-                                                      int16_t ctx) {
-  const int mode =
-      aom_read_symbol(r, xd->tile_ctx->inter_singleref_comp_mode_cdf[ctx],
-                      INTER_SINGLEREF_COMP_MODES, ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-
-  if (counts) ++counts->inter_singleref_comp_mode[ctx][mode];
-
-  assert(is_inter_singleref_comp_mode(SR_NEAREST_NEARMV + mode));
-  return SR_NEAREST_NEARMV + mode;
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-static int read_segment_id(aom_reader *r, struct segmentation_probs *segp) {
-  return aom_read_symbol(r, segp->tree_cdf, MAX_SEGMENTS, ACCT_STR);
-}
-
-#if CONFIG_VAR_TX
-static void read_tx_size_vartx(AV1_COMMON *cm, MACROBLOCKD *xd,
-                               MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
-                               TX_SIZE tx_size, int depth, int blk_row,
-                               int blk_col, aom_reader *r) {
-#if CONFIG_NEW_MULTISYMBOL
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-#endif
-  int is_split = 0;
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
-  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
-                                   xd->left_txfm_context + blk_row,
-                                   mbmi->sb_type, tx_size);
-  TX_SIZE(*const inter_tx_size)
-  [MAX_MIB_SIZE] =
-      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-  assert(tx_size > TX_4X4);
-
-  if (depth == MAX_VARTX_DEPTH) {
-    int idx, idy;
-    inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size;
-    mbmi->tx_size = tx_size;
-    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-    return;
-  }
-
-#if CONFIG_NEW_MULTISYMBOL
-  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
-#else
-  is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx], ACCT_STR);
-#endif
-
-  if (is_split) {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
-    if (counts) ++counts->txfm_partition[ctx][1];
-
-    if (sub_txs == TX_4X4) {
-      int idx, idy;
-      inter_tx_size[0][0] = sub_txs;
-      for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-        for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-          inter_tx_size[idy][idx] = inter_tx_size[0][0];
-      mbmi->tx_size = sub_txs;
-      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-      txfm_partition_update(xd->above_txfm_context + blk_col,
-                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
-      return;
-    }
-
-    assert(bsl > 0);
-    for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-      read_tx_size_vartx(cm, xd, mbmi, counts, sub_txs, depth + 1, offsetr,
-                         offsetc, r);
+int av1_neg_deinterleave(int diff, int ref, int max) {
+  if (!ref) return diff;
+  if (ref >= (max - 1)) return max - diff - 1;
+  if (2 * ref < max) {
+    if (diff <= 2 * ref) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
     }
+    return diff;
   } else {
-    int idx, idy;
-    inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size;
-    mbmi->tx_size = tx_size;
-    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
-    if (counts) ++counts->txfm_partition[ctx][0];
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    if (diff <= 2 * (max - ref - 1)) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
+    }
+    return max - (diff + 1);
   }
 }
-#endif
 
-static TX_SIZE read_selected_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int32_t tx_size_cat, aom_reader *r) {
-  FRAME_COUNTS *counts = xd->counts;
-  const int ctx = get_tx_size_context(xd);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-
-  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
-                                    tx_size_cat + 2, ACCT_STR);
-  const TX_SIZE tx_size = depth_to_tx_size(depth);
-#if CONFIG_RECT_TX
-  assert(!is_rect_tx(tx_size));
-#endif  // CONFIG_RECT_TX
-  if (counts) ++counts->tx_size[tx_size_cat][ctx][depth];
-  return tx_size;
-}
+static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                           int mi_row, int mi_col, aom_reader *r, int skip) {
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+  if (skip) return pred;
 
-static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
-                            int allow_select_inter, aom_reader *r) {
-  const TX_MODE tx_mode = cm->tx_mode;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
-
-  if (block_signals_txsize(bsize)) {
-    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
-      const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                           : intra_tx_size_cat_lookup[bsize];
-      const TX_SIZE coded_tx_size =
-          read_selected_tx_size(cm, xd, tx_size_cat, r);
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      if (coded_tx_size > max_txsize_lookup[bsize]) {
-        assert(coded_tx_size == max_txsize_lookup[bsize] + 1);
-#if CONFIG_RECT_TX_EXT
-        if (is_quarter_tx_allowed(xd, &xd->mi[0]->mbmi, is_inter)) {
-          int quarter_tx;
-
-          if (quarter_txsize_lookup[bsize] != max_txsize_lookup[bsize]) {
-#if CONFIG_NEW_MULTISYMBOL
-            quarter_tx =
-                aom_read_symbol(r, cm->fc->quarter_tx_size_cdf, 2, ACCT_STR);
-#else
-            quarter_tx = aom_read(r, cm->fc->quarter_tx_size_prob, ACCT_STR);
-            FRAME_COUNTS *counts = xd->counts;
-            if (counts) ++counts->quarter_tx_size[quarter_tx];
-#endif
-          } else {
-            quarter_tx = 1;
-          }
-          return quarter_tx ? quarter_txsize_lookup[bsize]
-                            : max_txsize_rect_lookup[bsize];
-        }
-#endif  // CONFIG_RECT_TX_EXT
-
-        return max_txsize_rect_lookup[bsize];
-      }
-#else
-      assert(coded_tx_size <= max_txsize_lookup[bsize]);
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      return coded_tx_size;
-    } else {
-      return tx_size_from_tx_mode(bsize, tx_mode, is_inter);
-    }
-  } else {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
-    return max_txsize_rect_lookup[bsize];
-#else
-    return TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
+  const int segment_id =
+      av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
+
+  if (segment_id < 0 || segment_id > seg->last_active_segid) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Corrupted segment_ids");
   }
+  return segment_id;
 }
 
 static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
                               int mi_offset, int x_mis, int y_mis) {
-  int x, y, segment_id = INT_MAX;
+  int segment_id = INT_MAX;
 
-  for (y = 0; y < y_mis; y++)
-    for (x = 0; x < x_mis; x++)
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
       segment_id =
           AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
 
@@ -575,30 +311,28 @@ static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
 
 static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
                            int segment_id) {
-  int x, y;
-
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
-  for (y = 0; y < y_mis; y++)
-    for (x = 0; x < x_mis; x++)
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
       cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
-static int read_intra_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_offset, int x_mis, int y_mis,
-                                 aom_reader *r) {
+static int read_intra_segment_id(AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, int mi_row,
+                                 int mi_col, int bsize, aom_reader *r,
+                                 int skip) {
   struct segmentation *const seg = &cm->seg;
-  FRAME_COUNTS *counts = xd->counts;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  struct segmentation_probs *const segp = &ec_ctx->seg;
-  int segment_id;
-
   if (!seg->enabled) return 0;  // Default for disabled segmentation
 
   assert(seg->update_map && !seg->temporal_update);
 
-  segment_id = read_segment_id(r, segp);
-  if (counts) ++counts->seg.tree_total[segment_id];
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+  const int segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, skip);
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
@@ -607,24 +341,25 @@ static void copy_segment_id(const AV1_COMMON *cm,
                             const uint8_t *last_segment_ids,
                             uint8_t *current_segment_ids, int mi_offset,
                             int x_mis, int y_mis) {
-  int x, y;
-
-  for (y = 0; y < y_mis; y++)
-    for (x = 0; x < x_mis; x++)
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
       current_segment_ids[mi_offset + y * cm->mi_cols + x] =
           last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x]
                            : 0;
 }
 
+static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
+                                    int x_mis, int y_mis) {
+  return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+                                                     mi_offset, x_mis, y_mis)
+                                : 0;
+}
+
 static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, aom_reader *r) {
+                                 int mi_row, int mi_col, int preskip,
+                                 aom_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  FRAME_COUNTS *counts = xd->counts;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  struct segmentation_probs *const segp = &ec_ctx->seg;
-
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int predicted_segment_id, segment_id;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = mi_size_wide[mbmi->sb_type];
   const int bh = mi_size_high[mbmi->sb_type];
@@ -635,60 +370,82 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
 
   if (!seg->enabled) return 0;  // Default for disabled segmentation
 
-  predicted_segment_id = cm->last_frame_seg_map
-                             ? dec_get_segment_id(cm, cm->last_frame_seg_map,
-                                                  mi_offset, x_mis, y_mis)
-                             : 0;
-
   if (!seg->update_map) {
     copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
                     mi_offset, x_mis, y_mis);
-    return predicted_segment_id;
+    return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+  }
+
+  int segment_id;
+  if (preskip) {
+    if (!seg->segid_preskip) return 0;
+  } else {
+    if (seg->segid_preskip) return mbmi->segment_id;
+    if (mbmi->skip) {
+      if (seg->temporal_update) {
+        mbmi->seg_id_predicted = 0;
+      }
+      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 1);
+      set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+      return segment_id;
+    }
   }
 
   if (seg->temporal_update) {
     const int ctx = av1_get_pred_context_seg_id(xd);
-#if CONFIG_NEW_MULTISYMBOL
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    struct segmentation_probs *const segp = &ec_ctx->seg;
     aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
     mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
-#else
-    const aom_prob pred_prob = segp->pred_probs[ctx];
-    mbmi->seg_id_predicted = aom_read(r, pred_prob, ACCT_STR);
-#endif
-    if (counts) ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
     if (mbmi->seg_id_predicted) {
-      segment_id = predicted_segment_id;
+      segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
     } else {
-      segment_id = read_segment_id(r, segp);
-      if (counts) ++counts->seg.tree_mispred[segment_id];
+      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
     }
   } else {
-    segment_id = read_segment_id(r, segp);
-    if (counts) ++counts->seg.tree_total[segment_id];
+    segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
   }
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
+static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                          aom_reader *r) {
+  if (!cm->skip_mode_flag) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    return 0;
+  }
+
+  const int ctx = av1_get_skip_mode_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int skip_mode =
+      aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
+  return skip_mode;
+}
+
 static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
                      aom_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = av1_get_skip_context(xd);
-#if CONFIG_NEW_MULTISYMBOL
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
-#else
-    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->skip[ctx][skip];
     return skip;
   }
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1])
 // and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into
 // one single sorted list(colors[...]).
@@ -796,346 +553,114 @@ static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth,
     }
   }
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                   aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
+                                   int mi_row, int mi_col, aom_reader *r) {
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-
-  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_LARGEST);
-  const int block_palette_idx = bsize - BLOCK_8X8;
-  int modev;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
   if (mbmi->mode == DC_PRED) {
-    int palette_y_mode_ctx = 0;
-    if (above_mi) {
-      palette_y_mode_ctx +=
-          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-    if (left_mi) {
-      palette_y_mode_ctx +=
-          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-#if CONFIG_NEW_MULTISYMBOL
-    modev = aom_read_symbol(
-        r,
-        xd->tile_ctx->palette_y_mode_cdf[block_palette_idx][palette_y_mode_ctx],
-        2, ACCT_STR);
-#else
-    modev = aom_read(
-        r,
-        av1_default_palette_y_mode_prob[block_palette_idx][palette_y_mode_ctx],
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+    const int modev = aom_read_symbol(
+        r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
         ACCT_STR);
-#endif
     if (modev) {
       pmi->palette_size[0] =
-          aom_read_symbol(r,
-                          xd->tile_ctx->palette_y_size_cdf[block_palette_idx],
+          aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-#if CONFIG_PALETTE_DELTA_ENCODING
       read_palette_colors_y(xd, cm->bit_depth, pmi, r);
-#else
-      for (int i = 0; i < pmi->palette_size[0]; ++i)
-        pmi->palette_colors[i] = aom_read_literal(r, cm->bit_depth, ACCT_STR);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
-  if (mbmi->uv_mode == UV_DC_PRED) {
+  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
-#if CONFIG_NEW_MULTISYMBOL
-    modev = aom_read_symbol(
+    const int modev = aom_read_symbol(
         r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
-#else
-    modev = aom_read(r, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx],
-                     ACCT_STR);
-#endif
     if (modev) {
       pmi->palette_size[1] =
-          aom_read_symbol(r,
-                          xd->tile_ctx->palette_uv_size_cdf[block_palette_idx],
+          aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-#if CONFIG_PALETTE_DELTA_ENCODING
       read_palette_colors_uv(xd, cm->bit_depth, pmi, r);
-#else
-      for (int i = 0; i < pmi->palette_size[1]; ++i) {
-        pmi->palette_colors[PALETTE_MAX_SIZE + i] =
-            aom_read_literal(r, cm->bit_depth, ACCT_STR);
-        pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
-            aom_read_literal(r, cm->bit_depth, ACCT_STR);
-      }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
 }
 
-#if CONFIG_FILTER_INTRA
-static void read_filter_intra_mode_info(AV1_COMMON *const cm,
-                                        MACROBLOCKD *const xd, int mi_row,
-                                        int mi_col, aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  FRAME_COUNTS *counts = xd->counts;
+static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
+  const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
+  return sym - MAX_ANGLE_DELTA;
+}
+
+static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
       &mbmi->filter_intra_mode_info;
 
-  if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
-    filter_intra_mode_info->use_filter_intra_mode[0] =
-        aom_read(r, cm->fc->filter_intra_probs[0], ACCT_STR);
-    if (filter_intra_mode_info->use_filter_intra_mode[0]) {
-      filter_intra_mode_info->filter_intra_mode[0] =
-          av1_read_uniform(r, FILTER_INTRA_MODES);
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    filter_intra_mode_info->use_filter_intra = aom_read_symbol(
+        r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra) {
+      filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
+          r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
     }
-    if (counts) {
-      ++counts
-            ->filter_intra[0][filter_intra_mode_info->use_filter_intra_mode[0]];
-    }
-  }
-
-#if CONFIG_CB4X4
-  if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                           xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y))
-    return;
-#else
-  (void)mi_row;
-  (void)mi_col;
-#endif  // CONFIG_CB4X4
-
-  if (mbmi->uv_mode == UV_DC_PRED &&
-      mbmi->palette_mode_info.palette_size[1] == 0) {
-    filter_intra_mode_info->use_filter_intra_mode[1] =
-        aom_read(r, cm->fc->filter_intra_probs[1], ACCT_STR);
-    if (filter_intra_mode_info->use_filter_intra_mode[1]) {
-      filter_intra_mode_info->filter_intra_mode[1] =
-          av1_read_uniform(r, FILTER_INTRA_MODES);
-    }
-    if (counts) {
-      ++counts
-            ->filter_intra[1][filter_intra_mode_info->use_filter_intra_mode[1]];
-    }
-  }
-}
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_EXT_INTRA
-static void read_intra_angle_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                  aom_reader *r) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_INTRA_INTERP
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-  const int ctx = av1_get_pred_context_intra_interp(xd);
-  int p_angle;
-#endif  // CONFIG_INTRA_INTERP
-
-  (void)cm;
-
-  mbmi->angle_delta[0] = 0;
-  mbmi->angle_delta[1] = 0;
-#if CONFIG_INTRA_INTERP
-  mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-
-  if (!av1_use_angle_delta(bsize)) return;
-
-  if (av1_is_directional_mode(mbmi->mode, bsize)) {
-    mbmi->angle_delta[0] =
-        av1_read_uniform(r, 2 * MAX_ANGLE_DELTA + 1) - MAX_ANGLE_DELTA;
-#if CONFIG_INTRA_INTERP
-    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-    if (av1_is_intra_filter_switchable(p_angle)) {
-      FRAME_COUNTS *counts = xd->counts;
-      mbmi->intra_filter = aom_read_symbol(r, ec_ctx->intra_filter_cdf[ctx],
-                                           INTRA_FILTERS, ACCT_STR);
-      if (counts) ++counts->intra_filter[ctx][mbmi->intra_filter];
-    }
-#endif  // CONFIG_INTRA_INTERP
-  }
-
-  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize)) {
-    mbmi->angle_delta[1] =
-        av1_read_uniform(r, 2 * MAX_ANGLE_DELTA + 1) - MAX_ANGLE_DELTA;
+  } else {
+    filter_intra_mode_info->use_filter_intra = 0;
   }
 }
-#endif  // CONFIG_EXT_INTRA
 
-void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                      int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                      int blk_row, int blk_col, int block, int plane,
-                      TX_SIZE tx_size,
-#endif
-                      aom_reader *r) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int inter_block = is_inter_block(mbmi);
-#if !CONFIG_TXK_SEL
-#if CONFIG_VAR_TX
-  const TX_SIZE tx_size = inter_block ? mbmi->min_tx_size : mbmi->tx_size;
-#else
-  const TX_SIZE tx_size = mbmi->tx_size;
-#endif
-#endif  // !CONFIG_TXK_SEL
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-#if !CONFIG_TXK_SEL
-  TX_TYPE *tx_type = &mbmi->tx_type;
-#else
-  // only y plane's tx_type is transmitted
-  if (plane > 0) return;
-  (void)block;
-  TX_TYPE *tx_type = &mbmi->txk_type[(blk_row << 4) + blk_col];
-#endif
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
-
-  if (!FIXED_TX_TYPE) {
-#if CONFIG_EXT_TX
-    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-    if (get_ext_tx_types(tx_size, mbmi->sb_type, inter_block,
-                         cm->reduced_tx_set_used) > 1 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          tx_size, mbmi->sb_type, inter_block, cm->reduced_tx_set_used);
-      const int eset = get_ext_tx_set(tx_size, mbmi->sb_type, inter_block,
-                                      cm->reduced_tx_set_used);
-      // eset == 0 should correspond to a set with only DCT_DCT and
-      // there is no need to read the tx_type
-      assert(eset != 0);
-
-#if !CONFIG_LGT_FROM_PRED
-      if (inter_block) {
-        *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-            r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-            av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-      } else if (ALLOW_INTRA_EXT_TX) {
-        *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-            r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-            av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-      }
-#else
-      // only signal tx_type when lgt is not allowed or not selected
-      if (inter_block) {
-        if (LGT_FROM_PRED_INTER) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
-            mbmi->use_lgt =
-                aom_read(r, ec_ctx->inter_lgt_prob[square_tx_size], ACCT_STR);
-#if CONFIG_ENTROPY_STATS
-            if (counts) ++counts->inter_lgt[square_tx_size][mbmi->use_lgt];
-#endif  // CONFIG_ENTROPY_STATS
-          }
-          if (!mbmi->use_lgt) {
-            *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-                r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-            if (counts) ++counts->inter_ext_tx[eset][square_tx_size][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            *tx_type = DCT_DCT;  // assign a dummy tx_type
-          }
-        } else {
-          *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-              r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-              av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-          if (counts) ++counts->inter_ext_tx[eset][square_tx_size][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      } else if (ALLOW_INTRA_EXT_TX) {
-        if (LGT_FROM_PRED_INTRA) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
-            mbmi->use_lgt =
-                aom_read(r, ec_ctx->intra_lgt_prob[square_tx_size][mbmi->mode],
-                         ACCT_STR);
-#if CONFIG_ENTROPY_STATS
-            if (counts)
-              ++counts->intra_lgt[square_tx_size][mbmi->mode][mbmi->use_lgt];
-#endif  // CONFIG_ENTROPY_STATS
-          }
-          if (!mbmi->use_lgt) {
-            *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-                r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-                av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-            if (counts)
-              ++counts
-                    ->intra_ext_tx[eset][square_tx_size][mbmi->mode][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            *tx_type = DCT_DCT;  // assign a dummy tx_type
-          }
-        } else {
-          *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-              r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-              av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-          if (counts)
-            ++counts->intra_ext_tx[eset][square_tx_size][mbmi->mode][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      }
-#endif  // CONFIG_LGT_FROM_PRED
+  const int txk_type_idx =
+      av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+  TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+  if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, inter_block, cm->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and
+    // there is no need to read the tx_type
+    assert(eset != 0);
+
+    if (inter_block) {
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
     } else {
-      *tx_type = DCT_DCT;
-    }
-#else  // CONFIG_EXT_TX
-
-    if (tx_size < TX_32X32 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_ENTROPY_STATS
-      FRAME_COUNTS *counts = xd->counts;
-#endif  // CONFIG_ENTROPY_STATS
-      if (inter_block) {
-        *tx_type = av1_ext_tx_inv[aom_read_symbol(
-            r, ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES, ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-        if (counts) ++counts->inter_ext_tx[tx_size][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      } else {
-        const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-        *tx_type = av1_ext_tx_inv[aom_read_symbol(
-            r, ec_ctx->intra_ext_tx_cdf[tx_size][tx_type_nom], TX_TYPES,
-            ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-        if (counts) ++counts->intra_ext_tx[tx_size][tx_type_nom][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      }
-    } else {
-      *tx_type = DCT_DCT;
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
     }
-#endif  // CONFIG_EXT_TX
+  } else {
+    *tx_type = DCT_DCT;
   }
-#if FIXED_TX_TYPE
-  assert(mbmi->tx_type == DCT_DCT);
-#endif
 }
 
-#if CONFIG_INTRABC
 static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
-                           nmv_context *ctx, nmv_context_counts *counts,
-                           MvSubpelPrecision precision);
+                           nmv_context *ctx, MvSubpelPrecision precision);
 
 static INLINE int is_mv_valid(const MV *mv);
 
@@ -1143,267 +668,195 @@ static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
                             const int_mv *ref_mv, int mi_row, int mi_col,
                             BLOCK_SIZE bsize, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-  FRAME_COUNTS *counts = xd->counts;
-  nmv_context_counts *const dv_counts = counts ? &counts->dv : NULL;
-  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, dv_counts,
-          MV_SUBPEL_NONE);
+  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
+  // DV should not have sub-pel.
+  assert((mv->as_mv.col & 7) == 0);
+  assert((mv->as_mv.row & 7) == 0);
+  mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
+  mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
   int valid = is_mv_valid(&mv->as_mv) &&
-              is_dv_valid(mv->as_mv, &xd->tile, mi_row, mi_col, bsize);
+              av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
+                              cm->seq_params.mib_size_log2);
   return valid;
 }
-#endif  // CONFIG_INTRABC
+
+static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                              int mi_row, int mi_col, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
+  if (mbmi->use_intrabc) {
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+    int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+    int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
+    int_mv global_mvs[REF_FRAMES];
+
+    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+                     xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col,
+                     inter_mode_ctx);
+
+    int_mv nearestmv, nearmv;
+
+    av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
+    int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+    if (dv_ref.as_int == 0)
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, mi_row,
+                      mi_col);
+    // Ref DV should not have sub-pel.
+    int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
+    dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
+    dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
+    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row,
+                                     mi_col, bsize, r);
+    if (!valid_dv) {
+      // Intra bc motion vectors are not valid - signal corrupt frame
+      aom_merge_corrupted_flag(&xd->corrupted, 1);
+    }
+  }
+}
 
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
                                        MACROBLOCKD *const xd, int mi_row,
                                        int mi_col, aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  int i;
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
+  struct segmentation *const seg = &cm->seg;
 
-  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r);
+  if (seg->segid_preskip)
+    mbmi->segment_id =
+        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, 0);
+
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
+  if (!seg->segid_preskip)
+    mbmi->segment_id =
+        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, mbmi->skip);
+
+  read_cdef(cm, r, xd, mi_col, mi_row);
+
   if (cm->delta_q_present_flag) {
-    xd->current_qindex =
-        xd->prev_qindex +
+    xd->current_qindex +=
         read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
     xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
-    xd->prev_qindex = xd->current_qindex;
-#if CONFIG_EXT_DELTA_Q
     if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
       if (cm->delta_lf_multi) {
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
-          mbmi->curr_delta_lf[lf_id] = xd->curr_delta_lf[lf_id] =
-              xd->prev_delta_lf[lf_id] +
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
               read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
                   cm->delta_lf_res;
-          xd->prev_delta_lf[lf_id] = xd->curr_delta_lf[lf_id];
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
         }
       } else {
-        mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-            xd->prev_delta_lf_from_base +
+        const int tmp_lvl =
+            xd->delta_lf_from_base +
             read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
                 cm->delta_lf_res;
-        xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
       }
-#else
-      const int current_delta_lf_from_base =
-          xd->prev_delta_lf_from_base +
-          read_delta_lflevel(cm, xd, r, mbmi, mi_col, mi_row) *
-              cm->delta_lf_res;
-      mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-          clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
-      xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
-#endif  // CONFIG_LOOPFILTER_LEVEL
     }
-#endif
   }
 
+  mbmi->current_qindex = xd->current_qindex;
+
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
-#if CONFIG_INTRABC
-  if (av1_allow_intrabc(bsize, cm)) {
-    mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
-    if (mbmi->use_intrabc) {
-      mbmi->tx_size = read_tx_size(cm, xd, 1, !mbmi->skip, r);
-      mbmi->mode = mbmi->uv_mode = UV_DC_PRED;
-      mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-
-      int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
-      int_mv ref_mvs[MAX_MV_REF_CANDIDATES];
-
-      av1_find_mv_refs(cm, xd, mi, INTRA_FRAME, &xd->ref_mv_count[INTRA_FRAME],
-                       xd->ref_mv_stack[INTRA_FRAME], NULL, ref_mvs, mi_row,
-                       mi_col, NULL, NULL, inter_mode_ctx);
-
-      int_mv nearestmv, nearmv;
-      av1_find_best_ref_mvs(0, ref_mvs, &nearestmv, &nearmv);
-
-      int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
-      if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
-
-      xd->corrupted |=
-          !assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row, mi_col, bsize, r);
-#if CONFIG_VAR_TX
-      // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-      int idx, idy;
-      for (idy = 0; idy < height; ++idy)
-        for (idx = 0; idx < width; ++idx)
-          mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
-      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-#if CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      av1_read_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                       0,
-#endif
-                       r);
-#endif  // CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      return;
-    }
-  }
-#endif  // CONFIG_INTRABC
-
-  mbmi->tx_size = read_tx_size(cm, xd, 0, 1, r);
-
-#if CONFIG_CB4X4
-  (void)i;
-  mbmi->mode =
-      read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
-#else
-  switch (bsize) {
-    case BLOCK_4X4:
-      for (i = 0; i < 4; ++i)
-        mi->bmi[i].as_mode = read_intra_mode(
-            r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, i));
-      mbmi->mode = mi->bmi[3].as_mode;
-      break;
-    case BLOCK_4X8:
-      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
-      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 1));
-      break;
-    case BLOCK_8X4:
-      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
-      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 2));
-      break;
-    default:
-      mbmi->mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  if (av1_allow_intrabc(cm)) {
+    read_intrabc_info(cm, xd, mi_row, mi_col, r);
+    if (is_intrabc_block(mbmi)) return;
   }
-#endif
 
-#if CONFIG_CB4X4
-  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
-#if CONFIG_CFL
-    xd->cfl->is_chroma_reference = 1;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_CB4X4
-    mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, mbmi->mode);
+  mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
 
-#if CONFIG_CFL
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      (use_angle_delta && av1_is_directional_mode(mbmi->mode))
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
+    xd->cfl.is_chroma_reference = 1;
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
-      xd->cfl->store_y = 1;
-    } else {
-      xd->cfl->store_y = 0;
     }
-#endif  // CONFIG_CFL
-
-#if CONFIG_CB4X4
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
   } else {
     // Avoid decoding angle_info if there is is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_CFL
-    xd->cfl->is_chroma_reference = 0;
-    xd->cfl->store_y = 1;
-#endif
+    xd->cfl.is_chroma_reference = 0;
   }
-#endif
+  xd->cfl.store_y = store_cfl_required(cm, xd);
 
-#if CONFIG_EXT_INTRA
-  read_intra_angle_info(cm, xd, r);
-#endif  // CONFIG_EXT_INTRA
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
   if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    read_palette_mode_info(cm, xd, r);
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-  if (bsize >= BLOCK_8X8 || CONFIG_CB4X4)
-    read_filter_intra_mode_info(cm, xd, mi_row, mi_col, r);
-#endif  // CONFIG_FILTER_INTRA
-
-#if !CONFIG_TXK_SEL
-  av1_read_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                   0,
-#endif
-                   r);
-#endif  // !CONFIG_TXK_SEL
+    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
 }
 
 static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
-#if CONFIG_INTRABC || CONFIG_AMVR
-                             int use_subpel,
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-                             int usehp) {
+                             int use_subpel, int usehp) {
   int mag, d, fr, hp;
-#if CONFIG_NEW_MULTISYMBOL
-  const int sign = aom_read_bit(r, ACCT_STR);
-#else
-  const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
-#endif
+  const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
   const int mv_class =
-      aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
+      aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-#if CONFIG_NEW_MULTISYMBOL
     d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
-#else
-    d = aom_read(r, mvcomp->class0[0], ACCT_STR);
-#endif
     mag = 0;
   } else {
-    int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     d = 0;
-#if CONFIG_NEW_MULTISYMBOL
-    for (i = 0; i < n; ++i)
-      d |= aom_read_symbol(r, mvcomp->bits_cdf[(i + 1) / 2], 2, ACCT_STR) << i;
-#else
-    for (i = 0; i < n; ++i) d |= aom_read(r, mvcomp->bits[i], ACCT_STR) << i;
-#endif
+    for (int i = 0; i < n; ++i)
+      d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
-#if CONFIG_INTRABC || CONFIG_AMVR
   if (use_subpel) {
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-        // Fractional part
+    // Fractional part
     fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
                          MV_FP_SIZE, ACCT_STR);
 
-// High precision part (if hp is not used, the default value of the hp is 1)
-#if CONFIG_NEW_MULTISYMBOL
+    // High precision part (if hp is not used, the default value of the hp is 1)
     hp = usehp ? aom_read_symbol(
                      r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
                      ACCT_STR)
                : 1;
-#else
-  hp = usehp ? aom_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp, ACCT_STR)
-             : 1;
-#endif
-#if CONFIG_INTRABC || CONFIG_AMVR
   } else {
     fr = 3;
     hp = 1;
   }
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
 
   // Result
   mag += ((d << 3) | (fr << 1) | hp) + 1;
@@ -1411,29 +864,19 @@ static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
 }
 
 static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
-                           nmv_context *ctx, nmv_context_counts *counts,
-                           MvSubpelPrecision precision) {
-  MV_JOINT_TYPE joint_type;
-  MV diff = { 0, 0 };
-  joint_type =
-      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
+                           nmv_context *ctx, MvSubpelPrecision precision) {
+  MV diff = kZeroMv;
+  const MV_JOINT_TYPE joint_type =
+      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
 
   if (mv_joint_vertical(joint_type))
-    diff.row = read_mv_component(r, &ctx->comps[0],
-#if CONFIG_INTRABC || CONFIG_AMVR
-                                 precision > MV_SUBPEL_NONE,
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
+    diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
                                  precision > MV_SUBPEL_LOW_PRECISION);
 
   if (mv_joint_horizontal(joint_type))
-    diff.col = read_mv_component(r, &ctx->comps[1],
-#if CONFIG_INTRABC || CONFIG_AMVR
-                                 precision > MV_SUBPEL_NONE,
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
+    diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
                                  precision > MV_SUBPEL_LOW_PRECISION);
 
-  av1_inc_mv(&diff, counts, precision);
-
   mv->row = ref->row + diff.row;
   mv->col = ref->col + diff.col;
 }
@@ -1441,138 +884,68 @@ static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
 static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
                                                 const MACROBLOCKD *xd,
                                                 aom_reader *r) {
-  if (!is_comp_ref_allowed(xd->mi[0]->mbmi.sb_type)) return SINGLE_REFERENCE;
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-    const int ctx = av1_get_reference_mode_context(cm, xd);
-#if CONFIG_NEW_MULTISYMBOL
+    const int ctx = av1_get_reference_mode_context(xd);
     const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
         r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
-#else
-    const REFERENCE_MODE mode =
-        (REFERENCE_MODE)aom_read(r, cm->fc->comp_inter_prob[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->comp_inter[ctx][mode];
     return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
   } else {
+    assert(cm->reference_mode == SINGLE_REFERENCE);
     return cm->reference_mode;
   }
 }
 
-#if CONFIG_NEW_MULTISYMBOL
 #define READ_REF_BIT(pname) \
-  aom_read_symbol(r, av1_get_pred_cdf_##pname(cm, xd), 2, ACCT_STR)
-#define READ_REF_BIT2(pname) \
   aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
-#else
-#define READ_REF_BIT(pname) \
-  aom_read(r, av1_get_pred_prob_##pname(cm, xd), ACCT_STR)
-#define READ_REF_BIT2(pname) \
-  aom_read(r, av1_get_pred_prob_##pname(cm, xd), ACCT_STR)
-#endif
 
-#if CONFIG_EXT_COMP_REFS
-static COMP_REFERENCE_TYPE read_comp_reference_type(AV1_COMMON *cm,
-                                                    const MACROBLOCKD *xd,
+static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd,
                                                     aom_reader *r) {
   const int ctx = av1_get_comp_reference_type_context(xd);
-#if USE_UNI_COMP_REFS
-  COMP_REFERENCE_TYPE comp_ref_type;
-#if CONFIG_VAR_REFS
-  if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm)) {
-    if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm)) {
-#endif  // CONFIG_VAR_REFS
-#if CONFIG_NEW_MULTISYMBOL
-      (void)cm;
-      comp_ref_type = (COMP_REFERENCE_TYPE)aom_read_symbol(
+  const COMP_REFERENCE_TYPE comp_ref_type =
+      (COMP_REFERENCE_TYPE)aom_read_symbol(
           r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
-#else
-  comp_ref_type = (COMP_REFERENCE_TYPE)aom_read(
-      r, cm->fc->comp_ref_type_prob[ctx], ACCT_STR);
-#endif
-#if CONFIG_VAR_REFS
-    } else {
-      comp_ref_type = BIDIR_COMP_REFERENCE;
-    }
-  } else {
-    comp_ref_type = UNIDIR_COMP_REFERENCE;
-  }
-#endif  // CONFIG_VAR_REFS
-#else   // !USE_UNI_COMP_REFS
-  // TODO(zoeliu): Temporarily turn off uni-directional comp refs
-  const COMP_REFERENCE_TYPE comp_ref_type = BIDIR_COMP_REFERENCE;
-#endif  // USE_UNI_COMP_REFS
-  FRAME_COUNTS *counts = xd->counts;
-  if (counts) ++counts->comp_ref_type[ctx][comp_ref_type];
   return comp_ref_type;  // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
 }
-#endif  // CONFIG_EXT_COMP_REFS
+
+static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm,
+                                         MV_REFERENCE_FRAME ref_frame[2]) {
+  ref_frame[0] = LAST_FRAME + cm->ref_frame_idx_0;
+  ref_frame[1] = LAST_FRAME + cm->ref_frame_idx_1;
+}
 
 // Read the referncence frame
 static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                             aom_reader *r, int segment_id,
                             MV_REFERENCE_FRAME ref_frame[2]) {
-  FRAME_COUNTS *counts = xd->counts;
+  if (xd->mi[0]->skip_mode) {
+    set_ref_frames_for_skip_mode(cm, ref_frame);
+    return;
+  }
 
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
                                                    SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE_FRAME;
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    ref_frame[0] = LAST_FRAME;
+    ref_frame[1] = NONE_FRAME;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
-    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
-    if (mode == COMPOUND_REFERENCE) {
-#if CONFIG_EXT_COMP_REFS
-      const COMP_REFERENCE_TYPE comp_ref_type =
-          read_comp_reference_type(cm, xd, r);
 
-#if !USE_UNI_COMP_REFS
-      // TODO(zoeliu): Temporarily turn off uni-directional comp refs
-      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // !USE_UNI_COMP_REFS
+    if (mode == COMPOUND_REFERENCE) {
+      const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
 
       if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
-        const int ctx = av1_get_pred_context_uni_comp_ref_p(xd);
-        int bit;
-#if CONFIG_VAR_REFS
-        if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm))
-#endif  // CONFIG_VAR_REFS
-          bit = READ_REF_BIT2(uni_comp_ref_p);
-#if CONFIG_VAR_REFS
-        else
-          bit = BWD_AND_ALT(cm);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->uni_comp_ref[ctx][0][bit];
-
+        const int bit = READ_REF_BIT(uni_comp_ref_p);
         if (bit) {
           ref_frame[0] = BWDREF_FRAME;
           ref_frame[1] = ALTREF_FRAME;
         } else {
-          const int ctx1 = av1_get_pred_context_uni_comp_ref_p1(xd);
-          int bit1;
-#if CONFIG_VAR_REFS
-          if (L_AND_L2(cm) && (L_AND_L3(cm) || L_AND_G(cm)))
-#endif  // CONFIG_VAR_REFS
-            bit1 = READ_REF_BIT2(uni_comp_ref_p1);
-#if CONFIG_VAR_REFS
-          else
-            bit1 = L_AND_L3(cm) || L_AND_G(cm);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->uni_comp_ref[ctx1][1][bit1];
-
+          const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
           if (bit1) {
-            const int ctx2 = av1_get_pred_context_uni_comp_ref_p2(xd);
-            int bit2;
-#if CONFIG_VAR_REFS
-            if (L_AND_L3(cm) && L_AND_G(cm))
-#endif  // CONFIG_VAR_REFS
-              bit2 = READ_REF_BIT2(uni_comp_ref_p2);
-#if CONFIG_VAR_REFS
-            else
-              bit2 = L_AND_G(cm);
-#endif  // CONFIG_VAR_REFS
-            if (counts) ++counts->uni_comp_ref[ctx2][2][bit2];
-
+            const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
             if (bit2) {
               ref_frame[0] = LAST_FRAME;
               ref_frame[1] = GOLDEN_FRAME;
@@ -1590,202 +963,46 @@ static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
       }
 
       assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // CONFIG_EXT_COMP_REFS
 
-// Normative in decoder (for low delay)
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
       const int idx = 1;
-#else  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS)
-#if CONFIG_EXT_REFS
-      const int idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#else   // !CONFIG_EXT_REFS
-      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS)
-
-      const int ctx = av1_get_pred_context_comp_ref_p(cm, xd);
-#if CONFIG_VAR_REFS
-      int bit;
-      // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-      if (L_OR_L2(cm) && L3_OR_G(cm))
-        bit = READ_REF_BIT(comp_ref_p);
-      else
-        bit = L3_OR_G(cm);
-#else   // !CONFIG_VAR_REFS
       const int bit = READ_REF_BIT(comp_ref_p);
-#endif  // CONFIG_VAR_REFS
-      if (counts) ++counts->comp_ref[ctx][0][bit];
-
-#if CONFIG_EXT_REFS
       // Decode forward references.
       if (!bit) {
-        const int ctx1 = av1_get_pred_context_comp_ref_p1(cm, xd);
-#if CONFIG_VAR_REFS
-        int bit1;
-        // Test need to explicitly code (L) vs (L2) branch node in tree
-        if (L_AND_L2(cm))
-          bit1 = READ_REF_BIT(comp_ref_p1);
-        else
-          bit1 = LAST_IS_VALID(cm);
-#else   // !CONFIG_VAR_REFS
         const int bit1 = READ_REF_BIT(comp_ref_p1);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->comp_ref[ctx1][1][bit1];
-        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 0 : 1];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 1 : 0];
       } else {
-        const int ctx2 = av1_get_pred_context_comp_ref_p2(cm, xd);
-#if CONFIG_VAR_REFS
-        int bit2;
-        // Test need to explicitly code (L3) vs (G) branch node in tree
-        if (L3_AND_G(cm))
-          bit2 = READ_REF_BIT(comp_ref_p2);
-        else
-          bit2 = GOLDEN_IS_VALID(cm);
-#else   // !CONFIG_VAR_REFS
         const int bit2 = READ_REF_BIT(comp_ref_p2);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->comp_ref[ctx2][2][bit2];
         ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
       }
 
       // Decode backward references.
-      const int ctx_bwd = av1_get_pred_context_comp_bwdref_p(cm, xd);
-#if CONFIG_VAR_REFS
-      int bit_bwd;
-      // Test need to explicitly code (BWD/ALT2) vs (ALT) branch node in tree
-      const int bit_bwd_uncertain = BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm);
-      if (bit_bwd_uncertain)
-        bit_bwd = READ_REF_BIT(comp_bwdref_p);
-      else
-        bit_bwd = ALTREF_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
       const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
-#endif  // CONFIG_VAR_REFS
-      if (counts) ++counts->comp_bwdref[ctx_bwd][0][bit_bwd];
       if (!bit_bwd) {
-        const int ctx1_bwd = av1_get_pred_context_comp_bwdref_p1(cm, xd);
-#if CONFIG_VAR_REFS
-        int bit1_bwd;
-        if (BWD_AND_ALT2(cm))
-          bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
-        else
-          bit1_bwd = ALTREF2_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
         const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->comp_bwdref[ctx1_bwd][1][bit1_bwd];
         ref_frame[idx] = cm->comp_bwd_ref[bit1_bwd];
       } else {
         ref_frame[idx] = cm->comp_bwd_ref[2];
       }
-#else   // !CONFIG_EXT_REFS
-      ref_frame[!idx] = cm->comp_var_ref[bit];
-      ref_frame[idx] = cm->comp_fixed_ref;
-#endif  // CONFIG_EXT_REFS
     } else if (mode == SINGLE_REFERENCE) {
-#if CONFIG_EXT_REFS
-      const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
-#if CONFIG_VAR_REFS
-      int bit0;
-      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT2,ALT) branch node
-      // in tree
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) &&
-          (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm)))
-        bit0 = READ_REF_BIT(single_ref_p1);
-      else
-        bit0 = (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm));
-#else   // !CONFIG_VAR_REFS
       const int bit0 = READ_REF_BIT(single_ref_p1);
-#endif  // CONFIG_VAR_REFS
-      if (counts) ++counts->single_ref[ctx0][0][bit0];
-
       if (bit0) {
-        const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
-#if CONFIG_VAR_REFS
-        int bit1;
-        // Test need to explicitly code (BWD/ALT2) vs (ALT) branch node in tree
-        const int bit1_uncertain = BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm);
-        if (bit1_uncertain)
-          bit1 = READ_REF_BIT(single_ref_p2);
-        else
-          bit1 = ALTREF_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
         const int bit1 = READ_REF_BIT(single_ref_p2);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->single_ref[ctx1][1][bit1];
         if (!bit1) {
-          const int ctx5 = av1_get_pred_context_single_ref_p6(xd);
-#if CONFIG_VAR_REFS
-          int bit5;
-          if (BWD_AND_ALT2(cm))
-            bit5 = READ_REF_BIT(single_ref_p6);
-          else
-            bit5 = ALTREF2_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
           const int bit5 = READ_REF_BIT(single_ref_p6);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->single_ref[ctx5][5][bit5];
           ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
         } else {
           ref_frame[0] = ALTREF_FRAME;
         }
       } else {
-        const int ctx2 = av1_get_pred_context_single_ref_p3(xd);
-#if CONFIG_VAR_REFS
-        int bit2;
-        // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-        if (L_OR_L2(cm) && L3_OR_G(cm))
-          bit2 = READ_REF_BIT(single_ref_p3);
-        else
-          bit2 = L3_OR_G(cm);
-#else  // !CONFIG_VAR_REFS
         const int bit2 = READ_REF_BIT(single_ref_p3);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->single_ref[ctx2][2][bit2];
         if (bit2) {
-          const int ctx4 = av1_get_pred_context_single_ref_p5(xd);
-#if CONFIG_VAR_REFS
-          int bit4;
-          // Test need to explicitly code (L3) vs (G) branch node in tree
-          if (L3_AND_G(cm))
-            bit4 = READ_REF_BIT(single_ref_p5);
-          else
-            bit4 = GOLDEN_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
           const int bit4 = READ_REF_BIT(single_ref_p5);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->single_ref[ctx4][4][bit4];
           ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
         } else {
-          const int ctx3 = av1_get_pred_context_single_ref_p4(xd);
-#if CONFIG_VAR_REFS
-          int bit3;
-          // Test need to explicitly code (L) vs (L2) branch node in tree
-          if (L_AND_L2(cm))
-            bit3 = READ_REF_BIT(single_ref_p4);
-          else
-            bit3 = LAST2_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
           const int bit3 = READ_REF_BIT(single_ref_p4);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->single_ref[ctx3][3][bit3];
           ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
         }
       }
-#else   // !CONFIG_EXT_REFS
-      const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
-      const int bit0 = READ_REF_BIT(single_ref_p1);
-      if (counts) ++counts->single_ref[ctx0][0][bit0];
-
-      if (bit0) {
-        const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
-        const int bit1 = READ_REF_BIT(single_ref_p2);
-        if (counts) ++counts->single_ref[ctx1][1][bit1];
-        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
-      } else {
-        ref_frame[0] = LAST_FRAME;
-      }
-#endif  // CONFIG_EXT_REFS
 
       ref_frame[1] = NONE_FRAME;
     } else {
@@ -1798,7 +1015,6 @@ static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
                                          MACROBLOCKD *const xd,
                                          MB_MODE_INFO *const mbmi,
                                          aom_reader *r) {
-  FRAME_COUNTS *counts = xd->counts;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
@@ -1809,120 +1025,68 @@ static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
   if (cm->interp_filter != SWITCHABLE) {
     mbmi->interp_filters = av1_broadcast_interp_filter(cm->interp_filter);
   } else {
-#if CONFIG_DUAL_FILTER
     InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
     for (int dir = 0; dir < 2; ++dir) {
-      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-          (mbmi->ref_frame[1] > INTRA_FRAME &&
-           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        ref0_filter[dir] =
-            (InterpFilter)aom_read_symbol(r, ec_ctx->switchable_interp_cdf[ctx],
-                                          SWITCHABLE_FILTERS, ACCT_STR);
-        if (counts) ++counts->switchable_interp[ctx][ref0_filter[dir]];
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      ref0_filter[dir] = (InterpFilter)aom_read_symbol(
+          r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
+      if (cm->seq_params.enable_dual_filter == 0) {
+        ref0_filter[1] = ref0_filter[0];
+        break;
       }
     }
     // The index system works as: (0, 1) -> (vertical, horizontal) filter types
     mbmi->interp_filters =
         av1_make_interp_filters(ref0_filter[0], ref0_filter[1]);
-#else   // CONFIG_DUAL_FILTER
-    const int ctx = av1_get_pred_context_switchable_interp(xd);
-    InterpFilter filter = (InterpFilter)aom_read_symbol(
-        r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
-    mbmi->interp_filters = av1_broadcast_interp_filter(filter);
-    if (counts) ++counts->switchable_interp[ctx][filter];
-#endif  // CONFIG_DUAL_FILTER
   }
 }
 
 static void read_intra_block_mode_info(AV1_COMMON *const cm, const int mi_row,
                                        const int mi_col, MACROBLOCKD *const xd,
-                                       MODE_INFO *mi, aom_reader *r) {
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  int i;
+                                       MB_MODE_INFO *const mbmi,
+                                       aom_reader *r) {
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int use_angle_delta = av1_use_angle_delta(bsize);
 
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
 
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-#if CONFIG_CB4X4
-  (void)i;
   mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
-#else
-  switch (bsize) {
-    case BLOCK_4X4:
-      for (i = 0; i < 4; ++i)
-        mi->bmi[i].as_mode = read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      mbmi->mode = mi->bmi[3].as_mode;
-      break;
-    case BLOCK_4X8:
-      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      break;
-    case BLOCK_8X4:
-      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      break;
-    default:
-      mbmi->mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
-  }
-#endif
 
-#if CONFIG_CB4X4
-  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
-    mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, mbmi->mode);
-#else
-  mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, mbmi->mode);
-  (void)mi_row;
-  (void)mi_col;
-#endif
-
-#if CONFIG_CFL
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      use_angle_delta && av1_is_directional_mode(mbmi->mode)
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+  const int has_chroma =
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  xd->cfl.is_chroma_reference = has_chroma;
+  if (!cm->seq_params.monochrome && has_chroma) {
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx =
           read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
-      xd->cfl->store_y = 1;
-    } else {
-      xd->cfl->store_y = 0;
     }
-#endif  // CONFIG_CFL
-
-#if CONFIG_CB4X4
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
   } else {
     // Avoid decoding angle_info if there is is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_CFL
-    xd->cfl->is_chroma_reference = 0;
-    xd->cfl->store_y = 1;
-#endif
   }
-#endif
-
-  // Explicitly ignore cm here to avoid a compile warning if none of
-  // ext-intra, palette and filter-intra are enabled.
-  (void)cm;
+  xd->cfl.store_y = store_cfl_required(cm, xd);
 
-#if CONFIG_EXT_INTRA
-  read_intra_angle_info(cm, xd, r);
-#endif  // CONFIG_EXT_INTRA
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
   if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    read_palette_mode_info(cm, xd, r);
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-  if (bsize >= BLOCK_8X8 || CONFIG_CB4X4)
-    read_filter_intra_mode_info(cm, xd, mi_row, mi_col, r);
-#endif  // CONFIG_FILTER_INTRA
+    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -1932,188 +1096,43 @@ static INLINE int is_mv_valid(const MV *mv) {
 
 static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
                             PREDICTION_MODE mode,
-                            MV_REFERENCE_FRAME ref_frame[2], int block,
-                            int_mv mv[2], int_mv ref_mv[2],
-                            int_mv nearest_mv[2], int_mv near_mv[2], int mi_row,
-                            int mi_col, int is_compound, int allow_hp,
-                            aom_reader *r) {
-  int i;
-  int ret = 1;
+                            MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
+                            int_mv ref_mv[2], int_mv nearest_mv[2],
+                            int_mv near_mv[2], int mi_row, int mi_col,
+                            int is_compound, int allow_hp, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_CB4X4
-  int_mv *pred_mv = mbmi->pred_mv;
-  (void)block;
-#else
-  int_mv *pred_mv =
-      (bsize >= BLOCK_8X8) ? mbmi->pred_mv : xd->mi[0]->bmi[block].pred_mv;
-#endif  // CONFIG_CB4X4
-  (void)ref_frame;
-  (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)bsize;
-#if CONFIG_AMVR
-  if (cm->cur_frame_mv_precision_level) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  if (cm->cur_frame_force_integer_mv) {
     allow_hp = MV_SUBPEL_NONE;
   }
-#endif
   switch (mode) {
     case NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      for (i = 0; i < 1 + is_compound; ++i) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
-                        mbmi->ref_mv_idx);
-        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-        nmv_context_counts *const mv_counts =
-            counts ? &counts->mv[nmv_ctx] : NULL;
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, mv_counts, allow_hp);
-        ret = ret && is_mv_valid(&mv[i].as_mv);
-
-        pred_mv[i].as_int = ref_mv[i].as_int;
-      }
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
       break;
     }
     case NEARESTMV: {
       mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
-
-      pred_mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) pred_mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) mv[1].as_int = near_mv[1].as_int;
-
-      pred_mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) pred_mv[1].as_int = near_mv[1].as_int;
-      break;
-    }
-    case ZEROMV: {
-#if CONFIG_GLOBAL_MOTION
-      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block
-#if CONFIG_AMVR
-                                          ,
-                                          cm->cur_frame_mv_precision_level
-#endif
-                                          )
-                         .as_int;
-      if (is_compound)
-        mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
-                                            cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, block
-#if CONFIG_AMVR
-                                            ,
-                                            cm->cur_frame_mv_precision_level
-#endif
-                                            )
-                           .as_int;
-#else
-      mv[0].as_int = 0;
-      if (is_compound) mv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-
-      pred_mv[0].as_int = mv[0].as_int;
-      if (is_compound) pred_mv[1].as_int = mv[1].as_int;
-      break;
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    case SR_NEAREST_NEARMV: {
-      assert(!is_compound);
-      mv[0].as_int = nearest_mv[0].as_int;
-      mv[1].as_int = near_mv[0].as_int;
-      break;
-    }
-    /*
-    case SR_NEAREST_NEWMV: {
-      assert(!is_compound);
-      mv[0].as_int = nearest_mv[0].as_int;
-
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[1].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
-      break;
-    }*/
-    case SR_NEAR_NEWMV: {
-      assert(!is_compound);
-      mv[0].as_int = near_mv[0].as_int;
-
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[1].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
-      break;
-    }
-    case SR_ZERO_NEWMV: {
-      assert(!is_compound);
-#if CONFIG_GLOBAL_MOTION
-      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block)
-                         .as_int;
-#else
-      mv[0].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[1].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
       break;
     }
-    case SR_NEW_NEWMV: {
-      assert(!is_compound);
-
-      FRAME_COUNTS *counts = xd->counts;
-      for (i = 0; i < 2; ++i) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], 0,
-                        mbmi->ref_mv_idx);
-        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-        nmv_context_counts *const mv_counts =
-            counts ? &counts->mv[nmv_ctx] : NULL;
-        read_mv(r, &mv[i].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-        ret = ret && is_mv_valid(&mv[i].as_mv);
-      }
+    case GLOBALMV: {
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
       break;
     }
-#endif  // CONFIG_COMPOUND_SINGLEREF
     case NEW_NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
       assert(is_compound);
-      for (i = 0; i < 2; ++i) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
-                        mbmi->ref_mv_idx);
-        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-        nmv_context_counts *const mv_counts =
-            counts ? &counts->mv[nmv_ctx] : NULL;
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, mv_counts, allow_hp);
-        ret = ret && is_mv_valid(&mv[i].as_mv);
+      for (int i = 0; i < 2; ++i) {
+        nmv_context *const nmvc = &ec_ctx->nmvc;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
       }
       break;
     }
@@ -2130,984 +1149,440 @@ static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
       break;
     }
     case NEW_NEARESTMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
       assert(is_compound);
-      ret = ret && is_mv_valid(&mv[0].as_mv);
       mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEAREST_NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+      nmv_context *const nmvc = &ec_ctx->nmvc;
       mv[0].as_int = nearest_mv[0].as_int;
-      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, mv_counts, allow_hp);
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
       assert(is_compound);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
       break;
     }
     case NEAR_NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
+      nmv_context *const nmvc = &ec_ctx->nmvc;
       mv[0].as_int = near_mv[0].as_int;
-      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, mv_counts, allow_hp);
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
       assert(is_compound);
-
-      ret = ret && is_mv_valid(&mv[1].as_mv);
       break;
     }
     case NEW_NEARMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
       assert(is_compound);
-      ret = ret && is_mv_valid(&mv[0].as_mv);
       mv[1].as_int = near_mv[1].as_int;
       break;
     }
-    case ZERO_ZEROMV: {
+    case GLOBAL_GLOBALMV: {
       assert(is_compound);
-#if CONFIG_GLOBAL_MOTION
-      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block
-#if CONFIG_AMVR
-                                          ,
-                                          cm->cur_frame_mv_precision_level
-#endif
-                                          )
-                         .as_int;
-      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block
-#if CONFIG_AMVR
-                                          ,
-                                          cm->cur_frame_mv_precision_level
-#endif
-                                          )
-                         .as_int;
-#else
-      mv[0].as_int = 0;
-      mv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
+      mv[1].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
       break;
     }
     default: { return 0; }
   }
+
+  int ret = is_mv_valid(&mv[0].as_mv);
+  if (is_compound) {
+    ret = ret && is_mv_valid(&mv[1].as_mv);
+  }
   return ret;
 }
 
 static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                int segment_id, aom_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
-  } else {
-    const int ctx = av1_get_intra_inter_context(xd);
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    const int is_inter =
-        aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
-#else
-    const int is_inter = aom_read(r, cm->fc->intra_inter_prob[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->intra_inter[ctx][is_inter];
-    return is_inter;
+    const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+    if (frame < LAST_FRAME) return 0;
+    return frame != INTRA_FRAME;
   }
-}
-
-#if CONFIG_COMPOUND_SINGLEREF
-static int read_is_inter_singleref_comp_mode(AV1_COMMON *const cm,
-                                             MACROBLOCKD *const xd,
-                                             int segment_id, aom_reader *r) {
-  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) return 0;
-
-  const int ctx = av1_get_inter_mode_context(xd);
-  const int is_singleref_comp_mode =
-      aom_read(r, cm->fc->comp_inter_mode_prob[ctx], ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-
-  if (counts) ++counts->comp_inter_mode[ctx][is_singleref_comp_mode];
-  return is_singleref_comp_mode;
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-static void fpm_sync(void *const data, int mi_row) {
-  AV1Decoder *const pbi = (AV1Decoder *)data;
-  av1_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << pbi->common.mib_size_log2);
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    return 1;
+  }
+  const int ctx = av1_get_intra_inter_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int is_inter =
+      aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
+  return is_inter;
 }
 
 #if DEC_MISMATCH_DEBUG
-static void dec_dump_logs(AV1_COMMON *cm, MODE_INFO *const mi, int mi_row,
-                          int mi_col,
-                          int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES],
-                          int16_t mode_ctx) {
+static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
+                          int mi_col, int16_t mode_ctx) {
   int_mv mv[2] = { { 0 } };
-  int ref;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
     mv[ref].as_mv = mbmi->mv[ref].as_mv;
 
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
   int16_t zeromv_ctx = -1;
   int16_t refmv_ctx = -1;
   if (mbmi->mode != NEWMV) {
-    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) assert(mbmi->mode == ZEROMV);
-    zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-    if (mbmi->mode != ZEROMV) {
+    zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mbmi->mode != GLOBALMV)
       refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
-      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
-      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-    }
   }
 
-  int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#define FRAME_TO_CHECK 1
+#define FRAME_TO_CHECK 11
   if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
     printf(
         "=== DECODER ===: "
-        "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, "
+        "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
         "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
-        "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, "
-        "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d\n",
-        cm->current_video_frame, mi_row, mi_col, mbmi->mode, mbmi->sb_type,
-        cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row,
-        mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1],
-        mbmi->motion_mode, inter_mode_ctx[ref_frame_type], mode_ctx, newmv_ctx,
-        zeromv_ctx, refmv_ctx);
+        "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+        "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+        cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+        mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+        mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+        mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx,
+        refmv_ctx, mbmi->tx_size);
   }
 }
 #endif  // DEC_MISMATCH_DEBUG
 
 static void read_inter_block_mode_info(AV1Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       MODE_INFO *const mi,
-#if CONFIG_SUPERTX
-                                       int mi_row, int mi_col, aom_reader *r,
-                                       int supertx_enabled) {
-#else
-                                       int mi_row, int mi_col, aom_reader *r) {
-#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  const int unify_bsize = CONFIG_CB4X4;
   int_mv nearestmv[2], nearmv[2];
-  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int ref, is_compound;
-#if CONFIG_COMPOUND_SINGLEREF
-  int is_singleref_comp_mode = 0;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
   int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
-  int16_t compound_inter_mode_ctx[MODE_CTX_REF_FRAMES];
-  int16_t mode_ctx = 0;
-#if CONFIG_WARPED_MOTION
   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#if WARPED_MOTION_SORT_SAMPLES
-  int pts_mv[SAMPLES_ARRAY_SIZE];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  assert(NELEMENTS(mode_2_counter) == MB_MODE_COUNT);
-
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
 
-  memset(ref_mvs, 0, sizeof(ref_mvs));
+  av1_collect_neighbors_ref_counts(xd);
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
-  is_compound = has_second_ref(mbmi);
-
-#if CONFIG_EXT_COMP_REFS
-#if !USE_UNI_COMP_REFS
-  // NOTE: uni-directional comp refs disabled
-  if (is_compound)
-    assert(mbmi->ref_frame[0] < BWDREF_FRAME &&
-           mbmi->ref_frame[1] >= BWDREF_FRAME);
-#endif  // !USE_UNI_COMP_REFS
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!is_compound)
-    is_singleref_comp_mode =
-        read_is_inter_singleref_comp_mode(cm, xd, mbmi->segment_id, r);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-
-    av1_find_mv_refs(cm, xd, mi, frame, &xd->ref_mv_count[frame],
-                     xd->ref_mv_stack[frame], compound_inter_mode_ctx,
-                     ref_mvs[frame], mi_row, mi_col, fpm_sync, (void *)pbi,
-                     inter_mode_ctx);
-  }
+  const int is_compound = has_second_ref(mbmi);
 
-  if (is_compound) {
-    MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
-    av1_find_mv_refs(cm, xd, mi, ref_frame, &xd->ref_mv_count[ref_frame],
-                     xd->ref_mv_stack[ref_frame], compound_inter_mode_ctx,
-                     ref_mvs[ref_frame], mi_row, mi_col, fpm_sync, (void *)pbi,
-                     inter_mode_ctx);
+  MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  int_mv global_mvs[REF_FRAMES];
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+                   ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx);
 
-    if (xd->ref_mv_count[ref_frame] < 2) {
-      MV_REFERENCE_FRAME rf[2];
-      int_mv zeromv[2];
-      av1_set_ref_frame(rf, ref_frame);
-#if CONFIG_GLOBAL_MOTION
-      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
-                                              cm->allow_high_precision_mv,
-                                              bsize, mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                              ,
-                                              cm->cur_frame_mv_precision_level
-#endif
-                                              )
-                             .as_int;
-      zeromv[1].as_int =
-          (rf[1] != NONE_FRAME)
-              ? gm_get_motion_vector(&cm->global_motion[rf[1]],
-                                     cm->allow_high_precision_mv, bsize, mi_col,
-                                     mi_row, 0
-#if CONFIG_AMVR
-                                     ,
-                                     cm->cur_frame_mv_precision_level
-#endif
-                                     )
-                    .as_int
-              : 0;
-#else
-      zeromv[0].as_int = zeromv[1].as_int = 0;
-#endif
-      for (ref = 0; ref < 2; ++ref) {
-        if (rf[ref] == NONE_FRAME) continue;
-#if CONFIG_AMVR
-        lower_mv_precision(&ref_mvs[rf[ref]][0].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-        lower_mv_precision(&ref_mvs[rf[ref]][1].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&ref_mvs[rf[ref]][0].as_mv, allow_hp);
-        lower_mv_precision(&ref_mvs[rf[ref]][1].as_mv, allow_hp);
-#endif
-        if (ref_mvs[rf[ref]][0].as_int != zeromv[ref].as_int ||
-            ref_mvs[rf[ref]][1].as_int != zeromv[ref].as_int)
-          inter_mode_ctx[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
-      }
-    }
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (is_compound || is_singleref_comp_mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_compound)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    mode_ctx = compound_inter_mode_ctx[mbmi->ref_frame[0]];
-  else
-    mode_ctx =
-        av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame, bsize, -1);
+  int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
 
-#if CONFIG_SEGMENT_ZEROMV
-  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_ZEROMV)) {
-#else
-  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#endif
-    mbmi->mode = ZEROMV;
-    if (bsize < BLOCK_8X8 && !unify_bsize) {
-      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segment feature on small blocks");
-      return;
-    }
+  if (mbmi->skip_mode) {
+    assert(is_compound);
+    mbmi->mode = NEAREST_NEARESTMV;
   } else {
-    if (bsize >= BLOCK_8X8 || unify_bsize) {
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
+      mbmi->mode = GLOBALMV;
+    } else {
       if (is_compound)
-        mbmi->mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
-#if CONFIG_COMPOUND_SINGLEREF
-      else if (is_singleref_comp_mode)
-        mbmi->mode = read_inter_singleref_comp_mode(xd, r, mode_ctx);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+        mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
       else
-        mbmi->mode = read_inter_mode(ec_ctx, xd, r, mode_ctx);
+        mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
       if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-          mbmi->mode == SR_NEW_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
           have_nearmv_in_inter_mode(mbmi->mode))
         read_drl_idx(ec_ctx, xd, mbmi, r);
     }
   }
 
-  if ((bsize < BLOCK_8X8 && !unify_bsize) ||
-      (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV)) {
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-#if CONFIG_AMVR
-      av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref],
-                            cm->cur_frame_mv_precision_level);
-#else
-      av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref]);
-#endif
-    }
+  if (is_compound != is_inter_compound_mode(mbmi->mode)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Prediction mode %d invalid with ref frame %d %d",
+                       mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if ((is_compound || is_singleref_comp_mode) &&
-      (bsize >= BLOCK_8X8 || unify_bsize) && mbmi->mode != ZERO_ZEROMV)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_compound && (bsize >= BLOCK_8X8 || unify_bsize) &&
-      mbmi->mode != ZERO_ZEROMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-
-    if (xd->ref_mv_count[ref_frame_type] > 0) {
-      if (mbmi->mode == NEAREST_NEARESTMV) {
-        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
-        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
-#if CONFIG_AMVR
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
-#endif
-      } else if (mbmi->mode == NEAREST_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-                 || mbmi->mode == SR_NEAREST_NEARMV
-// || mbmi->mode == SR_NEAREST_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                 ) {
-        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
-
-#if CONFIG_AMVR
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
-#endif
-      } else if (mbmi->mode == NEW_NEARESTMV) {
-        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
-#if CONFIG_AMVR
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
-#endif
-      }
-    }
-
-    if (xd->ref_mv_count[ref_frame_type] > 1) {
-      int ref_mv_idx = 1 + mbmi->ref_mv_idx;
-#if CONFIG_COMPOUND_SINGLEREF
-      if (is_compound) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        if (compound_ref0_mode(mbmi->mode) == NEARMV) {
-          nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-#if CONFIG_AMVR
-          lower_mv_precision(&nearmv[0].as_mv, allow_hp,
-                             cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
-#endif
-        }
+  if (!is_compound && mbmi->mode != GLOBALMV) {
+    av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
+                          &nearmv[0], cm->cur_frame_force_integer_mv);
+  }
 
-        if (compound_ref1_mode(mbmi->mode) == NEARMV) {
-          nearmv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-#if CONFIG_AMVR
-          lower_mv_precision(&nearmv[1].as_mv, allow_hp,
-                             cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
-#endif
-        }
-#if CONFIG_COMPOUND_SINGLEREF
-      } else {
-        assert(is_singleref_comp_mode);
-        if (compound_ref0_mode(mbmi->mode) == NEARMV ||
-            compound_ref1_mode(mbmi->mode) == NEARMV) {
-          nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-          lower_mv_precision(&nearmv[0].as_mv, allow_hp);
-        }
-      }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    }
+  if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
+    int ref_mv_idx = mbmi->ref_mv_idx + 1;
+    nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
+    nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
+    nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+    nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+    lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[0].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[1].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
   } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
     int_mv cur_mv =
         xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
     nearmv[0] = cur_mv;
   }
 
-#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
-  read_mb_interp_filter(cm, xd, mbmi, r);
-#endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
-
-  if (bsize < BLOCK_8X8 && !unify_bsize) {
-    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
-    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
-    int idx, idy;
-    PREDICTION_MODE b_mode;
-    int_mv nearest_sub8x8[2], near_sub8x8[2];
-    int_mv ref_mv[2][2];
-    for (idy = 0; idy < 2; idy += num_4x4_h) {
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        int_mv block[2];
-        const int j = idy * 2 + idx;
-        int_mv ref_mv_s8[2];
-        if (!is_compound)
-          mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame,
-                                               bsize, j);
-        if (is_compound)
-          b_mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
-        else
-          b_mode = read_inter_mode(ec_ctx, xd, r, mode_ctx);
-
-        if (b_mode != ZEROMV && b_mode != ZERO_ZEROMV) {
-          CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
-          uint8_t ref_mv_count[2];
-          for (ref = 0; ref < 1 + is_compound; ++ref) {
-            int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
-            av1_update_mv_context(cm, xd, mi, mbmi->ref_frame[ref], mv_ref_list,
-                                  j, mi_row, mi_col, NULL);
-            av1_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
-                                          ref_mv_stack[ref], &ref_mv_count[ref],
-                                          mv_ref_list, &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref]);
-            if (have_newmv_in_inter_mode(b_mode)) {
-              mv_ref_list[0].as_int = nearest_sub8x8[ref].as_int;
-              mv_ref_list[1].as_int = near_sub8x8[ref].as_int;
-#if CONFIG_AMVR
-              av1_find_best_ref_mvs(allow_hp, mv_ref_list, &ref_mv[0][ref],
-                                    &ref_mv[1][ref],
-                                    cm->cur_frame_mv_precision_level);
-#else
-              av1_find_best_ref_mvs(allow_hp, mv_ref_list, &ref_mv[0][ref],
-                                    &ref_mv[1][ref]);
-#endif
-            }
-          }
-        }
-
-        for (ref = 0; ref < 1 + is_compound && b_mode != ZEROMV; ++ref) {
-          ref_mv_s8[ref] = nearest_sub8x8[ref];
-#if CONFIG_AMVR
-          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp,
-                             cm->cur_frame_mv_precision_level);
-#else
-          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp);
-#endif
-        }
-        (void)ref_mv_s8;
-
-        if (!assign_mv(cm, xd, b_mode, mbmi->ref_frame, j, block, ref_mv[0],
-                       nearest_sub8x8, near_sub8x8, mi_row, mi_col, is_compound,
-                       allow_hp, r)) {
-          aom_merge_corrupted_flag(&xd->corrupted, 1);
-          break;
-        };
-
-        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
-        mi->bmi[j].as_mode = b_mode;
-        if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int;
-
-        if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
-        if (num_4x4_w == 2) mi->bmi[j + 1] = mi->bmi[j];
-      }
-    }
+  int_mv ref_mv[2];
+  ref_mv[0] = nearestmv[0];
+  ref_mv[1] = nearestmv[1];
 
-    mbmi->pred_mv[0].as_int = mi->bmi[3].pred_mv[0].as_int;
-    mbmi->pred_mv[1].as_int = mi->bmi[3].pred_mv[1].as_int;
-    mi->mbmi.mode = b_mode;
-
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  if (is_compound) {
+    int ref_mv_idx = mbmi->ref_mv_idx;
+    // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+    // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+    // mbmi->ref_mv_idx (like NEWMV)
+    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+      ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+    // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
+    if (compound_ref0_mode(mbmi->mode) == NEWMV)
+      ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+
+    if (compound_ref1_mode(mbmi->mode) == NEWMV)
+      ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
   } else {
-    int_mv ref_mv[2];
-    ref_mv[0] = nearestmv[0];
-    ref_mv[1] = nearestmv[1];
-
-    if (is_compound) {
-      int ref_mv_idx = mbmi->ref_mv_idx;
-      // Special case: NEAR_NEWMV and NEW_NEARMV modes use
-      // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-      // mbmi->ref_mv_idx (like NEWMV)
-      if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
-        ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-      if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        if (xd->ref_mv_count[ref_frame_type] > 1) {
-          ref_mv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-          clamp_mv_ref(&ref_mv[0].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                       xd->n8_h << MI_SIZE_LOG2, xd);
-        }
-        nearestmv[0] = ref_mv[0];
-      }
-      if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        if (xd->ref_mv_count[ref_frame_type] > 1) {
-          ref_mv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-          clamp_mv_ref(&ref_mv[1].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                       xd->n8_h << MI_SIZE_LOG2, xd);
-        }
-        nearestmv[1] = ref_mv[1];
-      }
-#if CONFIG_COMPOUND_SINGLEREF
-    } else if (is_singleref_comp_mode) {
-      int ref_mv_idx = mbmi->ref_mv_idx;
-      // Special case: SR_NEAR_NEWMV use 1 + mbmi->ref_mv_idx (like NEARMV)
-      //               instead of mbmi->ref_mv_idx (like NEWMV)
-      if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-      if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-          compound_ref1_mode(mbmi->mode) == NEWMV) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        if (xd->ref_mv_count[ref_frame_type] > 1) {
-          ref_mv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-          clamp_mv_ref(&ref_mv[0].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                       xd->n8_h << MI_SIZE_LOG2, xd);
-        }
-        // TODO(zoeliu): To further investigate why this would not cause a
-        //               mismatch for the mode of SR_NEAREST_NEWMV.
-        nearestmv[0] = ref_mv[0];
-      }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    } else {
-      if (mbmi->mode == NEWMV) {
-        for (ref = 0; ref < 1 + is_compound; ++ref) {
-          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-          if (xd->ref_mv_count[ref_frame_type] > 1) {
-            ref_mv[ref] =
-                (ref == 0)
-                    ? xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].this_mv
-                    : xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
-                          .comp_mv;
-            clamp_mv_ref(&ref_mv[ref].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-          }
-          nearestmv[ref] = ref_mv[ref];
-        }
-      }
+    if (mbmi->mode == NEWMV) {
+      if (xd->ref_mv_count[ref_frame] > 1)
+        ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
     }
+  }
 
+  if (mbmi->skip_mode) {
+    assert(mbmi->mode == NEAREST_NEARESTMV);
+    mbmi->mv[0].as_int = nearestmv[0].as_int;
+    mbmi->mv[1].as_int = nearestmv[1].as_int;
+  } else {
     int mv_corrupted_flag =
-        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, 0, mbmi->mv, ref_mv,
+        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
                    nearestmv, nearmv, mi_row, mi_col, is_compound, allow_hp, r);
     aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
   }
 
-#if CONFIG_INTERINTRA
   mbmi->use_wedge_interintra = 0;
-  if (cm->reference_mode != COMPOUND_REFERENCE &&
-#if CONFIG_SUPERTX
-      !supertx_enabled &&
-#endif
-      cm->allow_interintra_compound && is_interintra_allowed(mbmi)) {
+  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+      is_interintra_allowed(mbmi)) {
     const int bsize_group = size_group_lookup[bsize];
-#if CONFIG_NEW_MULTISYMBOL
     const int interintra =
         aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
-#else
-    const int interintra =
-        aom_read(r, cm->fc->interintra_prob[bsize_group], ACCT_STR);
-#endif
-    if (xd->counts) xd->counts->interintra[bsize_group][interintra]++;
     assert(mbmi->ref_frame[1] == NONE_FRAME);
     if (interintra) {
       const INTERINTRA_MODE interintra_mode =
-          read_interintra_mode(cm, xd, r, bsize_group);
+          read_interintra_mode(xd, r, bsize_group);
       mbmi->ref_frame[1] = INTRA_FRAME;
       mbmi->interintra_mode = interintra_mode;
-#if CONFIG_EXT_INTRA
-      mbmi->angle_delta[0] = 0;
-      mbmi->angle_delta[1] = 0;
-#if CONFIG_INTRA_INTERP
-      mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
       if (is_interintra_wedge_used(bsize)) {
-#if CONFIG_NEW_MULTISYMBOL
         mbmi->use_wedge_interintra = aom_read_symbol(
             r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
-#else
-        mbmi->use_wedge_interintra =
-            aom_read(r, cm->fc->wedge_interintra_prob[bsize], ACCT_STR);
-#endif
-        if (xd->counts)
-          xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
         if (mbmi->use_wedge_interintra) {
           mbmi->interintra_wedge_index =
-              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+              aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
           mbmi->interintra_wedge_sign = 0;
         }
       }
     }
   }
-#endif  // CONFIG_INTERINTRA
 
-#if CONFIG_WARPED_MOTION
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
     xd->block_refs[ref] = ref_buf;
   }
-#endif
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-#if CONFIG_WARPED_MOTION
-  if (mbmi->sb_type >= BLOCK_8X8 && !has_second_ref(mbmi))
-#if WARPED_MOTION_SORT_SAMPLES
-    mbmi->num_proj_ref[0] =
-        findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
-#else
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+      !has_second_ref(mbmi))
     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-#endif
 
-#if CONFIG_SUPERTX
-  if (!supertx_enabled) {
-#endif  // CONFIG_SUPERTX
-    if (mbmi->ref_frame[1] != INTRA_FRAME)
-      mbmi->motion_mode = read_motion_mode(cm, xd, mi, r);
+  if (mbmi->ref_frame[1] != INTRA_FRAME)
+    mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    read_ncobmc_mode(xd, mi, mbmi->ncobmc_mode, r);
-#endif
+  // init
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
 
-#if CONFIG_COMPOUND_SINGLEREF
-    if (is_singleref_comp_mode) assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_WARPED_MOTION
-    if (mbmi->motion_mode == WARPED_CAUSAL) {
-      mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
-
-#if WARPED_MOTION_SORT_SAMPLES
-      if (mbmi->num_proj_ref[0] > 1)
-        mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
-                                            pts_inref, mbmi->num_proj_ref[0]);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-
-      if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
-                          mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                          &mbmi->wm_params[0], mi_row, mi_col)) {
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR, "Invalid Warped Model");
-      }
+  if (has_second_ref(mbmi) && !mbmi->skip_mode) {
+    // Read idx to indicate current compound inter prediction mode group
+    const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                     cm->seq_params.enable_masked_compound;
+
+    if (masked_compound_used) {
+      const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+      mbmi->comp_group_idx = aom_read_symbol(
+          r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
     }
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_SUPERTX
-  }
-#endif  // CONFIG_SUPERTX
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-  if (
-#if CONFIG_COMPOUND_SINGLEREF
-      is_inter_anyref_comp_mode(mbmi->mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-      cm->reference_mode != SINGLE_REFERENCE &&
-      is_inter_compound_mode(mbmi->mode)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      && mbmi->motion_mode == SIMPLE_TRANSLATION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      ) {
-    if (is_any_masked_compound_used(bsize)) {
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-      if (cm->allow_masked_compound) {
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          mbmi->interinter_compound_type =
-              aom_read_bit(r, ACCT_STR) ? COMPOUND_AVERAGE : COMPOUND_SEG;
-        else
-#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          mbmi->interinter_compound_type = aom_read_symbol(
-              r, ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES, ACCT_STR);
-#if CONFIG_WEDGE
-        if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
-          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-          mbmi->wedge_index =
-              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
-          mbmi->wedge_sign = aom_read_bit(r, ACCT_STR);
-        }
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        if (mbmi->interinter_compound_type == COMPOUND_SEG) {
-          mbmi->mask_type = aom_read_literal(r, MAX_SEG_MASK_BITS, ACCT_STR);
-        }
-#endif  // CONFIG_COMPOUND_SEGMENT
+
+    if (mbmi->comp_group_idx == 0) {
+      if (cm->seq_params.enable_jnt_comp) {
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        mbmi->compound_idx = aom_read_symbol(
+            r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+      } else {
+        // Distance-weighted compound is disabled, so always use average
+        mbmi->compound_idx = 1;
       }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     } else {
-      mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+      assert(cm->reference_mode != SINGLE_REFERENCE &&
+             is_inter_compound_mode(mbmi->mode) &&
+             mbmi->motion_mode == SIMPLE_TRANSLATION);
+      assert(masked_compound_used);
+
+      // compound_diffwtd, wedge
+      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+        mbmi->interinter_comp.type =
+            1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize],
+                                COMPOUND_TYPES - 1, ACCT_STR);
+      else
+        mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+        mbmi->interinter_comp.wedge_index =
+            aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
+        mbmi->interinter_comp.wedge_sign = aom_read_bit(r, ACCT_STR);
+      } else {
+        assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+        mbmi->interinter_comp.mask_type =
+            aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
+      }
     }
-    if (xd->counts)
-      xd->counts->compound_interinter[bsize][mbmi->interinter_compound_type]++;
   }
 
-#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
   read_mb_interp_filter(cm, xd, mbmi, r);
-#endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
+
+  if (mbmi->motion_mode == WARPED_CAUSAL) {
+    mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+    mbmi->wm_params[0].invalid = 0;
+
+    if (mbmi->num_proj_ref[0] > 1)
+      mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                            mbmi->num_proj_ref[0], bsize);
+
+    if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+                        mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                        &mbmi->wm_params[0], mi_row, mi_col)) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected warped model from aomenc\n");
+#endif
+      mbmi->wm_params[0].invalid = 1;
+    }
+  }
+
+  xd->cfl.is_chroma_reference = is_chroma_reference(
+      mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+  xd->cfl.store_y = store_cfl_required(cm, xd);
 
 #if DEC_MISMATCH_DEBUG
-  dec_dump_logs(cm, mi, mi_row, mi_col, inter_mode_ctx, mode_ctx);
+  dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
 #endif  // DEC_MISMATCH_DEBUG
 }
 
 static void read_inter_frame_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                                       int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                                       int mi_row, int mi_col, aom_reader *r) {
+                                       MACROBLOCKD *const xd, int mi_row,
+                                       int mi_col, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int inter_block = 1;
-#if CONFIG_VAR_TX
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#endif  // CONFIG_VAR_TX
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
-  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-#if CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 1, r);
+
+  mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
+
+  if (mbmi->skip_mode)
+    mbmi->skip = 1;
+  else
     mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+
+  read_cdef(cm, r, xd, mi_col, mi_row);
+
   if (cm->delta_q_present_flag) {
-    xd->current_qindex =
-        xd->prev_qindex +
+    xd->current_qindex +=
         read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
     xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
-    xd->prev_qindex = xd->current_qindex;
-#if CONFIG_EXT_DELTA_Q
     if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
       if (cm->delta_lf_multi) {
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
-          mbmi->curr_delta_lf[lf_id] = xd->curr_delta_lf[lf_id] =
-              xd->prev_delta_lf[lf_id] +
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
               read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
                   cm->delta_lf_res;
-          xd->prev_delta_lf[lf_id] = xd->curr_delta_lf[lf_id];
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
         }
       } else {
-        mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-            xd->prev_delta_lf_from_base +
+        const int tmp_lvl =
+            xd->delta_lf_from_base +
             read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
                 cm->delta_lf_res;
-        xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
       }
-#else
-      const int current_delta_lf_from_base =
-          xd->prev_delta_lf_from_base +
-          read_delta_lflevel(cm, xd, r, mbmi, mi_col, mi_row) *
-              cm->delta_lf_res;
-      mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-          clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
-      xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
-#endif  // CONFIG_LOOPFILTER_LEVEL
     }
-#endif
   }
 
-#if CONFIG_SUPERTX
-  if (!supertx_enabled) {
-#endif  // CONFIG_SUPERTX
+  if (!mbmi->skip_mode)
     inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 
-#if CONFIG_VAR_TX
-    xd->above_txfm_context =
-        cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-    xd->left_txfm_context = xd->left_txfm_context_buffer +
-                            ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-
-    if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4
-        bsize > BLOCK_4X4 &&
-#else
-        bsize >= BLOCK_8X8 &&
-#endif
-        !mbmi->skip && inter_block && !xd->lossless[mbmi->segment_id]) {
-      const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
-      const int bh = tx_size_high_unit[max_tx_size];
-      const int bw = tx_size_wide_unit[max_tx_size];
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
-      int idx, idy;
-      int init_depth =
-          (height != width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
-
-      mbmi->min_tx_size = TX_SIZES_ALL;
-      for (idy = 0; idy < height; idy += bh)
-        for (idx = 0; idx < width; idx += bw)
-          read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size, init_depth,
-                             idy, idx, r);
-#if CONFIG_RECT_TX_EXT
-      if (is_quarter_tx_allowed(xd, mbmi, inter_block) &&
-          mbmi->tx_size == max_tx_size) {
-        int quarter_tx;
-
-        if (quarter_txsize_lookup[bsize] != max_tx_size) {
-#if CONFIG_NEW_MULTISYMBOL
-          quarter_tx =
-              aom_read_symbol(r, cm->fc->quarter_tx_size_cdf, 2, ACCT_STR);
-#else
-          quarter_tx = aom_read(r, cm->fc->quarter_tx_size_prob, ACCT_STR);
-          if (xd->counts) ++xd->counts->quarter_tx_size[quarter_tx];
-#endif
-        } else {
-          quarter_tx = 1;
-        }
-        if (quarter_tx) {
-          mbmi->tx_size = quarter_txsize_lookup[bsize];
-          for (idy = 0; idy < tx_size_high_unit[max_tx_size] / 2; ++idy)
-            for (idx = 0; idx < tx_size_wide_unit[max_tx_size] / 2; ++idx)
-              mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-          mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-        }
-      }
-#endif
-    } else {
-      mbmi->tx_size = read_tx_size(cm, xd, inter_block, !mbmi->skip, r);
-
-      if (inter_block) {
-        const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-        const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-        int idx, idy;
-        for (idy = 0; idy < height; ++idy)
-          for (idx = 0; idx < width; ++idx)
-            mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
-      }
-      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, mbmi->skip, xd);
-    }
-#else
-  mbmi->tx_size = read_tx_size(cm, xd, inter_block, !mbmi->skip, r);
-#endif  // CONFIG_VAR_TX
-#if CONFIG_SUPERTX
-  }
-#if CONFIG_VAR_TX
-  else if (inter_block) {
-    const int width = num_4x4_blocks_wide_lookup[bsize];
-    const int height = num_4x4_blocks_high_lookup[bsize];
-    int idx, idy;
-    xd->mi[0]->mbmi.tx_size = xd->supertx_size;
-    for (idy = 0; idy < height; ++idy)
-      for (idx = 0; idx < width; ++idx)
-        xd->mi[0]->mbmi.inter_tx_size[idy >> 1][idx >> 1] = xd->supertx_size;
-  }
-#endif  // CONFIG_VAR_TX
-#endif  // CONFIG_SUPERTX
+  mbmi->current_qindex = xd->current_qindex;
+
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd,
-#if CONFIG_SUPERTX
-                               mi, mi_row, mi_col, r, supertx_enabled);
-#else
-                               mi, mi_row, mi_col, r);
-#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
+    read_inter_block_mode_info(pbi, xd, mbmi, mi_row, mi_col, r);
   else
-    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mi, r);
-
-#if !CONFIG_TXK_SEL
-  av1_read_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                   supertx_enabled,
-#endif
-                   r);
-#endif  // !CONFIG_TXK_SEL
+    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mbmi, r);
 }
 
-static void av1_intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row,
-                                     int mi_col, int x_mis, int y_mis) {
-#if CONFIG_TMV
+static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int x_mis, int y_mis) {
   const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      ((mi_row & 0xfffe) >> 1) * frame_mvs_stride +
-                      ((mi_col & 0xfffe) >> 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
   y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
-#else
-  const int frame_mvs_stride = cm->mi_cols;
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      (mi_row & 0xfffe) * frame_mvs_stride + (mi_col & 0xfffe);
-  x_mis = AOMMAX(x_mis, 2);
-  y_mis = AOMMAX(y_mis, 2);
-#endif  // CONFIG_TMV
-  int w, h;
-
-  for (h = 0; h < y_mis; h++) {
-    MV_REF *const frame_mv = frame_mvs + h * frame_mvs_stride;
-    for (w = 0; w < x_mis; w++) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = NONE_FRAME;
-      mv->ref_frame[1] = NONE_FRAME;
+
+  for (int h = 0; h < y_mis; h++) {
+    MV_REF *mv = frame_mvs;
+    for (int w = 0; w < x_mis; w++) {
+      mv->ref_frame = NONE_FRAME;
+      mv++;
     }
+    frame_mvs += frame_mvs_stride;
   }
 }
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                        int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                        int mi_row, int mi_col, aom_reader *r, int x_mis,
-                        int y_mis) {
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
+                        int mi_col, aom_reader *r, int x_mis, int y_mis) {
   AV1_COMMON *const cm = &pbi->common;
-  MODE_INFO *const mi = xd->mi[0];
-#if CONFIG_INTRABC
-  mi->mbmi.use_intrabc = 0;
-#endif  // CONFIG_INTRABC
+  MB_MODE_INFO *const mi = xd->mi[0];
+  mi->use_intrabc = 0;
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
-    av1_intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
+    intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd,
-#if CONFIG_SUPERTX
-                               supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                               mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
     av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
   }
 }
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
index 162cf3254..6243bb168 100644
--- a/third_party/aom/av1/decoder/decodemv.h
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -21,9 +21,6 @@ extern "C" {
 #endif
 
 void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                        int supertx_enabled,
-#endif
 
                         int mi_row, int mi_col, aom_reader *r, int x_mis,
                         int y_mis);
@@ -32,14 +29,7 @@ void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
 }  // extern "C"
 #endif
 
-void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                      int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                      int blk_row, int blk_col, int block, int plane,
-                      TX_SIZE tx_size,
-#endif
-                      aom_reader *r);
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r);
 
 #endif  // AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index cd82d5b53..2e91d27d3 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -13,9 +13,9 @@
 #include <limits.h>
 #include <stdio.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/system_state.h"
@@ -33,12 +33,8 @@
 
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decoder.h"
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#include "av1/common/ncobmc_kernels.h"
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#if !CONFIG_PVQ
 #include "av1/decoder/detokenize.h"
-#endif
+#include "av1/decoder/obu.h"
 
 static void initialize_dec(void) {
   static volatile int init_done = 0;
@@ -53,23 +49,24 @@ static void initialize_dec(void) {
   }
 }
 
-static void av1_dec_setup_mi(AV1_COMMON *cm) {
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+static void dec_setup_mi(AV1_COMMON *cm) {
+  cm->mi = cm->mip;
+  cm->mi_grid_visible = cm->mi_grid_base;
   memset(cm->mi_grid_base, 0,
-         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
 }
 
 static int av1_dec_alloc_mi(AV1_COMMON *cm, int mi_size) {
   cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
   if (!cm->mip) return 1;
   cm->mi_alloc_size = mi_size;
-  cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
   if (!cm->mi_grid_base) return 1;
   return 0;
 }
 
-static void av1_dec_free_mi(AV1_COMMON *cm) {
+static void dec_free_mi(AV1_COMMON *cm) {
   aom_free(cm->mip);
   cm->mip = NULL;
   aom_free(cm->mi_grid_base);
@@ -108,28 +105,20 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
 
   cm->current_video_frame = 0;
-  pbi->ready_for_new_data = 1;
+  pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
   cm->bit_depth = AOM_BITS_8;
   cm->dequant_bit_depth = AOM_BITS_8;
 
   cm->alloc_mi = av1_dec_alloc_mi;
-  cm->free_mi = av1_dec_free_mi;
-  cm->setup_mi = av1_dec_setup_mi;
+  cm->free_mi = dec_free_mi;
+  cm->setup_mi = dec_setup_mi;
 
   av1_loop_filter_init(cm);
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  get_default_ncobmc_kernels(cm);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
-#if CONFIG_AOM_QM
-  aom_qm_init(cm);
-#endif
-#if CONFIG_LOOP_RESTORATION
+  av1_qm_init(cm);
   av1_loop_restoration_precal();
-#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_ACCOUNTING
   pbi->acct_enabled = 1;
   aom_accounting_init(&pbi->accounting);
@@ -142,33 +131,83 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   return pbi;
 }
 
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
+  if (tile_mt_info != NULL) {
+#if CONFIG_MULTITHREAD
+    if (tile_mt_info->job_mutex != NULL) {
+      pthread_mutex_destroy(tile_mt_info->job_mutex);
+      aom_free(tile_mt_info->job_mutex);
+    }
+#endif
+    aom_free(tile_mt_info->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*tile_mt_info);
+  }
+}
+
 void av1_decoder_remove(AV1Decoder *pbi) {
   int i;
 
   if (!pbi) return;
 
+  // Free the tile list output buffer.
+  if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+  pbi->tile_list_output = NULL;
+
   aom_get_worker_interface()->end(&pbi->lf_worker);
   aom_free(pbi->lf_worker.data1);
-  aom_free(pbi->tile_data);
-  for (i = 0; i < pbi->num_tile_workers; ++i) {
+
+  if (pbi->thread_data) {
+    for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      aom_free(thread_data->td);
+    }
+    aom_free(pbi->thread_data);
+  }
+
+  for (i = 0; i < pbi->num_workers; ++i) {
     AVxWorker *const worker = &pbi->tile_workers[i];
     aom_get_worker_interface()->end(worker);
   }
-  aom_free(pbi->tile_worker_data);
-  aom_free(pbi->tile_worker_info);
+  aom_free(pbi->tile_data);
   aom_free(pbi->tile_workers);
 
-  if (pbi->num_tile_workers > 0) {
+  if (pbi->num_workers > 0) {
     av1_loop_filter_dealloc(&pbi->lf_row_sync);
+    av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers);
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
   }
 
 #if CONFIG_ACCOUNTING
   aom_accounting_clear(&pbi->accounting);
 #endif
+  const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+  av1_free_mc_tmp_buf(&pbi->td, use_highbd);
 
   aom_free(pbi);
 }
 
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+                       palette_visitor_fn_t visit) {
+  if (!is_inter_block(xd->mi[0])) {
+    for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
+         ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                              pd->subsampling_y)) {
+        if (xd->mi[0]->palette_mode_info.palette_size[plane])
+          visit(xd, plane, r);
+      } else {
+        assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
+      }
+    }
+  }
+}
+
 static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
                             const YV12_BUFFER_CONFIG *b) {
   return a->y_height == b->y_height && a->y_width == b->y_width &&
@@ -178,6 +217,7 @@ static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
 aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
                                        YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
 
   const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
   if (cfg == NULL) {
@@ -188,13 +228,25 @@ aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
     aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
-    aom_yv12_copy_frame(cfg, sd);
+    aom_yv12_copy_frame(cfg, sd, num_planes);
 
   return cm->error.error_code;
 }
 
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
 aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
   YV12_BUFFER_CONFIG *ref_buf = NULL;
 
   // Get the destination reference buffer.
@@ -205,60 +257,132 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
     return AOM_CODEC_ERROR;
   }
 
-  if (!equal_dimensions(ref_buf, sd)) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
+  if (!use_external_ref) {
+    if (!equal_dimensions(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer.
+      aom_yv12_copy_frame(sd, ref_buf, num_planes);
+    }
   } else {
-    // Overwrite the reference frame buffer.
-    aom_yv12_copy_frame(sd, ref_buf);
+    if (!equal_dimensions_and_border(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer pointers.
+      // Once we no longer need the external reference buffer, these pointers
+      // are restored.
+      ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
+      ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
+      ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
+      ref_buf->y_buffer = sd->y_buffer;
+      ref_buf->u_buffer = sd->u_buffer;
+      ref_buf->v_buffer = sd->v_buffer;
+      ref_buf->use_external_refernce_buffers = 1;
+    }
   }
 
   return cm->error.error_code;
 }
 
-/* If any buffer updating is signaled it should be done here. */
-static void swap_frame_buffers(AV1Decoder *pbi) {
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here.
+   Consumes a reference to cm->new_fb_idx.
+*/
+static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-  lock_buffer_pool(pool);
-  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    const int old_idx = cm->ref_frame_map[ref_index];
-    // Current thread releases the holding of reference frame.
-    decrease_ref_count(old_idx, frame_bufs, pool);
-
-    // Release the reference frame holding in the reference map for the decoding
-    // of the next frame.
-    if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
-    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-    ++ref_index;
-  }
+  if (frame_decoded) {
+    lock_buffer_pool(pool);
 
-  // Current thread releases the holding of reference frame.
-  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
-    const int old_idx = cm->ref_frame_map[ref_index];
-    decrease_ref_count(old_idx, frame_bufs, pool);
-    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-  }
+    // In ext-tile decoding, the camera frame header is only decoded once. So,
+    // we don't release the references here.
+    if (!pbi->camera_frame_header_ready) {
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
 
-  unlock_buffer_pool(pool);
-  pbi->hold_ref_buf = 0;
-  cm->frame_to_show = get_frame_new_buffer(cm);
+        // Release the reference frame holding in the reference map for the
+        // decoding of the next frame.
+        if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+        ++ref_index;
+      }
 
-  // TODO(zoeliu): To fix the ref frame buffer update for the scenario of
-  //               cm->frame_parellel_decode == 1
-  if (!cm->frame_parallel_decode || !cm->show_frame) {
+      // Current thread releases the holding of reference frame.
+      const int check_on_show_existing_frame =
+          !cm->show_existing_frame || cm->reset_decoder_state;
+      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+           ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+      }
+    }
+
+    YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+
+    if (cm->show_existing_frame || cm->show_frame) {
+      if (pbi->output_all_layers) {
+        // Append this frame to the output queue
+        if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
+          // We can't store the new frame anywhere, so drop it and return an
+          // error
+          decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        } else {
+          pbi->output_frames[pbi->num_output_frames] = cur_frame;
+          pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx;
+          pbi->num_output_frames++;
+        }
+      } else {
+        // Replace any existing output frame
+        assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
+        if (pbi->num_output_frames > 0) {
+          decrease_ref_count((int)pbi->output_frame_index[0], frame_bufs, pool);
+        }
+        pbi->output_frames[0] = cur_frame;
+        pbi->output_frame_index[0] = cm->new_fb_idx;
+        pbi->num_output_frames = 1;
+      }
+    } else {
+      decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    }
+
+    unlock_buffer_pool(pool);
+  } else {
+    // Nothing was decoded, so just drop this frame buffer
     lock_buffer_pool(pool);
-    --frame_bufs[cm->new_fb_idx].ref_count;
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
     unlock_buffer_pool(pool);
   }
 
-  // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
-    cm->frame_refs[ref_index].idx = INVALID_IDX;
-    cm->frame_refs[ref_index].buf = NULL;
+  if (!pbi->camera_frame_header_ready) {
+    pbi->hold_ref_buf = 0;
+
+    // Invalidate these references until the next frame starts.
+    for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+      cm->frame_refs[ref_index].idx = INVALID_IDX;
+      cm->frame_refs[ref_index].buf = NULL;
+    }
   }
 }
 
@@ -268,7 +392,6 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   BufferPool *volatile const pool = cm->buffer_pool;
   RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
   const uint8_t *source = *psource;
-  int retcode = 0;
   cm->error.error_code = AOM_CODEC_OK;
 
   if (size == 0) {
@@ -286,18 +409,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     }
   }
 
-  pbi->ready_for_new_data = 0;
-
   // Find a free buffer for the new frame, releasing the reference previously
   // held.
 
-  // Check if the previous frame was a frame without any references to it.
-  // Release frame buffer if not decoding in frame parallel mode.
-  if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-      frame_bufs[cm->new_fb_idx].ref_count == 0)
-    pool->release_fb_cb(pool->cb_priv,
-                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
-
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR;
@@ -305,31 +419,20 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   // Assign a MV array to the frame buffer.
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
-  pbi->hold_ref_buf = 0;
-  if (cm->frame_parallel_decode) {
-    AVxWorker *const worker = pbi->frame_worker_owner;
-    av1_frameworker_lock_stats(worker);
-    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
-    // Reset decoding progress.
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    av1_frameworker_unlock_stats(worker);
-  } else {
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-  }
+  if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
+
+  pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
   if (setjmp(cm->error.jmp)) {
     const AVxWorkerInterface *const winterface = aom_get_worker_interface();
     int i;
 
     cm->error.setjmp = 0;
-    pbi->ready_for_new_data = 1;
 
     // Synchronize all threads immediately as a subsequent decode call may
     // cause a resize invalidating some allocations.
     winterface->sync(&pbi->lf_worker);
-    for (i = 0; i < pbi->num_tile_workers; ++i) {
+    for (i = 0; i < pbi->num_workers; ++i) {
       winterface->sync(&pbi->tile_workers[i]);
     }
 
@@ -349,7 +452,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
       }
 
       // Current thread releases the holding of reference frame.
-      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+      const int check_on_show_existing_frame =
+          !cm->show_existing_frame || cm->reset_decoder_state;
+      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+           ++ref_index) {
         const int old_idx = cm->ref_frame_map[ref_index];
         decrease_ref_count(old_idx, frame_bufs, pool);
       }
@@ -365,160 +471,72 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
 
   cm->error.setjmp = 1;
 
-#if !CONFIG_OBU
-  av1_decode_frame_headers_and_setup(pbi, source, source + size, psource);
-  if (!cm->show_existing_frame) {
-    av1_decode_tg_tiles_and_wrapup(pbi, source, source + size, psource, 0,
-                                   cm->tile_rows * cm->tile_cols - 1, 1);
-  }
-#else
-  av1_decode_frame_from_obus(pbi, source, source + size, psource);
+  int frame_decoded =
+      aom_decode_frame_from_obus(pbi, source, source + size, psource);
+
+  if (cm->error.error_code != AOM_CODEC_OK) return 1;
+
+#if TXCOEFF_TIMER
+  cm->cum_txcoeff_timer += cm->txcoeff_timer;
+  fprintf(stderr,
+          "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
+          cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
+  cm->txcoeff_timer = 0;
+  cm->txb_count = 0;
 #endif
 
-  swap_frame_buffers(pbi);
+  // Note: At this point, this function holds a reference to cm->new_fb_idx
+  // in the buffer pool. This reference is consumed by swap_frame_buffers().
+  swap_frame_buffers(pbi, frame_decoded);
+
+  if (frame_decoded) {
+    pbi->decoding_first_frame = 0;
+  }
 
-#if CONFIG_EXT_TILE
-  // For now, we only extend the frame borders when the whole frame is decoded.
-  // Later, if needed, extend the border for the decoded tile on the frame
-  // border.
-  if (pbi->dec_tile_row == -1 && pbi->dec_tile_col == -1)
-#endif  // CONFIG_EXT_TILE
-    // TODO(debargha): Fix encoder side mv range, so that we can use the
-    // inner border extension. As of now use the larger extension.
-    // aom_extend_frame_inner_borders(cm->frame_to_show);
-    aom_extend_frame_borders(cm->frame_to_show);
+  if (cm->error.error_code != AOM_CODEC_OK) return 1;
 
   aom_clear_system_state();
 
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
 
-#if CONFIG_EXT_REFS
-    // NOTE: It is not supposed to ref to any frame not used as reference
-    if (cm->is_reference_frame)
-#endif  // CONFIG_EXT_REFS
-      cm->prev_frame = cm->cur_frame;
-
-    if (cm->seg.enabled && !cm->frame_parallel_decode)
-      av1_swap_current_and_last_seg_map(cm);
-  }
-
-  // Update progress in frame parallel decode.
-  if (cm->frame_parallel_decode) {
-    // Need to lock the mutex here as another thread may
-    // be accessing this buffer.
-    AVxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    av1_frameworker_lock_stats(worker);
-
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
-    frame_worker_data->frame_decoded = 1;
-    frame_worker_data->frame_context_ready = 1;
-    av1_frameworker_signal_stats(worker);
-    av1_frameworker_unlock_stats(worker);
-  } else {
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-    cm->last_tile_cols = cm->tile_cols;
-    cm->last_tile_rows = cm->tile_rows;
-    if (cm->show_frame) {
-      cm->current_video_frame++;
+    if (cm->seg.enabled) {
+      if (cm->prev_frame && (cm->mi_rows == cm->prev_frame->mi_rows) &&
+          (cm->mi_cols == cm->prev_frame->mi_cols)) {
+        cm->last_frame_seg_map = cm->prev_frame->seg_map;
+      } else {
+        cm->last_frame_seg_map = NULL;
+      }
     }
   }
 
+  // Update progress in frame parallel decode.
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+  cm->last_tile_cols = cm->tile_cols;
+  cm->last_tile_rows = cm->tile_rows;
   cm->error.setjmp = 0;
-  return retcode;
-}
-
-int av1_get_raw_frame(AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd) {
-  AV1_COMMON *const cm = &pbi->common;
-  int ret = -1;
-  if (pbi->ready_for_new_data == 1) return ret;
 
-  pbi->ready_for_new_data = 1;
+  return 0;
+}
 
-  /* no raw frame to show!!! */
-  if (!cm->show_frame) return ret;
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params) {
+  RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
 
-  *sd = *cm->frame_to_show;
-  ret = 0;
+  if (index >= pbi->num_output_frames) return -1;
+  *sd = pbi->output_frames[index];
+  *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params;
   aom_clear_system_state();
-  return ret;
+  return 0;
 }
 
+// Get the highest-spatial-layer output
+// TODO(david.barker): What should this do?
 int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
-  AV1_COMMON *const cm = &pbi->common;
-
-  if (!cm->show_frame || !cm->frame_to_show) return -1;
+  if (pbi->num_output_frames == 0) return -1;
 
-  *frame = *cm->frame_to_show;
+  *frame = *pbi->output_frames[pbi->num_output_frames - 1];
   return 0;
 }
-
-aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                           uint32_t sizes[8], int *count,
-                                           int *index_size,
-                                           aom_decrypt_cb decrypt_cb,
-                                           void *decrypt_state) {
-  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
-  // it is a super frame index. If the last byte of real video compression
-  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
-  // not the associated matching marker byte at the front of the index we have
-  // an invalid bitstream and need to return an error.
-
-  uint8_t marker;
-  size_t frame_sz_sum = 0;
-
-  assert(data_sz);
-  marker = read_marker(decrypt_cb, decrypt_state, data);
-  *count = 0;
-
-  if ((marker & 0xe0) == 0xc0) {
-    const uint32_t frames = (marker & 0x7) + 1;
-    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * (frames - 1);
-    *index_size = (int)index_sz;
-
-    // This chunk is marked as having a superframe index but doesn't have
-    // enough data for it, thus it's an invalid superframe index.
-    if (data_sz < index_sz) return AOM_CODEC_CORRUPT_FRAME;
-
-    {
-      const uint8_t marker2 =
-          read_marker(decrypt_cb, decrypt_state, data + index_sz - 1);
-
-      // This chunk is marked as having a superframe index but doesn't have
-      // the matching marker byte at the front of the index therefore it's an
-      // invalid chunk.
-      if (marker != marker2) return AOM_CODEC_CORRUPT_FRAME;
-    }
-
-    {
-      // Found a valid superframe index.
-      uint32_t i, j;
-      const uint8_t *x = &data[1];
-
-      // Frames has a maximum of 8 and mag has a maximum of 4.
-      uint8_t clear_buffer[28];
-      assert(sizeof(clear_buffer) >= (frames - 1) * mag);
-      if (decrypt_cb) {
-        decrypt_cb(decrypt_state, x, clear_buffer, (frames - 1) * mag);
-        x = clear_buffer;
-      }
-
-      for (i = 0; i < frames - 1; ++i) {
-        uint32_t this_sz = 0;
-
-        for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
-        this_sz += 1;
-        sizes[i] = this_sz;
-        frame_sz_sum += this_sz;
-      }
-      sizes[i] = (uint32_t)(data_sz - index_sz - frame_sz_sum);
-      *count = frames;
-    }
-  }
-  return AOM_CODEC_OK;
-}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 20129b669..42fcc1256 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -12,7 +12,7 @@
 #ifndef AV1_DECODER_DECODER_H_
 #define AV1_DECODER_DECODER_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_codec.h"
 #include "aom_dsp/bitreader.h"
@@ -29,73 +29,61 @@
 #include "av1/decoder/inspection.h"
 #endif
 
-#if CONFIG_PVQ
-#include "aom_dsp/entdec.h"
-#include "av1/decoder/decint.h"
-#include "av1/encoder/encodemb.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// TODO(hkuang): combine this with TileWorkerData.
-typedef struct TileData {
-  AV1_COMMON *cm;
-  aom_reader bit_reader;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+typedef struct ThreadData {
+  aom_reader *bit_reader;
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
-#if CONFIG_PVQ
-  /* forward transformed predicted image, a reference for PVQ */
-  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-#endif
-#if CONFIG_CFL
-  CFL_CTX cfl;
-#endif
-  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
-#if CONFIG_MRC_TX
-  DECLARE_ALIGNED(16, uint8_t, mrc_mask[MAX_SB_SQUARE]);
-#endif  // CONFIG_MRC_TX
-} TileData;
-
-typedef struct TileWorkerData {
-  struct AV1Decoder *pbi;
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  CB_BUFFER cb_buffer_base;
+  uint8_t *mc_buf[2];
+  int32_t mc_buf_size;
+} ThreadData;
+
+typedef struct TileDataDec {
+  TileInfo tile_info;
   aom_reader bit_reader;
-  FRAME_COUNTS counts;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
-  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
-#if CONFIG_PVQ
-  /* forward transformed predicted image, a reference for PVQ */
-  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-#endif
-#if CONFIG_CFL
-  CFL_CTX cfl;
-#endif
-  FRAME_CONTEXT tctx;
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
-#if CONFIG_MRC_TX
-  DECLARE_ALIGNED(16, uint8_t, mrc_mask[MAX_SB_SQUARE]);
-#endif  // CONFIG_MRC_TX
-  struct aom_internal_error_info error_info;
-} TileWorkerData;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+} TileDataDec;
 
 typedef struct TileBufferDec {
   const uint8_t *data;
   size_t size;
-  const uint8_t *raw_data_end;  // The end of the raw tile buffer in the
-                                // bit stream.
-  int col;                      // only used with multi-threaded decoding
 } TileBufferDec;
 
-typedef struct AV1Decoder {
-  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+typedef struct DataBuffer {
+  const uint8_t *data;
+  size_t size;
+} DataBuffer;
+
+typedef struct EXTERNAL_REFERENCES {
+  YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES];
+  int num;
+} EXTERNAL_REFERENCES;
+
+typedef struct TileJobsDec {
+  TileBufferDec *tile_buffer;
+  TileDataDec *tile_data;
+} TileJobsDec;
 
-  DECLARE_ALIGNED(16, AV1_COMMON, common);
+typedef struct AV1DecTileMTData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  TileJobsDec *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+  int alloc_tile_rows;
+  int alloc_tile_cols;
+} AV1DecTileMT;
+
+typedef struct AV1Decoder {
+  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
 
-  int ready_for_new_data;
+  DECLARE_ALIGNED(32, AV1_COMMON, common);
 
   int refresh_frame_flags;
 
@@ -105,20 +93,38 @@ typedef struct AV1Decoder {
 
   AVxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
   AVxWorker lf_worker;
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
   AVxWorker *tile_workers;
-  TileWorkerData *tile_worker_data;
-  TileInfo *tile_worker_info;
-  int num_tile_workers;
-
-  TileData *tile_data;
+  int num_workers;
+  DecWorkerData *thread_data;
+  ThreadData td;
+  TileDataDec *tile_data;
   int allocated_tiles;
 
   TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
-
-  AV1LfSync lf_row_sync;
-
-  aom_decrypt_cb decrypt_cb;
-  void *decrypt_state;
+  AV1DecTileMT tile_mt_info;
+
+  // Each time the decoder is called, we expect to receive a full temporal unit.
+  // This can contain up to one shown frame per spatial layer in the current
+  // operating point (note that some layers may be entirely omitted).
+  // If the 'output_all_layers' option is true, we save all of these shown
+  // frames so that they can be returned to the application. If the
+  // 'output_all_layers' option is false, then we only output one image per
+  // temporal unit.
+  //
+  // Note: The saved buffers are released at the start of the next time the
+  // application calls aom_codec_decode().
+  int output_all_layers;
+  YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS];
+  size_t output_frame_index[MAX_NUM_SPATIAL_LAYERS];  // Buffer pool indices
+  size_t num_output_frames;  // How many frames are queued up so far?
+
+  // In order to properly support random-access decoding, we need
+  // to behave slightly differently for the very first frame we decode.
+  // So we track whether this is the first frame or not.
+  int decoding_first_frame;
 
   int allow_lowbitdepth;
   int max_threads;
@@ -127,29 +133,47 @@ typedef struct AV1Decoder {
   int hold_ref_buf;  // hold the reference buffer.
 
   int tile_size_bytes;
-#if CONFIG_EXT_TILE
   int tile_col_size_bytes;
   int dec_tile_row, dec_tile_col;  // always -1 for non-VR tile encoding
-#endif                             // CONFIG_EXT_TILE
 #if CONFIG_ACCOUNTING
   int acct_enabled;
   Accounting accounting;
 #endif
-  size_t uncomp_hdr_size;       // Size of the uncompressed header
-  size_t first_partition_size;  // Size of the compressed header
-  int tg_size;                  // Number of tiles in the current tilegroup
-  int tg_start;                 // First tile in the current tilegroup
+  size_t uncomp_hdr_size;  // Size of the uncompressed header
+  int tg_size;             // Number of tiles in the current tilegroup
+  int tg_start;            // First tile in the current tilegroup
   int tg_size_bit_offset;
+  int sequence_header_ready;
 #if CONFIG_INSPECTION
   aom_inspect_cb inspect_cb;
   void *inspect_ctx;
 #endif
+  int operating_point;
+  int current_operating_point;
+  int seen_frame_header;
+
+  // State if the camera frame header is already decoded while
+  // large_scale_tile = 1.
+  int camera_frame_header_ready;
+  size_t frame_header_size;
+  DataBuffer obu_size_hdr;
+  int output_frame_width_in_tiles_minus_1;
+  int output_frame_height_in_tiles_minus_1;
+  int tile_count_minus_1;
+  uint32_t coded_tile_data_size;
+  unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+  EXTERNAL_REFERENCES ext_refs;
+  size_t tile_list_size;
+  uint8_t *tile_list_output;
+  size_t buffer_sz;
 } AV1Decoder;
 
 int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
                                 const uint8_t **dest);
 
-int av1_get_raw_frame(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd);
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params);
 
 int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
 
@@ -157,29 +181,16 @@ aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx,
                                        YV12_BUFFER_CONFIG *sd);
 
 aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
                                       YV12_BUFFER_CONFIG *sd);
-
-static INLINE uint8_t read_marker(aom_decrypt_cb decrypt_cb,
-                                  void *decrypt_state, const uint8_t *data) {
-  if (decrypt_cb) {
-    uint8_t marker;
-    decrypt_cb(decrypt_state, data, &marker, 1);
-    return marker;
-  }
-  return *data;
-}
-
-// This function is exposed for use in tests, as well as the inlined function
-// "read_marker".
-aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                           uint32_t sizes[8], int *count,
-                                           int *index_size,
-                                           aom_decrypt_cb decrypt_cb,
-                                           void *decrypt_state);
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
 
 struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
 
 void av1_decoder_remove(struct AV1Decoder *pbi);
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync);
 
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
@@ -196,7 +207,6 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
   }
 }
 
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
                                        RefCntBuffer *frame_buf) {
   AV1_COMMON *const cm = &pbi->common;
@@ -208,7 +218,6 @@ static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
   }
   return (i < INTER_REFS_PER_FRAME);
 }
-#endif  // CONFIG_EXT_REFS
 
 #define ACCT_STR __func__
 static INLINE int av1_read_uniform(aom_reader *r, int n) {
@@ -222,6 +231,13 @@ static INLINE int av1_read_uniform(aom_reader *r, int n) {
     return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
 }
 
+typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
+                                     aom_reader *r);
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+                       palette_visitor_fn_t visit);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
index 13f944b35..f9a3e8578 100644
--- a/third_party/aom/av1/decoder/decodetxb.c
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -9,28 +9,25 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/scan.h"
+#include "av1/decoder/decodetxb.h"
+
+#include "aom_ports/mem.h"
 #include "av1/common/idct.h"
+#include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "av1/decoder/decodemv.h"
-#include "av1/decoder/decodetxb.h"
-#include "av1/decoder/dsubexp.h"
-#include "av1/decoder/symbolrate.h"
 
 #define ACCT_STR __func__
 
-static int read_golomb(MACROBLOCKD *xd, aom_reader *r, FRAME_COUNTS *counts) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
   int x = 1;
   int length = 0;
   int i = 0;
 
   while (!i) {
-    i = av1_read_record_bit(counts, r, ACCT_STR);
+    i = aom_read_bit(r, ACCT_STR);
     ++length;
-    if (length >= 32) {
+    if (length > 20) {
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid length in read_golomb");
       break;
@@ -39,570 +36,306 @@ static int read_golomb(MACROBLOCKD *xd, aom_reader *r, FRAME_COUNTS *counts) {
 
   for (i = 0; i < length - 1; ++i) {
     x <<= 1;
-    x += av1_read_record_bit(counts, r, ACCT_STR);
+    x += aom_read_bit(r, ACCT_STR);
   }
 
   return x - 1;
 }
 
-static INLINE int read_nz_map(aom_reader *r, tran_low_t *tcoeffs, int plane,
-                              const int16_t *scan, TX_SIZE tx_size,
-                              TX_TYPE tx_type, FRAME_CONTEXT *fc,
-                              FRAME_COUNTS *counts) {
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-#if CONFIG_CTX1D
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      (counts) ? &counts->nz_map[txs_ctx][plane_type] : NULL;
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-  aom_prob *eob_flag = fc->eob_flag[txs_ctx][plane_type];
-#endif
-  int c;
-  for (c = 0; c < seg_eob; ++c) {
-    int is_nz;
-    int coeff_ctx = get_nz_map_ctx(tcoeffs, c, scan, bwl, height, tx_type);
-    int eob_ctx = get_eob_ctx(tcoeffs, scan[c], txs_ctx, tx_type);
-
-    if (c < seg_eob - 1) {
-#if LV_MAP_PROB
-      is_nz = av1_read_record_bin(
-          counts, r, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2,
-          ACCT_STR);
-#else
-      is_nz = aom_read(r, nz_map[coeff_ctx], ACCT_STR);
-#endif
-    } else {
-      is_nz = 1;
-    }
-
-    // set non-zero coefficient map.
-    tcoeffs[scan[c]] = is_nz;
-
-    if (c == seg_eob - 1) {
-      ++c;
-      break;
-    }
-
-    if (counts) ++(*nz_map_count)[coeff_ctx][is_nz];
-
-    if (is_nz) {
-#if LV_MAP_PROB
-      int is_eob = av1_read_record_bin(
-          counts, r, fc->eob_flag_cdf[txs_ctx][plane_type][eob_ctx], 2,
-          ACCT_STR);
-#else
-      int is_eob = aom_read(r, eob_flag[eob_ctx], ACCT_STR);
-#endif
-      if (counts) ++counts->eob_flag[txs_ctx][plane_type][eob_ctx][is_eob];
-      if (is_eob) break;
-    }
+static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+  int eob = k_eob_group_start[eob_token];
+  if (eob > 2) {
+    eob += extra;
   }
-  return AOMMIN(seg_eob, c + 1);
+  return eob;
 }
 
-#if CONFIG_CTX1D
-static INLINE int read_nz_map_vert(aom_reader *r, tran_low_t *tcoeffs,
-                                   int plane, const int16_t *scan,
-                                   const int16_t *iscan, TX_SIZE tx_size,
-                                   TX_TYPE tx_type, FRAME_CONTEXT *fc,
-                                   FRAME_COUNTS *counts) {
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  int eob = 0;
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int col = 0; col < width; ++col) {
-    int el_ctx = get_empty_line_ctx(col, eob_ls);
-#if LV_MAP_PROB
-    int empty_line = av1_read_record_bin(
-        counts, r, fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2,
-        ACCT_STR);
-#else
-    int empty_line = aom_read(
-        r, fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx], ACCT_STR);
-#endif
-    if (counts)
-      ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][empty_line];
-    if (!empty_line) {
-      int row;
-      for (row = 0; row < height; ++row) {
-        if (row + 1 != height) {
-          int coeff_idx = row * width + col;
-          int scan_idx = iscan[coeff_idx];
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeffs, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          int is_nz = av1_read_record_bin(
-              counts, r, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2,
-              ACCT_STR);
-#else
-          int is_nz = aom_read(r, nz_map[coeff_ctx], ACCT_STR);
-#endif
-          if (counts) ++counts->nz_map[txs_ctx][plane_type][coeff_ctx][is_nz];
-          tcoeffs[coeff_idx] = is_nz;
-          if (is_nz) {
-            eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-            if (row + 1 != height) {
-              int eob_ctx = get_hv_eob_ctx(col, row, eob_ls);
-#if LV_MAP_PROB
-              int is_eob = av1_read_record_bin(
-                  counts, r,
-                  fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2,
-                  ACCT_STR);
-#else
-              int is_eob = aom_read(
-                  r, fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx],
-                  ACCT_STR);
-#endif
-              if (counts)
-                ++counts
-                      ->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx][is_eob];
-              if (is_eob) break;
-            }
-          }
-        } else {
-          int coeff_idx = row * width + col;
-          tcoeffs[coeff_idx] = 1;
-          eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-        }
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+                                          int start_si, int end_si,
+                                          const int16_t *scan, int bwl,
+                                          uint8_t *levels,
+                                          base_cdf_arr base_cdf,
+                                          br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, tx_size);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx_2d(levels, pos, bwl);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
       }
-      eob_ls[col] = AOMMIN(height, row + 1);
-    } else {
-      eob_ls[col] = 0;
     }
+    levels[get_padded_idx(pos, bwl)] = level;
   }
-  return eob;
 }
 
-static INLINE int read_nz_map_horiz(aom_reader *r, tran_low_t *tcoeffs,
-                                    int plane, const int16_t *scan,
-                                    const int16_t *iscan, TX_SIZE tx_size,
-                                    TX_TYPE tx_type, FRAME_CONTEXT *fc,
-                                    FRAME_COUNTS *counts) {
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  int eob = 0;
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int row = 0; row < height; ++row) {
-    int el_ctx = get_empty_line_ctx(row, eob_ls);
-#if LV_MAP_PROB
-    int empty_line = av1_read_record_bin(
-        counts, r, fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2,
-        ACCT_STR);
-#else
-    int empty_line = aom_read(
-        r, fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx], ACCT_STR);
-#endif
-    if (counts)
-      ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][empty_line];
-    if (!empty_line) {
-      int col;
-      for (col = 0; col < width; ++col) {
-        if (col + 1 != width) {
-          int coeff_idx = row * width + col;
-          int scan_idx = iscan[coeff_idx];
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeffs, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          int is_nz = av1_read_record_bin(
-              counts, r, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2,
-              ACCT_STR);
-#else
-          int is_nz = aom_read(r, nz_map[coeff_ctx], ACCT_STR);
-#endif
-          if (counts) ++counts->nz_map[txs_ctx][plane_type][coeff_ctx][is_nz];
-          tcoeffs[coeff_idx] = is_nz;
-          if (is_nz) {
-            eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-            int eob_ctx = get_hv_eob_ctx(row, col, eob_ls);
-#if LV_MAP_PROB
-            int is_eob = av1_read_record_bin(
-                counts, r,
-                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2,
-                ACCT_STR);
-#else
-            int is_eob =
-                aom_read(r, fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx],
-                         ACCT_STR);
-#endif
-            if (counts)
-              ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx][is_eob];
-            if (is_eob) break;
-          }
-        } else {
-          int coeff_idx = row * width + col;
-          tcoeffs[coeff_idx] = 1;
-          eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-        }
+static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+                                       TX_CLASS tx_class, int start_si,
+                                       int end_si, const int16_t *scan, int bwl,
+                                       uint8_t *levels, base_cdf_arr base_cdf,
+                                       br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx =
+        get_lower_levels_ctx(levels, pos, bwl, tx_size, tx_class);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
       }
-      eob_ls[row] = AOMMIN(width, col + 1);
-    } else {
-      eob_ls[row] = 0;
     }
+    levels[get_padded_idx(pos, bwl)] = level;
   }
-  return eob;
 }
-#endif
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                            aom_reader *r, int blk_row, int blk_col, int block,
-                            int plane, tran_low_t *tcoeffs, TXB_CTX *txb_ctx,
-                            TX_SIZE tx_size, int16_t *max_scan_line, int *eob) {
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  FRAME_COUNTS *counts = xd->counts;
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  PLANE_TYPE plane_type = get_plane_type(plane);
-#if !LV_MAP_PROB
-  aom_prob *nz_map = ec_ctx->nz_map[txs_ctx][plane_type];
-  aom_prob *eob_flag = ec_ctx->eob_flag[txs_ctx][plane_type];
-#endif
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int seg_eob = tx_size_2d[tx_size];
-  int c = 0;
-  int update_eob = -1;
-  const int16_t *const dequant = xd->plane[plane].seg_dequant[mbmi->segment_id];
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size) {
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  const int32_t max_value = (1 << (7 + xd->bd)) - 1;
+  const int32_t min_value = -(1 << (7 + xd->bd));
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
+  tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
   const int shift = av1_get_tx_scale(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
   int cul_level = 0;
-  memset(tcoeffs, 0, sizeof(*tcoeffs) * seg_eob);
-
-#if LV_MAP_PROB
-  int all_zero = av1_read_record_bin(
-      counts, r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2,
-      ACCT_STR);
-#else
-  int all_zero =
-      aom_read(r, ec_ctx->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx], ACCT_STR);
-#endif
-  if (xd->counts)
-    ++xd->counts->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx][all_zero];
-
+  int dc_val = 0;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const int all_zero = aom_read_symbol(
+      r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t *const eob = &(eob_data->eob);
+  uint16_t *const max_scan_line = &(eob_data->max_scan_line);
+  *max_scan_line = 0;
   *eob = 0;
   if (all_zero) {
     *max_scan_line = 0;
-#if CONFIG_TXK_SEL
-    if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = DCT_DCT;
-#endif
+    if (plane == 0) {
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      mbmi->txk_type[txk_type_idx] = DCT_DCT;
+    }
     return 0;
   }
 
-  (void)blk_row;
-  (void)blk_col;
-#if CONFIG_TXK_SEL
-  av1_read_tx_type(cm, xd, blk_row, blk_col, block, plane,
-                   get_min_tx_size(tx_size), r);
-#endif
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
-
-#if CONFIG_CTX1D
-  const int16_t *iscan = scan_order->iscan;
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    *eob =
-        read_nz_map(r, tcoeffs, plane, scan, tx_size, tx_type, ec_ctx, counts);
-  } else {
-#if LV_MAP_PROB
-    const int eob_mode = av1_read_record_bin(
-        counts, r, ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], 2,
-        ACCT_STR);
-#else
-    const int eob_mode =
-        aom_read(r, ec_ctx->eob_mode[txs_ctx][plane_type][tx_class], ACCT_STR);
-#endif
-    if (counts) ++counts->eob_mode[txs_ctx][plane_type][tx_class][eob_mode];
-    if (eob_mode == 0) {
-      *eob = read_nz_map(r, tcoeffs, plane, scan, tx_size, tx_type, ec_ctx,
-                         counts);
-    } else {
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        *eob = read_nz_map_vert(r, tcoeffs, plane, scan, iscan, tx_size,
-                                tx_type, ec_ctx, counts);
-      else
-        *eob = read_nz_map_horiz(r, tcoeffs, plane, scan, iscan, tx_size,
-                                 tx_type, ec_ctx, counts);
-    }
+  memset(levels_buf, 0,
+         sizeof(*levels_buf) *
+             ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+  if (plane == AOM_PLANE_Y) {
+    // only y plane's tx_type is transmitted
+    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
+  }
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int eob_extra = 0;
+  int eob_pt = 1;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
+                          5, ACCT_STR) +
+          1;
+      break;
+    case 1:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
+                          6, ACCT_STR) +
+          1;
+      break;
+    case 2:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
+                          7, ACCT_STR) +
+          1;
+      break;
+    case 3:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
+                          8, ACCT_STR) +
+          1;
+      break;
+    case 4:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
+                          9, ACCT_STR) +
+          1;
+      break;
+    case 5:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
+                          10, ACCT_STR) +
+          1;
+      break;
+    case 6:
+    default:
+      eob_pt = aom_read_symbol(
+                   r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
+                   ACCT_STR) +
+               1;
+      break;
   }
-#else
-  *eob = read_nz_map(r, tcoeffs, plane, scan, tx_size, tx_type, ec_ctx, counts);
-#endif
-  *max_scan_line = *eob;
-
-  int i;
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-#if !LV_MAP_PROB
-    aom_prob *coeff_base = ec_ctx->coeff_base[txs_ctx][plane_type][i];
-#endif
-    update_eob = 0;
-    for (c = *eob - 1; c >= 0; --c) {
-      tran_low_t *v = &tcoeffs[scan[c]];
-      int sign;
-      int ctx;
-
-      if (*v <= i) continue;
-
-      ctx = get_base_ctx(tcoeffs, scan[c], bwl, height, i + 1);
-
-#if LV_MAP_PROB
-      if (av1_read_record_bin(
-              counts, r, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx], 2,
-              ACCT_STR))
-#else
-      if (aom_read(r, coeff_base[ctx], ACCT_STR))
-#endif
-      {
-        *v = i + 1;
-        cul_level += i + 1;
 
-        if (counts) ++counts->coeff_base[txs_ctx][plane_type][i][ctx][1];
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int bit = aom_read_symbol(
+        r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
+    if (bit) {
+      eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1));
+    }
 
-        if (c == 0) {
-          int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-#if LV_MAP_PROB
-          sign = av1_read_record_bin(
-              counts, r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], 2,
-              ACCT_STR);
-#else
-          sign =
-              aom_read(r, ec_ctx->dc_sign[plane_type][dc_sign_ctx], ACCT_STR);
-#endif
-          if (counts) ++counts->dc_sign[plane_type][dc_sign_ctx][sign];
-        } else {
-          sign = av1_read_record_bit(counts, r, ACCT_STR);
-        }
-        if (sign) *v = -(*v);
-        continue;
+    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+      bit = aom_read_bit(r, ACCT_STR);
+      if (bit) {
+        eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i));
       }
-      *v = i + 2;
-      if (counts) ++counts->coeff_base[txs_ctx][plane_type][i][ctx][0];
-
-      // update the eob flag for coefficients with magnitude above 1.
-      update_eob = AOMMAX(update_eob, c);
     }
   }
-
-  for (c = update_eob; c >= 0; --c) {
-    tran_low_t *v = &tcoeffs[scan[c]];
-    int sign;
-    int idx;
-    int ctx;
-
-    if (*v <= NUM_BASE_LEVELS) continue;
-
-    if (c == 0) {
-      int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-#if LV_MAP_PROB
-      sign = av1_read_record_bin(
-          counts, r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], 2, ACCT_STR);
-#else
-      sign = aom_read(r, ec_ctx->dc_sign[plane_type][dc_sign_ctx], ACCT_STR);
-#endif
-      if (counts) ++counts->dc_sign[plane_type][dc_sign_ctx][sign];
+  *eob = rec_eob_pos(eob_pt, eob_extra);
+
+  {
+    // Read the non-zero coefficient with scan index eob-1
+    // TODO(angiebird): Put this into a function
+    const int c = *eob - 1;
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, c);
+    const int nsymbs = 3;
+    aom_cdf_prob *cdf =
+        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
+    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(
+            r,
+            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+            BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+  if (*eob > 1) {
+    base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
+    br_cdf_arr br_cdf =
+        ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
+    if (tx_class == TX_CLASS_2D) {
+      read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bwl, levels,
+                             base_cdf, br_cdf);
+      read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bwl, levels,
+                          base_cdf, br_cdf);
     } else {
-      sign = av1_read_record_bit(counts, r, ACCT_STR);
+      read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bwl,
+                          levels, base_cdf, br_cdf);
     }
+  }
 
-    ctx = get_br_ctx(tcoeffs, scan[c], bwl, height);
-
-#if BR_NODE
-    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-#if LV_MAP_PROB
-      if (av1_read_record_bin(
-              counts, r, ec_ctx->coeff_br_cdf[txs_ctx][plane_type][idx][ctx], 2,
-              ACCT_STR))
-#else   // LV_MAP_PROB
-      if (aom_read(r, ec_ctx->coeff_br[txs_ctx][plane_type][idx][ctx],
-                   ACCT_STR))
-#endif  // LV_MAP_PROB
-      {
-        int extra_bits = (1 << br_extra_bits[idx]) - 1;
-        //        int br_offset = aom_read_literal(r, extra_bits, ACCT_STR);
-        int br_offset = 0;
-        int tok;
-        if (counts) ++counts->coeff_br[txs_ctx][plane_type][idx][ctx][1];
-        for (tok = 0; tok < extra_bits; ++tok) {
-#if LV_MAP_PROB
-          if (av1_read_record_bin(
-                  counts, r, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2,
-                  ACCT_STR))
-#else
-          if (aom_read(r, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx],
-                       ACCT_STR))
-#endif
-          {
-            br_offset = tok;
-            if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][1];
-            break;
-          }
-          if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][0];
-        }
-        if (tok == extra_bits) br_offset = extra_bits;
-
-        int br_base = br_index_to_coeff[idx];
-
-        *v = NUM_BASE_LEVELS + 1 + br_base + br_offset;
-        cul_level += *v;
-        if (sign) *v = -(*v);
-        break;
+  int16_t num_zero_coeffs = 0;
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    num_zero_coeffs = AOMMAX(num_zero_coeffs, pos);
+  }
+  memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0]));
+
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    uint8_t sign;
+    tran_low_t level = levels[get_padded_idx(pos, bwl)];
+    if (level) {
+      *max_scan_line = AOMMAX(*max_scan_line, pos);
+      if (c == 0) {
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                               2, ACCT_STR);
+      } else {
+        sign = aom_read_bit(r, ACCT_STR);
+      }
+      if (level >= MAX_BASE_BR_RANGE) {
+        level += read_golomb(xd, r);
       }
-      if (counts) ++counts->coeff_br[txs_ctx][plane_type][idx][ctx][0];
-    }
-
-    if (idx < BASE_RANGE_SETS) continue;
-#else
-    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-#if LV_MAP_PROB
-      if (av1_read_record_bin(counts, r,
-                              ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
-                              2, ACCT_STR))
-#else
-      if (aom_read(r, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx], ACCT_STR))
-#endif
-      {
-        *v = (idx + 1 + NUM_BASE_LEVELS);
-        if (sign) *v = -(*v);
-        cul_level += abs(*v);
 
-        if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][1];
-        break;
+      if (c == 0) dc_val = sign ? -level : level;
+
+      // Bitmasking to clamp level to valid range:
+      //   The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
+      level &= 0xfffff;
+      cul_level += level;
+      tran_low_t dq_coeff;
+      // Bitmasking to clamp dq_coeff to valid range:
+      //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
+      dq_coeff = (tran_low_t)(
+          (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+      dq_coeff = dq_coeff >> shift;
+      if (sign) {
+        dq_coeff = -dq_coeff;
       }
-      if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][0];
+      tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
     }
-    if (idx < COEFF_BASE_RANGE) continue;
-#endif
-
-    // decode 0-th order Golomb code
-    *v = read_golomb(xd, r, counts) + COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS;
-    if (sign) *v = -(*v);
-    cul_level += abs(*v);
-  }
-
-  for (c = 0; c < *eob; ++c) {
-    int16_t dqv = (c == 0) ? dequant[0] : dequant[1];
-    tran_low_t *v = &tcoeffs[scan[c]];
-#if CONFIG_SYMBOLRATE
-    av1_record_coeff(counts, abs(*v));
-#endif
-    int sign = (*v) < 0;
-    *v = (abs(*v) * dqv) >> shift;
-    if (sign) *v = -(*v);
   }
 
-  cul_level = AOMMIN(63, cul_level);
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
 
   // DC value
-  set_dc_sign(&cul_level, tcoeffs[0]);
+  set_dc_sign(&cul_level, dc_val);
 
   return cul_level;
 }
 
-uint8_t av1_read_coeffs_txb_facade(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   aom_reader *r, int row, int col, int block,
-                                   int plane, tran_low_t *tcoeffs,
-                                   TX_SIZE tx_size, int16_t *max_scan_line,
-                                   int *eob) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblockd_plane *pd = &xd->plane[plane];
+uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                   MACROBLOCKD *const xd, aom_reader *const r,
+                                   const int row, const int col,
+                                   const int plane, const TX_SIZE tx_size) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
 
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else   // CONFIG_CB4X4
   const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif  // CONFIG_CB4X4
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + col,
               pd->left_context + row, &txb_ctx);
-  uint8_t cul_level =
-      av1_read_coeffs_txb(cm, xd, r, row, col, block, plane, tcoeffs, &txb_ctx,
-                          tx_size, max_scan_line, eob);
-#if CONFIG_ADAPT_SCAN
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, block, tx_size);
-  if (xd->counts && *eob > 0)
-    av1_update_scan_count_facade(cm, xd->counts, tx_size, tx_type, pd->dqcoeff,
-                                 *eob);
-#endif
-  av1_set_contexts(xd, pd, plane, tx_size, cul_level, col, row);
+  const uint8_t cul_level =
+      av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
   return cul_level;
 }
-
-#if !LV_MAP_PROB
-static void read_txb_probs(FRAME_CONTEXT *fc, const TX_SIZE tx_size,
-                           aom_reader *r, FRAME_COUNTS *counts) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
-  int plane, ctx, level;
-
-  if (av1_read_record_bit(counts, r, ACCT_STR) == 0) return;
-
-  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
-    av1_diff_update_prob(r, &fc->txb_skip[tx_size][ctx], ACCT_STR);
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->nz_map[tx_size][plane][ctx], ACCT_STR);
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->eob_flag[tx_size][plane][ctx], ACCT_STR);
-
-  for (level = 0; level < NUM_BASE_LEVELS; ++level)
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-        av1_diff_update_prob(r, &fc->coeff_base[tx_size][plane][level][ctx],
-                             ACCT_STR);
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->coeff_lps[tx_size][plane][ctx], ACCT_STR);
-}
-
-void av1_read_txb_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r,
-                        FRAME_COUNTS *counts) {
-  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-  TX_SIZE tx_size;
-  int ctx, plane;
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->dc_sign[plane][ctx], ACCT_STR);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    read_txb_probs(fc, tx_size, r, counts);
-}
-#endif  // !LV_MAP_PROB
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index 1c6512e97..d0b3d8c7a 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -12,24 +12,21 @@
 #ifndef DECODETXB_H_
 #define DECODETXB_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
 #include "aom_dsp/bitreader.h"
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                            aom_reader *r, int blk_row, int blk_col, int block,
-                            int plane, tran_low_t *tcoeffs, TXB_CTX *txb_ctx,
-                            TX_SIZE tx_size, int16_t *max_scan_line, int *eob);
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size);
 
-uint8_t av1_read_coeffs_txb_facade(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   aom_reader *r, int row, int col, int block,
-                                   int plane, tran_low_t *tcoeffs,
-                                   TX_SIZE tx_size, int16_t *max_scan_line,
-                                   int *eob);
-#if !LV_MAP_PROB
-void av1_read_txb_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r,
-                        FRAME_COUNTS *counts);
-#endif  // !LV_MAP_PROB
+uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                   MACROBLOCKD *const xd, aom_reader *const r,
+                                   const int row, const int col,
+                                   const int plane, const TX_SIZE tx_size);
 #endif  //  DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c
index a59a7bac1..9d54bd13d 100644
--- a/third_party/aom/av1/decoder/detokenize.c
+++ b/third_party/aom/av1/decoder/detokenize.c
@@ -9,245 +9,18 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
-#if !CONFIG_PVQ
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#endif  // !CONFIG_PVQ
-
 #include "av1/common/blockd.h"
 #include "av1/decoder/detokenize.h"
 
 #define ACCT_STR __func__
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/idct.h"
-#endif
-
-#include "av1/decoder/symbolrate.h"
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-#define EOB_CONTEXT_NODE 0
-#define ZERO_CONTEXT_NODE 1
-#define ONE_CONTEXT_NODE 2
-#define LOW_VAL_CONTEXT_NODE 0
-#define TWO_CONTEXT_NODE 1
-#define THREE_CONTEXT_NODE 2
-#define HIGH_LOW_CONTEXT_NODE 3
-#define CAT_ONE_CONTEXT_NODE 4
-#define CAT_THREEFOUR_CONTEXT_NODE 5
-#define CAT_THREE_CONTEXT_NODE 6
-#define CAT_FIVE_CONTEXT_NODE 7
-
-#define INCREMENT_COUNT(token)                   \
-  do {                                           \
-    if (counts) ++coef_counts[band][ctx][token]; \
-  } while (0)
-
-#if CONFIG_NEW_MULTISYMBOL
-#define READ_COEFF(counts, prob_name, cdf_name, num, r) \
-  read_coeff(counts, cdf_name, num, r);
-static INLINE int read_coeff(FRAME_COUNTS *counts,
-                             const aom_cdf_prob *const *cdf, int n,
-                             aom_reader *r) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
-  int val = 0;
-  int i = 0;
-  int count = 0;
-  while (count < n) {
-    const int size = AOMMIN(n - count, 4);
-    val |= av1_read_record_cdf(counts, r, cdf[i++], 1 << size, ACCT_STR)
-           << count;
-    count += size;
-  }
-  return val;
-}
-#else
-#define READ_COEFF(counts, prob_name, cdf_name, num, r) \
-  read_coeff(counts, prob_name, num, r);
-static INLINE int read_coeff(FRAME_COUNTS *counts, const aom_prob *probs, int n,
-                             aom_reader *r) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
-  int i, val = 0;
-  for (i = 0; i < n; ++i)
-    val = (val << 1) | av1_read_record(counts, r, probs[i], ACCT_STR);
-  return val;
-}
-
-#endif
-
-static int token_to_value(FRAME_COUNTS *counts, aom_reader *const r, int token,
-                          TX_SIZE tx_size, int bit_depth) {
-#if !CONFIG_HIGHBITDEPTH
-  assert(bit_depth == 8);
-#endif  // !CONFIG_HIGHBITDEPTH
-
-  switch (token) {
-    case ZERO_TOKEN:
-    case ONE_TOKEN:
-    case TWO_TOKEN:
-    case THREE_TOKEN:
-    case FOUR_TOKEN: return token;
-    case CATEGORY1_TOKEN:
-      return CAT1_MIN_VAL +
-             READ_COEFF(counts, av1_cat1_prob, av1_cat1_cdf, 1, r);
-    case CATEGORY2_TOKEN:
-      return CAT2_MIN_VAL +
-             READ_COEFF(counts, av1_cat2_prob, av1_cat2_cdf, 2, r);
-    case CATEGORY3_TOKEN:
-      return CAT3_MIN_VAL +
-             READ_COEFF(counts, av1_cat3_prob, av1_cat3_cdf, 3, r);
-    case CATEGORY4_TOKEN:
-      return CAT4_MIN_VAL +
-             READ_COEFF(counts, av1_cat4_prob, av1_cat4_cdf, 4, r);
-    case CATEGORY5_TOKEN:
-      return CAT5_MIN_VAL +
-             READ_COEFF(counts, av1_cat5_prob, av1_cat5_cdf, 5, r);
-    case CATEGORY6_TOKEN: {
-      const int skip_bits = (int)sizeof(av1_cat6_prob) -
-                            av1_get_cat6_extrabits_size(tx_size, bit_depth);
-      return CAT6_MIN_VAL + READ_COEFF(counts, av1_cat6_prob + skip_bits,
-                                       av1_cat6_cdf, 18 - skip_bits, r);
-    }
-    default:
-      assert(0);  // Invalid token.
-      return -1;
-  }
-}
-
-static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
-                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
-#if CONFIG_NEW_QUANT
-                        dequant_val_type_nuq *dq_val,
-#else
-#if CONFIG_AOM_QM
-                        qm_val_t *iqm[2][TX_SIZES_ALL],
-#endif  // CONFIG_AOM_QM
-#endif  // CONFIG_NEW_QUANT
-                        int ctx, const int16_t *scan, const int16_t *nb,
-                        int16_t *max_scan_line, aom_reader *r) {
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  const int max_eob = tx_size_2d[tx_size];
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-#if CONFIG_AOM_QM && !CONFIG_NEW_QUANT
-  const qm_val_t *iqmatrix = iqm[!ref][tx_size];
-#endif  // CONFIG_AOM_QM
-  (void)tx_type;
-  int band, c = 0;
-  const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
-  aom_cdf_prob(*coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_head_cdfs[tx_size_ctx][type][ref];
-  aom_cdf_prob(*coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_tail_cdfs[tx_size_ctx][type][ref];
-  int val = 0;
-
-  uint8_t token_cache[MAX_TX_SQUARE];
-  const uint8_t *band_translate = get_band_translate(tx_size);
-  int dq_shift;
-  int v, token;
-  int32_t dqv = dq[0];
-#if CONFIG_NEW_QUANT
-  const tran_low_t *dqv_val = &dq_val[0][0];
-#endif  // CONFIG_NEW_QUANT
-
-  dq_shift = av1_get_tx_scale(tx_size);
-
-  band = *band_translate++;
-
-  int more_data = 1;
-  while (more_data) {
-    int comb_token;
-    int last_pos = (c + 1 == max_eob);
-    int first_pos = (c == 0);
-
-#if CONFIG_NEW_QUANT
-    dqv_val = &dq_val[band][0];
-#endif  // CONFIG_NEW_QUANT
-
-    comb_token = last_pos ? 2 * av1_read_record_bit(xd->counts, r, ACCT_STR) + 2
-                          : av1_read_record_symbol(
-                                xd->counts, r, coef_head_cdfs[band][ctx],
-                                HEAD_TOKENS + first_pos, ACCT_STR) +
-                                !first_pos;
-    if (first_pos) {
-      if (comb_token == 0) return 0;
-    }
-    token = comb_token >> 1;
-
-    while (!token) {
-      *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
-      token_cache[scan[c]] = 0;
-#if CONFIG_SYMBOLRATE
-      av1_record_coeff(xd->counts, 0);
-#endif
-      ++c;
-      dqv = dq[1];
-      ctx = get_coef_context(nb, token_cache, c);
-      band = *band_translate++;
-
-      last_pos = (c + 1 == max_eob);
-
-      comb_token =
-          last_pos
-              ? 2 * av1_read_record_bit(xd->counts, r, ACCT_STR) + 2
-              : av1_read_record_symbol(xd->counts, r, coef_head_cdfs[band][ctx],
-                                       HEAD_TOKENS, ACCT_STR) +
-                    1;
-      token = comb_token >> 1;
-    }
-
-    more_data = comb_token & 1;
-
-    if (token > ONE_TOKEN)
-      token += av1_read_record_symbol(xd->counts, r, coef_tail_cdfs[band][ctx],
-                                      TAIL_TOKENS, ACCT_STR);
-#if CONFIG_NEW_QUANT
-    dqv_val = &dq_val[band][0];
-#endif  // CONFIG_NEW_QUANT
-
-    *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
-    token_cache[scan[c]] = av1_pt_energy_class[token];
-
-    val = token_to_value(xd->counts, r, token, tx_size, xd->bd);
-#if CONFIG_SYMBOLRATE
-    av1_record_coeff(xd->counts, val);
-#endif
-
-#if CONFIG_NEW_QUANT
-    v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
-    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
-#else
-#if CONFIG_AOM_QM
-    // Apply quant matrix only for 2D transforms
-    if (IS_2D_TRANSFORM(tx_type) && iqmatrix != NULL)
-      dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
-            AOM_QM_BITS;
-#endif
-    v = (val * dqv) >> dq_shift;
-#endif
-
-    v = (int)check_range(av1_read_record_bit(xd->counts, r, ACCT_STR) ? -v : v,
-                         xd->bd);
-
-    dqcoeff[scan[c]] = v;
-
-    ++c;
-    more_data &= (c < max_eob);
-    if (!more_data) break;
-    dqv = dq[1];
-    ctx = get_coef_context(nb, token_cache, c);
-    band = *band_translate++;
-  }
-
-  return c;
-}
-#endif  // !CONFIG_PVQ
 
 static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   uint8_t color_order[PALETTE_MAX_SIZE];
@@ -263,7 +36,6 @@ static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   color_map[0] = av1_read_uniform(r, n);
   assert(color_map[0] < n);
 
-#if CONFIG_PALETTE_THROUGHPUT
   // Run wavefront on the palette map index decoding.
   for (int i = 1; i < rows + cols - 1; ++i) {
     for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
@@ -283,21 +55,6 @@ static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
              (plane_block_width - cols));
     }
   }
-#else
-  for (int i = 0; i < rows; ++i) {
-    for (int j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      const int color_ctx = av1_get_palette_color_index_context(
-          color_map, plane_block_width, i, j, n, color_order, NULL);
-      const int color_idx = aom_read_symbol(
-          r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
-      assert(color_idx >= 0 && color_idx < n);
-      color_map[i * plane_block_width + j] = color_order[color_idx];
-    }
-    memset(color_map + i * plane_block_width + cols,
-           color_map[i * plane_block_width + cols - 1],
-           (plane_block_width - cols));  // Copy last column to extra columns.
-  }
-#endif  // CONFIG_PALETTE_THROUGHPUT
   // Copy last row to extra rows.
   for (int i = rows; i < plane_block_height; ++i) {
     memcpy(color_map + i * plane_block_width,
@@ -305,97 +62,17 @@ static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   }
 }
 
-static void get_palette_params(const MACROBLOCKD *const xd, int plane,
-                               BLOCK_SIZE bsize, Av1ColorMapParam *params) {
-  assert(plane == 0 || plane == 1);
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  params->color_map = xd->plane[plane].color_index_map;
-  params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
-                          : xd->tile_ctx->palette_y_color_index_cdf;
-  params->n_colors = pmi->palette_size[plane];
-  av1_get_block_dimensions(bsize, plane, xd, &params->plane_width,
-                           &params->plane_height, &params->rows, &params->cols);
-}
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-static void get_mrc_params(const MACROBLOCKD *const xd, TX_SIZE tx_size,
-                           Av1ColorMapParam *params) {
-  memset(params, 0, sizeof(*params));
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-  params->color_map = xd->mrc_mask;
-  params->map_cdf = is_inter ? xd->tile_ctx->mrc_mask_inter_cdf
-                             : xd->tile_ctx->mrc_mask_intra_cdf;
-  params->n_colors = 2;
-  params->plane_width = tx_size_wide[tx_size];
-  params->rows = tx_size_high[tx_size];
-  params->cols = tx_size_wide[tx_size];
-}
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
                                aom_reader *r) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   assert(plane == 0 || plane == 1);
-  assert(mbmi->sb_type >= BLOCK_8X8);
-  Av1ColorMapParam color_map_params;
-  memset(&color_map_params, 0, sizeof(color_map_params));
-  get_palette_params(xd, plane, mbmi->sb_type, &color_map_params);
-  decode_color_map_tokens(&color_map_params, r);
-}
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-static void decode_mrc_tokens(MACROBLOCKD *const xd, TX_TYPE tx_size,
-                              aom_reader *r) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-  if ((is_inter && !SIGNAL_MRC_MASK_INTER) ||
-      (!is_inter && !SIGNAL_MRC_MASK_INTRA))
-    return;
-  Av1ColorMapParam color_map_params;
-  get_mrc_params(xd, tx_size, &color_map_params);
-  decode_color_map_tokens(&color_map_params, r);
-}
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-int av1_decode_block_tokens(AV1_COMMON *cm, MACROBLOCKD *const xd, int plane,
-                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
-                            TX_TYPE tx_type, int16_t *max_scan_line,
-                            aom_reader *r, int seg_id) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int16_t *const dequant = pd->seg_dequant[seg_id];
-  const int ctx =
-      get_entropy_context(tx_size, pd->above_context + x, pd->left_context + y);
-#if CONFIG_NEW_QUANT
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  int dq =
-      get_dq_profile_from_ctx(xd->qindex[seg_id], ctx, ref, pd->plane_type);
-#endif  //  CONFIG_NEW_QUANT
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  if (tx_type == MRC_DCT) decode_mrc_tokens(xd, tx_size, r);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  const int eob =
-      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
-#if CONFIG_NEW_QUANT
-                   pd->seg_dequant_nuq[seg_id][dq],
-#else
-#if CONFIG_AOM_QM
-                   pd->seg_iqmatrix[seg_id],
-#endif  // CONFIG_AOM_QM
-#endif  // CONFIG_NEW_QUANT
-                   ctx, sc->scan, sc->neighbors, max_scan_line, r);
-  av1_set_contexts(xd, pd, plane, tx_size, eob > 0, x, y);
-#if CONFIG_ADAPT_SCAN
-  if (xd->counts)
-    av1_update_scan_count_facade(cm, xd->counts, tx_size, tx_type, pd->dqcoeff,
-                                 eob);
-#else
-  (void)cm;
-#endif
-  return eob;
+  Av1ColorMapParam params;
+  params.color_map =
+      xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
+  params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                         : xd->tile_ctx->palette_y_color_index_cdf;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  params.n_colors = mbmi->palette_mode_info.palette_size[plane];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, &params.rows, &params.cols);
+  decode_color_map_tokens(&params, r);
 }
-#endif  // !CONFIG_PVQ
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
index eb31d58c6..ec85bf7ea 100644
--- a/third_party/aom/av1/decoder/detokenize.h
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -12,10 +12,9 @@
 #ifndef AV1_DECODER_DETOKENIZE_H_
 #define AV1_DECODER_DETOKENIZE_H_
 
-#include "./aom_config.h"
-#if !CONFIG_PVQ || CONFIG_VAR_TX
+#include "config/aom_config.h"
+
 #include "av1/common/scan.h"
-#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
 #include "av1/decoder/decoder.h"
 
 #ifdef __cplusplus
@@ -24,12 +23,6 @@ extern "C" {
 
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-int av1_decode_block_tokens(AV1_COMMON *cm, MACROBLOCKD *const xd, int plane,
-                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
-                            TX_TYPE tx_type, int16_t *max_scan_line,
-                            aom_reader *r, int seg_id);
-#endif  // !CONFIG_PVQ
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/decoder/dsubexp.c b/third_party/aom/av1/decoder/dsubexp.c
deleted file mode 100644
index 5171f1144..000000000
--- a/third_party/aom/av1/decoder/dsubexp.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/entropy.h"
-
-#include "av1/decoder/dsubexp.h"
-
-static int inv_recenter_nonneg(int v, int m) {
-  if (v > 2 * m) return v;
-
-  return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
-}
-
-#define decode_uniform(r, ACCT_STR_NAME) \
-  decode_uniform_(r ACCT_STR_ARG(ACCT_STR_NAME))
-#define decode_term_subexp(r, ACCT_STR_NAME) \
-  decode_term_subexp_(r ACCT_STR_ARG(ACCT_STR_NAME))
-
-static int decode_uniform_(aom_reader *r ACCT_STR_PARAM) {
-  const int l = 8;
-  const int m = (1 << l) - 190;
-  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
-  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
-}
-
-static int inv_remap_prob(int v, int m) {
-  /* clang-format off */
-  static uint8_t inv_map_table[MAX_PROB - 1] = {
-      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
-    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
-     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
-     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
-     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
-     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
-     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
-     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
-    109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
-    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
-    142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
-    158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
-    174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
-    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
-    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
-    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
-    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
-  }; /* clang-format on */
-  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
-  v = inv_map_table[v];
-  m--;
-  if ((m << 1) <= MAX_PROB) {
-    return 1 + inv_recenter_nonneg(v, m);
-  } else {
-    return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
-  }
-}
-
-static int decode_term_subexp_(aom_reader *r ACCT_STR_PARAM) {
-  if (!aom_read_bit(r, ACCT_STR_NAME))
-    return aom_read_literal(r, 4, ACCT_STR_NAME);
-  if (!aom_read_bit(r, ACCT_STR_NAME))
-    return aom_read_literal(r, 4, ACCT_STR_NAME) + 16;
-  if (!aom_read_bit(r, ACCT_STR_NAME))
-    return aom_read_literal(r, 5, ACCT_STR_NAME) + 32;
-  return decode_uniform(r, ACCT_STR_NAME) + 64;
-}
-
-void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM) {
-  if (aom_read(r, DIFF_UPDATE_PROB, ACCT_STR_NAME)) {
-    const int delp = decode_term_subexp(r, ACCT_STR_NAME);
-    *p = (aom_prob)inv_remap_prob(delp, *p);
-  }
-}
diff --git a/third_party/aom/av1/decoder/dsubexp.h b/third_party/aom/av1/decoder/dsubexp.h
deleted file mode 100644
index 4bc38578c..000000000
--- a/third_party/aom/av1/decoder/dsubexp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_DECODER_DSUBEXP_H_
-#define AV1_DECODER_DSUBEXP_H_
-
-#include "aom_dsp/bitreader.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if CONFIG_ACCOUNTING
-#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p, str)
-#else
-#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p)
-#endif
-
-void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-#endif  // AV1_DECODER_DSUBEXP_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
index 7f16b233c..ff03502e6 100644
--- a/third_party/aom/av1/decoder/dthread.c
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 #include "av1/common/reconinter.h"
 #include "av1/decoder/dthread.h"
@@ -157,12 +158,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
   av1_frameworker_unlock_stats(src_worker);
 
   dst_cm->bit_depth = src_cm->bit_depth;
-#if CONFIG_HIGHBITDEPTH
   dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
-#endif
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): To handle parallel decoding
-#endif  // CONFIG_EXT_REFS
+  // TODO(zoeliu): To handle parallel decoding
   dst_cm->prev_frame =
       src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
   dst_cm->last_width =
@@ -180,14 +177,10 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
 
   memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
          (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
-  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
-#if CONFIG_LOOPFILTER_LEVEL
+  dst_cm->lf.sharpness_level = src_cm->lf.sharpness_level;
   dst_cm->lf.filter_level[0] = src_cm->lf.filter_level[0];
   dst_cm->lf.filter_level[1] = src_cm->lf.filter_level[1];
-#else
-  dst_cm->lf.filter_level = src_cm->lf.filter_level;
-#endif
-  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, TOTAL_REFS_PER_FRAME);
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, REF_FRAMES);
   memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
   dst_cm->seg = src_cm->seg;
   memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index c17053d9c..33d89006e 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -12,7 +12,8 @@
 #ifndef AV1_DECODER_DTHREAD_H_
 #define AV1_DECODER_DTHREAD_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_util/aom_thread.h"
 #include "aom/internal/aom_codec_internal.h"
 
@@ -22,6 +23,13 @@ extern "C" {
 
 struct AV1Common;
 struct AV1Decoder;
+struct ThreadData;
+
+typedef struct DecWorkerData {
+  struct ThreadData *td;
+  const uint8_t *data_end;
+  struct aom_internal_error_info error_info;
+} DecWorkerData;
 
 // WorkerData for the FrameWorker thread. It contains all the information of
 // the worker and decode structures for decoding a frame.
diff --git a/third_party/aom/av1/decoder/generic_decoder.c b/third_party/aom/av1/decoder/generic_decoder.c
deleted file mode 100644
index 0c7d71b9f..000000000
--- a/third_party/aom/av1/decoder/generic_decoder.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitreader.h"
-#include "av1/common/generic_code.h"
-#include "av1/common/odintrin.h"
-#include "pvq_decoder.h"
-
-/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
- * the cdf accordingly.
- *
- * @param [in,out] r     multi-symbol entropy decoder
- * @param [in,out] cdf   CDF of the variable (Q15)
- * @param [in]     n     number of values possible
- * @param [in,out] count number of symbols encoded with that cdf so far
- * @param [in]     rate  adaptation rate shift (smaller is faster)
- * @return decoded variable
- */
-int aom_decode_cdf_adapt_q15_(aom_reader *r, uint16_t *cdf, int n,
- int *count, int rate ACCT_STR_PARAM) {
-  int val;
-  int i;
-  if (*count == 0) {
-    int ft;
-    ft = cdf[n - 1];
-    for (i = 0; i < n; i++) {
-      cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
-    }
-  }
-  val = aom_read_cdf(r, cdf, n, ACCT_STR_NAME);
-  aom_cdf_adapt_q15(val, cdf, n, count, rate);
-  return val;
-}
-
-/** Encodes a random variable using a "generic" model, assuming that the
- * distribution is one-sided (zero and up), has a single mode, and decays
- * exponentially past the model.
- *
- * @param [in,out] r     multi-symbol entropy decoder
- * @param [in,out] model generic probability model
- * @param [in]     x     variable being encoded
- * @param [in,out] ExQ16 expectation of x (adapted)
- * @param [in]     integration integration period of ExQ16 (leaky average over
- * 1<<integration samples)
- *
- * @retval decoded variable x
- */
-int generic_decode_(aom_reader *r, generic_encoder *model,
- int *ex_q16, int integration ACCT_STR_PARAM) {
-  int lg_q1;
-  int shift;
-  int id;
-  uint16_t *cdf;
-  int xs;
-  int lsb;
-  int x;
-  lsb = 0;
-  lg_q1 = log_ex(*ex_q16);
-  /* If expectation is too large, shift x to ensure that
-     all we have past xs=15 is the exponentially decaying tail
-     of the distribution. */
-  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
-  /* Choose the cdf to use: we have two per "octave" of ExQ16. */
-  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
-  cdf = model->cdf[id];
-  xs = aom_read_symbol_pvq(r, cdf, 16, ACCT_STR_NAME);
-  if (xs == 15) {
-    int e;
-    unsigned decay;
-    /* Estimate decay based on the assumption that the distribution is close
-       to Laplacian for large values. We should probably have an adaptive
-       estimate instead. Note: The 2* is a kludge that's not fully understood
-       yet. */
-    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
-    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
-    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
-    xs += aom_laplace_decode_special(r, decay, ACCT_STR_NAME);
-  }
-  if (shift != 0) {
-    int special;
-    /* Because of the rounding, there's only half the number of possibilities
-       for xs=0 */
-    special = xs == 0;
-    if (shift - special > 0) {
-      lsb = aom_read_literal(r, shift - special, ACCT_STR_NAME);
-    }
-    lsb -= !special << (shift - 1);
-  }
-  x = (xs << shift) + lsb;
-  generic_model_update(ex_q16, x, integration);
-  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
-   "dec: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, dec->rng));
-  return x;
-}
diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c
index 98c51d4ba..e6c89298a 100644
--- a/third_party/aom/av1/decoder/inspection.c
+++ b/third_party/aom/av1/decoder/inspection.c
@@ -11,12 +11,7 @@
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/inspection.h"
 #include "av1/common/enums.h"
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
 
 static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
   fd->mi_cols = mi_cols;
@@ -48,25 +43,29 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
   fd->show_frame = cm->show_frame;
   fd->frame_type = cm->frame_type;
   fd->base_qindex = cm->base_qindex;
-  fd->tile_mi_cols = cm->tile_width;
-  fd->tile_mi_rows = cm->tile_height;
+  // Set width and height of the first tile until generic support can be added
+  TileInfo tile_info;
+  av1_tile_set_row(&tile_info, cm, 0);
+  av1_tile_set_col(&tile_info, cm, 0);
+  fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start;
+  fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start;
+  fd->delta_q_present_flag = cm->delta_q_present_flag;
+  fd->delta_q_res = cm->delta_q_res;
 #if CONFIG_ACCOUNTING
   fd->accounting = &pbi->accounting;
 #endif
-#if CONFIG_CDEF
-// TODO(negge): copy per frame CDEF data
-#endif
+  // TODO(negge): copy per frame CDEF data
   int i, j;
   for (i = 0; i < MAX_SEGMENTS; i++) {
     for (j = 0; j < 2; j++) {
-      fd->y_dequant[i][j] = cm->y_dequant[i][j];
-      fd->uv_dequant[i][j] = cm->uv_dequant[i][j];
+      fd->y_dequant[i][j] = cm->y_dequant_QTX[i][j];
+      fd->u_dequant[i][j] = cm->u_dequant_QTX[i][j];
+      fd->v_dequant[i][j] = cm->v_dequant_QTX[i][j];
     }
   }
   for (j = 0; j < cm->mi_rows; j++) {
     for (i = 0; i < cm->mi_cols; i++) {
-      const MB_MODE_INFO *mbmi =
-          &cm->mi_grid_visible[j * cm->mi_stride + i]->mbmi;
+      const MB_MODE_INFO *mbmi = cm->mi_grid_visible[j * cm->mi_stride + i];
       insp_mi_data *mi = &fd->mi_grid[j * cm->mi_cols + i];
       // Segment
       mi->segment_id = mbmi->segment_id;
@@ -90,24 +89,19 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
       mi->sb_type = mbmi->sb_type;
       // Skip Flag
       mi->skip = mbmi->skip;
-#if CONFIG_DUAL_FILTER
       mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
       mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
-#else
-      mi->filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
-#endif
+      mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
       // Transform
-      mi->tx_type = mbmi->tx_type;
+      // TODO(anyone): extract tx type info from mbmi->txk_type[].
+      mi->tx_type = DCT_DCT;
       mi->tx_size = mbmi->tx_size;
 
-#if CONFIG_CDEF
       mi->cdef_level =
           cm->cdef_strengths[mbmi->cdef_strength] / CDEF_SEC_STRENGTHS;
       mi->cdef_strength =
           cm->cdef_strengths[mbmi->cdef_strength] % CDEF_SEC_STRENGTHS;
       mi->cdef_strength += mi->cdef_strength == 3;
-#endif
-#if CONFIG_CFL
       if (mbmi->uv_mode == UV_CFL_PRED) {
         mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
         mi->cfl_alpha_sign = mbmi->cfl_alpha_signs;
@@ -115,7 +109,8 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
         mi->cfl_alpha_idx = 0;
         mi->cfl_alpha_sign = 0;
       }
-#endif
+      // delta_q
+      mi->current_qindex = mbmi->current_qindex;
     }
   }
   return 1;
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
index 06a94b737..bb604f684 100644
--- a/third_party/aom/av1/decoder/inspection.h
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -20,7 +20,9 @@ extern "C" {
 #include "av1/decoder/accounting.h"
 #endif
 
+#ifndef AOM_AOMDX_H_
 typedef void (*aom_inspect_cb)(void *decoder, void *data);
+#endif
 
 typedef struct insp_mv insp_mv;
 
@@ -33,27 +35,21 @@ typedef struct insp_mi_data insp_mi_data;
 
 struct insp_mi_data {
   insp_mv mv[2];
-  int8_t ref_frame[2];
-  int8_t mode;
-  int8_t uv_mode;
-  int8_t sb_type;
-  int8_t skip;
-  int8_t segment_id;
-#if CONFIG_DUAL_FILTER
-  int8_t filter[2];
-#else
-  int8_t filter;
-#endif
-  int8_t tx_type;
-  int8_t tx_size;
-#if CONFIG_CDEF
-  int8_t cdef_level;
-  int8_t cdef_strength;
-#endif
-#if CONFIG_CFL
-  int8_t cfl_alpha_idx;
-  int8_t cfl_alpha_sign;
-#endif
+  int16_t ref_frame[2];
+  int16_t mode;
+  int16_t uv_mode;
+  int16_t sb_type;
+  int16_t skip;
+  int16_t segment_id;
+  int16_t dual_filter_type;
+  int16_t filter[2];
+  int16_t tx_type;
+  int16_t tx_size;
+  int16_t cdef_level;
+  int16_t cdef_strength;
+  int16_t cfl_alpha_idx;
+  int16_t cfl_alpha_sign;
+  int16_t current_qindex;
 };
 
 typedef struct insp_frame_data insp_frame_data;
@@ -71,10 +67,11 @@ struct insp_frame_data {
   int tile_mi_rows;
   int tile_mi_cols;
   int16_t y_dequant[MAX_SEGMENTS][2];
-  int16_t uv_dequant[MAX_SEGMENTS][2];
-#if CONFIG_CDEF
-// TODO(negge): add per frame CDEF data
-#endif
+  int16_t u_dequant[MAX_SEGMENTS][2];
+  int16_t v_dequant[MAX_SEGMENTS][2];
+  // TODO(negge): add per frame CDEF data
+  int delta_q_present_flag;
+  int delta_q_res;
 };
 
 void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
diff --git a/third_party/aom/av1/decoder/laplace_decoder.c b/third_party/aom/av1/decoder/laplace_decoder.c
deleted file mode 100644
index 5cc080ea7..000000000
--- a/third_party/aom/av1/decoder/laplace_decoder.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitreader.h"
-#include "av1/common/pvq.h"
-#include "pvq_decoder.h"
-
-#define aom_decode_pvq_split(r, adapt, sum, ctx, ACCT_STR_NAME) \
-  aom_decode_pvq_split_(r, adapt, sum, ctx ACCT_STR_ARG(ACCT_STR_NAME))
-
-static int aom_decode_pvq_split_(aom_reader *r, od_pvq_codeword_ctx *adapt,
- int sum, int ctx ACCT_STR_PARAM) {
-  int shift;
-  int count;
-  int msbs;
-  int fctx;
-  count = 0;
-  if (sum == 0) return 0;
-  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
-  fctx = 7*ctx + (sum >> shift) - 1;
-  msbs = aom_read_symbol_pvq(r, adapt->pvq_split_cdf[fctx], (sum >> shift) + 1,
-      ACCT_STR_NAME);
-  if (shift) count = aom_read_literal(r, shift, ACCT_STR_NAME);
-  count += msbs << shift;
-  if (count > sum) {
-    count = sum;
-#if !CONFIG_ANS
-    r->ec.error = 1;
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  }
-  return count;
-}
-
-void aom_decode_band_pvq_splits(aom_reader *r, od_pvq_codeword_ctx *adapt,
- od_coeff *y, int n, int k, int level) {
-  int mid;
-  int count_right;
-  if (n == 1) {
-    y[0] = k;
-  }
-  else if (k == 0) {
-    OD_CLEAR(y, n);
-  }
-  else if (k == 1 && n <= 16) {
-    int cdf_id;
-    int pos;
-    cdf_id = od_pvq_k1_ctx(n, level == 0);
-    OD_CLEAR(y, n);
-    pos = aom_read_symbol_pvq(r, adapt->pvq_k1_cdf[cdf_id], n, "pvq:k1");
-    y[pos] = 1;
-  }
-  else {
-    mid = n >> 1;
-    count_right = aom_decode_pvq_split(r, adapt, k, od_pvq_size_ctx(n),
-     "pvq:split");
-    aom_decode_band_pvq_splits(r, adapt, y, mid, k - count_right, level + 1);
-    aom_decode_band_pvq_splits(r, adapt, y + mid, n - mid, count_right,
-     level + 1);
-  }
-}
-
-/** Decodes the tail of a Laplace-distributed variable, i.e. it doesn't
- * do anything special for the zero case.
- *
- * @param [dec] range decoder
- * @param [decay] decay factor of the distribution, i.e. pdf ~= decay^x
- *
- * @retval decoded variable x
- */
-int aom_laplace_decode_special_(aom_reader *r, unsigned decay ACCT_STR_PARAM) {
-  int pos;
-  int shift;
-  int xs;
-  int sym;
-  const uint16_t *cdf;
-  shift = 0;
-  /* We don't want a large decay value because that would require too many
-     symbols. */
-  while (decay > 235) {
-    decay = (decay*decay + 128) >> 8;
-    shift++;
-  }
-  decay = OD_MINI(decay, 254);
-  decay = OD_MAXI(decay, 2);
-  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
-  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d\n", decay));
-  xs = 0;
-  do {
-    sym = OD_MINI(xs, 15);
-    {
-      int i;
-      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d", xs, shift, sym));
-      for (i = 0; i < 16; i++) {
-        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
-      }
-      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
-    }
-    sym = aom_read_cdf(r, cdf, 16, ACCT_STR_NAME);
-    xs += sym;
-  } while (sym >= 15);
-  if (shift) pos = (xs << shift) + aom_read_literal(r, shift, ACCT_STR_NAME);
-  else pos = xs;
-  return pos;
-}
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
new file mode 100644
index 000000000..482b6415e
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_ports/mem_ops.h"
+
+#include "av1/common/common.h"
+#include "av1/common/timing.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+typedef enum {
+  SCALABILITY_L1T2 = 0,
+  SCALABILITY_L1T3 = 1,
+  SCALABILITY_L2T1 = 2,
+  SCALABILITY_L2T2 = 3,
+  SCALABILITY_L2T3 = 4,
+  SCALABILITY_S2T1 = 5,
+  SCALABILITY_S2T2 = 6,
+  SCALABILITY_S2T3 = 7,
+  SCALABILITY_L2T1h = 8,
+  SCALABILITY_L2T2h = 9,
+  SCALABILITY_L2T3h = 10,
+  SCALABILITY_S2T1h = 11,
+  SCALABILITY_S2T2h = 12,
+  SCALABILITY_S2T3h = 13,
+  SCALABILITY_SS = 14
+} SCALABILITY_STRUCTURES;
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+  int valid_type = 0;
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: valid_type = 1; break;
+    default: break;
+  }
+  return valid_type;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+                                       int is_annexb, ObuHeader *header) {
+  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->size = 1;
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // Forbidden bit. Must not be set.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->has_extension = aom_rb_read_bit(rb);
+  header->has_size_field = aom_rb_read_bit(rb);
+
+  if (!header->has_size_field && !is_annexb) {
+    // section 5 obu streams must have obu_size field set.
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // obu_reserved_1bit must be set to 0.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (header->has_extension) {
+    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+    header->size += 1;
+    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+    if (aom_rb_read_literal(rb, 3) != 0) {
+      // extension_header_reserved_3bits must be set to 0.
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb) {
+  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+  // TODO(tomfinegan): Set the error handler here and throughout this file, and
+  // confirm parsing work done via aom_read_bit_buffer is successful.
+  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+                                    NULL };
+  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+  return parse_result;
+}
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *number_spatial_layers,
+    unsigned int *number_temporal_layers) {
+  // derive number of spatial/temporal layers from operating_point_idc
+
+  if (!number_spatial_layers || !number_temporal_layers)
+    return AOM_CODEC_INVALID_PARAM;
+
+  if (operating_point_idc == 0) {
+    *number_temporal_layers = 1;
+    *number_spatial_layers = 1;
+  } else {
+    *number_spatial_layers = 0;
+    *number_temporal_layers = 0;
+    for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
+      *number_spatial_layers +=
+          (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
+    }
+    for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
+      *number_temporal_layers += (operating_point_idc >> j) & 0x1;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static int is_obu_in_current_operating_point(AV1Decoder *pbi,
+                                             ObuHeader obu_header) {
+  if (!pbi->current_operating_point) {
+    return 1;
+  }
+
+  if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
+      (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+          0x1) {
+    return 1;
+  }
+  return 0;
+}
+
+static uint32_t read_temporal_delimiter_obu() { return 0; }
+
+// Returns a boolean that indicates success.
+static int read_bitstream_level(BitstreamLevel *bl,
+                                struct aom_read_bit_buffer *rb) {
+  const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+  if (!is_valid_seq_level_idx(seq_level_idx)) return 0;
+  bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN;
+  bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1);
+  return 1;
+}
+
+// On success, sets pbi->sequence_header_ready to 1 and returns the number of
+// bytes read from 'rb'.
+// On failure, sets pbi->common.error.error_code and returns 0.
+static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
+                                         struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const uint32_t saved_bit_offset = rb->bit_offset;
+
+  // Verify rb has been configured to report errors.
+  assert(rb->error_handler);
+
+  cm->profile = av1_read_profile(rb);
+  if (cm->profile > PROFILE_2) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  SequenceHeader *const seq_params = &cm->seq_params;
+
+  // Still picture or not
+  seq_params->still_picture = aom_rb_read_bit(rb);
+  seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
+  // Video must have reduced_still_picture_hdr = 0
+  if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  if (seq_params->reduced_still_picture_hdr) {
+    cm->timing_info_present = 0;
+    seq_params->decoder_model_info_present_flag = 0;
+    seq_params->display_model_info_present_flag = 0;
+    seq_params->operating_points_cnt_minus_1 = 0;
+    seq_params->operating_point_idc[0] = 0;
+    if (!read_bitstream_level(&seq_params->level[0], rb)) {
+      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      return 0;
+    }
+    seq_params->tier[0] = 0;
+    cm->op_params[0].decoder_model_param_present_flag = 0;
+    cm->op_params[0].display_model_param_present_flag = 0;
+  } else {
+    cm->timing_info_present = aom_rb_read_bit(rb);  // timing_info_present_flag
+    if (cm->timing_info_present) {
+      av1_read_timing_info_header(cm, rb);
+
+      seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
+      if (seq_params->decoder_model_info_present_flag)
+        av1_read_decoder_model_info(cm, rb);
+    } else {
+      seq_params->decoder_model_info_present_flag = 0;
+    }
+    seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
+    seq_params->operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      seq_params->operating_point_idc[i] =
+          aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (!read_bitstream_level(&seq_params->level[i], rb)) {
+        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        return 0;
+      }
+      // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
+      // is equivalent to level 3.3.
+      if (seq_params->level[i].major > 3)
+        seq_params->tier[i] = aom_rb_read_bit(rb);
+      else
+        seq_params->tier[i] = 0;
+      if (seq_params->decoder_model_info_present_flag) {
+        cm->op_params[i].decoder_model_param_present_flag = aom_rb_read_bit(rb);
+        if (cm->op_params[i].decoder_model_param_present_flag)
+          av1_read_op_parameters_info(cm, rb, i);
+      } else {
+        cm->op_params[i].decoder_model_param_present_flag = 0;
+      }
+      if (cm->timing_info_present &&
+          (cm->timing_info.equal_picture_interval ||
+           cm->op_params[i].decoder_model_param_present_flag)) {
+        cm->op_params[i].bitrate = max_level_bitrate(
+            cm->profile, major_minor_to_seq_level_idx(seq_params->level[i]),
+            seq_params->tier[i]);
+        // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
+        // the check
+        if (cm->op_params[i].bitrate == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "AV1 does not support this combination of "
+                             "profile, level, and tier.");
+        // Buffer size in bits/s is bitrate in bits/s * 1 s
+        cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+      }
+      if (cm->timing_info_present && cm->timing_info.equal_picture_interval &&
+          !cm->op_params[i].decoder_model_param_present_flag) {
+        // When the decoder_model_parameters are not sent for this op, set
+        // the default ones that can be used with the resource availability mode
+        cm->op_params[i].decoder_buffer_delay = 70000;
+        cm->op_params[i].encoder_buffer_delay = 20000;
+        cm->op_params[i].low_delay_mode_flag = 0;
+      }
+
+      if (seq_params->display_model_info_present_flag) {
+        cm->op_params[i].display_model_param_present_flag = aom_rb_read_bit(rb);
+        if (cm->op_params[i].display_model_param_present_flag) {
+          cm->op_params[i].initial_display_delay =
+              aom_rb_read_literal(rb, 4) + 1;
+          if (cm->op_params[i].initial_display_delay > 10)
+            aom_internal_error(
+                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                "AV1 does not support more than 10 decoded frames delay");
+        } else {
+          cm->op_params[i].initial_display_delay = 10;
+        }
+      } else {
+        cm->op_params[i].display_model_param_present_flag = 0;
+        cm->op_params[i].initial_display_delay = 10;
+      }
+    }
+  }
+  // This decoder supports all levels.  Choose operating point provided by
+  // external means
+  int operating_point = pbi->operating_point;
+  if (operating_point < 0 ||
+      operating_point > seq_params->operating_points_cnt_minus_1)
+    operating_point = 0;
+  pbi->current_operating_point =
+      seq_params->operating_point_idc[operating_point];
+  if (aom_get_num_layers_from_operating_point_idc(
+          pbi->current_operating_point, &cm->number_spatial_layers,
+          &cm->number_temporal_layers) != AOM_CODEC_OK) {
+    cm->error.error_code = AOM_CODEC_ERROR;
+    return 0;
+  }
+
+  read_sequence_header(cm, rb);
+
+  av1_read_color_config(cm, rb, pbi->allow_lowbitdepth);
+
+  cm->film_grain_params_present = aom_rb_read_bit(rb);
+
+  if (av1_check_trailing_bits(pbi, rb) != 0) {
+    // cm->error.error_code is already set.
+    return 0;
+  }
+
+  pbi->sequence_header_ready = 1;
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+static uint32_t read_frame_header_obu(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      const uint8_t *data,
+                                      const uint8_t **p_data_end,
+                                      int trailing_bits_present) {
+  av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+                                     trailing_bits_present);
+  return (uint32_t)(pbi->uncomp_hdr_size);
+}
+
+static int32_t read_tile_group_header(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      int *start_tile, int *end_tile,
+                                      int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t saved_bit_offset = rb->bit_offset;
+  int tile_start_and_end_present_flag = 0;
+  const int num_tiles = pbi->common.tile_rows * pbi->common.tile_cols;
+
+  if (!pbi->common.large_scale_tile && num_tiles > 1) {
+    tile_start_and_end_present_flag = aom_rb_read_bit(rb);
+  }
+  if (pbi->common.large_scale_tile || num_tiles == 1 ||
+      !tile_start_and_end_present_flag) {
+    *start_tile = 0;
+    *end_tile = num_tiles - 1;
+    return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+  }
+  if (tile_start_implicit && tile_start_and_end_present_flag) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+  *start_tile =
+      aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+  *end_tile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+static uint32_t read_one_tile_group_obu(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg,
+    const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end,
+    int *is_last_tg, int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  int start_tile, end_tile;
+  int32_t header_size, tg_payload_size;
+
+  header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
+                                       tile_start_implicit);
+  if (header_size == -1) return 0;
+  if (start_tile > end_tile) return header_size;
+  data += header_size;
+  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
+                                 end_tile, is_first_tg);
+
+  tg_payload_size = (uint32_t)(*p_data_end - data);
+
+  // TODO(shan):  For now, assume all tile groups received in order
+  *is_last_tg = end_tile == cm->tile_rows * cm->tile_cols - 1;
+  return header_size + tg_payload_size;
+}
+
+// Only called while large_scale_tile = 1.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+                                              struct aom_read_bit_buffer *rb,
+                                              const uint8_t *data,
+                                              const uint8_t *data_end,
+                                              const uint8_t **p_data_end,
+                                              int *frame_decoding_finished) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t tile_list_payload_size = 0;
+  const int num_tiles = cm->tile_cols * cm->tile_rows;
+  const int start_tile = 0;
+  const int end_tile = num_tiles - 1;
+  int i = 0;
+
+  // Process the tile list info.
+  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+  if (pbi->tile_count_minus_1 > 511) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+
+  // Allocate output frame buffer for the tile list.
+  // TODO(yunqing): for now, copy each tile's decoded YUV data directly to the
+  // output buffer. This needs to be modified according to the application
+  // requirement.
+  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  const int ssy = cm->subsampling_y;
+  const int ssx = cm->subsampling_x;
+  const int num_planes = av1_num_planes(cm);
+  const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels;
+  const size_t uvplane_tile_size =
+      (num_planes > 1)
+          ? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx)
+          : 0;
+  const size_t tile_size = (cm->use_highbitdepth ? 2 : 1) *
+                           (yplane_tile_size + 2 * uvplane_tile_size);
+  pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1);
+
+  if (pbi->tile_list_size > pbi->buffer_sz) {
+    if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+    pbi->tile_list_output = NULL;
+
+    pbi->tile_list_output = (uint8_t *)aom_memalign(32, pbi->tile_list_size);
+    if (pbi->tile_list_output == NULL)
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate the tile list output buffer");
+    pbi->buffer_sz = pbi->tile_list_size;
+  }
+
+  uint32_t tile_list_info_bytes = 4;
+  tile_list_payload_size += tile_list_info_bytes;
+  data += tile_list_info_bytes;
+  uint8_t *output = pbi->tile_list_output;
+
+  for (i = 0; i <= pbi->tile_count_minus_1; i++) {
+    // Process 1 tile.
+    // Reset the bit reader.
+    rb->bit_offset = 0;
+    rb->bit_buffer = data;
+
+    // Read out the tile info.
+    uint32_t tile_info_bytes = 5;
+    // Set reference for each tile.
+    int ref_idx = aom_rb_read_literal(rb, 8);
+    if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    av1_set_reference_dec(cm, 0, 1, &pbi->ext_refs.refs[ref_idx]);
+
+    pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
+    pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
+    if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
+        pbi->dec_tile_row >= cm->tile_rows ||
+        pbi->dec_tile_col >= cm->tile_cols) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
+    data += tile_info_bytes;
+    if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
+                                   p_data_end, start_tile, end_tile, 0);
+    uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
+
+    tile_list_payload_size += tile_info_bytes + tile_payload_size;
+
+    // Update data ptr for next tile decoding.
+    data = *p_data_end;
+    assert(data <= data_end);
+
+    // Copy decoded tile to the tile list output buffer.
+    YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+    const int mi_row = pbi->dec_tile_row * cm->tile_height;
+    const int mi_col = pbi->dec_tile_col * cm->tile_width;
+    const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+    uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
+    int strides[MAX_MB_PLANE] = { 0, 0, 0 };
+    int plane;
+
+    for (plane = 0; plane < num_planes; ++plane) {
+      int shift_x = plane > 0 ? ssx : 0;
+      int shift_y = plane > 0 ? ssy : 0;
+
+      bufs[plane] = cur_frame->buffers[plane];
+      strides[plane] =
+          (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
+      if (is_hbd) {
+        bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(cur_frame->buffers[plane]);
+        strides[plane] =
+            (plane > 0) ? 2 * cur_frame->strides[1] : 2 * cur_frame->strides[0];
+      }
+
+      bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
+                     mi_col * (MI_SIZE >> shift_x);
+
+      int w, h;
+      w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
+                                     : tile_width_in_pixels;
+      w *= (1 + is_hbd);
+      h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
+                                     : tile_height_in_pixels;
+      int j;
+
+      for (j = 0; j < h; ++j) {
+        memcpy(output, bufs[plane], w);
+        bufs[plane] += strides[plane];
+        output += w;
+      }
+    }
+  }
+
+  *frame_decoding_finished = 1;
+  return tile_list_payload_size;
+}
+
+static void read_metadata_itut_t35(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  for (size_t i = 0; i < sz; i++) {
+    aom_rb_read_literal(&rb, 8);
+  }
+}
+
+static void read_metadata_hdr_cll(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  aom_rb_read_literal(&rb, 16);  // max_cll
+  aom_rb_read_literal(&rb, 16);  // max_fall
+}
+
+static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  for (int i = 0; i < 3; i++) {
+    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_x
+    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_y
+  }
+
+  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_x
+  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_y
+
+  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_max
+  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_min
+}
+
+static void scalability_structure(struct aom_read_bit_buffer *rb) {
+  int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
+  int spatial_layer_dimensions_present_flag = aom_rb_read_literal(rb, 1);
+  int spatial_layer_description_present_flag = aom_rb_read_literal(rb, 1);
+  int temporal_group_description_present_flag = aom_rb_read_literal(rb, 1);
+  aom_rb_read_literal(rb, 3);  // reserved
+
+  if (spatial_layer_dimensions_present_flag) {
+    int i;
+    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+      aom_rb_read_literal(rb, 16);
+      aom_rb_read_literal(rb, 16);
+    }
+  }
+  if (spatial_layer_description_present_flag) {
+    int i;
+    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+      aom_rb_read_literal(rb, 8);
+    }
+  }
+  if (temporal_group_description_present_flag) {
+    int i, j, temporal_group_size;
+    temporal_group_size = aom_rb_read_literal(rb, 8);
+    for (i = 0; i < temporal_group_size; i++) {
+      aom_rb_read_literal(rb, 3);
+      aom_rb_read_literal(rb, 1);
+      aom_rb_read_literal(rb, 1);
+      int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+      for (j = 0; j < temporal_group_ref_cnt; j++) {
+        aom_rb_read_literal(rb, 8);
+      }
+    }
+  }
+}
+
+static void read_metadata_scalability(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  int scalability_mode_idc = aom_rb_read_literal(&rb, 8);
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    scalability_structure(&rb);
+  }
+}
+
+static void read_metadata_timecode(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  aom_rb_read_literal(&rb, 5);                     // counting_type f(5)
+  int full_timestamp_flag = aom_rb_read_bit(&rb);  // full_timestamp_flag f(1)
+  aom_rb_read_bit(&rb);                            // discontinuity_flag (f1)
+  aom_rb_read_bit(&rb);                            // cnt_dropped_flag f(1)
+  aom_rb_read_literal(&rb, 9);                     // n_frames f(9)
+  if (full_timestamp_flag) {
+    aom_rb_read_literal(&rb, 6);  // seconds_value f(6)
+    aom_rb_read_literal(&rb, 6);  // minutes_value f(6)
+    aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+  } else {
+    int seconds_flag = aom_rb_read_bit(&rb);  // seconds_flag f(1)
+    if (seconds_flag) {
+      aom_rb_read_literal(&rb, 6);              // seconds_value f(6)
+      int minutes_flag = aom_rb_read_bit(&rb);  // minutes_flag f(1)
+      if (minutes_flag) {
+        aom_rb_read_literal(&rb, 6);            // minutes_value f(6)
+        int hours_flag = aom_rb_read_bit(&rb);  // hours_flag f(1)
+        if (hours_flag) {
+          aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+        }
+      }
+    }
+  }
+  // time_offset_length f(5)
+  int time_offset_length = aom_rb_read_literal(&rb, 5);
+  if (time_offset_length) {
+    aom_rb_read_literal(&rb, time_offset_length);  // f(time_offset_length)
+  }
+}
+
+static size_t read_metadata(const uint8_t *data, size_t sz) {
+  size_t type_length;
+  uint64_t type_value;
+  OBU_METADATA_TYPE metadata_type;
+  if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+    return sz;
+  }
+  metadata_type = (OBU_METADATA_TYPE)type_value;
+  if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
+    read_metadata_itut_t35(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
+    read_metadata_hdr_cll(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
+    read_metadata_hdr_mdcv(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+    read_metadata_scalability(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_TIMECODE) {
+    read_metadata_timecode(data + type_length, sz - type_length);
+  }
+
+  return sz;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+                                     size_t bytes_available,
+                                     size_t *const obu_size,
+                                     size_t *const length_field_size) {
+  uint64_t u_obu_size = 0;
+  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+      0) {
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+  *obu_size = (size_t)u_obu_size;
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read) {
+  size_t length_field_size = 0, obu_size = 0;
+  aom_codec_err_t status;
+
+  if (is_annexb) {
+    // Size field comes before the OBU header, and includes the OBU header
+    status =
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  struct aom_read_bit_buffer rb = { data + length_field_size,
+                                    data + bytes_available, 0, NULL, NULL };
+
+  status = read_obu_header(&rb, is_annexb, obu_header);
+  if (status != AOM_CODEC_OK) return status;
+
+  if (is_annexb) {
+    // Derive the payload size from the data we've already read
+    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+    *payload_size = obu_size - obu_header->size;
+  } else {
+    // Size field comes after the OBU header, and is just the payload size
+    status = read_obu_size(data + obu_header->size,
+                           bytes_available - obu_header->size, payload_size,
+                           &length_field_size);
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  *bytes_read = length_field_size + obu_header->size;
+  return AOM_CODEC_OK;
+}
+
+#define EXT_TILE_DEBUG 0
+// On success, returns a boolean that indicates whether the decoding of the
+// current frame is finished. On failure, sets cm->error.error_code and
+// returns -1.
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  int frame_decoding_finished = 0;
+  int is_first_tg_obu_received = 1;
+  int frame_header_size = 0;
+  int seq_header_received = 0;
+  size_t seq_header_size = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  pbi->seen_frame_header = 0;
+
+  if (data_end < data) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+
+  // Reset pbi->camera_frame_header_ready to 0 if cm->large_scale_tile = 0.
+  if (!cm->large_scale_tile) pbi->camera_frame_header_ready = 0;
+
+  // decode frame as a series of OBUs
+  while (!frame_decoding_finished && !cm->error.error_code) {
+    struct aom_read_bit_buffer rb;
+    size_t payload_size = 0;
+    size_t decoded_payload_size = 0;
+    size_t obu_payload_offset = 0;
+    size_t bytes_read = 0;
+    const size_t bytes_available = data_end - data;
+
+    if (bytes_available == 0 && !pbi->seen_frame_header) {
+      *p_data_end = data;
+      cm->error.error_code = AOM_CODEC_OK;
+      break;
+    }
+
+    aom_codec_err_t status =
+        aom_read_obu_header_and_size(data, bytes_available, cm->is_annexb,
+                                     &obu_header, &payload_size, &bytes_read);
+
+    if (status != AOM_CODEC_OK) {
+      cm->error.error_code = status;
+      return -1;
+    }
+
+    // Record obu size header information.
+    pbi->obu_size_hdr.data = data + obu_header.size;
+    pbi->obu_size_hdr.size = bytes_read - obu_header.size;
+
+    // Note: aom_read_obu_header_and_size() takes care of checking that this
+    // doesn't cause 'data' to advance past 'data_end'.
+    data += bytes_read;
+
+    if ((size_t)(data_end - data) < payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    cm->temporal_layer_id = obu_header.temporal_layer_id;
+    cm->spatial_layer_id = obu_header.spatial_layer_id;
+
+    if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+        obu_header.type != OBU_SEQUENCE_HEADER &&
+        obu_header.type != OBU_PADDING) {
+      // don't decode obu if it's not in current operating mode
+      if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+        data += payload_size;
+        continue;
+      }
+    }
+
+    av1_init_read_bit_buffer(pbi, &rb, data, data_end);
+
+    switch (obu_header.type) {
+      case OBU_TEMPORAL_DELIMITER:
+        decoded_payload_size = read_temporal_delimiter_obu();
+        pbi->seen_frame_header = 0;
+        break;
+      case OBU_SEQUENCE_HEADER:
+        if (!seq_header_received) {
+          decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+          if (cm->error.error_code != AOM_CODEC_OK) return -1;
+
+          seq_header_size = decoded_payload_size;
+          seq_header_received = 1;
+        } else {
+          // Seeing another sequence header, skip as all sequence headers are
+          // required to be identical except for the contents of
+          // operating_parameters_info and the amount of trailing bits.
+          // TODO(yaowu): verifying redundant sequence headers are identical.
+          decoded_payload_size = seq_header_size;
+        }
+        break;
+      case OBU_FRAME_HEADER:
+      case OBU_REDUNDANT_FRAME_HEADER:
+      case OBU_FRAME:
+        // Only decode first frame header received
+        if (!pbi->seen_frame_header ||
+            (cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
+          pbi->seen_frame_header = 1;
+          frame_header_size = read_frame_header_obu(
+              pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+          if (cm->large_scale_tile) pbi->camera_frame_header_ready = 1;
+        }
+        decoded_payload_size = frame_header_size;
+        pbi->frame_header_size = (size_t)frame_header_size;
+
+        if (cm->show_existing_frame) {
+          frame_decoding_finished = 1;
+          pbi->seen_frame_header = 0;
+          break;
+        }
+
+#if !EXT_TILE_DEBUG
+        // In large scale tile coding, decode the common camera frame header
+        // before any tile list OBU.
+        if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
+          frame_decoding_finished = 1;
+          // Skip the rest of the frame data.
+          decoded_payload_size = payload_size;
+          // Update data_end.
+          *p_data_end = data_end;
+          break;
+        }
+#endif  // EXT_TILE_DEBUG
+
+        if (obu_header.type != OBU_FRAME) break;
+        obu_payload_offset = frame_header_size;
+        AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
+      case OBU_TILE_GROUP:
+        if (!pbi->seen_frame_header) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        if ((size_t)(data_end - data) < obu_payload_offset) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        decoded_payload_size += read_one_tile_group_obu(
+            pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
+            data + payload_size, p_data_end, &frame_decoding_finished,
+            obu_header.type == OBU_FRAME);
+        is_first_tg_obu_received = 0;
+        if (frame_decoding_finished) pbi->seen_frame_header = 0;
+        break;
+      case OBU_METADATA:
+        decoded_payload_size = read_metadata(data, payload_size);
+        break;
+      case OBU_TILE_LIST:
+        // This OBU type is purely for the large scale tile coding mode.
+        // The common camera frame header has to be already decoded.
+        if (!pbi->camera_frame_header_ready) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+
+        cm->large_scale_tile = 1;
+        av1_set_single_tile_decoding_mode(cm);
+        decoded_payload_size =
+            read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
+                                          p_data_end, &frame_decoding_finished);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      case OBU_PADDING:
+      default:
+        // Skip unrecognized OBUs
+        decoded_payload_size = payload_size;
+        break;
+    }
+
+    // Check that the signalled OBU size matches the actual amount of data read
+    if (decoded_payload_size > payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    // If there are extra padding bytes, they should all be zero
+    while (decoded_payload_size < payload_size) {
+      uint8_t padding_byte = data[decoded_payload_size++];
+      if (padding_byte != 0) {
+        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        return -1;
+      }
+    }
+
+    data += payload_size;
+  }
+
+  return frame_decoding_finished;
+}
+#undef EXT_TILE_DEBUG
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
new file mode 100644
index 000000000..5f2197058
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_OBU_H
+#define AV1_DECODER_OBU_H
+
+#include "aom/aom_codec.h"
+#include "av1/decoder/decoder.h"
+
+typedef struct {
+  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
+                // optional OBU extension header) in the bitstream.
+  OBU_TYPE type;
+  int has_size_field;
+  int has_extension;
+  // The following fields come from the OBU extension header and therefore are
+  // only used if has_extension is true.
+  int temporal_layer_id;
+  int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read);
+
+// Try to decode one frame from a buffer.
+// Returns 1 if we decoded a frame,
+//         0 if we didn't decode a frame but that's okay
+//           (eg, if there was a frame but we skipped it),
+//     or -1 on error
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end);
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *num_spatial_layers,
+    unsigned int *num_temporal_layers);
+
+#endif
diff --git a/third_party/aom/av1/decoder/pvq_decoder.c b/third_party/aom/av1/decoder/pvq_decoder.c
deleted file mode 100644
index d9a8e8056..000000000
--- a/third_party/aom/av1/decoder/pvq_decoder.c
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./aom_config.h"
-#include "aom_dsp/bitreader.h"
-#include "aom_dsp/entcode.h"
-#include "aom_dsp/entdec.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/partition.h"
-#include "av1/common/pvq_state.h"
-#include "av1/decoder/decint.h"
-#include "av1/decoder/pvq_decoder.h"
-#include "aom_ports/system_state.h"
-
-int aom_read_symbol_pvq_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs
- ACCT_STR_PARAM) {
-  if (cdf[0] == 0)
-    aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
-  return aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME);
-}
-
-static void aom_decode_pvq_codeword(aom_reader *r, od_pvq_codeword_ctx *ctx,
- od_coeff *y, int n, int k) {
-  int i;
-  aom_decode_band_pvq_splits(r, ctx, y, n, k, 0);
-  for (i = 0; i < n; i++) {
-    if (y[i] && aom_read_bit(r, "pvq:sign")) y[i] = -y[i];
-  }
-}
-
-/** Inverse of neg_interleave; decodes the interleaved gain.
- *
- * @param [in]      x      quantized/interleaved gain to decode
- * @param [in]      ref    quantized gain of the reference
- * @return                 original quantized gain value
- */
-static int neg_deinterleave(int x, int ref) {
-  if (x < 2*ref-1) {
-    if (x & 1) return ref - 1 - (x >> 1);
-    else return ref + (x >> 1);
-  }
-  else return x+1;
-}
-
-/** Synthesizes one parition of coefficient values from a PVQ-encoded
- * vector.
- *
- * @param [out]     xcoeff  output coefficient partition (x in math doc)
- * @param [in]      ypulse  PVQ-encoded values (y in math doc); in the noref
- *                          case, this vector has n entries, in the
- *                          reference case it contains n-1 entries
- *                          (the m-th entry is not included)
- * @param [in]      ref     reference vector (prediction)
- * @param [in]      n       number of elements in this partition
- * @param [in]      gr      gain of the reference vector (prediction)
- * @param [in]      noref   indicates presence or lack of prediction
- * @param [in]      g       decoded quantized vector gain
- * @param [in]      theta   decoded theta (prediction error)
- * @param [in]      qm      QM with magnitude compensation
- * @param [in]      qm_inv  Inverse of QM with magnitude compensation
- */
-static void pvq_synthesis(od_coeff *xcoeff, od_coeff *ypulse, od_val16 *r16,
- int n, od_val32 gr, int noref, od_val32 g, od_val32 theta, const int16_t *qm_inv,
- int shift) {
-  int s;
-  int m;
-  /* Sign of the Householder reflection vector */
-  s = 0;
-  /* Direction of the Householder reflection vector */
-  m = noref ? 0 : od_compute_householder(r16, n, gr, &s, shift);
-  od_pvq_synthesis_partial(xcoeff, ypulse, r16, n, noref, g, theta, m, s,
-   qm_inv);
-}
-
-typedef struct {
-  od_coeff *ref;
-  int nb_coeffs;
-  int allow_flip;
-} cfl_ctx;
-
-/** Decodes a single vector of integers (eg, a partition within a
- *  coefficient block) encoded using PVQ
- *
- * @param [in,out] ec          range encoder
- * @param [in]     q0          scale/quantizer
- * @param [in]     n           number of coefficients in partition
- * @param [in,out] model       entropy decoder state
- * @param [in,out] adapt       adaptation context
- * @param [in,out] exg         ExQ16 expectation of decoded gain value
- * @param [in,out] ext         ExQ16 expectation of decoded theta value
- * @param [in]     ref         'reference' (prediction) vector
- * @param [out]    out         decoded partition
- * @param [out]    noref       boolean indicating absence of reference
- * @param [in]     beta        per-band activity masking beta param
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     pli         plane index
- * @param [in]     cdf_ctx     selects which cdf context to use
- * @param [in,out] skip_rest   whether to skip further bands in each direction
- * @param [in]     band        index of the band being decoded
- * @param [in]     band        index of the band being decoded
- * @param [out]    skip        skip flag with range [0,1]
- * @param [in]     qm          QM with magnitude compensation
- * @param [in]     qm_inv      Inverse of QM with magnitude compensation
- */
-static void pvq_decode_partition(aom_reader *r,
-                                 int q0,
-                                 int n,
-                                 generic_encoder model[3],
-                                 od_adapt_ctx *adapt,
-                                 int *exg,
-                                 int *ext,
-                                 od_coeff *ref,
-                                 od_coeff *out,
-                                 int *noref,
-                                 od_val16 beta,
-                                 int is_keyframe,
-                                 int pli,
-                                 int cdf_ctx,
-                                 cfl_ctx *cfl,
-                                 int has_skip,
-                                 int *skip_rest,
-                                 int band,
-                                 int *skip,
-                                 const int16_t *qm,
-                                 const int16_t *qm_inv) {
-  int k;
-  od_val32 qcg;
-  int itheta;
-  od_val32 theta;
-  od_val32 gr;
-  od_val32 gain_offset;
-  od_coeff y[MAXN];
-  int qg;
-  int id;
-  int i;
-  od_val16 ref16[MAXN];
-  int rshift;
-  theta = 0;
-  gr = 0;
-  gain_offset = 0;
-  /* Skip is per-direction. For band=0, we can use any of the flags. */
-  if (skip_rest[(band + 2) % 3]) {
-    qg = 0;
-    if (is_keyframe) {
-      itheta = -1;
-      *noref = 1;
-    }
-    else {
-      itheta = 0;
-      *noref = 0;
-    }
-  }
-  else {
-    /* Jointly decode gain, itheta and noref for small values. Then we handle
-       larger gain. */
-    id = aom_read_symbol_pvq(r, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
-     8 + 7*has_skip, "pvq:gaintheta");
-    if (!is_keyframe && id >= 10) id++;
-    if (is_keyframe && id >= 8) id++;
-    if (id >= 8) {
-      id -= 8;
-      skip_rest[0] = skip_rest[1] = skip_rest[2] = 1;
-    }
-    qg = id & 1;
-    itheta = (id >> 1) - 1;
-    *noref = (itheta == -1);
-  }
-  /* The CfL flip bit is only decoded on the first band that has noref=0. */
-  if (cfl->allow_flip && !*noref) {
-    int flip;
-    flip = aom_read_bit(r, "cfl:flip");
-    if (flip) {
-      for (i = 0; i < cfl->nb_coeffs; i++) cfl->ref[i] = -cfl->ref[i];
-    }
-    cfl->allow_flip = 0;
-  }
-  if (qg > 0) {
-    int tmp;
-    tmp = *exg;
-    qg = 1 + generic_decode(r, &model[!*noref], &tmp, 2, "pvq:gain");
-    OD_IIR_DIADIC(*exg, qg << 16, 2);
-  }
-  *skip = 0;
-#if defined(OD_FLOAT_PVQ)
-  rshift = 0;
-#else
-  /* Shift needed to make the reference fit in 15 bits, so that the Householder
-     vector can fit in 16 bits. */
-  rshift = OD_MAXI(0, od_vector_log_mag(ref, n) - 14);
-#endif
-  for (i = 0; i < n; i++) {
-#if defined(OD_FLOAT_PVQ)
-    ref16[i] = ref[i]*(double)qm[i]*OD_QM_SCALE_1;
-#else
-    ref16[i] = OD_SHR_ROUND(ref[i]*qm[i], OD_QM_SHIFT + rshift);
-#endif
-  }
-  if(!*noref){
-    /* we have a reference; compute its gain */
-    od_val32 cgr;
-    int icgr;
-    int cfl_enabled;
-    cfl_enabled = pli != 0 && is_keyframe && !OD_DISABLE_CFL;
-    cgr = od_pvq_compute_gain(ref16, n, q0, &gr, beta, rshift);
-    if (cfl_enabled) cgr = OD_CGAIN_SCALE;
-#if defined(OD_FLOAT_PVQ)
-    icgr = (int)floor(.5 + cgr);
-#else
-    icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
-#endif
-    /* quantized gain is interleave encoded when there's a reference;
-       deinterleave it now */
-    if (is_keyframe) qg = neg_deinterleave(qg, icgr);
-    else {
-      qg = neg_deinterleave(qg, icgr + 1) - 1;
-      if (qg == 0) *skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
-    }
-    if (qg == icgr && itheta == 0 && !cfl_enabled) *skip = OD_PVQ_SKIP_COPY;
-    gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
-    qcg = OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset;
-    /* read and decode first-stage PVQ error theta */
-    if (itheta > 1) {
-      int tmp;
-      tmp = *ext;
-      itheta = 2 + generic_decode(r, &model[2], &tmp, 2, "pvq:theta");
-      OD_IIR_DIADIC(*ext, itheta << 16, 2);
-    }
-    theta = od_pvq_compute_theta(itheta, od_pvq_compute_max_theta(qcg, beta));
-  }
-  else{
-    itheta = 0;
-    if (!is_keyframe) qg++;
-    qcg = OD_SHL(qg, OD_CGAIN_SHIFT);
-    if (qg == 0) *skip = OD_PVQ_SKIP_ZERO;
-  }
-
-  k = od_pvq_compute_k(qcg, itheta, *noref, n, beta);
-  if (k != 0) {
-    /* when noref==0, y is actually size n-1 */
-    aom_decode_pvq_codeword(r, &adapt->pvq.pvq_codeword_ctx, y,
-     n - !*noref, k);
-  }
-  else {
-    OD_CLEAR(y, n);
-  }
-  if (*skip) {
-    if (*skip == OD_PVQ_SKIP_COPY) OD_COPY(out, ref, n);
-    else OD_CLEAR(out, n);
-  }
-  else {
-    od_val32 g;
-    g = od_gain_expand(qcg, q0, beta);
-    pvq_synthesis(out, y, ref16, n, gr, *noref, g, theta, qm_inv, rshift);
-  }
-  /* If OD_PVQ_SKIP_ZERO or OD_PVQ_SKIP_COPY, set skip to 1 for visualization */
-  if (*skip) *skip = 1;
-}
-
-/** Decodes a coefficient block (except for DC) encoded using PVQ
- *
- * @param [in,out] dec         daala decoder context
- * @param [in]     ref         'reference' (prediction) vector
- * @param [out]    out         decoded partition
- * @param [in]     q0          quantizer
- * @param [in]     pli         plane index
- * @param [in]     bs          log of the block size minus two
- * @param [in]     beta        per-band activity masking beta param
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [out]    flags       bitmask of the per band skip and noref flags
- * @param [in]     ac_dc_coded skip flag for the block (range 0-3)
- * @param [in]     qm          QM with magnitude compensation
- * @param [in]     qm_inv      Inverse of QM with magnitude compensation
- */
-void od_pvq_decode(daala_dec_ctx *dec,
-                   od_coeff *ref,
-                   od_coeff *out,
-                   int q0,
-                   int pli,
-                   int bs,
-                   const od_val16 *beta,
-                   int is_keyframe,
-                   unsigned int *flags,
-                   PVQ_SKIP_TYPE ac_dc_coded,
-                   const int16_t *qm,
-                   const int16_t *qm_inv){
-
-  int noref[PVQ_MAX_PARTITIONS];
-  int skip[PVQ_MAX_PARTITIONS];
-  int *exg;
-  int *ext;
-  int nb_bands;
-  int i;
-  const int *off;
-  int size[PVQ_MAX_PARTITIONS];
-  generic_encoder *model;
-  int skip_rest[3] = {0};
-  cfl_ctx cfl;
-  const unsigned char *pvq_qm;
-  int use_masking;
-
-  aom_clear_system_state();
-
-  /*Default to skip=1 and noref=0 for all bands.*/
-  for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
-    noref[i] = 0;
-    skip[i] = 1;
-  }
-
-  use_masking = dec->use_activity_masking;
-
-  if (use_masking)
-    pvq_qm = &dec->state.pvq_qm_q4[pli][0];
-  else
-    pvq_qm = 0;
-
-  exg = &dec->state.adapt->pvq.pvq_exg[pli][bs][0];
-  ext = dec->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
-  model = dec->state.adapt->pvq.pvq_param_model;
-  nb_bands = OD_BAND_OFFSETS[bs][0];
-  off = &OD_BAND_OFFSETS[bs][1];
-  out[0] = ac_dc_coded & DC_CODED;
-  if (ac_dc_coded < AC_CODED) {
-    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
-    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
-  }
-  else {
-    for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
-    cfl.ref = ref;
-    cfl.nb_coeffs = off[nb_bands];
-    cfl.allow_flip = pli != 0 && is_keyframe;
-    for (i = 0; i < nb_bands; i++) {
-      int q;
-
-      if (use_masking)
-        q = OD_MAXI(1, q0 * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
-      else
-        q = OD_MAXI(1, q0);
-
-      pvq_decode_partition(dec->r, q, size[i],
-       model, dec->state.adapt, exg + i, ext + i, ref + off[i], out + off[i],
-       &noref[i], beta[i], is_keyframe, pli,
-       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
-       &cfl, i == 0 && (i < nb_bands - 1), skip_rest, i, &skip[i],
-       qm + off[i], qm_inv + off[i]);
-      if (i == 0 && !skip_rest[0] && bs > 0) {
-        int skip_dir;
-        int j;
-        skip_dir = aom_read_symbol(dec->r,
-         &dec->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7,
-         "pvq:skiprest");
-        for (j = 0; j < 3; j++) skip_rest[j] = !!(skip_dir & (1 << j));
-      }
-    }
-  }
-  *flags = 0;
-  for (i = nb_bands - 1; i >= 0; i--) {
-    *flags <<= 1;
-    *flags |= noref[i]&1;
-    *flags <<= 1;
-    *flags |= skip[i]&1;
-  }
-}
diff --git a/third_party/aom/av1/decoder/pvq_decoder.h b/third_party/aom/av1/decoder/pvq_decoder.h
deleted file mode 100644
index 98970663b..000000000
--- a/third_party/aom/av1/decoder/pvq_decoder.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_pvq_decoder_H)
-# define _pvq_decoder_H (1)
-# include "aom_dsp/bitreader.h"
-# include "aom_dsp/entdec.h"
-# include "av1/common/pvq.h"
-# include "av1/decoder/decint.h"
-
-#define aom_read_symbol_pvq(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_symbol_pvq_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-
-int aom_read_symbol_pvq_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs
-  ACCT_STR_PARAM);
-
-void aom_decode_band_pvq_splits(aom_reader *r, od_pvq_codeword_ctx *adapt,
- od_coeff *y, int n, int k, int level);
-
-#define aom_laplace_decode_special(r, decay, ACCT_STR_NAME) \
-  aom_laplace_decode_special_(r, decay ACCT_STR_ARG(ACCT_STR_NAME))
-
-int aom_laplace_decode_special_(aom_reader *r, unsigned decay ACCT_STR_PARAM);
-
-void od_pvq_decode(daala_dec_ctx *dec, od_coeff *ref, od_coeff *out, int q0,
-    int pli, int bs, const od_val16 *beta, int is_keyframe,
-    unsigned int *flags, PVQ_SKIP_TYPE ac_dc_coded, const int16_t *qm,
-    const int16_t *qm_inv);
-
-#endif
diff --git a/third_party/aom/av1/decoder/symbolrate.h b/third_party/aom/av1/decoder/symbolrate.h
deleted file mode 100644
index 023287732..000000000
--- a/third_party/aom/av1/decoder/symbolrate.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/bitreader.h"
-
-#ifndef AV1_DECODER_SYMBOLRATE_H_
-#define AV1_DECODER_SYMBOLRATE_H_
-
-#if CONFIG_SYMBOLRATE
-static INLINE void av1_dump_symbol_rate(struct AV1Common *cm) {
-  const FRAME_COUNTS *counts = &cm->counts;
-  printf("%d %d %d %d\n", counts->coeff_num[0], counts->coeff_num[1],
-         counts->symbol_num[0], counts->symbol_num[1]);
-}
-static INLINE int av1_read_record_symbol(FRAME_COUNTS *counts, aom_reader *r,
-                                         aom_cdf_prob *cdf, int nsymbs,
-                                         const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read_symbol(r, cdf, nsymbs, str);
-}
-
-#if CONFIG_LV_MAP
-static INLINE int av1_read_record_bin(FRAME_COUNTS *counts, aom_reader *r,
-                                      aom_cdf_prob *cdf, int nsymbs,
-                                      const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read_bin(r, cdf, nsymbs, str);
-}
-#endif
-
-static INLINE int av1_read_record(FRAME_COUNTS *counts, aom_reader *r, int prob,
-                                  const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read(r, prob, str);
-}
-
-static INLINE int av1_read_record_cdf(FRAME_COUNTS *counts, aom_reader *r,
-                                      const aom_cdf_prob *cdf, int nsymbs,
-                                      const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read_cdf(r, cdf, nsymbs, str);
-}
-
-static INLINE int av1_read_record_bit(FRAME_COUNTS *counts, aom_reader *r,
-                                      const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[1];
-  return aom_read_bit(r, str);
-}
-
-static INLINE void av1_record_coeff(FRAME_COUNTS *counts, tran_low_t qcoeff) {
-  assert(qcoeff >= 0);
-  if (counts) ++counts->coeff_num[qcoeff != 0];
-}
-#else  // CONFIG_SYMBOLRATE
-
-#define av1_read_record_symbol(counts, r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME)
-
-#if CONFIG_LV_MAP
-#define av1_read_record_bin(counts, r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME)
-#endif
-
-#define av1_read_record(counts, r, prob, ACCT_STR_NAME) \
-  aom_read(r, prob, ACCT_STR_NAME)
-
-#define av1_read_record_cdf(counts, r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME)
-
-#define av1_read_record_bit(counts, r, ACCT_STR_NAME) \
-  aom_read_bit(r, ACCT_STR_NAME)
-
-#endif  // CONFIG_SYMBOLRATE
-
-#endif  // AV1_DECODER_SYMBOLRATE_H_
diff --git a/third_party/aom/av1/encoder/ab_partition_model_weights.h b/third_party/aom/av1/encoder/ab_partition_model_weights.h
new file mode 100644
index 000000000..5b918fae2
--- /dev/null
+++ b/third_party/aom/av1/encoder/ab_partition_model_weights.h
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+  -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f,  -0.469759f,
+  0.426152f,  0.489798f,  0.469865f,  0.773821f,  0.088517f,  0.074585f,
+  0.838754f,  0.048449f,  -0.007584f, 0.638968f,  0.233305f,  -0.319236f,
+  -0.257124f, -0.170869f, 0.137180f,  0.114852f,  -0.721241f, -0.947962f,
+  -0.411298f, 0.494306f,  -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+  -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f,  -0.189925f,
+  0.134361f,  -0.258070f, -0.177558f, 0.158049f,  0.168668f,  -0.062919f,
+  0.341986f,  0.038100f,  -0.435577f, -0.321255f, 0.203213f,  0.213061f,
+  0.533304f,  0.359296f,  -0.079558f, 0.004637f,  0.663904f,  0.043779f,
+  0.383018f,  1.136559f,  -0.084155f, 0.333057f,  -0.199011f, 0.152059f,
+  -0.078419f, -0.167752f, -0.093651f, 0.083171f,  -0.190143f, 0.086195f,
+  -0.280632f, -0.160663f, -0.017298f, 0.122628f,  -0.138116f, 0.062927f,
+  0.222462f,  0.626979f,  0.426928f,  0.117170f,  -0.240457f, 0.053750f,
+  0.038017f,  0.007359f,  -0.017595f, 0.101407f,  0.332891f,  0.074933f,
+  0.306498f,  0.219380f,  -0.151638f, -0.247976f, 0.343405f,  0.121256f,
+  0.049173f,  0.171474f,  -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+  0.243401f,  0.059928f,  -0.089396f, -0.195565f, 0.364705f,  -0.020400f,
+  -1.383672f, 0.413018f,  0.536950f,  -0.020904f, -1.335306f, -0.732290f,
+  0.102885f,  0.315290f,  -0.208521f, -0.081811f, 0.182300f,  0.125712f,
+  -0.593833f, -0.220639f, -0.314155f, 0.188327f,  0.118503f,  0.524427f,
+  -1.083859f, -1.130640f, 0.390352f,  -0.045591f, 0.113160f,  -0.009149f,
+  -0.096183f, 0.115829f,  0.377752f,  0.318396f,  -0.591983f, 0.004797f,
+  -0.497377f, -0.342248f, 0.079546f,  -0.025249f, -0.295972f, 0.615501f,
+  -0.464372f, 0.418315f,  -0.173556f, 0.105217f,  0.298073f,  0.082478f,
+  0.033223f,  0.977341f,  -0.372982f, -0.052337f, 0.154124f,  0.396787f,
+  0.536654f,  -0.139061f, -0.223702f, 0.229666f,  -0.846766f, 0.107723f,
+  0.563839f,  -0.483141f, 0.304813f,  -0.765283f, 0.070964f,  0.151101f,
+  0.275188f,  0.490303f,  1.175892f,  0.085377f,  -0.191200f, 0.544532f,
+  -0.365075f, 0.167546f,  0.052183f,  -0.220529f, -0.212227f, -0.144988f,
+  -0.273356f, -0.062023f, 0.103993f,  -0.238493f, -0.161204f, -0.054611f,
+  -0.166672f, 0.128327f,  0.461751f,  -0.545822f, 0.739798f,  0.594386f,
+  -0.163192f, -0.332501f, 0.363834f,  -0.065043f, 0.474812f,  -0.138811f,
+  0.170924f,  -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+  0.340591f,  0.041783f,  0.055419f,  0.015155f,  -0.981830f, -1.355237f,
+  0.347516f,  1.155327f,  0.081319f,  0.274163f,  -0.327230f, -0.113478f,
+  0.556552f,  -0.055986f, 0.217318f,  -0.445351f, 0.325759f,  0.526547f,
+  -0.657434f, -0.572214f, -0.037087f, 0.081384f,  0.064518f,  0.014892f,
+  0.215279f,  1.834504f,  -0.242107f, 0.079810f,  0.129558f,  0.079588f,
+  -0.035189f, -0.221745f, -0.163414f, 0.043978f,  -1.028662f, -0.623609f,
+  1.130336f,  0.664661f,  -0.063975f, -0.415863f, 0.018581f,  0.157758f,
+  0.200570f,  0.063420f,  0.901039f,  -0.746286f, 0.196230f,  -0.290592f,
+  0.042373f,  -0.502500f, 0.183638f,  0.103394f,  -0.298858f, 0.145436f,
+  0.196916f,  0.108319f,  -0.448572f, -0.881385f, 0.302497f,  0.121679f,
+  -0.021327f, 0.025150f,  0.481306f,  -0.359634f, 0.350257f,  -0.228647f,
+  -0.669860f, 0.260025f,  -0.034182f, 0.619247f,  -0.158826f, -0.405864f,
+  0.674112f,  -0.027885f, -0.325274f, -0.241492f, 0.036024f,  -0.437685f,
+  -0.091458f, -0.109295f, -0.350676f, 0.044706f,  0.297059f,  0.016290f,
+  1.121203f,  1.289062f,  -1.299476f, -1.129221f, 0.103752f,  0.131302f,
+  -0.263265f, 0.222155f,  -0.229908f, 0.013922f,  -0.226001f, -0.248383f,
+  -0.004415f, -0.020958f, 0.055634f,  0.086200f,  0.114556f,  -0.184061f,
+  -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f,  0.023781f,
+  -0.264460f, 0.157026f,  -0.235228f, -0.102564f, 0.043463f,  -0.187823f,
+  -0.257500f, -0.199049f, -0.242210f, 0.030448f,  0.221604f,  0.151804f,
+  -0.100404f, -0.073931f, 0.144749f,  -0.001572f, -1.438079f, -0.233716f,
+  0.733422f,  1.727080f,  -0.036397f, 0.027551f,  0.425321f,  0.085703f,
+  0.031186f,  0.032333f,  -0.675130f, 1.437733f,  -0.202392f, -0.525003f,
+  0.087048f,  0.328194f,  -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+  -0.139600f, 0.154665f,  0.026202f,  -0.233501f, -0.009046f, -0.149187f,
+  -0.199646f, 0.115375f,  0.209762f,  -0.014875f, 0.124038f,  -0.119985f,
+  1.079625f,  -0.461513f, 0.614114f,  0.021003f,  0.439449f,  -0.824834f,
+  -0.299701f, 0.193817f,  -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+  0.305310f,  -0.089721f, -0.317314f, -0.075631f, 0.127172f,  -0.208635f,
+  1.191922f,  0.163141f,  0.564285f,  0.286352f,  0.480865f,  0.173094f,
+  -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f,  0.090258f,
+  -0.016099f, 0.193230f,  0.188061f,  0.398144f,  0.722781f,  0.769949f,
+  0.025442f,  -0.162016f, 0.070192f,  -0.056946f, -0.100957f, -0.219934f,
+  -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f,  -0.017493f,
+  0.527446f,  0.083605f,  0.588318f,  0.878215f,  0.028747f,  -0.146479f,
+  -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f,  -0.101340f,
+  -0.027733f, -0.282611f, 0.265366f,  0.082362f,  -0.265420f, -0.131124f,
+  0.166303f,  0.040194f,  -0.100710f, 0.579151f,  -0.530136f, 0.163422f,
+  -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f,  -0.090302f,
+  1.723272f,  0.552370f,  -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+  0.539616f,  -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+  -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f,  -0.159378f,
+  0.029145f,  -0.050892f, -0.223407f, -0.246239f, 0.043152f,  -0.018460f,
+  0.169972f,  -0.187769f, -0.034670f, -0.238330f, 0.288070f,  -0.093243f,
+  -0.437105f, -0.573376f, 0.660073f,  0.285727f,  0.408470f,  0.158475f,
+  0.032699f,  0.056280f,  -0.237176f, -0.083003f, 0.105598f,  -0.169522f,
+  -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+  0.029124f,  0.009580f,  -0.252034f, 0.103087f,  1.156561f,  0.603848f,
+  -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f,  0.076095f,
+  1.490819f,  0.415893f,  -0.277788f, -0.115787f, 0.093750f,  0.270726f,
+  -0.395983f, -0.353742f, 0.034605f,  0.005342f,  0.184537f,  0.086445f,
+  0.156417f,  1.476367f,  0.122587f,  0.002145f,  0.431057f,  -0.381184f,
+  -1.646457f, -0.014009f, -0.671224f, 0.193726f,  -0.019247f, -0.031267f,
+  -0.046208f, 0.298733f,  0.064734f,  0.616984f,  0.039381f,  0.182722f,
+  -0.116670f, 0.233093f,  -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+  0.077697f,  -0.266720f, 0.130875f,  -0.235295f, -0.265754f, -0.159999f,
+  -0.250114f, -0.183017f, 0.194403f,  -0.105808f, -0.169215f, -0.240866f,
+  -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+  -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+  0.881460f,  -0.678603f, 0.008666f,  -0.252053f, -0.341035f, -0.175290f,
+  0.183012f,  0.385991f,  0.079888f,  -0.014039f, -0.148653f, 0.671778f,
+  -0.130219f, 1.086467f,  0.129267f,  -0.040400f, -0.201221f, -0.077005f,
+  0.015890f,  0.000781f,  0.137764f,  1.389546f,  0.172152f,  0.047279f,
+  -0.042783f, 0.127740f,  0.141467f,  -0.335738f, -1.396392f, 0.031496f,
+  0.357385f,  0.343602f,  -0.714553f, 0.311014f,  0.132845f,  0.061149f,
+  0.006796f,  0.568106f,  -0.255949f, 0.104134f,  -0.993447f, 0.298135f,
+  -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+  0.068481f,  0.036240f,  -0.495801f, 0.180574f,  -0.766129f, 0.886967f,
+  -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+  0.019016f,  -0.015837f, 0.600197f,  0.429773f,  0.315026f,  0.319667f,
+  0.214617f,  -0.017316f, 0.270257f,  -0.040524f, 0.695803f,  -0.015223f,
+  -1.554965f, 0.356997f,  -1.472428f, 0.024637f,  -0.562958f, 0.870351f,
+  0.193635f,  0.036063f,  0.328638f,  0.200274f,  -1.634707f, 0.110534f,
+  0.420104f,  -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+  0.643147f,  -1.348826f, 0.431627f,  0.000000f,  0.102717f,  -0.772628f,
+  -0.034351f, -0.761977f, -0.638397f, 0.541969f,  -0.391311f, 0.563076f,
+  0.148553f,  0.267217f,  -0.788092f, 0.544573f,  -0.546280f, 0.000000f,
+  -0.446945f, 0.127732f,  0.270624f,  -0.219435f, -1.220203f, 0.324584f,
+  0.110885f,  0.276547f,  0.179726f,  -0.375160f, 0.026401f,  -0.032595f,
+  0.000000f,  -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+  0.476453f,  -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+  0.915351f,  -0.209962f, 0.000000f,  -0.025731f, 0.218288f,  0.000000f,
+  0.047726f,  -0.813077f, -1.263281f, 0.239087f,  0.278614f,  -0.030753f,
+  0.000000f,  0.346744f,  -0.948543f, -1.174211f, 0.216377f,  0.498913f,
+  0.853918f,  0.002504f,  -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+  0.179769f,  1.499417f,  -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+  -0.203213f, 0.302171f,  0.226877f,  -0.422169f, 1.687586f,  0.783773f,
+  0.220995f,  0.253482f,  0.370435f,  -1.342775f, 0.337229f,  -0.271473f,
+  0.291796f,  1.362227f,  -1.751397f, -0.086178f, 0.725496f,  -0.118597f,
+  0.227963f,  -0.501577f, 0.223849f,  -0.122421f, -0.123437f, -0.051045f,
+  -0.020115f, 0.212711f,  0.246025f,  0.088120f,  -0.168995f, 1.740190f,
+  -0.195098f, 0.680339f,  -0.589572f, -0.075244f, 0.878766f,  0.064092f,
+  -3.548527f, 0.001660f,  0.107926f,  -0.169501f, -0.455212f, 0.123045f,
+  -1.836998f, 0.330365f,  1.301475f,  0.454761f,  -0.576552f, -0.190761f,
+  0.208459f,  0.618483f,  1.383364f,  0.970718f,  0.390174f,  0.406252f,
+  -0.564519f, -0.312062f, 1.345712f,  -0.151873f, 0.109290f,  0.408847f,
+  0.391243f,  0.152024f,  0.181764f,  -0.036263f, -0.160466f, 0.153595f,
+  0.049163f,  -0.753012f, -1.804062f, 0.347475f,  -2.746580f, 0.575618f,
+  0.261799f,  0.210505f,  -0.302054f, -0.109872f, 0.199506f,  -1.182971f,
+  0.723668f,  0.177758f,  -0.338202f, 0.254396f,  -0.220023f, 0.043504f,
+  0.669866f,  -0.040816f, -0.402730f, 0.017990f,  0.215523f,  -0.216816f,
+  0.454826f,  -0.726067f, -0.018750f, -0.928679f, 0.154315f,  -0.465641f,
+  0.144566f,  -0.030064f, -0.054667f, -0.154055f, 0.625384f,  1.323795f,
+  -0.159496f, 0.097072f,  -0.463197f, -0.057938f, 0.750290f,  -0.233061f,
+  0.412631f,  -0.535223f, -0.151423f, -0.154583f, 0.024721f,  -0.494448f,
+  0.230594f,  -0.980138f, -0.653968f, 0.126079f,  0.051814f,  -0.053219f,
+  -0.421708f, -0.228853f, 0.237885f,  0.888157f,  0.059655f,  0.241295f,
+  0.210443f,  0.228238f,  0.119127f,  -0.051989f, -0.355408f, 0.182215f,
+  0.244277f,  -0.104577f, -0.558035f, -0.023270f, 0.054571f,  0.700646f,
+  -0.223006f, 0.115523f,  0.023391f,  0.437264f,  0.709477f,  -0.531212f,
+  -0.094731f, 0.328161f,  -0.105418f, -0.133511f, 0.497168f,  -0.030948f,
+  -0.407132f, -0.043943f, 0.155505f,  0.251945f,  0.205010f,  0.167160f,
+  0.083654f,  -0.636810f, 0.401315f,  -0.398414f, 0.290046f,  0.206846f,
+  0.042218f,  0.168150f,  0.843181f,  -0.671242f, -0.202392f, -0.073301f,
+  0.142895f,  0.237466f,  0.212145f,  -0.091828f, 0.187038f,  -0.720841f,
+  -0.616069f, -0.238021f, 0.065365f,  0.434119f,  0.179023f,  -0.040107f,
+  -0.430734f, -0.297368f, 0.575954f,  0.382619f,  -0.709787f, -0.320810f,
+  0.242342f,  -0.047614f, 0.705216f,  0.098077f,  0.357179f,  0.046017f,
+  0.115074f,  -0.412305f, -0.272304f, 0.048096f,  -0.803811f, 0.275000f,
+  0.642198f,  0.180286f,  -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+  0.241759f,  -1.038870f, 0.728124f,  0.800559f,  -1.296268f, 0.198612f,
+  -0.053478f, 0.414344f,  -0.510529f, 0.124179f,  -2.219115f, -0.074583f,
+  -0.143055f, 0.001697f,  0.810811f,  -0.657140f, 0.186818f,  -0.936414f,
+  0.539578f,  -0.308244f, -0.126624f, -0.204767f, 0.091145f,  -0.049340f,
+  0.252014f,  0.394582f,  0.018764f,  -0.060377f, -0.019133f, 0.064083f,
+  0.069211f,  -0.526693f, 0.209850f,  -0.481466f, -0.468302f, -0.100407f,
+  0.241018f,  -1.037781f, 0.038539f,  -2.113840f, -0.974895f, 0.163187f,
+  0.425132f,  -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+  -0.745175f, -0.177077f, 0.217658f,  0.381431f,  -0.052338f, 0.087176f,
+  -0.165972f, 0.085937f,  0.472564f,  -0.796627f, -2.453307f, 0.569664f,
+  -0.233010f, -0.192134f, 0.064339f,  -0.111411f, -0.262469f, -0.410022f,
+  0.519993f,  -0.684620f, 0.393460f,  -0.277753f, -0.153624f, 0.528984f,
+  -0.415558f, -0.445863f, 0.588512f,  -0.142439f, -0.132127f, 0.199776f,
+  -0.579284f, 0.119488f,  -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+  0.020519f,  0.233973f,  -0.297998f, -0.051511f, 0.518626f,  -0.412782f,
+  -0.074045f, 0.130523f,  0.465751f,  -0.117795f, 2.535813f,  0.352108f,
+  -0.499228f, 0.379784f,  0.056699f,  0.173142f,  -0.076519f, -0.026666f,
+  0.017834f,  0.492333f,  0.093364f,  0.037867f,  -0.165420f, -0.356429f,
+  -0.562334f, 0.057656f,  -0.307544f, 0.085857f,  -0.559851f, 0.107230f,
+  -0.398633f, 0.152618f,  -0.216835f, -0.024539f, 0.026044f,  -0.249519f,
+  -0.563594f, -0.746025f, 0.025265f,  -0.298888f, -0.185243f, 0.058794f,
+  0.233696f,  -0.115223f, 0.144617f,  -0.864390f, 0.619944f,  -0.023980f,
+  0.019481f,  0.225252f,  0.416552f,  -0.115993f, 0.935387f,  0.744386f,
+  0.053353f,  -0.052582f, -0.065650f, 0.228488f,  -0.032042f, -0.371252f,
+  -0.003638f, -0.736984f, -0.203776f, 0.030922f,  -0.065577f, -0.031643f,
+  -0.049253f, -0.054640f, 0.787134f,  0.545414f,  -0.140297f, -0.124274f,
+  -0.110011f, -0.029552f, 0.657005f,  0.214973f,  -0.374300f, 0.251642f,
+  0.276591f,  0.030566f,  -0.145470f, 0.350579f,  -0.356436f, -0.052694f,
+  -0.063966f, -0.751008f, -1.042392f, 0.328892f,  -0.425058f, -0.421571f,
+  -0.571889f, -1.141472f, -0.125216f, 0.212713f,  -0.485170f, -0.088791f,
+  0.124589f,  0.023237f,  0.077635f,  0.020901f,  -0.271402f, -0.321424f,
+  -0.513946f, -0.867872f, -0.284593f, 0.106276f,  0.220192f,  -0.143532f,
+  -0.014648f, 0.073402f,  0.327256f,  -0.139803f, 0.168763f,  0.048199f,
+  -0.122526f, 0.111713f,  -0.134257f, 0.810364f,  -0.085222f, -0.259221f,
+  -0.239349f, 0.044448f,  0.205031f,  0.413113f,  -0.107720f, -0.018816f,
+  -0.247741f, -0.004963f, 0.041170f,  -0.158019f, 0.134839f,  0.129502f,
+  0.800488f,  -1.041584f, -0.129336f, 0.170834f,  0.566586f,  -0.230443f,
+  0.437937f,  -0.149922f, -0.046665f, -0.094646f, 0.200070f,  0.072943f,
+  -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f,  -0.444731f,
+  -0.100877f, 0.545196f,  -1.786626f, -0.482946f, 0.500509f,  -0.843257f,
+  0.200374f,  0.045103f,  -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+  -0.139490f, 0.356058f,  -0.352075f, 0.061751f,  -0.200616f, -1.180921f,
+  -0.181355f, -0.137459f, 0.247574f,  0.181541f,  0.184314f,  -0.961482f,
+  0.493615f,  0.910261f,  -2.279238f, 0.648631f,  -0.055526f, -0.037137f,
+  0.038643f,  0.136609f,  -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+  0.454651f,  -0.595323f, -0.099500f, -0.263717f, 0.150456f,  0.245077f,
+  -0.268666f, 0.162232f,  -0.516451f, -0.024501f, 0.188046f,  -0.002262f,
+  0.261319f,  0.004173f,  0.746982f,  0.174761f,  0.470447f,  -0.159558f,
+  -0.385240f, 0.023084f,  -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+  -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+  -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f,  -0.153361f,
+  0.334394f,  -0.569472f, -0.198118f, 0.255922f,  0.104717f,  -0.065179f,
+  0.111879f,  -0.447237f, 1.373623f,  -0.190191f, -0.063311f, 0.337529f,
+  -0.138800f, 0.057009f,  -0.137006f, 0.641378f,  0.883147f,  -0.679655f,
+  0.267717f,  -0.351602f, -0.135225f, 0.229398f,  -0.513225f, -1.120345f,
+  0.528786f,  -0.051081f, 0.086653f,  0.140141f,  -0.563969f, 0.333402f,
+  -0.174745f, 0.321093f,  -0.438641f, -0.005131f, 0.247415f,  0.110120f,
+  -0.076308f, -0.083244f, 0.838944f,  -0.113043f, -0.013258f, -0.175028f,
+  -0.179941f, 0.272676f,  -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+  -0.083549f, -0.089952f, -0.186253f, 0.257483f,  0.011019f,  0.586435f,
+  0.060580f,  -0.052078f, 0.090277f,  -0.780869f, 0.969811f,  -0.025349f,
+  -0.281917f, 0.014857f,  0.231863f,  -0.228601f, -0.003861f, 0.226550f,
+  0.141825f,  -0.102171f, -0.010387f, 0.220378f,  -2.561975f, -0.497071f,
+  -0.315117f, 0.371981f,  0.138247f,  0.625031f,  -0.308133f, -0.217876f,
+  0.005615f,  -0.860179f, 0.747491f,  0.006356f,  -0.057024f, -0.483189f,
+  0.055592f,  -0.316834f, 0.069858f,  0.218788f,  -0.200044f, 0.227588f,
+  0.215496f,  -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+  -0.152512f, -0.332995f, 0.129053f,  0.178668f,  -0.302694f, 0.030678f,
+  0.925896f,  0.964375f,  0.169021f,  -0.218657f, -0.627204f, 0.206437f,
+  -0.521336f, 0.176206f,  0.142733f,  0.139248f,  0.411682f,  0.181544f,
+  0.224850f,  -0.935547f, -0.558208f, 0.348096f,  0.342129f,  -0.389340f,
+  -0.236308f, -0.132099f, 0.073642f,  0.089391f,  -0.306901f, -0.397842f,
+  0.444282f,  0.074623f,  -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+  -0.138761f, 0.120794f,  -0.647577f, -0.336471f, 0.527899f,  -0.164234f,
+  -0.028354f, 1.083678f,  -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+  -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+  -1.152632f, 0.383685f,  -0.105895f, -0.096829f, 0.118382f,  0.047447f,
+  -0.019051f, 0.310180f,  -0.162793f, -0.029574f, 0.058054f,  -0.636017f,
+  0.490639f,  0.158347f,  -0.385701f, -0.147057f, 1.285825f,  -1.276083f,
+  -0.021795f, -0.101600f, 0.163254f,  0.267160f,  -2.317864f, -0.098598f,
+  -0.296337f, -0.309017f, 0.164127f,  -0.270012f, -0.071187f, -0.262270f,
+  0.075415f,  -0.368328f, 0.186728f,  -0.158031f, 0.481663f,  0.515950f,
+  -0.162551f, 0.497981f,  0.262196f,  0.168479f,  0.726066f,  -0.243856f,
+  -0.058998f, 0.140168f,  0.053242f,  -0.624623f, -0.249480f, 0.055197f,
+  -1.376804f, 0.417571f,  0.203784f,  0.174370f,  -0.155531f, -0.029400f,
+  -0.491473f, 0.079811f,  -0.080123f, 1.345900f,  0.637077f,  0.434862f,
+  -1.787438f, 0.005756f,  -0.362706f, 0.179458f,  -0.288263f, 0.516788f,
+  -0.921248f, 0.043794f,  -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+  -0.156532f, -0.132566f, 0.517989f,  -0.154321f, -0.054174f, -0.077900f,
+  -0.373316f, -0.117718f, 0.188986f,  -0.476188f, -0.245312f, 0.181439f,
+  -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+  -0.135429f, 0.125766f,  -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+  -0.205966f, 0.031472f,  0.744446f,  -0.006680f, -0.837551f, 0.605862f,
+  -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f,  -0.183586f,
+  -0.010307f, 0.099373f,  -0.228278f, 0.175236f,  -0.000133f, 0.104491f,
+  -1.540545f, -0.570971f, -0.252885f, 0.483036f,  0.052531f,  0.260214f,
+  -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f,  -1.775975f,
+  -0.298634f, 0.323626f,  -0.373579f, -0.872977f, 0.619574f,  0.026862f,
+  -0.122531f, -0.084698f, -2.436297f, 0.483996f,  -0.203640f, -0.302157f,
+  -0.150666f, -0.238320f, 0.089250f,  0.236485f,  -0.668654f, -0.122863f,
+  0.491152f,  -0.226444f, -0.181248f, 0.120158f,  0.294027f,  0.250056f,
+  0.307601f,  0.357875f,  -1.746455f, -0.175670f, 0.385447f,  -0.108808f,
+  -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+  0.607555f,  -0.489426f, 0.150624f,  0.598114f,  -0.128816f, -0.445793f,
+  -0.066524f, -0.254380f, 0.227106f,  -0.406495f, -0.121632f, -0.275960f,
+  -0.136494f, 0.339457f,  -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+  -0.001211f, 0.375192f,  -0.473448f, -0.162510f, 0.099329f,  -0.277965f,
+  0.101221f,  -0.060263f, 0.121867f,  -1.042140f, 0.440851f,  0.078898f,
+  -0.209007f, -0.243699f, 0.715197f,  -0.093997f, 0.086022f,  -0.178203f,
+  -2.275496f, -0.098413f, 0.199352f,  -0.526791f, -0.162086f, -0.197806f,
+  -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f,  0.416236f,
+  0.064082f,  0.197655f,  0.340871f,  -0.186645f, -0.291498f, 0.433938f,
+  -1.110063f, 0.003751f,  0.392738f,  0.069360f,  0.102088f,  -0.302128f,
+  -1.518457f, 0.106939f,  0.404527f,  -0.306868f, -0.286928f, 0.729276f,
+  -0.531710f, 0.745048f,  -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+  0.241877f,  -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+  -0.489957f, 0.100850f,  0.323999f,  -0.802837f, -0.462408f, -0.079350f,
+  -0.029374f, 0.131213f,  -0.825032f, 0.040202f,  0.351821f,  0.002869f,
+  -0.132516f, -0.471264f, -0.297002f, 0.263913f,  0.033478f,  0.146161f,
+  0.533229f,  -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+  0.005151f,  0.018584f,  -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+  -1.197056f, 0.146302f,  0.226840f,  -0.852126f, 0.031214f,  0.108880f,
+  0.562000f,  -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+  0.515073f,  -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+  0.347673f,  0.623379f,  0.722067f,  -0.492458f, -0.513263f, 0.585167f,
+  0.721518f,  -0.693499f, 0.343725f,  -0.273861f, -0.040230f, -0.785664f,
+  -0.157500f, -0.308445f, 0.054062f,  0.600131f,  -0.860887f, 0.434470f,
+  -0.191382f, -0.306150f, -0.243965f, 0.705444f,  0.007789f,  -0.146154f,
+  -0.054499f, -0.073500f, -1.067364f, 0.404936f,  -2.864590f, 0.182323f,
+  0.326126f,  0.102405f,  -0.135800f, 1.128095f,  -0.012267f, -0.023996f,
+  -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f,  -0.498361f,
+  0.083560f,  -0.210074f, 0.019225f,  -0.201614f, -0.904760f, 0.181421f,
+  0.586384f,  -0.177706f, 0.065471f,  0.168552f,  0.054705f,  0.045241f,
+  0.048057f,  -0.410957f, -2.188854f, -0.169812f, 0.015521f,  0.176856f,
+  -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f,  0.010454f,
+  0.823643f,  -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+  -0.433195f, -0.120488f, -0.116721f, 0.112134f,  0.118170f, -0.259769f,
+  -0.077530f, 0.394044f,  0.279167f,  -0.317988f, 0.189538f, 0.314776f,
+  0.325655f,  -0.107123f, 0.591049f,  0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_128_layer0,
+      av1_ab_partition_nn_weights_128_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_128_layer0,
+      av1_ab_partition_nn_bias_128_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+  -0.495347f, -0.049498f, -0.026804f, 0.030474f,  -0.289308f, -0.264193f,
+  -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+  -0.038217f, 0.014872f,  -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+  -0.442543f, -0.482492f, 0.073510f,  0.007503f,  2.162329f,  -0.362849f,
+  2.145915f,  -0.883135f, 0.185636f,  -0.062859f, -0.465574f, -0.486205f,
+  -0.056710f, -0.330642f, -0.321860f, 0.042321f,  -0.348965f, 0.003542f,
+  -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+  0.246622f,  0.199651f,  -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+  0.816811f,  0.083247f,  -0.218839f, 0.038143f,  -0.063436f, 0.015517f,
+  -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+  0.050425f,  -0.221723f, -0.256942f, -0.287285f, 0.144011f,  -0.033245f,
+  0.083649f,  0.119428f,  -0.056706f, -0.117805f, 0.021866f,  -0.257300f,
+  -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f,  -0.347247f,
+  0.042539f,  -0.302697f, 1.652316f,  0.000701f,  -0.482843f, -0.160332f,
+  -0.450099f, 0.212399f,  -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+  0.085956f,  -0.037767f, 1.052409f,  -0.931924f, -2.221907f, 0.268946f,
+  0.015512f,  1.237094f,  -1.092185f, 0.418247f,  -0.082143f, -0.076914f,
+  -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+  -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+  -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f,  0.019839f,
+  0.451127f,  0.004376f,  1.410392f,  3.255835f,  -0.344815f, 0.145202f,
+  0.204132f,  0.171948f,  -0.527736f, -0.110353f, 0.901448f,  0.003238f,
+  -3.822090f, 0.235462f,  1.024823f,  -0.821244f, 0.876056f,  2.553762f,
+  -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+  -0.246040f, 0.039430f,  -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+  -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+  0.084721f,  0.168089f,  -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+  0.079376f,  -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+  0.066176f,  -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+  -0.385845f, 0.119769f,  -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+  -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f,  -0.114423f,
+  -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+  -0.279011f, -0.008132f, 0.208463f,  0.020569f,  -0.206803f, -0.213408f,
+  -0.206131f, -0.290245f, 0.069701f,  -0.000371f, -0.307572f, -0.451785f,
+  -0.300838f, -0.453186f, -0.301691f, 0.046327f,  -0.312668f, 0.058272f,
+  -0.303131f, -0.376252f, 0.108384f,  -0.086623f, -0.100630f, -0.027330f,
+  -0.003969f, 0.089502f,  -0.200722f, -0.107889f, 0.061843f,  -0.008478f,
+  -0.265057f, -0.271132f, -0.073562f, 0.129337f,  -0.283698f, -0.353414f,
+  0.076420f,  -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+  -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+  -0.011932f, -0.585700f, 0.253212f,  -1.061900f, -0.205116f, -0.336407f,
+  -0.762199f, 0.577737f,  0.230832f,  0.434440f,  -0.096713f, 0.038552f,
+  -0.147800f, -0.213553f, 0.041740f,  -0.281907f, -0.026154f, -0.082356f,
+  -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+  -0.391963f, -0.467392f, 0.027453f,  -0.394761f, -0.045544f, 0.076052f,
+  0.483985f,  0.067093f,  0.141361f,  0.576772f,  0.859718f,  2.566515f,
+  -0.025476f, 0.769738f,  -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+  -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+  0.551148f,  1.777227f,  -0.461630f, 0.043093f,  0.012293f,  -0.255841f,
+  -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+  -0.297266f, -0.128699f, -0.149555f, 0.016534f,  -0.375498f, -0.346759f,
+  -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+  -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+  -0.048421f, -0.144133f, 0.889073f,  0.012606f,  3.007608f,  -0.602584f,
+  -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f,  -1.867208f,
+  -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f,  0.119141f,
+  -0.230715f, 0.083247f,  0.020367f,  -0.128629f, -0.217455f, -0.159640f,
+  1.815952f,  -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+  0.662971f,  0.486475f,  0.159746f,  -0.018932f, 3.692397f,  1.384353f,
+  -0.401984f, -0.248380f, -0.140861f, 0.215248f,  -0.023711f, 0.059679f,
+  -0.072260f, 0.004271f,  0.039545f,  -0.347971f, -0.081851f, -0.474896f,
+  -0.181572f, 0.066736f,  -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+  -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+  -0.102701f, -0.312336f, 0.149831f,  0.007229f,  -0.155700f, -0.173611f,
+  4.074261f,  1.342306f,  -1.272712f, 1.570899f,  -0.545093f, -0.317605f,
+  -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+  -0.239130f, -0.067211f, 0.041957f,  -0.039234f, -1.003587f, -0.094412f,
+  0.532512f,  -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+  0.419466f,  0.492122f,  -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+  2.396852f,  2.660000f,  -0.376537f, 0.468628f,  0.149413f,  -0.074898f,
+  -0.067154f, 0.021245f,  0.127857f,  0.294189f,  0.508056f,  0.390232f,
+  -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+  -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+  -0.398724f, -0.372068f, -0.234279f, 0.017799f,  -0.424760f, -0.646717f,
+  -0.047568f, 2.924664f,  -0.644165f, 0.359349f,  -0.294800f, 0.591746f,
+  -0.404710f, -0.092358f, -0.250729f, 0.030829f,  -0.147149f, -0.476023f,
+  -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+  -0.377052f, -0.449899f, -0.056452f, 0.138081f,  -0.085350f, -0.308391f,
+  0.106661f,  0.176234f,  0.258869f,  -0.230172f, -0.233029f, -0.241208f,
+  -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+  -0.158114f, -0.223167f, -0.026689f, 0.051863f,  0.212834f,  -0.304714f,
+  -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+  0.280815f,  -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+  -0.380595f, 0.109504f,  -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+  -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+  -0.418429f, -0.183240f, 0.031319f,  -0.095785f, -0.315447f, 0.069404f,
+  -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+  -0.178198f, 0.177208f,  0.134688f,  -0.081933f, -0.229452f, -0.208872f,
+  0.026287f,  -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+  -0.267238f, -0.494125f, -0.056255f, 0.053715f,  -0.487754f, 0.014818f,
+  0.087383f,  -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+  -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f,  0.010186f,
+  -0.001587f, 0.086735f,  -2.465718f, 1.482185f,  1.621193f,  -2.081680f,
+  1.386553f,  -3.204335f, -0.267111f, -0.004508f, 0.164712f,  0.274147f,
+  1.724306f,  -2.273659f, 0.749574f,  -0.891905f, 0.105965f,  -0.030428f,
+  -0.416018f, -0.300762f, 0.122911f,  -0.316908f, -0.292504f, 0.138666f,
+  -0.161327f, -0.042143f, -0.249128f, 0.149210f,  -0.088987f, -0.654101f,
+  -1.501843f, 0.216777f,  0.955914f,  0.524158f,  -1.642561f, -1.643626f,
+  0.864797f,  -0.425451f, -2.115764f, -0.012502f, 0.065172f,  1.297270f,
+  0.018845f,  1.167276f,  -0.470970f, -0.244995f, 0.374782f,  -1.811056f,
+  -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+  -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+  -0.370683f, 0.172816f,  -0.265069f, 0.194321f,  -0.273478f, 0.037442f,
+  -0.235552f, -0.078625f, -0.447541f, 0.016836f,  -0.271123f, -0.171481f,
+  -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+  -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+  0.230343f,  -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+  -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f,  -0.150944f,
+  -0.075727f, -0.208414f, 1.054996f,  0.713758f,  -0.300051f, -0.151482f,
+  -2.443570f, 0.430590f,  -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+  -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+  0.578161f,  -0.220318f, -0.210107f, -3.111584f, 0.604419f,  -0.232622f,
+  -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f,  -2.535531f,
+  -0.209783f, -0.211189f, -2.766337f, 0.000000f,  0.450177f,  -1.754884f,
+  3.262664f,  -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+  -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+  -0.305430f, 0.739171f,  0.991277f,  -0.088150f, 0.086313f,  -0.023379f,
+  -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+  -0.036800f, 0.528721f,  0.490767f,   0.144409f,  1.103640f,  0.361910f,
+  -0.180069f, 0.068033f,  -14.868382f, 0.359013f,  0.322567f,  -0.199212f,
+  0.906164f,  -0.488254f, 0.149653f,   -0.216394f, -0.099347f, 0.004936f,
+  -0.111391f, 0.074848f,  -0.041709f,  0.147627f,  -0.018905f, 0.096116f,
+  0.184817f,  -0.016241f, 0.115739f,   2.376754f,  0.637097f,  0.052954f,
+  0.136428f,  0.225267f,  -0.181873f,  -0.142876f, 0.684048f,  0.658791f,
+  0.105795f,  0.241705f,  1.381114f,   -0.209379f, 1.145949f,  0.795293f,
+  -9.361877f, 0.198302f,  0.539600f,   0.092317f,  -0.081695f, 0.200777f,
+  0.102334f,  0.081583f,  0.060948f,   -0.025110f, 0.160951f,  -0.020170f,
+  0.234006f,  -0.029369f, 0.375036f,   0.270209f,  -0.556529f, 1.402949f,
+  0.101777f,  -0.027331f, 0.004502f,   -0.153166f, -0.116651f, 0.151573f,
+  -0.022187f, 0.144044f,  -0.108719f,  -0.129942f, -0.270321f, 0.227363f,
+  1.892330f,  -0.661052f, -0.219398f,  -0.229417f, -0.856438f, -1.196988f,
+  -0.081774f, 0.078847f,  -0.207057f,  -0.048947f, 0.152073f,  -0.243056f,
+  -0.233329f, -0.288689f, -0.158333f,  -0.141177f, -0.715436f, 0.016947f,
+  -0.093752f, 0.204984f,  -1.209782f,  0.155683f,  0.092239f,  0.146495f,
+  0.813146f,  -0.027757f, 0.330982f,   2.173948f,  -0.028867f, -0.141815f,
+  0.292708f,  -0.204794f, 0.014496f,   1.032799f,  1.312155f,  0.107020f,
+  0.824752f,  -0.013945f, 0.184829f,   -0.041633f, 0.215300f,  -0.476088f,
+  -0.053213f, 0.126862f,  -0.020777f,  0.082893f,  -0.223727f, -0.923063f,
+  0.466529f,  0.082140f,  -0.845758f,  -1.140791f, -0.262033f, 0.138491f,
+  0.151717f,  -0.182479f, -0.131128f,  0.055411f,  0.106771f,  0.125552f,
+  0.297184f,  -0.257403f, -0.059884f,  -0.274903f, 2.694357f,  -0.108244f,
+  0.025377f,  0.043092f,  -0.558317f,  3.517159f,  -0.270833f, -0.240676f,
+  0.205100f,  -0.057068f, -0.140445f,  -0.193449f, -0.030061f, -0.286762f,
+  -0.467523f, -0.012647f, 0.190564f,   0.022394f,  -0.101479f, 0.339684f,
+  -0.902743f, -0.169578f, -0.178029f,  -0.041836f, -3.952108f, -0.028298f,
+  -0.221137f, -0.733895f, -0.223895f,  0.039012f,  0.687867f,  0.021423f,
+  0.113063f,  0.676087f,  -0.961000f,  -0.064847f, 0.712856f,  -0.192765f,
+  -0.001132f, 0.016689f,  -0.236020f,  -0.766186f, -0.175729f, 0.012879f,
+  -0.251064f, -0.105523f, -0.039212f,  -0.347584f, 0.304352f,  -0.034174f,
+  -0.364258f, -0.685252f, -0.266115f,  -0.247345f, -0.155905f, 0.152283f,
+  -0.156315f, 0.174082f,  -0.757654f,  0.102303f,  -2.192316f, -0.245815f,
+  0.119882f,  -0.086542f, 1.987246f,   -1.353163f, -0.374813f, -0.233504f,
+  -1.980895f, 0.692093f,  -0.168351f,  0.172700f,  -0.009052f, -0.015734f,
+  0.106679f,  -0.060472f, -0.256813f,  -0.074874f, -0.207488f, -0.329515f,
+  -0.418268f, -0.017940f, -0.036081f,  0.064719f,  -1.488016f, 0.020591f,
+  -0.176325f, -0.141074f, 0.944494f,   0.150237f,  -0.249805f, -0.277280f,
+  0.012686f,  0.132483f,  0.116123f,   0.013737f,  -0.116091f, 0.750340f,
+  3.251343f,  -0.188864f, 1.096992f,   0.058467f,  -0.041433f, -0.037937f,
+  -0.133294f, -0.137908f, -0.171132f,  0.106362f,  0.069383f,  -0.052662f,
+  -0.177883f, -0.408049f, 0.680221f,   -0.117035f, -0.904240f, -1.395228f,
+  0.154527f,  0.134427f,  0.022767f,   -0.158886f, -0.230316f, 0.161096f,
+  0.362213f,  -0.235060f, -0.941620f,  0.055912f,  -0.049458f, -0.166632f,
+  0.481418f,  0.930146f,  0.041108f,   0.033674f,  1.372066f,  -1.847709f,
+  0.003324f,  0.259534f,  0.177014f,   -0.202761f, -0.262017f, -0.190852f,
+  -0.102839f, 0.028338f,  0.187193f,   -0.041684f, 0.123973f,  -0.198576f,
+  -0.110369f, -1.431400f, 0.208369f,   -0.302370f, -0.248549f, 0.062985f,
+  0.673409f,  0.036662f,  -0.711340f,  -0.120584f, -0.189789f, 0.098812f,
+  2.947819f,  0.216567f,  -0.414472f,  -0.181742f, 1.873779f,  -0.222726f,
+  -0.782870f, 0.007889f,  0.015062f,   -0.554328f, 0.182928f,  -0.191430f,
+  0.123636f,  -0.215460f, -0.225245f,  0.251516f,  -0.013025f, -1.359595f,
+  -0.750602f, 0.342667f,  -0.141899f,  -0.687493f, -0.072639f, 0.048018f,
+  -0.242107f, -0.031917f, -0.287472f,  -0.046088f, 0.832197f,  -0.016576f,
+  -1.553349f, -0.216341f, 0.023077f,   -0.410867f, 4.243743f,  -0.514878f,
+  -0.066007f, -0.160696f, -0.262678f,  -0.648790f, -0.430586f, 0.199940f,
+  -0.202496f, -0.222241f, -0.016406f,  -0.121473f, 0.000828f,  -0.081584f,
+  -0.152641f, -0.190166f, 0.644400f,   0.040196f,  -0.302104f, -1.143654f,
+  -0.160327f, -0.320780f, -0.187006f,  0.037311f,  0.440618f,  -0.070733f,
+  -0.117785f, 1.527539f,  -0.419310f,  0.001300f,  1.389956f,  -0.036366f,
+  -0.269203f, 0.612265f,  2.721897f,   -0.086836f, -0.446999f, 0.012525f,
+  -0.078317f, -0.287052f, -0.111188f,  -0.085181f, -0.164667f, -0.010466f,
+  -0.569722f, -0.018888f, -0.101663f,  -1.147130f, -0.465204f, 0.114524f,
+  -2.192402f, -0.221325f, 0.375748f,   0.206284f,  -0.261548f, -0.246257f,
+  -0.143004f, -0.069981f, -0.057306f,  -0.116481f, -0.435903f, -0.314970f,
+  0.013210f,  -0.010175f, 4.630571f,   -0.473226f, -0.197199f, -0.028204f,
+  0.122907f,  2.475548f,  0.025011f,   -0.092603f, -0.127561f, -0.151330f,
+  -0.077295f, 0.245016f,  -0.045005f,  0.183396f,  -0.330556f, -0.384887f,
+  0.356374f,  -0.016618f, -0.463353f,  -1.291546f, -0.071986f, -0.311599f,
+  0.072385f,  -0.430786f, -2.094788f,  0.202733f,  -0.910109f, -1.336543f,
+  -0.086800f, -0.096413f, 1.544383f,   0.031860f,  -0.796211f, 0.762786f,
+  3.250022f,  -0.441798f, -0.698537f,  0.062839f,  0.033525f,  -0.362996f,
+  0.027022f,  -1.131264f, -0.228926f,  0.053885f,  -0.338628f, 0.155037f,
+  -0.046844f, -0.888172f, -0.241767f,  0.084965f,  -0.617743f, -0.049896f,
+  -0.036894f, -0.304783f, -0.002639f,  0.137957f,  0.052121f,  -0.131161f,
+  -0.117200f, -0.253380f, -0.205561f,  -0.302450f, -0.047397f, -0.330518f,
+  3.613420f,  -1.525951f, -0.026738f,  0.209150f,  -2.103534f, 2.019689f,
+  -0.366199f, -0.095260f, 0.027417f,   -0.242512f, 0.162579f,  0.052113f,
+  -0.293851f, -0.068138f, -0.005799f,  -0.344696f, -0.114824f, -0.431107f,
+  -0.120058f, -1.139926f, -1.048379f,  0.036446f,  -0.323020f, -0.432945f,
+  0.454151f,  -0.140058f, 0.050649f,   -0.094900f, -0.017278f, -0.238719f,
+  1.193153f,  0.120447f,  -0.496061f,  0.917431f,  2.936126f,  -0.115521f,
+  -0.347397f, -0.435325f, -0.004383f,  -0.211864f, 0.162383f,  -1.040726f,
+  0.089537f,  -0.128579f, -0.133505f,  0.107129f,  -0.435657f, -0.180388f,
+  0.043650f,  0.018709f,  -0.773242f,  -0.687192f, -0.120633f, -0.063626f,
+  0.029912f,  0.113972f,  -0.403502f,  -0.127640f, -0.269625f, 0.129794f,
+  -0.188539f, 0.041641f,  0.029769f,   -0.198374f, 1.401407f,  0.353887f,
+  -0.219925f, 0.260515f,  1.157034f,   -2.992044f, -0.097618f, -0.064417f,
+  -0.203626f, -0.008217f, -0.112339f,  -0.227407f, -0.155118f, 0.247705f,
+  -0.012304f, -0.248447f, -0.913463f,  -0.064788f, -0.214619f, -0.251761f,
+  -0.386861f, -0.040574f, -0.163219f,  -0.100700f, 1.488274f,  -0.071684f,
+  -0.033626f, -0.006497f, -0.246945f,  -0.145221f, -3.747390f, 0.149609f,
+  -0.263326f, -0.297385f, -1.039896f,  -0.083174f, -0.025473f, -0.235586f,
+  -0.001087f, 0.254286f,  0.265106f,   0.007325f,  0.199239f,  0.134103f,
+  -0.578211f, -0.259801f, -0.062373f,  2.368348f,  0.560556f,  -0.252260f,
+  0.889997f,  -0.447872f, -0.059218f,  -0.095315f, -0.061667f, 0.183580f,
+  -0.157479f, 0.055387f,  -0.831734f,  0.007606f,  -1.104906f, 0.301180f,
+  -0.117115f, 0.212959f,  4.727223f,   -0.243833f, -0.397495f, -0.025021f,
+  -0.367587f, -2.082058f, -0.217699f,  0.148111f,  0.252430f,  0.111088f,
+  -0.260692f, 0.095124f,  -0.407774f,  -0.322169f, 0.002927f,  0.126169f,
+  -1.272325f, -0.279772f, -0.373680f,  -0.485177f, -0.605458f, 0.021225f,
+  -0.092031f, -0.226585f, 1.895162f,   0.037866f,  -0.275475f, 1.614360f,
+  -0.014972f, -0.277679f, -3.449082f,  -0.092060f, -0.747873f, 0.020716f,
+  2.776178f,  -0.049963f, 0.183999f,   -0.295259f, -0.028868f, 0.221895f,
+  0.001265f,  0.336823f,  0.219372f,   0.112824f,  0.408132f,  -0.017940f,
+  -0.311666f, 1.489606f,  -0.058093f,  -0.305659f, -0.491933f, -0.143847f,
+  0.166115f,  0.042867f,  -0.123447f,  -0.087099f, -0.305395f, -0.365079f,
+  -0.755801f, -0.160649f, 0.736260f,   -0.008611f, 0.095836f,  -0.017345f,
+  5.697515f,  -0.498971f, -0.125280f,  0.199907f,  0.300053f,  0.605026f,
+  -0.228225f, -0.259523f, 0.016384f,   0.146973f,  0.210258f,  0.226766f,
+  -0.075178f, -0.050924f, 0.188496f,   -0.415266f, -0.484880f, -0.236384f,
+  0.071931f,  -0.331863f, -0.601243f,  -0.232479f, -0.285272f, 0.123789f,
+  -1.341333f, 0.037082f,  -0.315202f,  -1.587215f, -0.271576f, 0.003216f,
+  -4.437186f, -0.256205f, -0.576589f,  -0.114147f, 2.153916f,  -0.369618f,
+  0.271415f,  0.145036f,  -0.158731f,  -0.240938f, -0.187369f, 0.036325f,
+  0.254771f,  0.211488f,  -0.240297f,  0.098417f,  -0.415011f, 2.334793f,
+  -0.127252f, 0.020069f,  -0.168755f,  -0.448922f, -0.219207f, 0.016232f,
+  -0.221935f, -0.269500f, -0.100636f,  0.102545f,  -0.809376f, -0.054979f,
+  0.360713f,  -0.326541f, 0.112933f,   0.138073f,  4.229404f,  -0.763801f,
+  -0.305429f, 0.199955f,  -1.787713f,  0.272866f,  0.109895f,  0.138466f,
+  -0.250259f, -0.167162f, -0.212588f,  -0.217589f, -0.067125f, -0.077490f,
+  -0.208970f, -0.006863f, -0.671146f,  -0.298320f, -0.165509f, 0.044597f,
+  -1.408624f, -0.213957f, -0.220947f,  0.129718f,  1.316777f,  -0.098928f,
+  -0.008121f, -0.558293f, -0.297290f,  -0.218873f, -4.346638f, -0.228174f,
+  -0.204710f, -0.388864f, 2.697919f,   0.025260f,  0.857020f,  0.009921f,
+  0.036915f,  -0.320275f, -0.087937f,  0.022636f,  0.236667f,  0.135496f,
+  -0.059616f, -0.192955f, 0.009470f,   2.139589f,  -0.200449f, 0.129818f,
+  1.017444f,  -0.608299f, 0.257914f,   -0.134306f, -0.033327f, 0.002855f,
+  -0.338598f, 0.015559f,  0.117362f,   -0.166760f, 0.086903f,  -0.167666f,
+  0.193523f,  0.033852f,  -1.147686f,  0.489468f,  -0.006969f, 0.125630f,
+  1.557907f,  -1.604449f, -0.071114f,  0.096178f,  0.007065f,  0.200013f,
+  0.213393f,  0.168466f,  -0.100568f,  -0.117861f, -0.161542f, -0.072561f,
+  -1.069871f, -0.470138f, -0.352578f,  -1.503513f, -0.001394f, -0.380109f,
+  0.065089f,  -0.281668f, 0.988953f,   -0.002778f, -0.659026f, -0.470692f,
+  -0.407292f, 0.011710f,  -1.362085f,  0.184738f,  -0.135786f, -1.374241f,
+  4.487930f,  -0.067274f, -0.956404f,  -0.233995f, 0.224527f,  -0.454556f,
+  0.037900f,  -0.281658f, 0.208224f,   -0.254753f, 0.045740f,  0.051444f,
+  -0.388281f, 0.257112f,  -0.485030f,  -0.082659f, 0.148103f,  -1.007456f,
+  -0.022295f, 0.036984f,  -0.369401f,  -0.076943f, -0.007636f, -0.293022f,
+  0.470466f,  0.199012f,  -2.158182f,  0.036577f,  -0.014725f, -0.229516f,
+  2.236929f,  0.030945f,  -0.400045f,  0.109348f,  0.214691f,  -0.891516f,
+  -0.251379f, -0.217358f, 0.013733f,   0.205573f,  -0.151725f, -0.191782f,
+  -0.339630f, -0.163905f, -0.119191f,  -0.032516f, 0.503015f,  0.025772f,
+  0.029094f,  -1.146153f, 0.216723f,   -0.330023f, 0.064695f,  -0.262521f,
+  0.425612f,  -0.093080f, -0.489648f,  1.051293f,  -0.092332f, 0.095557f,
+  -0.874132f, 0.218483f,  -0.127648f,  -1.605802f, 2.763617f,  -0.186734f,
+  -1.243166f, -0.193514f, -0.173748f,  0.337822f,  0.183873f,  -0.251594f,
+  -0.211582f, 0.144081f,  0.029620f,   -0.024853f, -0.385140f, 0.467341f,
+  -0.928316f, -0.195442f, 0.917783f,   0.357084f,  0.174445f,  -0.073659f,
+  -0.012811f, -0.115420f, -0.181147f,  -0.364449f, -0.567395f, -0.012969f,
+  -1.680714f, 0.065323f,  0.198063f,   -0.244201f, 1.428545f,  -0.432539f,
+  -0.208931f, -0.091205f, 0.957125f,   0.813519f,  -0.262677f, 0.246852f,
+  0.015536f,  0.055026f,  0.067054f,   0.262103f,  -0.358115f, -0.095206f,
+  -0.267522f, -0.402710f, -0.680397f,  -0.123627f, -0.385590f, -1.504680f,
+  -0.169513f, -0.215338f, 0.043633f,   -0.079052f, -0.464410f, 0.122894f,
+  -0.278231f, -2.456445f, -0.159917f,  -0.015597f, -0.735449f, -0.078854f,
+  -0.400290f, -1.153870f, 3.657228f,   -0.287093f, -1.174355f, -0.102001f,
+  -0.288281f, 0.185209f,  -0.145228f,  -0.200449f, -0.099914f, -0.138354f,
+  0.254428f,  -0.161751f, -0.118206f,  0.296043f,  -0.482613f, 0.080932f,
+  1.097605f,  -0.010190f, 0.232439f,   0.447617f,  -0.133508f, 0.115763f,
+  -0.388589f, 0.174695f,  -0.236014f,  0.006284f,  -1.374129f, 0.092015f,
+  -0.241419f, -0.231667f, 2.763950f,   -0.922932f, -0.061605f, 0.208740f,
+  -1.597190f, 1.353325f,  -0.198528f,  0.250498f,  -0.013950f, -0.203861f,
+  -0.254563f, 0.081931f,  -0.413369f,  0.011844f,  0.080961f,  -0.231161f,
+  -1.234909f, -0.440843f, -0.174980f,  -0.315283f, -0.337474f, -0.123243f,
+  -0.310001f, -0.271028f, 0.364179f,   0.022845f,  -0.535517f, -0.772936f,
+  -0.188435f, 0.039667f,  -0.807463f,  0.266550f,  -0.288857f, -1.630789f,
+  1.280155f,  0.065712f,  -0.279960f,  -0.300056f, 0.258440f,  -0.073781f,
+  0.213878f,  0.042196f,  0.021360f,   0.211698f,  -0.003751f, -0.192673f,
+  -0.137008f, 0.247878f,  -0.470604f,  0.073164f,  1.523241f,  0.734755f,
+  -0.114126f, -0.193834f, -0.025759f,  0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+  -0.148074f, 0.923430f,  -0.364770f, 0.203550f,  0.401216f,  0.938246f,
+  -0.872737f, 0.718723f,  0.703398f,  2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_64_layer0,
+      av1_ab_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_64_layer0,
+      av1_ab_partition_nn_bias_64_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+  -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+  -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+  0.344916f,  -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+  0.411575f,  -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+  -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+  0.225887f,  -0.000493f, 2.682241f,  0.871204f,  0.059014f,  0.803542f,
+  -1.407028f, -1.154669f, 1.388148f,  -0.293348f, -0.003669f, -0.009607f,
+  1.330030f,  -0.337841f, 2.118617f,  1.033059f,  -0.084788f, 0.212904f,
+  0.082405f,  -0.070579f, -0.494005f, -0.173392f, 0.039546f,  -0.463865f,
+  0.077163f,  -0.434066f, 0.030835f,  -0.427139f, -0.560520f, -0.031606f,
+  -0.368541f, -0.027458f, 0.370574f,  0.461418f,  1.087682f,  -0.572137f,
+  -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+  -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f,  -0.156744f,
+  -0.267922f, 0.171216f,  0.110556f,  0.002954f,  -0.200327f, -0.187663f,
+  3.691601f,  1.234152f,  0.186315f,  -0.125370f, -0.211235f, -0.554432f,
+  -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f,  0.012896f,
+  -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f,  0.016307f,
+  0.384673f,  -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+  -0.169709f, 0.421681f,  -0.033360f, -0.072817f, 0.003647f,  -0.110632f,
+  -0.158651f, -0.095136f, 0.223759f,  0.165767f,  -0.269129f, -0.196075f,
+  -0.023183f, -0.293420f, 0.014875f,  0.018688f,  -0.153407f, -0.172009f,
+  -0.259947f, -0.124015f, 0.173653f,  -0.089103f, -0.021001f, -0.334230f,
+  0.027177f,  0.103371f,  -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+  -0.143771f, -0.247106f, 0.218116f,  -0.013240f, 2.831783f,  1.483928f,
+  -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f,  0.234684f,
+  -0.119150f, -0.075182f, -0.330463f, 0.071503f,  -0.254924f, -0.360071f,
+  -0.037022f, 0.063261f,  -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+  0.065318f,  -0.235743f, -0.257194f, -0.094784f, 0.022423f,  0.055925f,
+  0.086672f,  -0.021010f, 0.009965f,  -0.001648f, -0.104917f, -0.387443f,
+  -0.102673f, -0.281706f, 0.145923f,  -0.233391f, -0.378365f, -0.145584f,
+  -0.077751f, -0.121166f, 1.134565f,  -0.097500f, -0.749202f, -0.544566f,
+  -1.361374f, -0.102494f, 1.089275f,  0.375299f,  -0.105091f, 0.037641f,
+  -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+  -0.339326f, -0.128217f, -0.282905f, 0.014937f,  1.067185f,  -0.171764f,
+  0.484458f,  0.396706f,  -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+  -0.218449f, -0.004755f, 1.572857f,  0.006229f,  1.962895f,  -0.029746f,
+  -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+  -1.263078f, -0.304560f, 1.072374f,  2.556429f,  0.312850f,  0.257488f,
+  -0.634264f, 0.156769f,  -0.188943f, 0.040295f,  -0.389915f, 0.085250f,
+  -0.248525f, 0.045667f,  -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+  -1.285316f, 0.079060f,  0.389124f,  -0.510401f, -0.015299f, -0.664661f,
+  0.099901f,  -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+  -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+  -0.064569f, -0.156516f, 0.543522f,  -0.005924f, 0.161432f,  0.974793f,
+  0.273712f,  1.104850f,  -0.290312f, 0.313417f,  -0.125370f, 0.136234f,
+  -0.191227f, -0.165054f, 0.011872f,  -0.298871f, 0.095740f,  0.142760f,
+  -0.215771f, -0.031437f, 0.101041f,  -0.085620f, 0.435387f,  0.002786f,
+  1.971375f,  0.018392f,  -1.771940f, -0.401433f, 0.808263f,  -3.350013f,
+  2.296952f,  -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+  -0.276088f, -0.455907f, 0.266021f,  0.087348f,  -0.146566f, 0.040492f,
+  -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+  -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f,  0.002185f,
+  -4.225019f, 0.344025f,  0.728796f,  -0.262936f, 1.383924f,  1.577300f,
+  -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+  -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+  0.031970f,  -0.373402f, -0.396079f, 0.045566f,  0.072595f,  -0.222681f,
+  -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+  -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+  -0.205936f, -0.316275f, 0.103729f,  -0.197893f, -0.128029f, -0.218796f,
+  -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+  -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+  -0.270504f, 0.234505f,  0.272144f,  0.266938f,  -0.392395f, -0.011717f,
+  -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+  -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+  -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+  -0.119432f, -0.222351f, 0.000450f,  0.208724f,  -0.510526f, -0.144656f,
+  -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+  0.043714f,  -0.235414f, 0.115594f,  -0.195616f, -0.106693f, -0.124242f,
+  0.083990f,  0.049110f,  -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+  -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+  -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+  -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+  -0.023459f, -0.222538f, 0.028849f,  -0.088038f, -0.301550f, -0.273566f,
+  0.067295f,  -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+  -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+  -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+  -0.086147f, -0.430088f, 0.058466f,  -0.152129f, -0.058411f, -0.236392f,
+  -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+  -0.324501f, 0.000490f,  -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+  -0.175500f, 0.165220f,  -0.276212f, 0.062153f,  -0.217054f, -0.255487f,
+  -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+  -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+  -0.158325f, 0.151907f,  -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+  -0.220028f, -0.247355f, 0.135584f,  0.016511f,  0.367705f,  -1.855877f,
+  0.435622f,  0.444710f,  -3.372301f, -3.030489f, 1.013267f,  0.380951f,
+  -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+  -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+  -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+  -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+  -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f,  1.189112f,
+  1.458468f,  -0.005876f, -0.927475f, 0.062038f,  -1.170818f, 0.338227f,
+  -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+  -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+  -0.310094f, 0.062721f,  0.251422f,  -0.014350f, -1.282910f, 1.619560f,
+  1.180566f,  -0.032163f, -1.322951f, -0.603601f, 1.443710f,  0.654650f,
+  -0.393227f, 0.003536f,  0.029725f,  -0.108925f, -0.053911f, 0.133977f,
+  -0.036145f, -0.168438f, 0.046989f,  -0.331463f, -0.176983f, -0.311922f,
+  -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+  -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f,  -0.032867f,
+  -0.039424f, -0.063670f, 0.193808f,  -0.303514f, -0.013376f, -0.057761f,
+  0.187922f,  0.006938f,  0.031810f,  0.180594f,  -1.198427f, 2.820662f,
+  0.154986f,  -0.375518f, 0.116925f,  -0.795782f, -0.085139f, -0.079365f,
+  -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+  -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+  -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+  -2.453587f, -0.045568f, -0.296932f, 0.613061f,  -0.320284f, 0.191620f,
+  -0.827145f, -0.225277f, 0.275800f,  1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+  -0.176206f, 0.660189f,  -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+  0.732684f,  -0.135581f, -2.193132f, -0.172771f, 0.605001f,  -0.060392f,
+  -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+  0.632779f,  0.005585f,  1.310169f,  1.392136f,  -0.563860f, -0.051053f,
+  0.660998f,  -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+  -0.177726f, 1.200859f,  -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+  0.538503f,  -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+  -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f,  -0.020116f,
+  -0.208096f, 0.000000f,  1.246166f,  -0.225421f, -0.181555f, 0.861761f,
+  1.172429f,  -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+  -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+  -0.037828f,  1.529029f,  0.004927f,  1.475763f,  0.627172f,  0.325872f,
+  -0.990757f,  0.129476f,  0.889958f,  -0.082031f, 0.332133f,  0.074422f,
+  -0.176212f,  -0.074355f, 0.774378f,  0.110987f,  -0.155469f, 0.253310f,
+  0.882538f,   0.253605f,  0.332436f,  -5.389474f, 0.278470f,  0.168644f,
+  0.914611f,   0.154165f,  0.809262f,  -0.174734f, 0.923673f,  0.064716f,
+  -0.070228f,  -0.228735f, 0.002312f,  0.112222f,  -0.045502f, -0.046004f,
+  0.514101f,   0.306480f,  0.021232f,  -0.015955f, -0.288260f, 0.189177f,
+  -0.104158f,  0.103273f,  0.096910f,  -0.086328f, 1.327289f,  -0.154247f,
+  0.056676f,   -0.243327f, -0.646676f, 0.177221f,  -0.086761f, 0.729729f,
+  -14.710893f, -0.044881f, 0.339003f,  -0.134737f, 0.073621f,  -0.162913f,
+  1.215237f,   0.140723f,  0.138630f,  1.241719f,  0.204092f,  -0.463080f,
+  -0.176086f,  1.125868f,  1.034814f,  0.225455f,  -0.203421f, -0.078787f,
+  -0.527498f,  0.012491f,  -0.563307f, -0.170792f, 0.002679f,  0.116153f,
+  0.211348f,   -0.191900f, -0.212505f, 0.263445f,  -0.074679f, -0.081441f,
+  -0.815405f,  2.448215f,  0.781299f,  0.149542f,  -1.045162f, 0.043014f,
+  0.217381f,   -0.094500f, -0.090427f, 0.025784f,  -0.228906f, -2.741798f,
+  0.230475f,   -0.256112f, -0.103297f, 0.159121f,  -0.229793f, -0.014883f,
+  -0.104131f,  -0.123816f, 0.164148f,  -0.052279f, -0.071845f, -0.041197f,
+  0.208527f,   -0.234197f, -0.542336f, 0.020053f,  0.088870f,  0.014346f,
+  2.502164f,   -0.010244f, -0.267792f, 0.844394f,  2.711486f,  -0.015262f,
+  -0.868053f,  -0.295704f, 0.222289f,  -0.000286f, -0.352098f, -0.079000f,
+  0.021267f,   -0.721739f, -0.240558f, -0.384775f, 0.065974f,  -2.161058f,
+  0.195889f,   0.268966f,  -0.009329f, 0.014949f,  0.314943f,  0.235885f,
+  0.072591f,   -0.127120f, 0.150784f,  0.105697f,  -1.297403f, -0.207509f,
+  -0.217688f,  -0.076752f, 0.170952f,  -0.294235f, 0.449973f,  -1.712690f,
+  0.860989f,   0.054757f,  -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+  -3.741608f,  0.495660f,  -0.288936f, 4.654852f,  -0.021305f, -0.308916f,
+  0.049205f,   -0.259996f, 0.114248f,  -0.252647f, -0.253180f, -0.449314f,
+  0.022979f,   0.063281f,  -0.196154f, 0.078295f,  -0.322317f, -0.145142f,
+  0.300573f,   0.048385f,  -0.254787f, 0.123939f,  -1.263088f, -0.228565f,
+  -0.389061f,  0.391084f,  2.322438f,  0.075009f,  0.225743f,  -0.198808f,
+  -0.280538f,  -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+  -0.102756f,  -1.760965f, 0.019149f,  -0.867342f, 0.347141f,  0.031588f,
+  0.302572f,   -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+  -0.108561f,  -0.167077f, -2.851509f, -0.307116f, 0.202720f,  -0.160280f,
+  -0.215525f,  0.064355f,  -0.427220f, 1.516230f,  0.634453f,  0.099400f,
+  -1.013887f,  -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+  -0.160953f,  0.399036f,  -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+  -0.588252f,  -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+  -0.253959f,  -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+  -0.175087f,  -0.055171f, 1.642014f,  -0.192559f, -0.288147f, 0.610311f,
+  4.688195f,   -0.128728f, -0.914869f, -0.108286f, 0.013789f,  0.092125f,
+  0.019770f,   -0.178386f, 0.074164f,  -1.152658f, -0.216738f, -0.277286f,
+  0.012381f,   0.418259f,  -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+  2.009457f,   0.054302f,  1.019838f,  -0.116170f, 0.165134f,  -0.112567f,
+  0.852632f,   -0.385796f, -0.108666f, 0.053181f,  -0.311797f, -0.372875f,
+  -0.675717f,  2.409268f,  -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+  0.203993f,   0.093617f,  -0.301290f, 0.253551f,  -0.128909f, -1.448442f,
+  -0.186823f,  -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+  -0.212084f,  -0.137326f, 0.012505f,  0.087850f,  -0.200413f, -0.394119f,
+  -0.132224f,  0.146917f,  0.155746f,  0.198725f,  -0.322541f, 0.196391f,
+  -0.945500f,  0.036736f,  -0.155646f, -0.677341f, 1.130545f,  -0.339554f,
+  0.411628f,   -0.355813f, -0.249843f, 0.213694f,  -2.035607f, 0.055694f,
+  -0.111669f,  0.408696f,  -0.067043f, -0.048182f, 0.398110f,  -0.067542f,
+  1.459801f,   0.236833f,  -0.178806f, 0.168758f,  0.492387f,  0.099691f,
+  -0.776680f,  -0.172865f, 0.204225f,  0.193982f,  0.575685f,  -0.062248f,
+  0.011486f,   0.058571f,  -0.493391f, 0.026893f,  -0.900467f, 3.793129f,
+  -0.634613f,  -0.064660f, -0.048262f, 0.361905f,  0.033641f,  0.245171f,
+  -0.064671f,  0.034954f,  0.204358f,  -0.904023f, -0.052714f, -0.250134f,
+  0.136700f,   0.000734f,  -0.371720f, 0.226483f,  0.217958f,  0.060559f,
+  0.180111f,   0.000970f,  0.079556f,  -0.096775f, 0.093855f,  -0.026224f,
+  -0.243664f,  0.004290f,  0.123281f,  -0.239476f, 1.230374f,  -0.107826f,
+  -0.101982f,  -0.153917f, 5.464427f,  0.304375f,  -0.809957f, 0.090564f,
+  -0.278416f,  -0.245555f, -2.078421f, 0.243093f,  -0.127666f, 0.052451f,
+  -0.126662f,  -0.783505f, 0.025149f,  -1.422675f, -0.207769f, -0.362547f,
+  0.115310f,   0.133390f,  1.264754f,  -0.027055f, -0.485312f, -0.240717f,
+  -0.239722f,  0.146818f,  -1.265043f, -0.235553f, 0.267104f,  -0.021357f,
+  -0.435949f,  -0.309371f, 0.049920f,  1.302721f,  -0.233978f, -0.097551f,
+  -0.240631f,  -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+  -0.029361f,  2.703590f,  -0.430659f, 0.067927f,  -0.387520f, -0.370630f,
+  -0.229236f,  0.085653f,  -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+  -0.109299f,  -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+  -0.196713f,  -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+  5.963707f,   -0.201157f, 0.726377f,  -0.011076f, 0.010553f,  -0.102918f,
+  -2.230088f,  -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+  -0.094735f,  -1.381839f, 0.587298f,  -0.173048f, 0.721360f,  0.241900f,
+  0.764302f,   -0.023609f, -1.173755f, 0.103912f,  -0.185363f, 0.078435f,
+  -2.245062f,  -0.127269f, 0.202234f,  0.158975f,  -0.260909f, 0.098608f,
+  -0.348247f,  1.732502f,  -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+  -0.530730f,  0.125716f,  -1.004419f, 0.145109f,  -0.059289f, 1.096304f,
+  0.012891f,   0.045033f,  -0.306875f, 0.003514f,  -0.176110f, 0.037544f,
+  -0.441537f,  -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+  -0.128894f,  -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+  1.173404f,   0.088312f,  -0.393568f, -0.175134f, 6.529819f,  -0.326652f,
+  -0.631917f,  -0.393476f, 0.057781f,  -0.217748f, -1.781139f, -0.012614f,
+  -0.212621f,  -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+  -0.608744f,  -0.265146f, 0.238517f,  0.066882f,  -2.916806f, 0.054642f,
+  0.282590f,   0.075248f,  0.010188f,  -0.133486f, 0.985945f,  -0.045849f,
+  -0.347564f,  0.057320f,  -0.417920f, 0.063664f,  0.387062f,  -2.692059f,
+  -0.535549f,  0.263736f,  0.327889f,  -0.070273f, -0.775254f, 0.147250f,
+  3.309425f,   -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+  0.022907f,   0.138421f,  -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+  -0.036527f,  0.021525f,  0.106649f,  -0.291883f, 0.088424f,  -0.057773f,
+  -0.086031f,  0.015277f,  -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+  -0.025820f,  -0.649037f, 0.706381f,  0.096410f,  0.643776f,  -0.046743f,
+  -0.009654f,  -0.024246f, 1.469255f,  -0.183536f, -0.370046f, -0.048442f,
+  -0.376527f,  -0.431264f, -0.245109f, -0.093951f, 0.203683f,  -0.099872f,
+  0.087210f,   0.160692f,  -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+  -0.241949f,  0.193613f,  0.979597f,  -0.091259f, 0.414424f,  -0.047341f,
+  -0.209582f,  -0.295134f, -0.016824f, 0.460327f,  -0.072671f, 0.246234f,
+  0.235896f,   0.127238f,  -1.068683f, 0.035648f,  2.254888f,  0.180105f,
+  -0.260098f,  -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+  -0.237916f,  0.031103f,  -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+  -0.342148f,  -0.257558f, -0.346300f, 0.115333f,  -0.115456f, 0.208354f,
+  -0.359301f,  -0.167395f, 1.146514f,  -0.177861f, -0.098658f, -0.444570f,
+  6.759993f,   -0.369772f, -0.831118f, 0.001866f,  -0.073298f, -0.072095f,
+  0.811902f,   -0.431997f, -0.286587f, -0.269500f, 0.111492f,  -0.525364f,
+  -0.351785f,  -2.463474f, -1.852659f, 0.135325f,  0.138267f,  0.100643f,
+  -2.373278f,  -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+  -0.716424f,  -0.031674f, 0.011147f,  0.057405f,  -0.215873f, -0.094401f,
+  0.573528f,   -1.223820f, 0.414852f,  -0.059053f, -0.076488f, -0.287168f,
+  -0.842640f,  0.174084f,  -0.567186f, 0.336629f,  -0.062514f, 2.075448f,
+  -0.061680f,  -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+  -0.049616f,  -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+  0.141501f,   -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+  -1.521661f,  -0.122639f, -0.015760f, -0.718912f, 5.877828f,  0.146916f,
+  0.151767f,   0.220785f,  -0.032298f, 0.230902f,  0.663943f,  -0.252613f,
+  0.057718f,   -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+  -1.031206f,  -0.104136f, 0.389897f,  0.127602f,  -2.667789f, -0.212366f,
+  -0.506262f,  -0.009115f, -0.213202f, 0.076167f,  -1.629405f, 0.055129f,
+  0.375393f,   -0.150272f, -0.241515f, -0.326497f, 0.100069f,  0.410703f,
+  0.340622f,   0.042437f,  -0.349945f, 0.041176f,  -1.178950f, 0.030992f,
+  0.933908f,   -0.035844f, -0.098660f, 1.030584f,  -0.092043f, -0.355739f,
+  -0.305562f,  0.036161f,  -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+  0.215493f,   -0.149105f, -0.013363f, 0.025886f,  -0.101306f, -0.205781f,
+  -1.072487f,  -0.076019f, 0.077555f,  0.131003f,  1.267763f,  -0.008954f,
+  -0.327617f,  -0.246539f, 6.664081f,  -0.404403f, -1.442489f, 0.191301f,
+  -0.336361f,  0.181156f,  0.833108f,  0.007879f,  -0.194464f, -1.029408f,
+  -0.036268f,  -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+  -0.065990f,  0.203160f,  -0.291788f, 0.000680f,  0.587011f,  -0.241289f,
+  0.037034f,   0.000552f,  1.072308f,  -0.387230f, -0.230050f, 0.292322f,
+  -0.720001f,  0.034109f,  -0.467260f, 2.211644f,  -1.839191f, -0.048797f,
+  -0.083469f,  -0.334686f, -0.269056f, 0.051295f,  1.319904f,  -0.035603f,
+  -0.018457f,  -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+  -0.305469f,  -0.099011f, 0.014225f,  -0.452772f, 0.170331f,  -0.389312f,
+  -0.115084f,  -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+  -0.125137f,  0.067228f,  -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+  -0.588325f,  -0.320024f, 0.085695f,  -0.235047f, -0.217790f, 0.103015f,
+  -0.698644f,  0.017766f,  -0.058299f, 0.199411f,  -0.122485f, -0.563949f,
+  -0.349011f,  -0.557045f, -0.131165f, 0.002281f,  0.118559f,  -0.210302f,
+  -1.153815f,  0.116738f,  -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+  0.953222f,   0.093748f,  0.266869f,  0.241869f,  -0.860832f, -0.387012f,
+  -0.338986f,  2.097515f,  -1.942512f, -0.298021f, 0.543911f,  -0.043214f,
+  0.082125f,   -0.120242f, 0.712231f,  0.213327f,  -0.301687f, -0.544011f,
+  -0.392131f,  0.004302f,  0.004825f,  -0.317440f, -0.107518f, -0.293407f,
+  -0.159111f,  -0.080367f, 0.132663f,  -0.017726f, -0.237521f, -0.190297f,
+  -0.361633f,  0.200518f,  -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+  0.630105f,   -0.190997f, -0.287840f, -0.603488f, 3.605598f,  -0.276614f,
+  -1.346383f,  0.186912f,  -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+  -0.223722f,  0.304924f,  -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+  -2.458027f,  0.237976f,  0.171050f,  -0.103139f, -0.278689f, 0.329824f,
+  -0.262448f,  -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+  0.091018f,   -0.386471f, -0.723940f, 0.064956f,  -0.057652f, 1.321024f,
+  -1.397418f,  -0.143136f, 0.272468f,  -0.030749f, 0.037324f,  0.069316f,
+  -0.904925f,  -0.333693f, -0.117709f, 2.279598f,  -0.428065f, -0.131157f,
+  -0.014288f,  -0.402862f, -0.666090f, 0.017070f,  -0.028333f, 0.002481f,
+  0.197156f,   -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+  -0.905007f,  -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+  -0.089948f,  -0.936827f, 1.437569f,  -0.388908f, 0.126170f,  0.186162f,
+  -0.018819f,  -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+  -0.230436f,  -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+  0.378157f,   0.113377f,  -0.850610f, 0.080245f,  -0.087305f, -0.002852f,
+  0.044408f,   -0.188172f, -1.891998f, 0.092189f,  0.125325f,  -0.105090f,
+  -0.848510f,  -0.396308f, -0.384130f, 2.007509f,  -1.480787f, -0.126946f,
+  0.314767f,   0.000195f,  -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+  -0.417603f,  1.570705f,  0.092459f,  -0.340974f, -0.284754f, -0.007801f,
+  -0.324610f,  -0.004734f, -0.207716f, -0.057175f, 0.055467f,  -0.210830f,
+  -0.113005f,  -0.299177f, 0.068074f,  0.017929f,  -2.897598f, -0.260074f,
+  -0.014422f,  -0.206467f, 1.246997f,  -0.372863f, -0.214160f, -0.114035f,
+  5.805862f,   0.003611f,  -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+  -1.251640f,  -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+  0.001427f,  0.523607f,  0.225068f,  -0.055273f, 1.019519f,  1.181880f,
+  -0.010198f, 0.130597f,  1.276752f,  2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_32_layer0,
+      av1_ab_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_32_layer0,
+      av1_ab_partition_nn_bias_32_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+  0.151902f,  0.007947f,  -1.788454f, 0.431869f,  -2.971387f, 0.923566f,
+  1.632542f,  -1.665136f, -0.338632f, -5.075884f, 0.398267f,  0.030467f,
+  2.263534f,  -0.045532f, -1.066128f, 0.915139f,  -0.560500f, -3.293125f,
+  2.072793f,  -1.011414f, 0.122716f,  -0.060169f, -0.388860f, 0.031019f,
+  -0.381861f, 0.001551f,  -0.328472f, 0.038296f,  -0.060398f, -0.375556f,
+  0.209226f,  0.014764f,  -1.443469f, -0.345486f, 2.409269f,  1.524846f,
+  -0.640666f, 1.322139f,  -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+  0.329701f,  0.115339f,  -1.339542f, 0.249024f,  -0.421545f, -0.409151f,
+  -0.258293f, 0.836288f,  -0.073685f, -0.009624f, 0.895712f,  0.320639f,
+  0.451002f,  -1.544558f, 0.193709f,  -1.389012f, 1.305451f,  0.089795f,
+  0.050338f,  -0.017433f, -0.304667f, 0.500729f,  0.504346f,  0.073757f,
+  0.582649f,  -0.993623f, 1.766766f,  -3.067265f, -0.415774f, -0.006036f,
+  -1.245281f, 0.253205f,  -0.591245f, -0.626238f, 0.551852f,  0.593755f,
+  0.491023f,  1.099384f,  -0.348448f, 0.054564f,  -0.451422f, -0.375781f,
+  -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+  -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+  0.068066f,  -0.374920f, 0.057536f,  -0.189748f, 0.058375f,  -0.267749f,
+  -0.147286f, -0.246153f, 0.006183f,  -0.202029f, -0.059128f, 0.116852f,
+  0.134719f,  -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+  -0.264499f, 0.155816f,  -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+  0.007313f,  -0.254124f, -0.231964f, -0.275972f, 0.032098f,  -0.264564f,
+  -0.208743f, 0.155599f,  -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+  -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+  -0.154114f, 0.017032f,  -0.017364f, -0.233247f, 0.009918f,  -0.179289f,
+  -0.190722f, 0.147106f,  -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+  -0.040718f, -0.324699f, 0.118660f,  -0.170727f, -0.316788f, 0.100886f,
+  -0.202842f, 0.045371f,  0.150561f,  -0.057054f, -0.308150f, 0.028346f,
+  -0.381473f, -0.195365f, 0.026221f,  -0.281795f, 0.087204f,  0.047689f,
+  -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+  -0.340273f, 0.048086f,  0.046103f,  -0.121527f, 0.021697f,  0.054109f,
+  -0.002768f, -0.008461f, -2.297240f, 0.124651f,  3.621661f,  -0.057120f,
+  -1.151656f, 2.296894f,  -3.678720f, -0.290240f, 0.087683f,  -0.186389f,
+  0.007656f,  -0.090236f, -0.245217f, 0.110389f,  -0.251719f, -0.029084f,
+  -0.128203f, -0.100005f, -0.032779f, 0.007281f,  -0.366596f, -0.267870f,
+  -0.215620f, 0.047687f,  0.010303f,  0.097980f,  -0.191569f, -0.341162f,
+  0.119249f,  0.026279f,  -2.161546f, 0.459591f,  1.290566f,  1.791797f,
+  -0.409835f, 0.127081f,  -1.156367f, 0.198286f,  0.099561f,  -0.067445f,
+  -0.034352f, 0.017966f,  -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+  0.146090f,  -0.357530f, 0.097644f,  -0.000932f, 0.446603f,  -0.066793f,
+  2.448620f,  0.937617f,  -1.232922f, 0.313183f,  0.816827f,  -0.275115f,
+  -0.245205f, -0.126895f, 0.156668f,  -0.186977f, -0.273505f, 0.013315f,
+  0.168629f,  -0.089084f, 0.006166f,  -0.116107f, -0.199316f, -0.024010f,
+  -0.242303f, 0.011612f,  -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+  0.006732f,  -0.148718f, -0.164225f, 0.116063f,  1.587898f,  0.690519f,
+  0.360566f,  0.009739f,  -0.678702f, -0.046003f, 0.126984f,  0.605212f,
+  1.240663f,  -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+  -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+  -0.242255f, 0.137424f,  -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+  -0.078695f, 0.038257f,  -0.012110f, -0.263521f, 0.009839f,  -0.109125f,
+  -0.226036f, 0.060712f,  0.093671f,  0.153143f,  0.039116f,  -0.290891f,
+  0.227057f,  -0.204633f, -0.207539f, -0.148242f, 0.046204f,  -0.231268f,
+  -0.209315f, -0.307579f, -0.436556f, 0.023475f,  0.131793f,  -0.038301f,
+  1.650584f,  0.392570f,  1.446576f,  1.254380f,  -0.516867f, -0.057116f,
+  0.149320f,  0.414424f,  -0.246309f, 0.003877f,  -0.480238f, -1.037035f,
+  -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f,  0.940609f,
+  -1.113370f, -0.018554f, 0.141064f,  -0.182504f, 1.270707f,  0.414904f,
+  -0.216036f, 0.203831f,  0.450716f,  -0.452909f, 0.139358f,  -0.027143f,
+  1.956892f,  1.643732f,  -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+  0.205023f,  0.661159f,  -0.000809f, 0.049033f,  -0.348579f, -0.200338f,
+  -0.362144f, -0.346590f, -0.230096f, 0.180746f,  -0.149954f, -0.253429f,
+  -0.378170f, -0.040724f, -0.041597f, 0.243659f,  -0.472181f, 0.015401f,
+  -0.180376f, 0.153139f,  -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+  -0.238925f, -0.265798f, -0.318374f, 0.142352f,  -0.210520f, 0.051928f,
+  -0.352190f, -0.179052f, -0.185498f, 0.025540f,  -0.111667f, -0.235187f,
+  -0.215454f, 0.010931f,  -0.238372f, -0.126659f, 0.075691f,  -0.091167f,
+  -2.462379f, -0.007950f, -0.637990f, 0.285554f,  -0.051275f, 0.282279f,
+  -0.744083f, -0.570646f, 0.592198f,  1.421332f,  -0.256027f, -0.140315f,
+  0.160247f,  -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+  -0.071228f, 0.055864f,  -1.084764f, -0.263409f, 0.779266f,  0.228187f,
+  0.375013f,  0.121204f,  -0.656948f, 0.533561f,  0.272671f,  -0.015423f,
+  -0.124180f, -0.009127f, 2.934838f,  -0.150998f, 1.163152f,  0.081997f,
+  -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f,  0.024046f,
+  -1.451709f, 0.332558f,  0.990504f,  0.376290f,  -1.466773f, -0.448439f,
+  -2.929108f, -4.255188f, 0.065238f,  0.019950f,  1.372393f,  0.444052f,
+  -2.538772f, 1.579767f,  -0.464911f, -1.866114f, 1.053958f,  0.434467f,
+  -0.125964f, 0.034671f,  0.077116f,  -0.138466f, -0.413395f, -0.223453f,
+  -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f,  0.037459f,
+  -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+  -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f,  -0.328036f,
+  -0.169790f, 0.036506f,  0.052572f,  -0.183570f, -0.073617f, -0.244959f,
+  0.266498f,  0.032846f,  -1.902106f, 0.486078f,  2.414993f,  0.975182f,
+  -0.382875f, 1.647810f,  -2.197017f, -0.890107f, 0.221287f,  0.010889f,
+  3.817042f,  0.572728f,  0.092466f,  0.473337f,  -1.634659f, -1.069455f,
+  1.486776f,  -1.023850f, 0.088184f,  0.008842f,  0.518202f,  0.270259f,
+  1.757191f,  -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+  -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+  -0.267836f, -0.319354f, -0.274975f, 0.068970f,  -0.406467f, 0.044074f,
+  -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f,  -0.177674f,
+  -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+  -0.312272f, -0.222217f, -0.100548f, 0.106260f,  -0.034655f, 0.135109f,
+  -0.021276f, 0.018177f,  -0.353097f, -0.011128f, 0.061136f,  -0.511662f,
+  -0.223236f, -0.308841f, 0.118789f,  -0.154628f, -0.053178f, -0.055973f,
+  0.013175f,  -0.368337f, -0.090863f, -0.116920f, 0.178990f,  -0.025278f,
+  -0.190553f, -0.238092f, 0.303943f,  -0.024944f, 0.719373f,  0.384332f,
+  -0.378480f, -0.423316f, 0.709922f,  0.758514f,  -1.559023f, -2.503173f,
+  0.068652f,  -0.234741f, -0.182932f, 0.037878f,  0.020684f,  -0.174142f,
+  -0.182300f, -0.052796f, -0.219145f, 0.113028f,  -1.041826f, 0.035317f,
+  0.919904f,  -0.676011f, 0.652297f,  1.456447f,  -0.166904f, -0.861823f,
+  0.895827f,  0.429821f,  -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+  -0.206692f, -0.080745f, -0.085444f, 0.186953f,  -0.050135f, 0.044243f,
+  -0.391706f, -0.160498f, -0.292268f, 0.164060f,  0.412649f,  0.211611f,
+  -0.327294f, -0.919399f, 0.320297f,  0.385284f,  -0.088848f, -0.072556f,
+  -0.384813f, -0.176267f, -0.065918f, 0.134724f,  -0.231104f, -0.337707f,
+  -0.195442f, -0.263569f, 0.098090f,  -0.341411f, -0.189211f, -0.439276f,
+  -0.404046f, 0.262491f,  -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+  -0.026945f, -0.112036f, -0.322985f, 0.078500f,  -0.230205f, -0.344535f,
+  -0.021087f, 0.110220f,  -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+  2.936406f,  -0.396539f, -0.110456f, -1.254954f, 0.785350f,  0.516290f,
+  -0.172341f, 0.254386f,  -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+  0.000000f,  -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+  0.457446f,  -0.125051f, -0.107712f, 0.714607f,  -0.140809f, -1.788650f,
+  -0.087199f, 0.000000f,  -1.290050f, 0.443930f,  -0.110634f, -0.109380f,
+  -0.188213f, -1.414179f, 1.193579f,  0.388775f,  -0.873193f, -0.110050f,
+  -0.072565f, -0.117050f, -0.119132f, 0.456959f,  -0.132069f, 0.131974f,
+  1.160474f,  1.746465f,  0.442628f,  -0.188849f, -0.207794f, -0.108364f,
+  -0.856655f, -2.141620f, 0.335476f,  -0.105508f, -0.212162f, -0.109319f,
+  -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f,  -0.023908f,
+  0.123809f,  -0.109797f, 0.200510f,  -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+  -6.823716f, 1.406568f,  -0.144009f, 2.228765f,  0.838336f,  0.738107f,
+  -0.319014f, -0.148756f, 0.240862f,  -0.111089f, -0.004241f, 0.025758f,
+  -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f,  0.252994f,
+  -0.289443f, 0.194932f,  0.057467f,  0.724735f,  0.014063f,  1.361352f,
+  0.025191f,  0.024274f,  0.231462f,  -7.227959f, -0.094515f, 0.039946f,
+  0.412719f,  0.812318f,  3.038903f,  -0.286289f, 0.647482f,  -0.115114f,
+  0.053590f,  0.066069f,  0.153134f,  0.996250f,  -0.125700f, 0.951365f,
+  -6.243494f, -4.827697f, 0.566320f,  0.239515f,  -0.099702f, 0.054546f,
+  1.847330f,  3.680076f,  -3.049829f, -0.127709f, 0.068469f,  -0.017794f,
+  0.223864f,  -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+  -0.552073f, 0.043311f,  0.218668f,  0.033209f,  -3.199210f, 0.193079f,
+  0.321406f,  0.718307f,  -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+  -0.029757f, -0.130065f, 0.043782f,  0.072394f,  -0.088686f, 0.025322f,
+  0.129882f,  0.101324f,  0.335707f,  0.072714f,  -2.079774f, 0.203997f,
+  0.239321f,  -0.301757f, 0.257845f,  1.288382f,  -0.031275f, -0.234194f,
+  0.310722f,  2.045469f,  0.034716f,  0.135638f,  -0.251388f, 0.320071f,
+  -1.065301f, -0.322731f, -0.545028f, 0.226276f,  0.090799f,  0.019289f,
+  0.048950f,  -1.079300f, 0.231938f,  0.083683f,  4.762127f,  0.145037f,
+  -0.145549f, 0.075592f,  0.172336f,  0.108175f,  0.333751f,  1.090501f,
+  1.056114f,  0.047073f,  0.182052f,  -0.081587f, 0.089900f,  0.339286f,
+  2.049988f,  0.073585f,  0.537355f,  -0.243322f, -0.010179f, -0.052601f,
+  -0.174915f, 0.117793f,  2.222990f,  -2.520837f, -0.092699f, 1.199887f,
+  0.138720f,  0.679918f,  -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+  0.114633f,  -0.128377f, 0.092970f,  -0.107489f, -0.191078f, 0.185182f,
+  0.216980f,  -0.019343f, 3.443133f,  0.287953f,  0.099314f,  0.985958f,
+  0.157268f,  -0.606516f, 0.049418f,  -0.221809f, -0.453081f, -0.344796f,
+  -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+  -1.011192f, 0.022795f,  0.186363f,  -0.076356f, -0.050932f, -0.165098f,
+  0.168177f,  -0.101596f, -5.270886f, 2.553943f,  -0.440870f, -0.017494f,
+  0.215208f,  -0.017032f, 1.495915f,  -4.304677f, 0.762211f,  0.182937f,
+  0.254406f,  -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+  0.737697f,  -0.234989f, 0.168095f,  0.245118f,  -0.077262f, 0.195718f,
+  0.753302f,  -1.637869f, 0.126227f,  0.982129f,  -0.121444f, -0.295570f,
+  -1.215799f, 0.147867f,  -0.068496f, 0.132726f,  -0.005772f, -0.181774f,
+  0.126513f,  0.204723f,  -0.366123f, 0.103906f,  -0.148053f, -0.075272f,
+  0.243884f,  -0.104828f, 0.198988f,  0.501034f,  -0.112671f, 0.111421f,
+  0.167508f,  -0.117803f, -0.738624f, 2.046292f,  0.124011f,  0.057983f,
+  -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+  0.122417f,  0.060291f,  -0.129033f, -0.843086f, 0.268241f,  -0.399927f,
+  1.585888f,  1.816393f,  -0.631427f, 0.127826f,  0.088105f,  0.073488f,
+  0.717694f,  -1.497362f, 2.608528f,  0.066896f,  -0.079230f, 0.223436f,
+  -0.010530f, 0.175310f,  1.120365f,  0.034391f,  0.835312f,  0.071652f,
+  -0.080615f, 0.111395f,  0.162742f,  0.079927f,  -3.859582f, -0.638431f,
+  -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f,  0.931940f,
+  -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f,  0.102793f,
+  -0.048546f, 0.063545f,  0.023864f,  -0.190863f, 1.934257f,  -0.136286f,
+  -0.107916f, -0.637468f, 0.066449f,  1.089693f,  -0.214047f, -0.265780f,
+  0.899660f,  -0.130333f, 0.288311f,  -0.049024f, 0.090202f,  0.487969f,
+  0.339704f,  0.858479f,  0.841253f,  -0.184100f, -0.637070f, -0.125071f,
+  -0.077650f, -0.087877f, 0.202268f,  -0.027300f, 2.842862f,  -0.100698f,
+  -0.259080f, 0.260556f,  0.157912f,  -0.070364f, 0.467190f,  1.200037f,
+  1.419317f,  -0.033588f, -0.227824f, 0.292617f,  0.228574f,  0.213839f,
+  -1.091099f, -0.022258f, -1.294681f, 0.136118f,  0.081652f,  -0.185359f,
+  -0.039706f, 0.191407f,  -2.053219f, -0.261934f, 0.047812f,  -0.029536f,
+  -0.823869f, -1.090534f, -0.755890f, 0.441035f,  -0.167945f, 0.231441f,
+  -0.135013f, -0.260762f, 0.256872f,  0.130339f,  -0.243751f, 0.189760f,
+  -0.288454f, 0.145363f,  0.338490f,  0.403898f,  -0.022814f, -1.263598f,
+  -0.101315f, 0.860135f,  0.136511f,  0.028942f,  0.574047f,  2.656370f,
+  0.037587f,  -0.188690f, -0.125312f, 1.100435f,  -1.080402f, 0.380905f,
+  0.004635f,  0.097144f,  -0.214309f, 0.085552f,  -0.285066f, -0.705134f,
+  -0.054704f, -0.319951f, 5.486626f,  0.958158f,  -1.380585f, 0.223340f,
+  -0.169167f, -0.170697f, -0.216748f, 0.324232f,  2.684204f,  -0.008490f,
+  -0.211052f, -0.201190f, 0.123466f,  -0.000234f, 0.579907f,  0.096938f,
+  -0.042745f, 0.201855f,  0.157195f,  -0.261440f, 0.029699f,  -0.046599f,
+  1.618216f,  -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+  0.579699f,  -0.100392f, 0.150694f,  0.061794f,  0.200425f,  -0.062515f,
+  -0.179122f, 0.250112f,  -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+  3.662276f,  -0.154921f, -0.312991f, 0.972008f,  -0.308596f, -0.190426f,
+  0.133889f,  -0.238673f, -0.094726f, 1.683835f,  -0.215629f, -0.198890f,
+  -0.035278f, -0.367973f, -0.822435f, 0.240848f,  -0.194656f, 0.034655f,
+  -0.079424f, 0.146670f,  0.026646f,  -0.034507f, 0.059467f,  -0.153109f,
+  -0.431033f, 2.552991f,  -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+  1.026326f,  -3.096230f, 1.346935f,  0.033633f,  -0.181827f, 0.094376f,
+  0.001696f,  -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+  0.281795f,  -0.127251f, 0.180776f,  0.067763f,  0.697124f,  -1.040779f,
+  0.111280f,  0.188351f,  -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+  -0.070310f, -0.032918f, -0.060787f, 0.131484f,  -0.077845f, -0.258652f,
+  0.056911f,  -0.062034f, 0.007663f,  -0.185100f, 1.340361f,  0.014096f,
+  -0.124602f, 0.194241f,  0.128383f,  0.360465f,  0.082979f,  -0.050475f,
+  -0.519294f, 3.323262f,  0.067014f,  0.221203f,  -0.085082f, -0.228606f,
+  -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+  -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f,  1.790253f,
+  -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+  2.251166f,  -0.146007f, 0.138527f,  -0.003134f, 0.103665f,  0.006928f,
+  -0.240253f, -0.227464f, 0.578437f,  -0.214724f, 0.503085f,  0.158093f,
+  0.033091f,  0.008061f,  4.815371f,  2.132264f,  0.281850f,  -2.288560f,
+  -0.145012f, 1.296832f,  -0.362401f, -0.403252f, 0.109873f,  0.185746f,
+  0.244764f,  0.172367f,  -0.185588f, 0.139801f,  -0.178254f, 0.068629f,
+  0.358488f,  -0.153969f, -6.433524f, 0.225983f,  -0.138123f, -0.095971f,
+  -0.036089f, -1.400083f, 0.265908f,  0.257787f,  0.181144f,  -1.647228f,
+  -0.136289f, -0.074206f, 0.122988f,  -0.088895f, -1.266717f, 0.006010f,
+  0.536681f,  0.263061f,  -0.032207f, -0.155136f, 0.086431f,  0.441950f,
+  -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f,  0.117667f,
+  -0.000408f, 0.225719f,  -2.199698f, 0.141447f,  -1.459051f, 0.051315f,
+  0.203228f,  0.354432f,  -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+  -0.666884f, 0.026283f,  -0.317486f, 0.210754f,  0.123897f,  0.223827f,
+  4.214405f,  1.457334f,  -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+  -1.553888f, -0.353429f, 0.069533f,  0.159278f,  -0.173836f, -0.004952f,
+  -0.137033f, 0.127012f,  0.143600f,  0.051587f,  -0.070549f, 0.066509f,
+  -5.776547f, 0.180021f,  -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+  0.140940f,  0.144451f,  -0.104534f, 2.089873f,  -0.168168f, 0.110726f,
+  0.132134f,  -0.215223f, -1.682754f, 0.157757f,  -0.146163f, 0.064882f,
+  0.117313f,  -0.038780f, -0.124720f, -0.501697f, 0.092047f,  -0.233992f,
+  3.324976f,  0.516601f,  1.294202f,  0.119989f,  0.061055f,  0.043420f,
+  -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+  -0.282998f, -0.282705f, 0.073798f,  0.169851f,  0.135651f,  0.182677f,
+  -0.040220f, 0.132462f,  -0.303120f, -0.230113f, 6.165739f,  -0.258596f,
+  0.024127f,  -1.388283f, -0.006042f, 0.572600f,  0.348411f,  -0.387376f,
+  -0.075845f, 0.122319f,  -0.029616f, 0.077873f,  0.154763f,  0.049073f,
+  0.018597f,  0.102688f,  -0.204165f, 0.020734f,  -1.389133f, -0.032854f,
+  -0.147561f, 0.853944f,  0.132100f,  -3.259659f, 0.243745f,  0.181529f,
+  -0.738414f, 1.509994f,  0.023470f,  -0.005329f, 0.066115f,  -1.345081f,
+  -1.455402f, -0.172023f, -0.194625f, 0.071885f,  -0.201742f, -0.262402f,
+  0.077601f,  -0.048938f, 0.257993f,  -0.504029f, -2.032415f, 1.158880f,
+  0.448647f,  -0.025633f, 0.117586f,  -0.072275f, -0.673744f, -3.854342f,
+  -0.983843f, 0.047766f,  -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+  -0.509112f, 0.148812f,  0.130122f,  0.006486f,  -0.099016f, 0.022514f,
+  -0.486850f, -0.059623f, 4.012731f,  0.025454f,  0.029059f,  -0.783546f,
+  -0.295260f, 0.322521f,  -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+  -0.258367f, -0.112897f, 0.269364f,  -0.065912f, 0.169022f,  -0.178783f,
+  -0.095114f, 0.122089f,  -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+  -0.087819f, -2.774399f, -0.100757f, 0.013005f,  -0.964533f, 3.236665f,
+  -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+  -1.799262f, -0.365269f, 0.108611f,  0.037994f,  0.024747f,  -1.073639f,
+  -0.203158f, -0.935006f, 1.880891f,  1.578385f,  0.726272f,  -0.024546f,
+  -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f,  0.075451f,
+  0.182899f,  0.092215f,  -0.207347f, -0.030111f, 0.054316f,  0.192481f,
+  0.594639f,  -0.247694f, 0.547471f,  -0.032094f, -0.065000f, 0.007198f,
+  1.605377f,  -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+  0.919365f,  0.599980f,  0.125545f,  0.265813f,  0.246884f,  0.095385f,
+  -0.260374f, -0.202916f, -0.042770f, 0.234967f,  -0.233139f, -0.326994f,
+  -1.375256f, 0.121766f,  0.077433f,  -1.103569f, 0.019497f,  -1.029185f,
+  0.253905f,  0.206569f,  0.187334f,  -0.237089f, -0.294351f, 0.164137f,
+  0.149696f,  -0.749787f, -0.413433f, 0.976587f,  1.027976f,  -0.285264f,
+  0.209273f,  -0.124762f, 0.050884f,  0.250764f,  -0.082031f, -0.646520f,
+  4.116680f,  0.437336f,  0.671684f,  0.129509f,  -0.078462f, 0.014072f,
+  -0.678232f, 0.094831f,  1.125624f,  0.207070f,  -0.154750f, -0.025780f,
+  -0.103030f, 0.118019f,  -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+  -0.217854f, -0.051790f, 0.017915f,  0.171001f,  1.355562f,  0.094603f,
+  -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+  -0.298901f, 0.038162f,  0.251899f,  0.039612f,  -0.022935f, -0.232308f,
+  -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+  -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+  0.185269f,  1.465082f,  0.040240f,  0.112665f,  0.144329f,  -0.286112f,
+  -0.617649f, 0.916177f,  0.221044f,  -0.079867f, 0.170251f,  -0.093638f,
+  -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f,  1.241179f,
+  0.355922f,  -0.170848f, -0.189168f, 0.080225f,  -1.357793f, 0.190890f,
+  0.976800f,  -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+  -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+  -0.049715f, -0.178005f, 3.029985f,  -1.141546f, 0.080066f,  -1.932316f,
+  -0.641137f, -0.189564f, 0.935080f,  0.136119f,  0.015558f,  -0.179331f,
+  0.204571f,  0.020350f,  0.009362f,  0.108478f,  0.037076f,  -0.049009f,
+  0.081090f,  -0.180202f, 1.455561f,  -0.081559f, 0.059361f,  0.484971f,
+  0.160923f,  -2.170744f, -0.013204f, 0.126561f,  -0.407122f, 1.223661f,
+  0.044262f,  0.118044f,  0.058274f,  -1.747100f, -0.171318f, 0.971374f,
+  0.306995f,  -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+  -0.106479f, -0.907933f, 1.121231f,  1.673840f,  -0.421458f, -0.021146f,
+  -0.254838f, 0.097632f,  0.235109f,  -2.901782f, 0.289518f,  -0.355459f,
+  -0.068264f, -0.179121f, 0.068560f,  -0.047570f, -0.522523f, -0.228963f,
+  -1.037158f, -0.163723f, 0.280563f,  -0.000868f, -0.197220f, -0.239329f,
+  1.985274f,  -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+  -0.792024f, -0.114290f, 0.060969f,  0.104106f,  -0.252123f, -0.150400f,
+  -0.133277f, 0.267147f,  0.274413f,  0.223744f,  -0.180223f, -0.345415f,
+  -0.104883f, 0.119210f,  -0.095041f, -0.301635f, 0.013175f,  -2.128121f,
+  -0.147208f, -0.151509f, -0.692013f, 3.418555f,  -0.016541f, 0.171511f,
+  0.107159f,  -1.516672f, 0.127408f,  0.687035f,  -0.906486f, -0.145463f,
+  -0.169382f, -0.143906f, 0.125091f,  -0.960645f, -0.180869f, -0.716908f,
+  2.840951f,  1.904919f,  -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+  -0.950604f, -1.599800f, 0.943671f,  -0.022744f, -0.270492f, 0.080843f,
+  -0.372916f, 0.047838f,  -0.100300f, -0.026600f, 0.011733f,  -0.226051f,
+  0.172790f,  -0.172982f, 0.041258f,  -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+  0.748430f,  0.203096f,  0.059317f, 0.418219f,  0.841294f,  0.402693f,
+  -0.658522f, 0.723479f,  0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_16_layer0,
+      av1_ab_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_16_layer0,
+      av1_ab_partition_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index 054b0e062..c5a6bc831 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -39,21 +39,29 @@ static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
   { -3.0, -2.0, -1.0, 100.00, 100.0 }
 };
 
-#define DEFAULT_COMPLEXITY 64
-
 static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
-  const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4;
+  const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4;
   return (base_quant > 10) + (base_quant > 25);
 }
 
 void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   struct segmentation *const seg = &cm->seg;
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
 
   // Make SURE use of floating point in this function is safe.
   aom_clear_system_state();
 
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    av1_disable_segmentation(seg);
+    return;
+  }
+
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
@@ -74,9 +82,6 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
 
     av1_enable_segmentation(seg);
 
-    // Select delta coding method.
-    seg->abs_delta = SEGMENT_DELTADATA;
-
     // Default segment "Q" feature is disabled so it defaults to the baseline Q.
     av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
 
@@ -107,13 +112,13 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
 
 #define DEFAULT_LV_THRESH 10.0
 #define MIN_DEFAULT_LV_THRESH 8.0
-#define VAR_STRENGTH_STEP 0.25
 // Select a segment for the current block.
 // The choice of segment for a block depends on the ratio of the projected
 // bits for the block vs a target average and its spatial complexity.
 void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
                             int mi_row, int mi_col, int projected_rate) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
@@ -126,9 +131,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
     segment = DEFAULT_AQ2_SEG;
   } else {
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
-    // It is converted to bits * 256 units.
-    const int64_t num = (int64_t)cpi->rc.sb64_target_rate * xmis * ymis * 256;
-    const int denom = cm->mib_size * cm->mib_size;
+    // It is converted to bits << AV1_PROB_COST_SHIFT units.
+    const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+                        << AV1_PROB_COST_SHIFT;
+    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
@@ -139,7 +145,7 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
                                                     MIN_DEFAULT_LV_THRESH)
                                            : DEFAULT_LV_THRESH;
 
-    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col);
+    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
     logvar = av1_log_block_var(cpi, mb, bs);
 
     segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index 8f61c7eb8..a1fe37d4a 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -320,7 +320,7 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
   double fraction_low = 0.0;
   int low_content_frame = 0;
 
-  MODE_INFO **mi;
+  MB_MODE_INFO **mi;
   RATE_CONTROL *const rc = &cpi->rc;
   const int rows = cm->mi_rows, cols = cm->mi_cols;
   int cnt1 = 0, cnt2 = 0;
@@ -330,12 +330,12 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
     mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0
-                            ? mi[0]->mbmi.mv[0].as_mv.row
-                            : -1 * mi[0]->mbmi.mv[0].as_mv.row;
-      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0
-                            ? mi[0]->mbmi.mv[0].as_mv.col
-                            : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+      int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
+                            ? mi[0]->mv[0].as_mv.row
+                            : -1 * mi[0]->mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
+                            ? mi[0]->mv[0].as_mv.col
+                            : -1 * mi[0]->mv[0].as_mv.col;
 
       // Calculate the motion of the background.
       if (abs_mvr <= 16 && abs_mvc <= 16) {
@@ -389,8 +389,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
-  sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
-  sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
+  sb_cols =
+      (cm->mi_cols + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+  sb_rows =
+      (cm->mi_rows + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
@@ -406,8 +408,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * cm->mib_size;
-    int mi_col = sb_col_index * cm->mib_size;
+    int mi_row = sb_row_index * cm->seq_params.mib_size;
+    int mi_col = sb_col_index * cm->seq_params.mib_size;
     int qindex_thresh =
         cpi->oxcf.content == AOM_CONTENT_SCREEN
             ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
@@ -416,14 +418,14 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size);
-    ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size);
+    xmis = AOMMIN(cm->mi_cols - mi_col, cm->seq_params.mib_size);
+    ymis = AOMMIN(cm->mi_rows - mi_row, cm->seq_params.mib_size);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
-        // reset to 0 later if block gets coded anything other than ZEROMV.
+        // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
         } else if (cr->map[bl_index2] < 0) {
@@ -479,6 +481,16 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
   const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
   if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
   // Don't apply refresh on key frame or enhancement layer frames.
   if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
@@ -509,8 +521,6 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
     // Clear down the segment map.
     av1_enable_segmentation(&cm->seg);
     av1_clearall_segfeatures(seg);
-    // Select delta coding method.
-    seg->abs_delta = SEGMENT_DELTADATA;
 
     // Note: setting temporal_update has no effect, as the seg-map coding method
     // (temporal or spatial) is determined in
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 84d967215..29a311447 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -19,6 +19,7 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
 #include "aom_ports/system_state.h"
 
 #define ENERGY_MIN (-4)
@@ -34,10 +35,8 @@ static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
 #define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
-#if CONFIG_HIGHBITDEPTH
 DECLARE_ALIGNED(16, static const uint16_t,
                 av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
-#endif
 
 unsigned int av1_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
@@ -49,6 +48,16 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   struct segmentation *seg = &cm->seg;
   int i;
 
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
@@ -57,8 +66,6 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
     av1_enable_segmentation(seg);
     av1_clearall_segfeatures(seg);
 
-    seg->abs_delta = SEGMENT_DELTADATA;
-
     aom_clear_system_state();
 
     for (i = 0; i < MAX_SEGMENTS; ++i) {
@@ -74,11 +81,6 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
         qindex_delta = -cm->base_qindex + 1;
       }
 
-      // No need to enable SEG_LVL_ALT_Q for this segment.
-      if (rate_ratio[i] == 1.0) {
-        continue;
-      }
-
       av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
       av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
     }
@@ -108,7 +110,6 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
                                  const uint8_t *b8, int b_stride, int w, int h,
                                  uint64_t *sse, uint64_t *sum) {
@@ -139,7 +140,6 @@ static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
   *sse = (unsigned int)sse_long;
   *sum = (int)sum_long;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bs) {
@@ -154,7 +154,6 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
     const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
     const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
     int avg;
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
                            CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
@@ -165,14 +164,9 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
       aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
                   bw, bh, &sse, &avg);
     }
-#else
-    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
-                bw, bh, &sse, &avg);
-#endif  // CONFIG_HIGHBITDEPTH
     var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
     return (unsigned int)((uint64_t)var * 256) / (bw * bh);
   } else {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       var =
           cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
@@ -181,10 +175,6 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
                                av1_all_zeros, 0, &sse);
     }
-#else
-    var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                             av1_all_zeros, 0, &sse);
-#endif  // CONFIG_HIGHBITDEPTH
     return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
   }
 }
@@ -205,3 +195,53 @@ int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
+
+unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int var = 0;
+  for (int r = 0; r < bh; r += 8)
+    for (int c = 0; c < bw; c += 8) {
+      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
+    }
+
+  return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int haar_sad = haar_ac_energy(x, bs);
+  aom_clear_system_state();
+  return log(haar_sad + 1.0);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  double energy, energy_midpoint;
+  aom_clear_system_state();
+  energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.frame_avg_haar_energy
+                                          : DEFAULT_E_MIDPOINT;
+  energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+                                         int block_var_level) {
+  ENERGY_IN_BOUNDS(block_var_level);
+
+  const int rate_level = SEGMENT_ID(block_var_level);
+  const AV1_COMMON *const cm = &cpi->common;
+  int qindex_delta =
+      av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                 rate_ratio[rate_level], cm->bit_depth);
+
+  if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+    qindex_delta = -cm->base_qindex + 1;
+  }
+
+  return qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
index 05725c5de..b1a8bc38a 100644
--- a/third_party/aom/av1/encoder/aq_variance.h
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -23,6 +23,10 @@ void av1_vaq_frame_setup(AV1_COMP *cpi);
 
 int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+                                         int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/arm/neon/error_neon.c b/third_party/aom/av1/encoder/arm/neon/error_neon.c
deleted file mode 100644
index fe5233f89..000000000
--- a/third_party/aom/av1/encoder/arm/neon/error_neon.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./av1_rtcd.h"
-
-int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
-
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
-
-  do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
-    // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
-  } while (block_size != 0);
-
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
-}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 000000000..b92b3469f
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1902 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
+                      int32_t size, int8_t bit);
+
+#define range_check(stage, input, buf, size, bit) \
+  range_check_func(stage, input, buf, size, bit)
+#else  // CONFIG_COEFFICIENT_RANGE_CHECKING
+
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t x0, x1, x2, x3;
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 0
+  range_check(0, input, input, 4, stage_range[0]);
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+  s7 = range_check_value(x0 + x1, stage_range[1]);
+
+  // stage 2
+  s7 = range_check_value(s7 - x3, stage_range[2]);
+
+  // stage 3
+  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+  x3 = range_check_value(s4, bit + stage_range[3]);
+
+  // stage 4
+  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+  // stage 5
+  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+  s1 = range_check_value(x1, bit + stage_range[5]);
+  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+  // stage 6
+  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = round_shift(s0, bit);
+  output[1] = round_shift(s1, bit);
+  output[2] = round_shift(s2, bit);
+  output[3] = round_shift(s3, bit);
+  range_check(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 4; ++i)
+    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  range_check(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  range_check(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 16; ++i)
+    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  range_check(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  range_check(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 000000000..9472af8e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM1D_H_
+#define AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 000000000..174689a14
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM2D_CFG_H_
+#define AV1_FWD_TXFM2D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t fwd_cos_bit_col[5][5];
+extern const int8_t fwd_cos_bit_row[5][5];
+#endif  // AV1_FWD_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 000000000..f25a667cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4_new;
+    case TXFM_TYPE_DCT8: return av1_fdct8_new;
+    case TXFM_TYPE_DCT16: return av1_fdct16_new;
+    case TXFM_TYPE_DCT32: return av1_fdct32_new;
+    case TXFM_TYPE_DCT64: return av1_fdct64_new;
+    case TXFM_TYPE_ADST4: return av1_fadst4_new;
+    case TXFM_TYPE_ADST8: return av1_fadst8_new;
+    case TXFM_TYPE_ADST16: return av1_fadst16_new;
+    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+  }
+
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf, int bd) {
+  int c, r;
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size_row;
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+    }
+    av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        buf[r * txfm_size_col + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip from left to right
+        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
+                  cos_bit_row, stage_range_row);
+    av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
+    if (abs(rect_type) == 1) {
+      // Multiply everything by Sqrt2 if the transform is rectangular and the
+      // size difference is a factor of 2.
+      for (c = 0; c < txfm_size_col; ++c) {
+        output[r * txfm_size_col + c] = round_shift(
+            (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
+      }
+    }
+  }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out top-right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Zero out the bottom 64x32 area.
+  memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 32x32 area.
+  memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out right 32x16 area.
+  for (int row = 0; row < 16; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x16 indices.
+  for (int row = 1; row < 16; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+  fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32,
+  fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,
+  fwd_shift_16x8,  fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+  fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,  fwd_shift_8x32,
+  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+                            [MAX_TXWH_IDX /*txh_idx*/] = {
+                              { 13, 13, 13, 0, 0 },
+                              { 13, 13, 13, 12, 0 },
+                              { 13, 13, 13, 12, 13 },
+                              { 0, 13, 13, 12, 13 },
+                              { 0, 0, 13, 12, 13 }
+                            };
+
+const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+                            [MAX_TXWH_IDX /*txh_idx*/] = {
+                              { 13, 13, 12, 0, 0 },
+                              { 13, 13, 13, 12, 0 },
+                              { 13, 13, 12, 13, 12 },
+                              { 0, 12, 13, 12, 11 },
+                              { 0, 0, 12, 11, 10 }
+                            };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0,  2,  4,  6,  8,  10,
+                                               11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t max_fwd_range_mult2_col[5] = { 3, 5, 7, 9, 11 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+#if 0
+const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/]
+                               [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 },
+                                                              { 3, 4, 5, 6, 0 },
+                                                              { 4, 5, 6, 7, 8 },
+                                                              { 0, 5, 6, 7, 8 },
+                                                              { 0, 0, 7, 8,
+                                                                9 } };
+#endif
+
+const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+  fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
+  fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
+  fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+  fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+  const int txh_idx = get_txh_idx(cfg->tx_size);
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+
+  if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
+    int stage_num_col = cfg->stage_num_col;
+    const int8_t *range_mult2_col =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+    for (int i = 0; i < stage_num_col; ++i)
+      cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+  }
+
+  if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
+    int stage_num_row = cfg->stage_num_row;
+    const int8_t *range_mult2_row =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+    for (int i = 0; i < stage_num_row; ++i)
+      cfg->stage_range_row[i] =
+          (max_fwd_range_mult2_col[txh_idx] + range_mult2_row[i] + 1) >> 1;
+  }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+  const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+  cfg->shift = fwd_txfm_shift_ls[tx_size];
+  cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+  set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index 033b4ba1a..1c5bdeb25 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -10,7 +10,9 @@
  */
 
 #include <math.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
@@ -24,413 +26,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/rd.h"
 
-#if CONFIG_NEW_QUANT
-static INLINE int quantize_coeff_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
-    q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int quantize_coeff_bigtx_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
-    q = NUQ_KNOTS +
-        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
-    // (logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int quantize_coeff_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS +
-        ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int quantize_coeff_bigtx_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS +
-        ((((int64_t)tmp -
-           ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
-          quant) >>
-         (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
-    // (logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                     int skip_block, const int16_t quant,
-                     const int16_t quant_shift, const int16_t dequant,
-                     const tran_low_t *cuml_bins_ptr,
-                     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-                     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                           cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t quant,
-                        const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-                        const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr,
-                              dequant_val, qcoeff_ptr, dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t quant,
-                           const int16_t quant_shift, const int16_t dequant,
-                           const tran_low_t *cuml_bins_ptr,
-                           const tran_low_t *dequant_val,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                 dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t quant,
-                              const int16_t dequant,
-                              const tran_low_t *cuml_bins_ptr,
-                              const tran_low_t *dequant_val,
-                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
-                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                    dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t quant,
-                           const int16_t quant_shift, const int16_t dequant,
-                           const tran_low_t *cuml_bins_ptr,
-                           const tran_low_t *dequant_val,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                 dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t quant,
-                              const int16_t dequant,
-                              const tran_low_t *cuml_bins_ptr,
-                              const tran_low_t *dequant_val,
-                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
-                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                    dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                    int skip_block, const int16_t *quant_ptr,
-                    const int16_t *quant_shift_ptr, const int16_t *dequant_ptr,
-                    const cuml_bins_type_nuq *cuml_bins_ptr,
-                    const dequant_val_type_nuq *dequant_val,
-                    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                    uint16_t *eob_ptr, const int16_t *scan,
-                    const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
-                             quant_shift_ptr[rc != 0], dequant_ptr[rc != 0],
-                             cuml_bins_ptr[band[i]], dequant_val[band[i]],
-                             &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *quant_ptr,
-                       const int16_t *dequant_ptr,
-                       const cuml_bins_type_nuq *cuml_bins_ptr,
-                       const dequant_val_type_nuq *dequant_val,
-                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
-                                dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-                                dequant_val[band[i]], &qcoeff_ptr[rc],
-                                &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
-                          const int16_t *dequant_ptr,
-                          const cuml_bins_type_nuq *cuml_bins_ptr,
-                          const dequant_val_type_nuq *dequant_val,
-                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *quant_ptr,
-                             const int16_t *dequant_ptr,
-                             const cuml_bins_type_nuq *cuml_bins_ptr,
-                             const dequant_val_type_nuq *dequant_val,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
-                          const int16_t *dequant_ptr,
-                          const cuml_bins_type_nuq *cuml_bins_ptr,
-                          const dequant_val_type_nuq *dequant_val,
-                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *quant_ptr,
-                             const int16_t *dequant_ptr,
-                             const cuml_bins_type_nuq *cuml_bins_ptr,
-                             const dequant_val_type_nuq *dequant_val,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_NEW_QUANT
-
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -439,8 +34,8 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
 }
 
 static void quantize_fp_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
@@ -450,12 +45,45 @@ static void quantize_fp_helper_c(
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  if (qm_ptr == NULL && iqm_ptr == NULL) {
+    const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    {  // rc == 0
+      const int coeff = coeff_ptr[0];
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
+        abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
+        const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
+          dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+          eob = 0;
+        }
+      }
+    }
+    const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
+    for (i = 1; i < n_coeffs; i++) {
+      const int coeff = coeff_ptr[i];
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      if ((abs_coeff << (1 + log_scale)) >= thresh1) {
+        abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
+        const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
+          dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+          eob = AOMMAX(iscan[i], eob);
+        }
+      }
+    }
+  } else {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
     for (i = 0; i < n_coeffs; i++) {
@@ -476,7 +104,8 @@ static void quantize_fp_helper_c(
         tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
                       (16 - log_scale + AOM_QM_BITS));
         qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
       }
 
       if (tmp32) eob = i;
@@ -486,15 +115,14 @@ static void quantize_fp_helper_c(
 }
 
 static void highbd_quantize_fp_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr, int log_scale) {
   int i;
   int eob = -1;
-  const int scale = 1 << log_scale;
   const int shift = 16 - log_scale;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
@@ -502,10 +130,7 @@ static void highbd_quantize_fp_helper_c(
   (void)quant_shift_ptr;
   (void)iscan;
 
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
+  if (qm_ptr || iqm_ptr) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
     for (i = 0; i < count; i++) {
@@ -517,150 +142,170 @@ static void highbd_quantize_fp_helper_c(
           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
       const int coeff_sign = (coeff >> 31);
+      const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int abs_qcoeff = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        const int64_t tmp =
+            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_qcoeff =
+            (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+        if (abs_qcoeff) eob = i;
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  } else {
+    const int log_scaled_round_arr[2] = {
+      ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+      ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+    };
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int rc01 = (rc != 0);
+      const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale);
-      const int abs_qcoeff =
-          (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
-      if (abs_qcoeff) eob = i;
+      const int log_scaled_round = log_scaled_round_arr[rc01];
+      if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+        const int quant = quant_ptr[rc01];
+        const int dequant = dequant_ptr[rc01];
+        const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+        const int abs_qcoeff = (int)((tmp * quant) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        if (abs_qcoeff) eob = i;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
     }
   }
   *eob_ptr = eob + 1;
 }
 
 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *zbin_ptr,
-                       const int16_t *round_ptr, const int16_t *quant_ptr,
-                       const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0);
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 0);
 }
 
 void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
-#if CONFIG_TX64X64
 void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 2);
 }
-#endif  // CONFIG_TX64X64
 
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                         pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr,
-                         iqm_ptr, qparam->log_scale);
+    quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                         p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
-#endif
     switch (qparam->log_scale) {
       case 0:
         if (n_coeffs < 16) {
           // TODO(jingning): Need SIMD implementation for smaller block size
           // quantization.
           quantize_fp_helper_c(
-              coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-              p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-              eob_ptr, sc->scan, sc->iscan, NULL, NULL, qparam->log_scale);
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+              p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, NULL, NULL, 0);
         } else {
-          av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                          pd->dequant, eob_ptr, sc->scan, sc->iscan);
+          av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                          p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                          sc->iscan);
         }
         break;
       case 1:
-        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                              p->round_fp, p->quant_fp, p->quant_shift,
-                              qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                              sc->scan, sc->iscan);
+        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
         break;
-#if CONFIG_TX64X64
       case 2:
-        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                              p->round_fp, p->quant_fp, p->quant_shift,
-                              qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                              sc->scan, sc->iscan);
+        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
         break;
-#endif  // CONFIG_TX64X64
       default: assert(0);
     }
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
-                           const QUANT_PARAM *qparam) {
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                        p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                        pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr,
-                        iqm_ptr, qparam->log_scale);
+    quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                        p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                        qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                        sc->scan, sc->iscan, qm_ptr, iqm_ptr,
+                        qparam->log_scale);
   } else {
-#endif  // CONFIG_AOM_QM
-
     switch (qparam->log_scale) {
       case 0:
-        aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                       p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                       pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                       p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                       qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                       sc->scan, sc->iscan);
         break;
       case 1:
-        aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                             pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                             p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                             qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                             sc->scan, sc->iscan);
         break;
-#if CONFIG_TX64X64
       case 2:
-        aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                             pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                             p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                             qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                             sc->scan, sc->iscan);
         break;
-#endif  // CONFIG_TX64X64
       default: assert(0);
     }
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
@@ -689,7 +334,8 @@ static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
     tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
     qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
     dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+    const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
     if (tmp32) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -697,237 +343,97 @@ static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
 
 void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
   (void)sc;
-  assert(qparam->log_scale >= 0 && qparam->log_scale < (2 + CONFIG_TX64X64));
-#if CONFIG_AOM_QM
+  assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-#else
-  const qm_val_t *qm_ptr = NULL;
-  const qm_val_t *iqm_ptr = NULL;
-#endif
-  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, p->quant_fp[0],
-              qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr, qm_ptr, iqm_ptr,
-              qparam->log_scale);
-}
-
-#if CONFIG_NEW_QUANT
-void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const MACROBLOCK_PLANE *p,
-                               tran_low_t *qcoeff_ptr,
-                               const MACROBLOCKD_PLANE *pd,
-                               tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                               const SCAN_ORDER *sc,
-                               const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
-                   pd->dequant,
-                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                   qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      quantize_32x32_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
-                         p->quant_shift, pd->dequant,
-                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                         qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      quantize_64x64_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
-                         p->quant_shift, pd->dequant,
-                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                         qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-
-void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      quantize_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-                      (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                      (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                      qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      quantize_32x32_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      quantize_64x64_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
+  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+              p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+              eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
 }
 
-void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  int dq = qparam->dq;
-  (void)sc;
-
-  switch (qparam->log_scale) {
-    case 0:
-      quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                         pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                         pd->dequant_val_nuq[dq][0], qcoeff_ptr, dqcoeff_ptr,
-                         eob_ptr);
-      break;
-    case 1:
-      quantize_dc_32x32_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                               pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-                               dqcoeff_ptr, eob_ptr);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      quantize_dc_64x64_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                               pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-                               dqcoeff_ptr, eob_ptr);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_NEW_QUANT
-
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
     highbd_quantize_fp_helper_c(
-        coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, p->quant_fp,
-        p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+        coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+        p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+        sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
-#endif  // CONFIG_AOM_QM
-
     if (n_coeffs < 16) {
       // TODO(jingning): Need SIMD implementation for smaller block size
       // quantization.
-      av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                               p->round_fp, p->quant_fp, p->quant_shift,
-                               qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                               sc->scan, sc->iscan, qparam->log_scale);
+      av1_highbd_quantize_fp_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qparam->log_scale);
       return;
     }
-
-    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                           p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr,
-                           dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                           p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                           dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
                            sc->iscan, qparam->log_scale);
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
-                                  const MACROBLOCKD_PLANE *pd,
                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                   const SCAN_ORDER *sc,
                                   const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                               p->round, p->quant, p->quant_shift, qcoeff_ptr,
-                               dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                               p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                               qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                               sc->scan, sc->iscan, qm_ptr, iqm_ptr,
+                               qparam->log_scale);
   } else {
-#endif  // CONFIG_AOM_QM
-
     switch (qparam->log_scale) {
       case 0:
         if (LIKELY(n_coeffs >= 8)) {
-          aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                p->round, p->quant, p->quant_shift, qcoeff_ptr,
-                                dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                                sc->iscan);
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                                p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                                qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                                eob_ptr, sc->scan, sc->iscan);
         } else {
           // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
           // quantization
-          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                  p->round, p->quant, p->quant_shift,
-                                  qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                                  sc->scan, sc->iscan);
+          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                                  p->round_QTX, p->quant_QTX,
+                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
         }
         break;
       case 1:
-        aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift,
-                                    qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-                                    eob_ptr, sc->scan, sc->iscan);
+        aom_highbd_quantize_b_32x32(
+            coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
+            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+            p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
         break;
-#if CONFIG_TX64X64
       case 2:
-        aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift,
-                                    qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-                                    eob_ptr, sc->scan, sc->iscan);
+        aom_highbd_quantize_b_64x64(
+            coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
+            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+            p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
         break;
-#endif  // CONFIG_TX64X64
       default: assert(0);
     }
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 static INLINE void highbd_quantize_dc(
@@ -954,7 +460,8 @@ static INLINE void highbd_quantize_dc(
     const int dequant =
         (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
 
-    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / (1 << log_scale);
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -963,550 +470,33 @@ static INLINE void highbd_quantize_dc(
 void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-#else
-  const qm_val_t *qm_ptr = NULL;
-  const qm_val_t *iqm_ptr = NULL;
-#endif  // CONFIG_AOM_QM
-
   (void)sc;
 
-  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
-                     p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                     eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
-}
-
-#if CONFIG_NEW_QUANT
-static INLINE int highbd_quantize_coeff_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
-    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int highbd_quantize_coeff_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS +
-        (int)(((tmp -
-                ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
-               quant) >>
-              (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int highbd_quantize_coeff_bigtx_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
-    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >>
-                          (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t quant,
-                            const int16_t quant_shift, const int16_t dequant,
-                            const tran_low_t *cuml_bins_ptr,
-                            const tran_low_t *dequant_val,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                                  cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                  dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t quant,
-                               const int16_t dequant,
-                               const tran_low_t *cuml_bins_ptr,
-                               const tran_low_t *dequant_val,
-                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                               uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant,
-                                     cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                     dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t *quant_ptr,
-                           const int16_t *quant_shift_ptr,
-                           const int16_t *dequant_ptr,
-                           const cuml_bins_type_nuq *cuml_bins_ptr,
-                           const dequant_val_type_nuq *dequant_val,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr, const int16_t *scan,
-                           const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 int skip_block, const int16_t *quant_ptr,
-                                 const int16_t *quant_shift_ptr,
-                                 const int16_t *dequant_ptr,
-                                 const cuml_bins_type_nuq *cuml_bins_ptr,
-                                 const dequant_val_type_nuq *dequant_val,
-                                 tran_low_t *qcoeff_ptr,
-                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *dequant_ptr,
-                                    const cuml_bins_type_nuq *cuml_bins_ptr,
-                                    const dequant_val_type_nuq *dequant_val,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                    const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 int skip_block, const int16_t *quant_ptr,
-                                 const int16_t *quant_shift_ptr,
-                                 const int16_t *dequant_ptr,
-                                 const cuml_bins_type_nuq *cuml_bins_ptr,
-                                 const dequant_val_type_nuq *dequant_val,
-                                 tran_low_t *qcoeff_ptr,
-                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
+  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+                     p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+                     p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+                     qparam->log_scale);
 }
 
-void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *dequant_ptr,
-                                    const cuml_bins_type_nuq *cuml_bins_ptr,
-                                    const dequant_val_type_nuq *dequant_val,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                    const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *quant_ptr,
-                              const int16_t *dequant_ptr,
-                              const cuml_bins_type_nuq *cuml_bins_ptr,
-                              const dequant_val_type_nuq *dequant_val,
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              uint16_t *eob_ptr, const int16_t *scan,
-                              const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_32x32_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_nuq(
-            coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
-            dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_32x32_fp_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_fp_nuq(
-            coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
-            qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void highbd_quantize_dc_64x64_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_nuq(
-            coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
-            dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_64x64_fp_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_fp_nuq(
-            coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
-            qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void av1_highbd_quantize_b_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  const int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      highbd_quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
-                          p->quant_shift, pd->dequant,
-                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                          qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      highbd_quantize_32x32_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      highbd_quantize_64x64_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-
-void av1_highbd_quantize_fp_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  const int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      highbd_quantize_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      highbd_quantize_32x32_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      highbd_quantize_64x64_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-
-void av1_highbd_quantize_dc_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const int dq = qparam->dq;
-  (void)sc;
-
-  switch (qparam->log_scale) {
-    case 0:
-      highbd_quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                                pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                                pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-                                dqcoeff_ptr, eob_ptr);
-      break;
-    case 1:
-      highbd_quantize_dc_32x32_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
-          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      highbd_quantize_dc_64x64_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
-          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_NEW_QUANT
-
-void av1_highbd_quantize_fp_c(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, int log_scale) {
-  highbd_quantize_fp_helper_c(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                              NULL, NULL, log_scale);
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+                              int log_scale) {
+  highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                              quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                              dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+                              log_scale);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -1520,8 +510,7 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
 }
 
 static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
-  const int quant = av1_dc_quant(q, 0, bit_depth);
-#if CONFIG_HIGHBITDEPTH
+  const int quant = av1_dc_quant_Q3(q, 0, bit_depth);
   switch (bit_depth) {
     case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
     case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
@@ -1530,16 +519,13 @@ static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
-#endif
 }
 
 void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
-                         int uv_dc_delta_q, int uv_ac_delta_q,
-                         QUANTS *const quants, Dequants *const deq) {
-  int i, q, quant;
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq) {
+  int i, q, quant_Q3, quant_QTX;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = get_qzbin_factor(q, bit_depth);
@@ -1547,41 +533,51 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
 
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = 64;
-      // y
-      quant = i == 0 ? av1_dc_quant(q, y_dc_delta_q, bit_depth)
-                     : av1_ac_quant(q, 0, bit_depth);
-      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
-      quants->y_quant_fp[q][i] = (1 << 16) / quant;
-      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
-      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
-      deq->y_dequant[q][i] = quant;
-
-      // uv
-      quant = i == 0 ? av1_dc_quant(q, uv_dc_delta_q, bit_depth)
-                     : av1_ac_quant(q, uv_ac_delta_q, bit_depth);
-      invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i],
-                   quant);
-      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
-      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
-      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
-      deq->uv_dequant[q][i] = quant;
-    }
-
-#if CONFIG_NEW_QUANT
-    int dq;
-    for (dq = 0; dq < QUANT_PROFILES; dq++) {
-      for (i = 0; i < COEF_BANDS; i++) {
-        const int y_quant = deq->y_dequant[q][i != 0];
-        const int uvquant = deq->uv_dequant[q][i != 0];
-        av1_get_dequant_val_nuq(y_quant, i, deq->y_dequant_val_nuq[dq][q][i],
-                                quants->y_cuml_bins_nuq[dq][q][i], dq);
-        av1_get_dequant_val_nuq(uvquant, i, deq->uv_dequant_val_nuq[dq][q][i],
-                                quants->uv_cuml_bins_nuq[dq][q][i], dq);
-      }
+      // y quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, 0, bit_depth);
+      // y quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, 0, bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+                   quant_QTX);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->y_dequant_QTX[q][i] = quant_QTX;
+      deq->y_dequant_Q3[q][i] = quant_Q3;
+
+      // u quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth);
+      // u quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+      invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+                   quant_QTX);
+      quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->u_dequant_QTX[q][i] = quant_QTX;
+      deq->u_dequant_Q3[q][i] = quant_Q3;
+
+      // v quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth);
+      // v quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+      invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+                   quant_QTX);
+      quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->v_dequant_QTX[q][i] = quant_QTX;
+      deq->v_dequant_Q3[q][i] = quant_Q3;
     }
-#endif  // CONFIG_NEW_QUANT
 
     for (i = 2; i < 8; i++) {  // 8: SIMD width
       quants->y_quant[q][i] = quants->y_quant[q][1];
@@ -1590,15 +586,25 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
-      deq->y_dequant[q][i] = deq->y_dequant[q][1];
-
-      quants->uv_quant[q][i] = quants->uv_quant[q][1];
-      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
-      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
-      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
-      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
-      quants->uv_round[q][i] = quants->uv_round[q][1];
-      deq->uv_dequant[q][i] = deq->uv_dequant[q][1];
+      deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+      deq->y_dequant_Q3[q][i] = deq->y_dequant_Q3[q][1];
+
+      quants->u_quant[q][i] = quants->u_quant[q][1];
+      quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+      quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+      quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+      quants->u_zbin[q][i] = quants->u_zbin[q][1];
+      quants->u_round[q][i] = quants->u_round[q][1];
+      deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+      deq->u_dequant_Q3[q][i] = deq->u_dequant_Q3[q][1];
+      quants->v_quant[q][i] = quants->u_quant[q][1];
+      quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+      quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+      quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+      quants->v_zbin[q][i] = quants->v_zbin[q][1];
+      quants->v_round[q][i] = quants->v_round[q][1];
+      deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+      deq->v_dequant_Q3[q][i] = deq->v_dequant_Q3[q][1];
     }
   }
 }
@@ -1607,8 +613,9 @@ void av1_init_quantizer(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   Dequants *const dequants = &cpi->dequants;
-  av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->uv_dc_delta_q,
-                      cm->uv_ac_delta_q, quants, dequants);
+  av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->u_dc_delta_q,
+                      cm->u_ac_delta_q, cm->v_dc_delta_q, cm->v_ac_delta_q,
+                      quants, dequants);
 }
 
 void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -1617,79 +624,68 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   const QUANTS *const quants = &cpi->quants;
 
-#if CONFIG_EXT_DELTA_Q
-  int current_q_index =
-      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
-                       cpi->oxcf.deltaq_mode != NO_DELTA_Q
-                           ? cm->base_qindex + xd->delta_qindex
-                           : cm->base_qindex));
-#else
-  int current_q_index = AOMMAX(
-      0, AOMMIN(QINDEX_RANGE - 1,
-                cm->delta_q_present_flag ? cm->base_qindex + xd->delta_qindex
-                                         : cm->base_qindex));
-#endif
-  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index);
+  int current_qindex = AOMMAX(
+      0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
+                                      ? cm->base_qindex + xd->delta_qindex
+                                      : cm->base_qindex));
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
   const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
-  int i;
-#if CONFIG_AOM_QM
-  int minqm = cm->min_qmlevel;
-  int maxqm = cm->max_qmlevel;
-  // Quant matrix only depends on the base QP so there is only one set per frame
   int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
                     ? NUM_QM_LEVELS - 1
-                    : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
-#endif
-#if CONFIG_NEW_QUANT
-  int dq;
-#endif
+                    : cm->qm_y;
 
   // Y
-  x->plane[0].quant = quants->y_quant[qindex];
-  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  x->plane[0].round_fp = quants->y_round_fp[qindex];
-  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
-  x->plane[0].zbin = quants->y_zbin[qindex];
-  x->plane[0].round = quants->y_round[qindex];
-#if CONFIG_AOM_QM
+  x->plane[0].quant_QTX = quants->y_quant[qindex];
+  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+  x->plane[0].round_QTX = quants->y_round[qindex];
+  x->plane[0].dequant_QTX = cpi->dequants.y_dequant_QTX[qindex];
   memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
          sizeof(cm->gqmatrix[qmlevel][0]));
   memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
          sizeof(cm->giqmatrix[qmlevel][0]));
-#endif
-  xd->plane[0].dequant = cpi->dequants.y_dequant[qindex];
-#if CONFIG_NEW_QUANT
-  for (dq = 0; dq < QUANT_PROFILES; dq++) {
-    x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex];
-    xd->plane[0].dequant_val_nuq[dq] =
-        cpi->dequants.y_dequant_val_nuq[dq][qindex];
-  }
-#endif  // CONFIG_NEW_QUANT
-
-  // UV
-  for (i = 1; i < 3; i++) {
-    x->plane[i].quant = quants->uv_quant[qindex];
-    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    x->plane[i].round_fp = quants->uv_round_fp[qindex];
-    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
-    x->plane[i].zbin = quants->uv_zbin[qindex];
-    x->plane[i].round = quants->uv_round[qindex];
-#if CONFIG_AOM_QM
-    memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+  xd->plane[0].dequant_Q3 = cpi->dequants.y_dequant_Q3[qindex];
+
+  // U
+  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                ? NUM_QM_LEVELS - 1
+                : cm->qm_u;
+  {
+    x->plane[1].quant_QTX = quants->u_quant[qindex];
+    x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+    x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+    x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+    x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+    x->plane[1].round_QTX = quants->u_round[qindex];
+    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+    memcpy(&xd->plane[1].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
            sizeof(cm->gqmatrix[qmlevel][1]));
-    memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+    memcpy(&xd->plane[1].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
            sizeof(cm->giqmatrix[qmlevel][1]));
-#endif
-    xd->plane[i].dequant = cpi->dequants.uv_dequant[qindex];
-#if CONFIG_NEW_QUANT
-    for (dq = 0; dq < QUANT_PROFILES; dq++) {
-      x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex];
-      xd->plane[i].dequant_val_nuq[dq] =
-          cpi->dequants.uv_dequant_val_nuq[dq][qindex];
-    }
-#endif  // CONFIG_NEW_QUANT
+    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+    xd->plane[1].dequant_Q3 = cpi->dequants.u_dequant_Q3[qindex];
+  }
+  // V
+  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                ? NUM_QM_LEVELS - 1
+                : cm->qm_v;
+  {
+    x->plane[2].quant_QTX = quants->v_quant[qindex];
+    x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+    x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+    x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+    x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+    x->plane[2].round_QTX = quants->v_round[qindex];
+    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+    memcpy(&xd->plane[2].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][2],
+           sizeof(cm->gqmatrix[qmlevel][2]));
+    memcpy(&xd->plane[2].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][2],
+           sizeof(cm->giqmatrix[qmlevel][2]));
+    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+    xd->plane[2].dequant_Q3 = cpi->dequants.v_dequant_Q3[qindex];
   }
-
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->qindex = qindex;
 
@@ -1701,16 +697,27 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
 void av1_frame_init_quantizer(AV1_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 }
 
 void av1_set_quantizer(AV1_COMMON *cm, int q) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
-  cm->base_qindex = q;
+  cm->base_qindex = AOMMAX(cm->delta_q_present_flag, q);
   cm->y_dc_delta_q = 0;
-  cm->uv_dc_delta_q = 0;
-  cm->uv_ac_delta_q = 0;
+  cm->u_dc_delta_q = 0;
+  cm->u_ac_delta_q = 0;
+  cm->v_dc_delta_q = 0;
+  cm->v_ac_delta_q = 0;
+  cm->qm_y = aom_get_qmlevel(cm->base_qindex, cm->min_qmlevel, cm->max_qmlevel);
+  cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
+                             cm->min_qmlevel, cm->max_qmlevel);
+
+  if (!cm->separate_uv_delta_q)
+    cm->qm_v = cm->qm_u;
+  else
+    cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
+                               cm->min_qmlevel, cm->max_qmlevel);
 }
 
 // Table that converts 0-63 Q-range values passed in outside to the Qindex
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index e5fc8b528..eaf8374de 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -12,7 +12,8 @@
 #ifndef AV1_ENCODER_QUANTIZE_H_
 #define AV1_ENCODER_QUANTIZE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/quant_common.h"
 #include "av1/common/scan.h"
 #include "av1/encoder/block.h"
@@ -23,33 +24,22 @@ extern "C" {
 
 typedef struct QUANT_PARAM {
   int log_scale;
-#if CONFIG_NEW_QUANT
   TX_SIZE tx_size;
-  int dq;
-#endif  // CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
   const qm_val_t *qmatrix;
   const qm_val_t *iqmatrix;
-#endif  // CONFIG_AOM_QM
 } QUANT_PARAM;
 
 typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                  const MACROBLOCK_PLANE *p,
                                  tran_low_t *qcoeff_ptr,
-                                 const MACROBLOCKD_PLANE *pd,
                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                  const SCAN_ORDER *sc,
                                  const QUANT_PARAM *qparam);
 
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
 typedef struct {
-#if CONFIG_NEW_QUANT
-  DECLARE_ALIGNED(
-      16, tran_low_t,
-      y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
-  DECLARE_ALIGNED(
-      16, tran_low_t,
-      uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
-#endif  // CONFIG_NEW_QUANT
   // 0: dc 1: ac 2-8: ac repeated to SIMD width
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -59,25 +49,36 @@ typedef struct {
   // TODO(jingning): in progress of re-working the quantization. will decide
   // if we want to deprecate the current use of y_quant.
   DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
-
-  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
 } QUANTS;
 
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are sufffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
 typedef struct {
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
-#if CONFIG_NEW_QUANT
-  DECLARE_ALIGNED(16, dequant_val_type_nuq,
-                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
-  DECLARE_ALIGNED(16, dequant_val_type_nuq,
-                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
-#endif  // CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, int16_t,
+                  y_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  u_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  v_dequant_QTX[QINDEX_RANGE][8]);              // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, u_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, v_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
 } Dequants;
 
 struct AV1_COMP;
@@ -89,8 +90,9 @@ void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id);
 
 void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
-                         int uv_dc_delta_q, int uv_ac_delta_q,
-                         QUANTS *const quants, Dequants *const deq);
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq);
 
 void av1_init_quantizer(struct AV1_COMP *cpi);
 
@@ -105,51 +107,22 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
 
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
 void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
-                           const QUANT_PARAM *qparam);
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
 void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
-#if CONFIG_NEW_QUANT
-void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam);
-
-void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const MACROBLOCK_PLANE *p,
-                               tran_low_t *qcoeff_ptr,
-                               const MACROBLOCKD_PLANE *pd,
-                               tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                               const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
-
-void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam);
-#endif  // CONFIG_NEW_QUANT
-
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
@@ -157,7 +130,6 @@ void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
 void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
-                                  const MACROBLOCKD_PLANE *pd,
                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                   const SCAN_ORDER *sc,
                                   const QUANT_PARAM *qparam);
@@ -165,31 +137,10 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
 void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
 
-#if CONFIG_NEW_QUANT
-void av1_highbd_quantize_fp_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam);
-
-void av1_highbd_quantize_b_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam);
-
-void av1_highbd_quantize_dc_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam);
-#endif  // CONFIG_NEW_QUANT
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/bgsprite.c b/third_party/aom/av1/encoder/bgsprite.c
deleted file mode 100644
index ae2cb1d40..000000000
--- a/third_party/aom/av1/encoder/bgsprite.c
+++ /dev/null
@@ -1,1257 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#define _POSIX_C_SOURCE 200112L  // rand_r()
-#include <assert.h>
-#include <float.h>
-#include <limits.h>
-#include <math.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "av1/encoder/bgsprite.h"
-
-#include "aom_mem/aom_mem.h"
-#include "./aom_scale_rtcd.h"
-#include "av1/common/mv.h"
-#include "av1/common/warped_motion.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/global_motion.h"
-#include "av1/encoder/mathutils.h"
-#include "av1/encoder/temporal_filter.h"
-
-/* Blending Modes:
- * 0 = Median
- * 1 = Mean
- */
-#define BGSPRITE_BLENDING_MODE 1
-
-// Enable removal of outliers from mean blending mode.
-#if BGSPRITE_BLENDING_MODE == 1
-#define BGSPRITE_MEAN_REMOVE_OUTLIERS 0
-#endif  // BGSPRITE_BLENDING_MODE == 1
-
-/* Interpolation for panorama alignment sampling:
- * 0 = Nearest neighbor
- * 1 = Bilinear
- */
-#define BGSPRITE_INTERPOLATION 0
-
-// Enable turning off bgsprite from firstpass metrics in define_gf_group.
-#define BGSPRITE_ENABLE_METRICS 1
-
-// Enable foreground/backgrond segmentation and combine with temporal filter.
-#define BGSPRITE_ENABLE_SEGMENTATION 1
-
-// Enable alignment using global motion.
-#define BGSPRITE_ENABLE_GME 0
-
-// Block size for foreground mask.
-#define BGSPRITE_MASK_BLOCK_SIZE 4
-
-typedef struct {
-#if CONFIG_HIGHBITDEPTH
-  uint16_t y;
-  uint16_t u;
-  uint16_t v;
-#else
-  uint8_t y;
-  uint8_t u;
-  uint8_t v;
-#endif  // CONFIG_HIGHBITDEPTH
-  uint8_t exists;
-} YuvPixel;
-
-typedef struct {
-  int curr_model;
-  double mean[2];
-  double var[2];
-  int age[2];
-  double u_mean[2];
-  double v_mean[2];
-
-#if CONFIG_HIGHBITDEPTH
-  uint16_t y;
-  uint16_t u;
-  uint16_t v;
-#else
-  uint8_t y;
-  uint8_t u;
-  uint8_t v;
-#endif  // CONFIG_HIGHBITDEPTH
-  double final_var;
-} YuvPixelGaussian;
-
-// Maps to convert from matrix form to param vector form.
-static const int params_to_matrix_map[] = { 2, 3, 0, 4, 5, 1, 6, 7 };
-static const int matrix_to_params_map[] = { 2, 5, 0, 1, 3, 4, 6, 7 };
-
-// Convert the parameter array to a 3x3 matrix form.
-static void params_to_matrix(const double *const params, double *target) {
-  for (int i = 0; i < MAX_PARAMDIM - 1; i++) {
-    assert(params_to_matrix_map[i] < MAX_PARAMDIM - 1);
-    target[i] = params[params_to_matrix_map[i]];
-  }
-  target[8] = 1;
-}
-
-// Convert a 3x3 matrix to a parameter array form.
-static void matrix_to_params(const double *const matrix, double *target) {
-  for (int i = 0; i < MAX_PARAMDIM - 1; i++) {
-    assert(matrix_to_params_map[i] < MAX_PARAMDIM - 1);
-    target[i] = matrix[matrix_to_params_map[i]];
-  }
-}
-
-#define TRANSFORM_MAT_DIM 3
-
-// Do matrix multiplication on params.
-static void multiply_params(double *const m1, double *const m2,
-                            double *target) {
-  double m1_matrix[MAX_PARAMDIM];
-  double m2_matrix[MAX_PARAMDIM];
-  double result[MAX_PARAMDIM];
-
-  params_to_matrix(m1, m1_matrix);
-  params_to_matrix(m2, m2_matrix);
-  multiply_mat(m2_matrix, m1_matrix, result, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, TRANSFORM_MAT_DIM);
-  matrix_to_params(result, target);
-}
-
-// Finds x and y limits of a single transformed image.
-// Width and height are the size of the input video.
-static void find_frame_limit(int width, int height,
-                             const double *const transform, int *x_min,
-                             int *x_max, int *y_min, int *y_max) {
-  double transform_matrix[MAX_PARAMDIM];
-  double xy_matrix[3] = { 0, 0, 1 };
-  double uv_matrix[3] = { 0 };
-// Macro used to update frame limits based on transformed coordinates.
-#define UPDATELIMITS(u, v, x_min, x_max, y_min, y_max) \
-  {                                                    \
-    if ((int)ceil(u) > *x_max) {                       \
-      *x_max = (int)ceil(u);                           \
-    }                                                  \
-    if ((int)floor(u) < *x_min) {                      \
-      *x_min = (int)floor(u);                          \
-    }                                                  \
-    if ((int)ceil(v) > *y_max) {                       \
-      *y_max = (int)ceil(v);                           \
-    }                                                  \
-    if ((int)floor(v) < *y_min) {                      \
-      *y_min = (int)floor(v);                          \
-    }                                                  \
-  }
-
-  params_to_matrix(transform, transform_matrix);
-  xy_matrix[0] = 0;
-  xy_matrix[1] = 0;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  *x_max = (int)ceil(uv_matrix[0]);
-  *x_min = (int)floor(uv_matrix[0]);
-  *y_max = (int)ceil(uv_matrix[1]);
-  *y_min = (int)floor(uv_matrix[1]);
-
-  xy_matrix[0] = width - 1;
-  xy_matrix[1] = 0;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
-
-  xy_matrix[0] = width - 1;
-  xy_matrix[1] = height - 1;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
-
-  xy_matrix[0] = 0;
-  xy_matrix[1] = height - 1;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
-
-#undef UPDATELIMITS
-}
-
-// Finds x and y limits for arrays. Also finds the overall max and minimums
-static void find_limits(int width, int height, const double **const params,
-                        int num_frames, int *x_min, int *x_max, int *y_min,
-                        int *y_max, int *pano_x_min, int *pano_x_max,
-                        int *pano_y_min, int *pano_y_max) {
-  *pano_x_max = INT_MIN;
-  *pano_x_min = INT_MAX;
-  *pano_y_max = INT_MIN;
-  *pano_y_min = INT_MAX;
-  for (int i = 0; i < num_frames; ++i) {
-    find_frame_limit(width, height, (const double *const)params[i], &x_min[i],
-                     &x_max[i], &y_min[i], &y_max[i]);
-    if (x_max[i] > *pano_x_max) {
-      *pano_x_max = x_max[i];
-    }
-    if (x_min[i] < *pano_x_min) {
-      *pano_x_min = x_min[i];
-    }
-    if (y_max[i] > *pano_y_max) {
-      *pano_y_max = y_max[i];
-    }
-    if (y_min[i] < *pano_y_min) {
-      *pano_y_min = y_min[i];
-    }
-  }
-}
-
-// Inverts a 3x3 matrix that is in the parameter form.
-static void invert_params(const double *const params, double *target) {
-  double temp[MAX_PARAMDIM] = { 0 };
-  params_to_matrix(params, temp);
-
-  // Find determinant of matrix (expansion by minors).
-  const double det = temp[0] * ((temp[4] * temp[8]) - (temp[5] * temp[7])) -
-                     temp[1] * ((temp[3] * temp[8]) - (temp[5] * temp[6])) +
-                     temp[2] * ((temp[3] * temp[7]) - (temp[4] * temp[6]));
-  assert(det != 0);
-
-  // inverse is transpose of cofactor * 1/det.
-  double inverse[MAX_PARAMDIM] = { 0 };
-  inverse[0] = (temp[4] * temp[8] - temp[7] * temp[5]) / det;
-  inverse[1] = (temp[2] * temp[7] - temp[1] * temp[8]) / det;
-  inverse[2] = (temp[1] * temp[5] - temp[2] * temp[4]) / det;
-  inverse[3] = (temp[5] * temp[6] - temp[3] * temp[8]) / det;
-  inverse[4] = (temp[0] * temp[8] - temp[2] * temp[6]) / det;
-  inverse[5] = (temp[3] * temp[2] - temp[0] * temp[5]) / det;
-  inverse[6] = (temp[3] * temp[7] - temp[6] * temp[4]) / det;
-  inverse[7] = (temp[6] * temp[1] - temp[0] * temp[7]) / det;
-  inverse[8] = (temp[0] * temp[4] - temp[3] * temp[1]) / det;
-
-  matrix_to_params(inverse, target);
-}
-
-static void build_image_stack(YV12_BUFFER_CONFIG **const frames,
-                              const int num_frames, const double **const params,
-                              const int *const x_min, const int *const x_max,
-                              const int *const y_min, const int *const y_max,
-                              int pano_x_min, int pano_y_min,
-                              YuvPixel ***img_stack) {
-  // Re-sample images onto panorama (pre-filtering).
-  const int x_offset = -pano_x_min;
-  const int y_offset = -pano_y_min;
-  const int frame_width = frames[0]->y_width;
-  const int frame_height = frames[0]->y_height;
-  for (int i = 0; i < num_frames; ++i) {
-    // Find transforms from panorama coordinate system back to single image
-    // coordinate system for sampling.
-    int transformed_width = x_max[i] - x_min[i] + 1;
-    int transformed_height = y_max[i] - y_min[i] + 1;
-
-    double transform_matrix[MAX_PARAMDIM];
-    double transform_params[MAX_PARAMDIM - 1];
-    invert_params(params[i], transform_params);
-    params_to_matrix(transform_params, transform_matrix);
-
-#if CONFIG_HIGHBITDEPTH
-    const uint16_t *y_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->y_buffer);
-    const uint16_t *u_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->u_buffer);
-    const uint16_t *v_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->v_buffer);
-#endif  // CONFIG_HIGHBITDEPTH
-
-    for (int y = 0; y < transformed_height; ++y) {
-      for (int x = 0; x < transformed_width; ++x) {
-        // Do transform.
-        double xy_matrix[3] = { x + x_min[i], y + y_min[i], 1 };
-        double uv_matrix[3] = { 0 };
-        multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-                     TRANSFORM_MAT_DIM, 1);
-
-        // Coordinates used for nearest neighbor interpolation.
-        int image_x = (int)round(uv_matrix[0]);
-        int image_y = (int)round(uv_matrix[1]);
-
-        // Temporary values for bilinear interpolation
-        double interpolated_yvalue = 0.0;
-        double interpolated_uvalue = 0.0;
-        double interpolated_vvalue = 0.0;
-        double interpolated_fraction = 0.0;
-        int interpolation_count = 0;
-
-#if BGSPRITE_INTERPOLATION == 1
-        // Coordintes used for bilinear interpolation.
-        double x_base;
-        double y_base;
-        double x_decimal = modf(uv_matrix[0], &x_base);
-        double y_decimal = modf(uv_matrix[1], &y_base);
-
-        if ((x_decimal > 0.2 && x_decimal < 0.8) ||
-            (y_decimal > 0.2 && y_decimal < 0.8)) {
-          for (int u = 0; u < 2; ++u) {
-            for (int v = 0; v < 2; ++v) {
-              int interp_x = (int)x_base + u;
-              int interp_y = (int)y_base + v;
-              if (interp_x >= 0 && interp_x < frame_width && interp_y >= 0 &&
-                  interp_y < frame_height) {
-                interpolation_count++;
-
-                interpolated_fraction +=
-                    fabs(u - x_decimal) * fabs(v - y_decimal);
-                int ychannel_idx = interp_y * frames[i]->y_stride + interp_x;
-                int uvchannel_idx = (interp_y >> frames[i]->subsampling_y) *
-                                        frames[i]->uv_stride +
-                                    (interp_x >> frames[i]->subsampling_x);
-#if CONFIG_HIGHBITDEPTH
-                if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-                  interpolated_yvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         y_buffer16[ychannel_idx];
-                  interpolated_uvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         u_buffer16[uvchannel_idx];
-                  interpolated_vvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         v_buffer16[uvchannel_idx];
-                } else {
-#endif  // CONFIG_HIGHBITDEPTH
-                  interpolated_yvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         frames[i]->y_buffer[ychannel_idx];
-                  interpolated_uvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         frames[i]->u_buffer[uvchannel_idx];
-                  interpolated_vvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         frames[i]->v_buffer[uvchannel_idx];
-#if CONFIG_HIGHBITDEPTH
-                }
-#endif  // CONFIG_HIGHBITDEPTH
-              }
-            }
-          }
-        }
-#endif  // BGSPRITE_INTERPOLATION == 1
-
-        if (BGSPRITE_INTERPOLATION && interpolation_count > 2) {
-          if (interpolation_count != 4) {
-            interpolated_yvalue /= interpolated_fraction;
-            interpolated_uvalue /= interpolated_fraction;
-            interpolated_vvalue /= interpolated_fraction;
-          }
-          int pano_x = x + x_min[i] + x_offset;
-          int pano_y = y + y_min[i] + y_offset;
-
-#if CONFIG_HIGHBITDEPTH
-          if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-            img_stack[pano_y][pano_x][i].y = (uint16_t)interpolated_yvalue;
-            img_stack[pano_y][pano_x][i].u = (uint16_t)interpolated_uvalue;
-            img_stack[pano_y][pano_x][i].v = (uint16_t)interpolated_vvalue;
-            img_stack[pano_y][pano_x][i].exists = 1;
-          } else {
-#endif  // CONFIG_HIGHBITDEPTH
-            img_stack[pano_y][pano_x][i].y = (uint8_t)interpolated_yvalue;
-            img_stack[pano_y][pano_x][i].u = (uint8_t)interpolated_uvalue;
-            img_stack[pano_y][pano_x][i].v = (uint8_t)interpolated_vvalue;
-            img_stack[pano_y][pano_x][i].exists = 1;
-#if CONFIG_HIGHBITDEPTH
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        } else if (image_x >= 0 && image_x < frame_width && image_y >= 0 &&
-                   image_y < frame_height) {
-          // Place in panorama stack.
-          int pano_x = x + x_min[i] + x_offset;
-          int pano_y = y + y_min[i] + y_offset;
-
-          int ychannel_idx = image_y * frames[i]->y_stride + image_x;
-          int uvchannel_idx =
-              (image_y >> frames[i]->subsampling_y) * frames[i]->uv_stride +
-              (image_x >> frames[i]->subsampling_x);
-#if CONFIG_HIGHBITDEPTH
-          if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-            img_stack[pano_y][pano_x][i].y = y_buffer16[ychannel_idx];
-            img_stack[pano_y][pano_x][i].u = u_buffer16[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].v = v_buffer16[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].exists = 1;
-          } else {
-#endif  // CONFIG_HIGHBITDEPTH
-            img_stack[pano_y][pano_x][i].y = frames[i]->y_buffer[ychannel_idx];
-            img_stack[pano_y][pano_x][i].u = frames[i]->u_buffer[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].v = frames[i]->v_buffer[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].exists = 1;
-#if CONFIG_HIGHBITDEPTH
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-    }
-  }
-}
-
-#if BGSPRITE_BLENDING_MODE == 0
-// swaps two YuvPixels.
-static void swap_yuv(YuvPixel *a, YuvPixel *b) {
-  const YuvPixel temp = *b;
-  *b = *a;
-  *a = temp;
-}
-
-// Partitions array to find pivot index in qselect.
-static int partition(YuvPixel arr[], int left, int right, int pivot_idx) {
-  YuvPixel pivot = arr[pivot_idx];
-
-  // Move pivot to the end.
-  swap_yuv(&arr[pivot_idx], &arr[right]);
-
-  int p_idx = left;
-  for (int i = left; i < right; ++i) {
-    if (arr[i].y <= pivot.y) {
-      swap_yuv(&arr[i], &arr[p_idx]);
-      p_idx++;
-    }
-  }
-
-  swap_yuv(&arr[p_idx], &arr[right]);
-
-  return p_idx;
-}
-
-// Returns the kth element in array, partially sorted in place (quickselect).
-static YuvPixel qselect(YuvPixel arr[], int left, int right, int k) {
-  if (left >= right) {
-    return arr[left];
-  }
-  unsigned int seed = (int)time(NULL);
-  int pivot_idx = left + rand_r(&seed) % (right - left + 1);
-  pivot_idx = partition(arr, left, right, pivot_idx);
-
-  if (k == pivot_idx) {
-    return arr[k];
-  } else if (k < pivot_idx) {
-    return qselect(arr, left, pivot_idx - 1, k);
-  } else {
-    return qselect(arr, pivot_idx + 1, right, k);
-  }
-}
-
-// Blends image stack together using a temporal median.
-static void blend_median(const int width, const int height,
-                         const int num_frames, const YuvPixel ***image_stack,
-                         YuvPixel **blended_img) {
-  // Allocate stack of pixels
-  YuvPixel *pixel_stack = aom_calloc(num_frames, sizeof(*pixel_stack));
-
-  // Apply median filtering using quickselect.
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      int count = 0;
-      for (int i = 0; i < num_frames; ++i) {
-        if (image_stack[y][x][i].exists) {
-          pixel_stack[count] = image_stack[y][x][i];
-          ++count;
-        }
-      }
-      if (count == 0) {
-        // Just make the pixel black.
-        // TODO(toddnguyen): Color the pixel with nearest neighbor
-        blended_img[y][x].exists = 0;
-      } else {
-        const int median_idx = (int)floor(count / 2);
-        YuvPixel median = qselect(pixel_stack, 0, count - 1, median_idx);
-
-        // Make the median value the 0th index for UV subsampling later
-        blended_img[y][x] = median;
-        blended_img[y][x].exists = 1;
-      }
-    }
-  }
-
-  aom_free(pixel_stack);
-}
-#endif  // BGSPRITE_BLENDING_MODE == 0
-
-#if BGSPRITE_BLENDING_MODE == 1
-// Blends image stack together using a temporal mean.
-static void blend_mean(const int width, const int height, const int num_frames,
-                       const YuvPixel ***image_stack, YuvPixel **blended_img,
-                       int highbitdepth) {
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      // Find
-      uint32_t y_sum = 0;
-      uint32_t u_sum = 0;
-      uint32_t v_sum = 0;
-      uint32_t count = 0;
-      for (int i = 0; i < num_frames; ++i) {
-        if (image_stack[y][x][i].exists) {
-          y_sum += image_stack[y][x][i].y;
-          u_sum += image_stack[y][x][i].u;
-          v_sum += image_stack[y][x][i].v;
-          ++count;
-        }
-      }
-
-#if BGSPRITE_MEAN_REMOVE_OUTLIERS
-      if (count > 1) {
-        double stdev = 0;
-        double y_mean = (double)y_sum / count;
-        for (int i = 0; i < num_frames; ++i) {
-          if (image_stack[y][x][i].exists) {
-            stdev += pow(y_mean - image_stack[y][x][i].y, 2);
-          }
-        }
-        stdev = sqrt(stdev / count);
-
-        uint32_t inlier_y_sum = 0;
-        uint32_t inlier_u_sum = 0;
-        uint32_t inlier_v_sum = 0;
-        uint32_t inlier_count = 0;
-        for (int i = 0; i < num_frames; ++i) {
-          if (image_stack[y][x][i].exists &&
-              fabs(image_stack[y][x][i].y - y_mean) <= 1.5 * stdev) {
-            inlier_y_sum += image_stack[y][x][i].y;
-            inlier_u_sum += image_stack[y][x][i].u;
-            inlier_v_sum += image_stack[y][x][i].v;
-            ++inlier_count;
-          }
-        }
-        count = inlier_count;
-        y_sum = inlier_y_sum;
-        u_sum = inlier_u_sum;
-        v_sum = inlier_v_sum;
-      }
-#endif  // BGSPRITE_MEAN_REMOVE_OUTLIERS
-
-      if (count != 0) {
-        blended_img[y][x].exists = 1;
-#if CONFIG_HIGHBITDEPTH
-        if (highbitdepth) {
-          blended_img[y][x].y = (uint16_t)OD_DIVU(y_sum, count);
-          blended_img[y][x].u = (uint16_t)OD_DIVU(u_sum, count);
-          blended_img[y][x].v = (uint16_t)OD_DIVU(v_sum, count);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          (void)highbitdepth;
-          blended_img[y][x].y = (uint8_t)OD_DIVU(y_sum, count);
-          blended_img[y][x].u = (uint8_t)OD_DIVU(u_sum, count);
-          blended_img[y][x].v = (uint8_t)OD_DIVU(v_sum, count);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-      } else {
-        blended_img[y][x].exists = 0;
-      }
-    }
-  }
-}
-#endif  // BGSPRITE_BLENDING_MODE == 1
-
-#if BGSPRITE_ENABLE_SEGMENTATION
-// Builds dual-mode single gaussian model from image stack.
-static void build_gaussian(const YuvPixel ***image_stack, const int num_frames,
-                           const int width, const int height,
-                           const int x_block_width, const int y_block_height,
-                           const int block_size, YuvPixelGaussian **gauss) {
-  const double initial_variance = 10.0;
-  const double s_theta = 2.0;
-
-  // Add images to dual-mode single gaussian model
-  for (int y_block = 0; y_block < y_block_height; ++y_block) {
-    for (int x_block = 0; x_block < x_block_width; ++x_block) {
-      // Process all blocks.
-      YuvPixelGaussian *model = &gauss[y_block][x_block];
-
-      // Process all frames.
-      for (int i = 0; i < num_frames; ++i) {
-        // Add block to the Gaussian model.
-        double max_variance[2] = { 0.0, 0.0 };
-        double temp_y_mean = 0.0;
-        double temp_u_mean = 0.0;
-        double temp_v_mean = 0.0;
-
-        // Find mean/variance of a block of pixels.
-        int temp_count = 0;
-        for (int sub_y = 0; sub_y < block_size; ++sub_y) {
-          for (int sub_x = 0; sub_x < block_size; ++sub_x) {
-            const int y = y_block * block_size + sub_y;
-            const int x = x_block * block_size + sub_x;
-            if (y < height && x < width && image_stack[y][x][i].exists) {
-              ++temp_count;
-              temp_y_mean += (double)image_stack[y][x][i].y;
-              temp_u_mean += (double)image_stack[y][x][i].u;
-              temp_v_mean += (double)image_stack[y][x][i].v;
-
-              const double variance_0 =
-                  pow((double)image_stack[y][x][i].y - model->mean[0], 2);
-              const double variance_1 =
-                  pow((double)image_stack[y][x][i].y - model->mean[1], 2);
-
-              if (variance_0 > max_variance[0]) {
-                max_variance[0] = variance_0;
-              }
-              if (variance_1 > max_variance[1]) {
-                max_variance[1] = variance_1;
-              }
-            }
-          }
-        }
-
-        // If pixels exist in the block, add to the model.
-        if (temp_count > 0) {
-          assert(temp_count <= block_size * block_size);
-          temp_y_mean /= temp_count;
-          temp_u_mean /= temp_count;
-          temp_v_mean /= temp_count;
-
-          // Switch the background model to the oldest model.
-          if (model->age[0] > model->age[1]) {
-            model->curr_model = 0;
-          } else if (model->age[1] > model->age[0]) {
-            model->curr_model = 1;
-          }
-
-          // If model is empty, initialize model.
-          if (model->age[model->curr_model] == 0) {
-            model->mean[model->curr_model] = temp_y_mean;
-            model->u_mean[model->curr_model] = temp_u_mean;
-            model->v_mean[model->curr_model] = temp_v_mean;
-            model->var[model->curr_model] = initial_variance;
-            model->age[model->curr_model] = 1;
-          } else {
-            // Constants for current model and foreground model (0 or 1).
-            const int opposite = 1 - model->curr_model;
-            const int current = model->curr_model;
-            const double j = i;
-
-            // Put block into the appropriate model.
-            if (pow(temp_y_mean - model->mean[current], 2) <
-                s_theta * model->var[current]) {
-              // Add block to the current background model
-              model->age[current] += 1;
-              const double prev_weight = 1 / j;
-              const double curr_weight = (j - 1) / j;
-              model->mean[current] = prev_weight * model->mean[current] +
-                                     curr_weight * temp_y_mean;
-              model->u_mean[current] = prev_weight * model->u_mean[current] +
-                                       curr_weight * temp_u_mean;
-              model->v_mean[current] = prev_weight * model->v_mean[current] +
-                                       curr_weight * temp_v_mean;
-              model->var[current] = prev_weight * model->var[current] +
-                                    curr_weight * max_variance[current];
-            } else {
-              // Block does not fit into current background candidate. Add to
-              // foreground candidate and reinitialize if necessary.
-              const double var_fg = pow(temp_y_mean - model->mean[opposite], 2);
-
-              if (var_fg <= s_theta * model->var[opposite]) {
-                model->age[opposite] += 1;
-                const double prev_weight = 1 / j;
-                const double curr_weight = (j - 1) / j;
-                model->mean[opposite] = prev_weight * model->mean[opposite] +
-                                        curr_weight * temp_y_mean;
-                model->u_mean[opposite] =
-                    prev_weight * model->u_mean[opposite] +
-                    curr_weight * temp_u_mean;
-                model->v_mean[opposite] =
-                    prev_weight * model->v_mean[opposite] +
-                    curr_weight * temp_v_mean;
-                model->var[opposite] = prev_weight * model->var[opposite] +
-                                       curr_weight * max_variance[opposite];
-              } else if (model->age[opposite] == 0 ||
-                         var_fg > s_theta * model->var[opposite]) {
-                model->mean[opposite] = temp_y_mean;
-                model->u_mean[opposite] = temp_u_mean;
-                model->v_mean[opposite] = temp_v_mean;
-                model->var[opposite] = initial_variance;
-                model->age[opposite] = 1;
-              } else {
-                // This case should never happen.
-                assert(0);
-              }
-            }
-          }
-        }
-      }
-
-      // Select the oldest candidate as the background model.
-      if (model->age[0] == 0 && model->age[1] == 0) {
-        model->y = 0;
-        model->u = 0;
-        model->v = 0;
-        model->final_var = 0;
-      } else if (model->age[0] > model->age[1]) {
-        model->y = (uint8_t)model->mean[0];
-        model->u = (uint8_t)model->u_mean[0];
-        model->v = (uint8_t)model->v_mean[0];
-        model->final_var = model->var[0];
-      } else {
-        model->y = (uint8_t)model->mean[1];
-        model->u = (uint8_t)model->u_mean[1];
-        model->v = (uint8_t)model->v_mean[1];
-        model->final_var = model->var[1];
-      }
-    }
-  }
-}
-
-// Builds foreground mask based on reference image and gaussian model.
-// In mask[][], 1 is foreground and 0 is background.
-static void build_mask(const int x_min, const int y_min, const int x_offset,
-                       const int y_offset, const int x_block_width,
-                       const int y_block_height, const int block_size,
-                       const YuvPixelGaussian **gauss,
-                       YV12_BUFFER_CONFIG *const reference,
-                       YV12_BUFFER_CONFIG *const panorama, uint8_t **mask) {
-  const int crop_x_offset = x_min + x_offset;
-  const int crop_y_offset = y_min + y_offset;
-  const double d_theta = 4.0;
-
-  for (int y_block = 0; y_block < y_block_height; ++y_block) {
-    for (int x_block = 0; x_block < x_block_width; ++x_block) {
-      // Create mask to determine if ARF is background for foreground.
-      const YuvPixelGaussian *model = &gauss[y_block][x_block];
-      double temp_y_mean = 0.0;
-      int temp_count = 0;
-
-      for (int sub_y = 0; sub_y < block_size; ++sub_y) {
-        for (int sub_x = 0; sub_x < block_size; ++sub_x) {
-          // x and y are panorama coordinates.
-          const int y = y_block * block_size + sub_y;
-          const int x = x_block * block_size + sub_x;
-
-          const int arf_y = y - crop_y_offset;
-          const int arf_x = x - crop_x_offset;
-
-          if (arf_y >= 0 && arf_y < panorama->y_height && arf_x >= 0 &&
-              arf_x < panorama->y_width) {
-            ++temp_count;
-            const int ychannel_idx = arf_y * panorama->y_stride + arf_x;
-            temp_y_mean += (double)reference->y_buffer[ychannel_idx];
-          }
-        }
-      }
-      if (temp_count > 0) {
-        assert(temp_count <= block_size * block_size);
-        temp_y_mean /= temp_count;
-
-        if (pow(temp_y_mean - model->y, 2) > model->final_var * d_theta) {
-          // Mark block as foreground.
-          mask[y_block][x_block] = 1;
-        }
-      }
-    }
-  }
-}
-#endif  // BGSPRITE_ENABLE_SEGMENTATION
-
-// Resamples blended_img into panorama, including UV subsampling.
-static void resample_panorama(YuvPixel **blended_img, const int center_idx,
-                              const int *const x_min, const int *const y_min,
-                              int pano_x_min, int pano_x_max, int pano_y_min,
-                              int pano_y_max, YV12_BUFFER_CONFIG *panorama) {
-  const int width = pano_x_max - pano_x_min + 1;
-  const int height = pano_y_max - pano_y_min + 1;
-  const int x_offset = -pano_x_min;
-  const int y_offset = -pano_y_min;
-  const int crop_x_offset = x_min[center_idx] + x_offset;
-  const int crop_y_offset = y_min[center_idx] + y_offset;
-#if CONFIG_HIGHBITDEPTH
-  if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) {
-    // Use median Y value.
-    uint16_t *pano_y_buffer16 = CONVERT_TO_SHORTPTR(panorama->y_buffer);
-    uint16_t *pano_u_buffer16 = CONVERT_TO_SHORTPTR(panorama->u_buffer);
-    uint16_t *pano_v_buffer16 = CONVERT_TO_SHORTPTR(panorama->v_buffer);
-
-    for (int y = 0; y < panorama->y_height; ++y) {
-      for (int x = 0; x < panorama->y_width; ++x) {
-        const int ychannel_idx = y * panorama->y_stride + x;
-        if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) {
-          pano_y_buffer16[ychannel_idx] =
-              blended_img[y + crop_y_offset][x + crop_x_offset].y;
-        } else {
-          pano_y_buffer16[ychannel_idx] = 0;
-        }
-      }
-    }
-
-    // UV subsampling with median UV values
-    for (int y = 0; y < panorama->uv_height; ++y) {
-      for (int x = 0; x < panorama->uv_width; ++x) {
-        uint32_t avg_count = 0;
-        uint32_t u_sum = 0;
-        uint32_t v_sum = 0;
-
-        // Look at surrounding pixels for subsampling
-        for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
-          for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
-            int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
-            int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
-            if (y_sample > 0 && y_sample < height && x_sample > 0 &&
-                x_sample < width && blended_img[y_sample][x_sample].exists) {
-              u_sum += blended_img[y_sample][x_sample].u;
-              v_sum += blended_img[y_sample][x_sample].v;
-              avg_count++;
-            }
-          }
-        }
-
-        const int uvchannel_idx = y * panorama->uv_stride + x;
-        if (avg_count != 0) {
-          pano_u_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(u_sum, avg_count);
-          pano_v_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(v_sum, avg_count);
-        } else {
-          pano_u_buffer16[uvchannel_idx] = 0;
-          pano_v_buffer16[uvchannel_idx] = 0;
-        }
-      }
-    }
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    // Use blended Y value.
-    for (int y = 0; y < panorama->y_height; ++y) {
-      for (int x = 0; x < panorama->y_width; ++x) {
-        const int ychannel_idx = y * panorama->y_stride + x;
-        // Use filtered background.
-        if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) {
-          panorama->y_buffer[ychannel_idx] =
-              blended_img[y + crop_y_offset][x + crop_x_offset].y;
-        } else {
-          panorama->y_buffer[ychannel_idx] = 0;
-        }
-      }
-    }
-
-    // UV subsampling with blended UV values.
-    for (int y = 0; y < panorama->uv_height; ++y) {
-      for (int x = 0; x < panorama->uv_width; ++x) {
-        uint16_t avg_count = 0;
-        uint16_t u_sum = 0;
-        uint16_t v_sum = 0;
-
-        // Look at surrounding pixels for subsampling.
-        for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
-          for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
-            int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
-            int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
-            if (y_sample > 0 && y_sample < height && x_sample > 0 &&
-                x_sample < width && blended_img[y_sample][x_sample].exists) {
-              u_sum += blended_img[y_sample][x_sample].u;
-              v_sum += blended_img[y_sample][x_sample].v;
-              avg_count++;
-            }
-          }
-        }
-
-        const int uvchannel_idx = y * panorama->uv_stride + x;
-        if (avg_count != 0) {
-          panorama->u_buffer[uvchannel_idx] =
-              (uint8_t)OD_DIVU(u_sum, avg_count);
-          panorama->v_buffer[uvchannel_idx] =
-              (uint8_t)OD_DIVU(v_sum, avg_count);
-        } else {
-          panorama->u_buffer[uvchannel_idx] = 0;
-          panorama->v_buffer[uvchannel_idx] = 0;
-        }
-      }
-    }
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-}
-
-#if BGSPRITE_ENABLE_SEGMENTATION
-// Combines temporal filter output and bgsprite output to make final ARF output
-static void combine_arf(YV12_BUFFER_CONFIG *const temporal_arf,
-                        YV12_BUFFER_CONFIG *const bgsprite,
-                        uint8_t **const mask, const int block_size,
-                        const int x_offset, const int y_offset,
-                        YV12_BUFFER_CONFIG *target) {
-  const int height = temporal_arf->y_height;
-  const int width = temporal_arf->y_width;
-
-  YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img));
-  for (int i = 0; i < height; ++i) {
-    blended_img[i] = aom_malloc(width * sizeof(**blended_img));
-  }
-
-  const int block_2_height = (height / BGSPRITE_MASK_BLOCK_SIZE) +
-                             (height % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0);
-  const int block_2_width = (width / BGSPRITE_MASK_BLOCK_SIZE) +
-                            (width % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0);
-
-  for (int block_y = 0; block_y < block_2_height; ++block_y) {
-    for (int block_x = 0; block_x < block_2_width; ++block_x) {
-      int count = 0;
-      int total = 0;
-      for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) {
-        for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) {
-          const int img_y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y;
-          const int img_x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x;
-          const int mask_y = (y_offset + img_y) / block_size;
-          const int mask_x = (x_offset + img_x) / block_size;
-
-          if (img_y < height && img_x < width) {
-            if (mask[mask_y][mask_x]) {
-              ++count;
-            }
-            ++total;
-          }
-        }
-      }
-
-      const double threshold = 0.30;
-      const int amount = (int)(threshold * total);
-      for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) {
-        for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) {
-          const int y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y;
-          const int x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x;
-          if (y < height && x < width) {
-            blended_img[y][x].exists = 1;
-            const int ychannel_idx = y * temporal_arf->y_stride + x;
-            const int uvchannel_idx =
-                (y >> temporal_arf->subsampling_y) * temporal_arf->uv_stride +
-                (x >> temporal_arf->subsampling_x);
-
-            if (count > amount) {
-// Foreground; use temporal arf.
-#if CONFIG_HIGHBITDEPTH
-              if (temporal_arf->flags & YV12_FLAG_HIGHBITDEPTH) {
-                uint16_t *pano_y_buffer16 =
-                    CONVERT_TO_SHORTPTR(temporal_arf->y_buffer);
-                uint16_t *pano_u_buffer16 =
-                    CONVERT_TO_SHORTPTR(temporal_arf->u_buffer);
-                uint16_t *pano_v_buffer16 =
-                    CONVERT_TO_SHORTPTR(temporal_arf->v_buffer);
-                blended_img[y][x].y = pano_y_buffer16[ychannel_idx];
-                blended_img[y][x].u = pano_u_buffer16[uvchannel_idx];
-                blended_img[y][x].v = pano_v_buffer16[uvchannel_idx];
-              } else {
-#endif  // CONFIG_HIGHBITDEPTH
-                blended_img[y][x].y = temporal_arf->y_buffer[ychannel_idx];
-                blended_img[y][x].u = temporal_arf->u_buffer[uvchannel_idx];
-                blended_img[y][x].v = temporal_arf->v_buffer[uvchannel_idx];
-#if CONFIG_HIGHBITDEPTH
-              }
-#endif  // CONFIG_HIGHBITDEPTH
-            } else {
-// Background; use bgsprite arf.
-#if CONFIG_HIGHBITDEPTH
-              if (bgsprite->flags & YV12_FLAG_HIGHBITDEPTH) {
-                uint16_t *pano_y_buffer16 =
-                    CONVERT_TO_SHORTPTR(bgsprite->y_buffer);
-                uint16_t *pano_u_buffer16 =
-                    CONVERT_TO_SHORTPTR(bgsprite->u_buffer);
-                uint16_t *pano_v_buffer16 =
-                    CONVERT_TO_SHORTPTR(bgsprite->v_buffer);
-                blended_img[y][x].y = pano_y_buffer16[ychannel_idx];
-                blended_img[y][x].u = pano_u_buffer16[uvchannel_idx];
-                blended_img[y][x].v = pano_v_buffer16[uvchannel_idx];
-              } else {
-#endif  // CONFIG_HIGHBITDEPTH
-                blended_img[y][x].y = bgsprite->y_buffer[ychannel_idx];
-                blended_img[y][x].u = bgsprite->u_buffer[uvchannel_idx];
-                blended_img[y][x].v = bgsprite->v_buffer[uvchannel_idx];
-#if CONFIG_HIGHBITDEPTH
-              }
-#endif  // CONFIG_HIGHBITDEPTH
-            }
-          }
-        }
-      }
-    }
-  }
-
-  const int x_min = 0;
-  const int y_min = 0;
-  resample_panorama(blended_img, 0, &x_min, &y_min, 0, width - 1, 0, height - 1,
-                    target);
-
-  for (int i = 0; i < height; ++i) {
-    aom_free(blended_img[i]);
-  }
-  aom_free(blended_img);
-}
-#endif  // BGSPRITE_ENABLE_SEGMENTATION
-
-// Stitches images together to create ARF and stores it in 'panorama'.
-static void stitch_images(AV1_COMP *cpi, YV12_BUFFER_CONFIG **const frames,
-                          const int num_frames, const int distance,
-                          const int center_idx, const double **const params,
-                          const int *const x_min, const int *const x_max,
-                          const int *const y_min, const int *const y_max,
-                          int pano_x_min, int pano_x_max, int pano_y_min,
-                          int pano_y_max, YV12_BUFFER_CONFIG *panorama) {
-  const int width = pano_x_max - pano_x_min + 1;
-  const int height = pano_y_max - pano_y_min + 1;
-
-  // Create pano_stack[y][x][num_frames] stack of pixel values
-  YuvPixel ***pano_stack = aom_malloc(height * sizeof(*pano_stack));
-  for (int i = 0; i < height; ++i) {
-    pano_stack[i] = aom_malloc(width * sizeof(**pano_stack));
-    for (int j = 0; j < width; ++j) {
-      pano_stack[i][j] = aom_calloc(num_frames, sizeof(***pano_stack));
-    }
-  }
-
-  build_image_stack(frames, num_frames, params, x_min, x_max, y_min, y_max,
-                    pano_x_min, pano_y_min, pano_stack);
-
-  // Create blended_img[y][x] of combined panorama pixel values.
-  YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img));
-  for (int i = 0; i < height; ++i) {
-    blended_img[i] = aom_malloc(width * sizeof(**blended_img));
-  }
-
-// Blending and saving result in blended_img.
-#if BGSPRITE_BLENDING_MODE == 1
-  blend_mean(width, height, num_frames, (const YuvPixel ***)pano_stack,
-             blended_img, panorama->flags & YV12_FLAG_HIGHBITDEPTH);
-#else   // BGSPRITE_BLENDING_MODE != 1
-  blend_median(width, height, num_frames, (const YuvPixel ***)pano_stack,
-               blended_img);
-#endif  // BGSPRITE_BLENDING_MODE == 1
-
-  // NOTE(toddnguyen): Right now the ARF in the cpi struct is fixed size at
-  // the same size as the frames. For now, we crop the generated panorama.
-  assert(panorama->y_width <= width && panorama->y_height <= height);
-
-  // Resamples the blended_img into the panorama buffer.
-  YV12_BUFFER_CONFIG bgsprite;
-  memset(&bgsprite, 0, sizeof(bgsprite));
-  aom_alloc_frame_buffer(&bgsprite, frames[0]->y_width, frames[0]->y_height,
-                         frames[0]->subsampling_x, frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &bgsprite);
-  bgsprite.bit_depth = frames[0]->bit_depth;
-  resample_panorama(blended_img, center_idx, x_min, y_min, pano_x_min,
-                    pano_x_max, pano_y_min, pano_y_max, &bgsprite);
-
-#if BGSPRITE_ENABLE_SEGMENTATION
-  YV12_BUFFER_CONFIG temporal_bgsprite;
-  memset(&temporal_bgsprite, 0, sizeof(temporal_bgsprite));
-  aom_alloc_frame_buffer(&temporal_bgsprite, frames[0]->y_width,
-                         frames[0]->y_height, frames[0]->subsampling_x,
-                         frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &temporal_bgsprite);
-  temporal_bgsprite.bit_depth = frames[0]->bit_depth;
-
-  av1_temporal_filter(cpi, &bgsprite, &temporal_bgsprite, distance);
-
-  // Block size constants for gaussian model.
-  const int N_1 = 2;
-  const int y_block_height = (height / N_1) + (height % N_1 != 0 ? 1 : 0);
-  const int x_block_width = (width / N_1) + (height % N_1 != 0 ? 1 : 0);
-  YuvPixelGaussian **gauss = aom_malloc(y_block_height * sizeof(*gauss));
-  for (int i = 0; i < y_block_height; ++i) {
-    gauss[i] = aom_calloc(x_block_width, sizeof(**gauss));
-  }
-
-  // Build Gaussian model.
-  build_gaussian((const YuvPixel ***)pano_stack, num_frames, width, height,
-                 x_block_width, y_block_height, N_1, gauss);
-
-  // Select background model and build foreground mask.
-  uint8_t **mask = aom_malloc(y_block_height * sizeof(*mask));
-  for (int i = 0; i < y_block_height; ++i) {
-    mask[i] = aom_calloc(x_block_width, sizeof(**mask));
-  }
-
-  const int x_offset = -pano_x_min;
-  const int y_offset = -pano_y_min;
-  build_mask(x_min[center_idx], y_min[center_idx], x_offset, y_offset,
-             x_block_width, y_block_height, N_1,
-             (const YuvPixelGaussian **)gauss,
-             (YV12_BUFFER_CONFIG * const) frames[center_idx], panorama, mask);
-
-  YV12_BUFFER_CONFIG temporal_arf;
-  memset(&temporal_arf, 0, sizeof(temporal_arf));
-  aom_alloc_frame_buffer(&temporal_arf, frames[0]->y_width, frames[0]->y_height,
-                         frames[0]->subsampling_x, frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &temporal_arf);
-  temporal_arf.bit_depth = frames[0]->bit_depth;
-  av1_temporal_filter(cpi, NULL, &temporal_arf, distance);
-
-  combine_arf(&temporal_arf, &temporal_bgsprite, mask, N_1, x_offset, y_offset,
-              panorama);
-
-  aom_free_frame_buffer(&temporal_arf);
-  aom_free_frame_buffer(&temporal_bgsprite);
-  for (int i = 0; i < y_block_height; ++i) {
-    aom_free(gauss[i]);
-    aom_free(mask[i]);
-  }
-  aom_free(gauss);
-  aom_free(mask);
-#else   // !BGSPRITE_ENABLE_SEGMENTATION
-  av1_temporal_filter(cpi, &bgsprite, panorama, distance);
-#endif  // BGSPRITE_ENABLE_SEGMENTATION
-
-  aom_free_frame_buffer(&bgsprite);
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      aom_free(pano_stack[i][j]);
-    }
-    aom_free(pano_stack[i]);
-    aom_free(blended_img[i]);
-  }
-  aom_free(pano_stack);
-  aom_free(blended_img);
-}
-
-int av1_background_sprite(AV1_COMP *cpi, int distance) {
-#if BGSPRITE_ENABLE_METRICS
-  // Do temporal filter if firstpass stats disable bgsprite.
-  if (!cpi->bgsprite_allowed) {
-    return 1;
-  }
-#endif  // BGSPRITE_ENABLE_METRICS
-
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-  static const double identity_params[MAX_PARAMDIM - 1] = {
-    0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
-  };
-
-  const int frames_after_arf =
-      av1_lookahead_depth(cpi->lookahead) - distance - 1;
-  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
-  int frames_bwd;
-
-  // Define the forward and backwards filter limits for this arnr group.
-  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
-  if (frames_fwd > distance) frames_fwd = distance;
-  frames_bwd = frames_fwd;
-
-#if CONFIG_EXT_REFS
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
-    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
-    frames_fwd = 0;
-    frames_bwd = 0;
-  } else {
-    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
-  }
-#endif  // CONFIG_EXT_REFS
-
-  const int start_frame = distance + frames_fwd;
-  const int frames_to_stitch = frames_bwd + 1 + frames_fwd;
-
-  // Get frames to be included in background sprite.
-  for (int frame = 0; frame < frames_to_stitch; ++frame) {
-    const int which_buffer = start_frame - frame;
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, which_buffer);
-    frames[frames_to_stitch - 1 - frame] = &buf->img;
-  }
-
-  // Allocate empty arrays for parameters between frames.
-  double **params = aom_malloc(frames_to_stitch * sizeof(*params));
-  for (int i = 0; i < frames_to_stitch; ++i) {
-    params[i] = aom_malloc(sizeof(identity_params));
-    memcpy(params[i], identity_params, sizeof(identity_params));
-  }
-
-// Use global motion to find affine transformations between frames.
-// params[i] will have the transform from frame[i] to frame[i-1].
-// params[0] will have the identity matrix (has no previous frame).
-#if BGSPRITE_ENABLE_GME
-  TransformationType model = AFFINE;
-  int inliers_by_motion[RANSAC_NUM_MOTIONS];
-  for (int frame = 0; frame < frames_to_stitch - 1; ++frame) {
-    const int global_motion_ret = compute_global_motion_feature_based(
-        model, frames[frame + 1], frames[frame],
-#if CONFIG_HIGHBITDEPTH
-        cpi->common.bit_depth,
-#endif  // CONFIG_HIGHBITDEPTH
-        inliers_by_motion, params[frame + 1], RANSAC_NUM_MOTIONS);
-
-    // Quit if global motion had an error.
-    if (global_motion_ret == 0) {
-      for (int i = 0; i < frames_to_stitch; ++i) {
-        aom_free(params[i]);
-      }
-      aom_free(params);
-      return 1;
-    }
-  }
-#endif  // BGSPRITE_ENABLE_GME
-
-  // Compound the transformation parameters.
-  for (int i = 1; i < frames_to_stitch; ++i) {
-    multiply_params(params[i - 1], params[i], params[i]);
-  }
-
-  // Compute frame limits for final stitched images.
-  int pano_x_max = INT_MIN;
-  int pano_x_min = INT_MAX;
-  int pano_y_max = INT_MIN;
-  int pano_y_min = INT_MAX;
-  int *x_max = aom_malloc(frames_to_stitch * sizeof(*x_max));
-  int *x_min = aom_malloc(frames_to_stitch * sizeof(*x_min));
-  int *y_max = aom_malloc(frames_to_stitch * sizeof(*y_max));
-  int *y_min = aom_malloc(frames_to_stitch * sizeof(*y_min));
-
-  find_limits(frames[0]->y_width, frames[0]->y_height,
-              (const double **const)params, frames_to_stitch, x_min, x_max,
-              y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
-
-  // Center panorama on the ARF.
-  const int center_idx = frames_bwd;
-  assert(center_idx >= 0 && center_idx < frames_to_stitch);
-
-  // Recompute transformations to adjust to center image.
-  // Invert center image's transform.
-  double inverse[MAX_PARAMDIM - 1] = { 0 };
-  invert_params(params[center_idx], inverse);
-
-  // Multiply the inverse to all transformation parameters.
-  for (int i = 0; i < frames_to_stitch; ++i) {
-    multiply_params(inverse, params[i], params[i]);
-  }
-
-  // Recompute frame limits for new adjusted center.
-  find_limits(frames[0]->y_width, frames[0]->y_height,
-              (const double **const)params, frames_to_stitch, x_min, x_max,
-              y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
-
-  // Stitch Images and apply bgsprite filter.
-  stitch_images(cpi, frames, frames_to_stitch, distance, center_idx,
-                (const double **const)params, x_min, x_max, y_min, y_max,
-                pano_x_min, pano_x_max, pano_y_min, pano_y_max,
-                &cpi->alt_ref_buffer);
-
-  // Free memory.
-  for (int i = 0; i < frames_to_stitch; ++i) {
-    aom_free(params[i]);
-  }
-  aom_free(params);
-  aom_free(x_max);
-  aom_free(x_min);
-  aom_free(y_max);
-  aom_free(y_min);
-
-  return 0;
-}
-
-#undef _POSIX_C_SOURCE
-#undef BGSPRITE_BLENDING_MODE
-#undef BGSPRITE_INTERPOLATION
-#undef BGSPRITE_ENABLE_METRICS
-#undef BGSPRITE_ENABLE_SEGMENTATION
-#undef BGSPRITE_ENABLE_GME
-#undef BGSPRITE_MASK_BLOCK_SIZE
-#undef TRANSFORM_MAT_DIM
diff --git a/third_party/aom/av1/encoder/bgsprite.h b/third_party/aom/av1/encoder/bgsprite.h
deleted file mode 100644
index 711b00e40..000000000
--- a/third_party/aom/av1/encoder/bgsprite.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_BGSPRITE_H_
-#define AV1_ENCODER_BGSPRITE_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "av1/encoder/encoder.h"
-
-// Creates alternate reference frame staring from source image + frames up to
-// 'distance' past source frame.
-// Returns 0 on success and 1 on failure.
-int av1_background_sprite(AV1_COMP *cpi, int distance);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_BGSPRITE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 08f605f10..cdd7c2492 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -24,9 +24,8 @@
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG
 
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif  // CONFIG_CDEF
+#include "av1/common/cfl.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
@@ -34,38 +33,21 @@
 #include "av1/common/odintrin.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
-#if CONFIG_EXT_INTRA
 #include "av1/common/reconintra.h"
-#endif  // CONFIG_EXT_INTRA
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
-#if CONFIG_LV_MAP
-#include "av1/encoder/encodetxb.h"
-#endif  // CONFIG_LV_MAP
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
 #include "av1/encoder/mcomp.h"
-#if CONFIG_PALETTE_DELTA_ENCODING
 #include "av1/encoder/palette.h"
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 #include "av1/encoder/segmentation.h"
-#include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_PVQ
-#include "av1/encoder/pvq_encoder.h"
-#endif
 
 #define ENC_MISMATCH_DEBUG 0
 
-#if CONFIG_COMPOUND_SINGLEREF
-static struct av1_token
-    inter_singleref_comp_mode_encodings[INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ || CONFIG_EXT_INTRA
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
@@ -77,110 +59,38 @@ static INLINE void write_uniform(aom_writer *w, int n, int v) {
     aom_write_literal(w, (v - m) & 1, 1);
   }
 }
-#endif  // !CONFIG_PVQ || CONFIG_EXT_INTRA
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-static struct av1_token intra_filter_encodings[INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_INTERINTRA
-static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
-#endif
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-static struct av1_token compound_type_encodings[COMPOUND_TYPES];
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#if CONFIG_LOOP_RESTORATION
-static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
+
 static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
                                              MACROBLOCKD *xd,
+                                             const RestorationUnitInfo *rui,
                                              aom_writer *const w, int plane,
-                                             int rtile_idx);
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_OBU
-static void write_uncompressed_header_obu(AV1_COMP *cpi,
-                                          struct aom_write_bit_buffer *wb);
-#else
-static void write_uncompressed_header_frame(AV1_COMP *cpi,
-                                            struct aom_write_bit_buffer *wb);
-#endif
-
-static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
-
-#if !CONFIG_OBU || CONFIG_EXT_TILE
-static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
-                       const uint32_t data_size, const uint32_t max_tile_size,
-                       const uint32_t max_tile_col_size,
-                       int *const tile_size_bytes,
-                       int *const tile_col_size_bytes);
-#endif
-void av1_encode_token_init(void) {
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-#if CONFIG_INTERINTRA
-  av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_COMPOUND_SINGLEREF
-  av1_tokens_from_tree(inter_singleref_comp_mode_encodings,
-                       av1_inter_singleref_comp_mode_tree);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-  av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#if CONFIG_LOOP_RESTORATION
-  av1_tokens_from_tree(switchable_restore_encodings,
-                       av1_switchable_restore_tree);
-#endif  // CONFIG_LOOP_RESTORATION
-}
+                                             FRAME_COUNTS *counts);
 
-static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
-                                const MODE_INFO *mi, const MODE_INFO *above_mi,
-                                const MODE_INFO *left_mi, int block,
+static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                const MB_MODE_INFO *mi,
+                                const MB_MODE_INFO *above_mi,
+                                const MB_MODE_INFO *left_mi,
                                 PREDICTION_MODE mode, aom_writer *w) {
-#if CONFIG_INTRABC
-  assert(!is_intrabc_block(&mi->mbmi));
-#endif  // CONFIG_INTRABC
-  aom_write_symbol(w, mode,
-                   get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block),
+  assert(!is_intrabc_block(mi));
+  (void)mi;
+  aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
                    INTRA_MODES);
-  (void)cm;
 }
 
 static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
                              FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
 
-#if CONFIG_NEW_MULTISYMBOL
   aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
-#else
-  aom_write(w, mode != NEWMV, ec_ctx->newmv_prob[newmv_ctx]);
-#endif
 
   if (mode != NEWMV) {
-    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      assert(mode == ZEROMV);
-      return;
-    }
-
-    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-#if CONFIG_NEW_MULTISYMBOL
-    aom_write_symbol(w, mode != ZEROMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
-#else
-    aom_write(w, mode != ZEROMV, ec_ctx->zeromv_prob[zeromv_ctx]);
-#endif
+    const int16_t zeromv_ctx =
+        (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
 
-    if (mode != ZEROMV) {
+    if (mode != GLOBALMV) {
       int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
-      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
-      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-#if CONFIG_NEW_MULTISYMBOL
       aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
-#else
-      aom_write(w, mode != NEARESTMV, ec_ctx->refmv_prob[refmv_ctx]);
-#endif
     }
   }
 }
@@ -191,24 +101,16 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
 
   assert(mbmi->ref_mv_idx < 3);
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
-      mbmi->mode == SR_NEW_NEWMV) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+  if (new_mv) {
     int idx;
     for (idx = 0; idx < 2; ++idx) {
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
 
-#if CONFIG_NEW_MULTISYMBOL
         aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
                          2);
-#else
-        aom_write(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_prob[drl_ctx]);
-#endif
         if (mbmi->ref_mv_idx == idx) return;
       }
     }
@@ -222,12 +124,8 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-#if CONFIG_NEW_MULTISYMBOL
         aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
                          ec_ctx->drl_cdf[drl_ctx], 2);
-#else
-        aom_write(w, mbmi->ref_mv_idx != (idx - 1), ec_ctx->drl_prob[drl_ctx]);
-#endif
         if (mbmi->ref_mv_idx == (idx - 1)) return;
       }
     }
@@ -235,52 +133,22 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
   }
 }
 
-static void write_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      aom_writer *w, PREDICTION_MODE mode,
+static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                      PREDICTION_MODE mode,
                                       const int16_t mode_ctx) {
   assert(is_inter_compound_mode(mode));
-  (void)cm;
   aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
                    xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
                    INTER_COMPOUND_MODES);
 }
 
-#if CONFIG_COMPOUND_SINGLEREF
-static void write_inter_singleref_comp_mode(MACROBLOCKD *xd, aom_writer *w,
-                                            PREDICTION_MODE mode,
-                                            const int16_t mode_ctx) {
-  assert(is_inter_singleref_comp_mode(mode));
-  aom_cdf_prob *const inter_singleref_comp_cdf =
-      xd->tile_ctx->inter_singleref_comp_mode_cdf[mode_ctx];
-
-  aom_write_symbol(w, INTER_SINGLEREF_COMP_OFFSET(mode),
-                   inter_singleref_comp_cdf, INTER_SINGLEREF_COMP_MODES);
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
-                                int max) {
-  aom_wb_write_literal(wb, data, get_unsigned_bits(max));
-}
-
-#if CONFIG_VAR_TX
-static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
-                                int depth, int blk_row, int blk_col,
-                                aom_writer *w) {
-#if CONFIG_NEW_MULTISYMBOL
+static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
+                                TX_SIZE tx_size, int depth, int blk_row,
+                                int blk_col, aom_writer *w) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-#endif
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
 
-  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
-                                   xd->left_txfm_context + blk_row,
-                                   mbmi->sb_type, tx_size);
-
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (depth == MAX_VARTX_DEPTH) {
@@ -289,31 +157,25 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
     return;
   }
 
-#if CONFIG_RECT_TX_EXT
-  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col] ||
-      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) {
-#else
-  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
-#endif
-#if CONFIG_NEW_MULTISYMBOL
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  const int txb_size_index =
+      av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+  const int write_txfm_partition =
+      tx_size == mbmi->inter_tx_size[txb_size_index];
+  if (write_txfm_partition) {
     aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
-#else
-    aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
-#endif
 
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
     // TODO(yuec): set correct txfm partition update for qttx
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
 
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
-#else
-    aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
-#endif
 
     if (sub_txs == TX_4X4) {
       txfm_partition_update(xd->above_txfm_context + blk_col,
@@ -321,185 +183,115 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
       return;
     }
 
-    assert(bsl > 0);
-    for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-      write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc,
-                          w);
-    }
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+      }
   }
 }
 
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
-                                        FRAME_COUNTS *counts, int probwt) {
-  int k;
-  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
-    av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
-                              counts->txfm_partition[k], probwt);
-}
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_VAR_TX
-
-static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                                   aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+static void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
   if (block_signals_txsize(bsize)) {
     const TX_SIZE tx_size = mbmi->tx_size;
-    const int is_inter = is_inter_block(mbmi);
     const int tx_size_ctx = get_tx_size_context(xd);
-    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                         : intra_tx_size_cat_lookup[bsize];
-    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-    const int depth = tx_size_to_depth(coded_tx_size);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int max_depths = bsize_to_max_depth(bsize);
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+    assert(depth >= 0 && depth <= max_depths);
+    assert(!is_inter_block(mbmi));
     assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
     aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
-                     tx_size_cat + 2);
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
-#if CONFIG_NEW_MULTISYMBOL
-      aom_write_symbol(w, tx_size == quarter_txsize_lookup[bsize],
-                       cm->fc->quarter_tx_size_cdf, 2);
-#else
-      aom_write(w, tx_size == quarter_txsize_lookup[bsize],
-                cm->fc->quarter_tx_size_prob);
-#endif
-#endif
+                     max_depths + 1);
   }
 }
 
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
-                                    FRAME_COUNTS *counts) {
-  int i;
-  const int probwt = cm->num_tg;
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
-                              probwt);
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
-                              counts->zeromv_mode[i], probwt);
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i],
-                              probwt);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
-                              probwt);
-}
-#endif
-
 static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, aom_writer *w) {
+                      int segment_id, const MB_MODE_INFO *mi, aom_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = mi->mbmi.skip;
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const int skip = mi->skip;
     const int ctx = av1_get_skip_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
-#else
-    aom_write(w, skip, av1_get_skip_prob(cm, xd));
-#endif
     return skip;
   }
 }
 
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, const MB_MODE_INFO *mi,
+                           aom_writer *w) {
+  if (!cm->skip_mode_flag) return 0;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+  const int skip_mode = mi->skip_mode;
+  if (!is_comp_ref_allowed(mi->sb_type)) {
+    assert(!skip_mode);
+    return 0;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    assert(!skip_mode);
+    return 0;
+  }
+  const int ctx = av1_get_skip_mode_context(xd);
+  aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+  return skip_mode;
+}
+
 static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                            int segment_id, aom_writer *w, const int is_inter) {
   if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+      assert(is_inter);
+      return;
+    }
     const int ctx = av1_get_intra_inter_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
-#else
-    aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
-#endif
   }
 }
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                              const MODE_INFO *mi, aom_writer *w) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-
-#if !CONFIG_GLOBAL_MOTION
-  // The cm parameter is only used with global_motion or with
-  // motion_var and warped_motion. In other cases, explicitly ignore
-  // it to avoid a compiler warning.
-  (void)cm;
-#endif
-  MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
-  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return;
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (last_motion_mode_allowed == NCOBMC_ADAPT_WEIGHT) {
-    aom_write_symbol(w, mbmi->motion_mode,
-                     xd->tile_ctx->ncobmc_cdf[mbmi->sb_type],
-                     OBMC_FAMILY_MODES);
-  } else if (last_motion_mode_allowed == OBMC_CAUSAL) {
-    aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
-                     xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
-  } else {
-#else
-  if (last_motion_mode_allowed == OBMC_CAUSAL) {
-#if CONFIG_NEW_MULTISYMBOL
-    aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
-                     xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
-#else
-    aom_write(w, mbmi->motion_mode == OBMC_CAUSAL,
-              cm->fc->obmc_prob[mbmi->sb_type]);
-#endif
-  } else {
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-    aom_write_symbol(w, mbmi->motion_mode,
-                     xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
-                     MOTION_MODES);
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-}
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void write_ncobmc_mode(MACROBLOCKD *xd, const MODE_INFO *mi,
-                              aom_writer *w) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type];
-  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return;
-
-  aom_write_symbol(w, mbmi->ncobmc_mode[0],
-                   xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
-  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-    aom_write_symbol(w, mbmi->ncobmc_mode[1],
-                     xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
+                              const MB_MODE_INFO *mbmi, aom_writer *w) {
+  MOTION_MODE last_motion_mode_allowed =
+      cm->switchable_motion_mode
+          ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+                                cm->allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->motion_mode <= last_motion_mode_allowed);
+  switch (last_motion_mode_allowed) {
+    case SIMPLE_TRANSLATION: break;
+    case OBMC_CAUSAL:
+      aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                       xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+      break;
+    default:
+      aom_write_symbol(w, mbmi->motion_mode,
+                       xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                       MOTION_MODES);
   }
 }
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                               int delta_qindex, aom_writer *w) {
+static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
+                               aom_writer *w) {
   int sign = delta_qindex < 0;
   int abs = sign ? -delta_qindex : delta_qindex;
   int rem_bits, thr;
   int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
   aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
                    DELTA_Q_PROBS + 1);
@@ -515,32 +307,23 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-#if CONFIG_EXT_DELTA_Q
 static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-#if CONFIG_LOOPFILTER_LEVEL
-                                int lf_id,
-#endif
-                                int delta_lflevel, aom_writer *w) {
+                                int lf_id, int delta_lflevel, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
   int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-#if CONFIG_LOOPFILTER_LEVEL
   if (cm->delta_lf_multi) {
-    assert(lf_id >= 0 && lf_id < FRAME_LF_COUNT);
+    assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                                         : FRAME_LF_COUNT - 2));
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
                      ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
   } else {
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
                      DELTA_LF_PROBS + 1);
   }
-#else
-  aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
-                   DELTA_LF_PROBS + 1);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
   if (!smallval) {
     rem_bits = OD_ILOG_NZ(abs - 1) - 1;
@@ -552,22 +335,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     aom_write_bit(w, sign);
   }
 }
-#endif  // CONFIG_EXT_DELTA_Q
-
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
-                              FRAME_COUNTS *counts) {
-  int k;
-  const int probwt = cm->num_tg;
-  for (k = 0; k < SKIP_CONTEXTS; ++k) {
-    av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
-                              probwt);
-  }
-}
-#endif
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
 static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
                             int num) {
   const TOKENEXTRA *p = *tp;
@@ -580,423 +348,142 @@ static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
   }
   *tp = p;
 }
-#endif  // !CONFIG_PVQ
-
-#if !CONFIG_PVQ
-#if CONFIG_SUPERTX
-static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
-  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
-                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
-  int i, j;
-  int savings = 0;
-  int do_update = 0;
-  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-    for (j = TX_8X8; j < TX_SIZES; ++j) {
-      savings += av1_cond_prob_diff_update_savings(
-          &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
-    }
-  }
-  do_update = savings > savings_thresh;
-  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-  if (do_update) {
-    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-      for (j = TX_8X8; j < TX_SIZES; ++j) {
-        av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
-                                  cm->counts.supertx[i][j], probwt);
-      }
-    }
-  }
-}
-#endif  // CONFIG_SUPERTX
-
-#if !CONFIG_LV_MAP
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val,
-                                     int n, aom_writer *w) {
-  // Code the extra bits from LSB to MSB in groups of 4
-  int i = 0;
-  int count = 0;
-  while (count < n) {
-    const int size = AOMMIN(n - count, 4);
-    const int mask = (1 << size) - 1;
-    aom_write_cdf(w, val & mask, cdf[i++], 1 << size);
-    val >>= size;
-    count += size;
-  }
-}
-#else
-static INLINE void write_coeff_extra(const aom_prob *pb, int value,
-                                     int num_bits, int skip_bits, aom_writer *w,
-                                     TOKEN_STATS *token_stats) {
-  // Code the extra bits from MSB to LSB 1 bit at a time
-  int index;
-  for (index = skip_bits; index < num_bits; ++index) {
-    const int shift = num_bits - index - 1;
-    const int bb = (value >> shift) & 1;
-    aom_write_record(w, bb, pb[index], token_stats);
-  }
-}
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           TX_TYPE tx_type, int is_inter,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           TOKEN_STATS *token_stats) {
-  const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
-  int count = 0;
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
-                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
-    int rows = tx_size_high[tx_size];
-    int cols = tx_size_wide[tx_size];
-    assert(tx_size == TX_32X32);
-    assert(p < stop);
-    pack_map_tokens(w, &p, 2, rows * cols);
-  }
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  while (p < stop && p->token != EOSB_TOKEN) {
-    const int token = p->token;
-    const int eob_val = p->eob_val;
-    if (token == BLOCK_Z_TOKEN) {
-      aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1);
-      p++;
-#if CONFIG_VAR_TX
-      break;
-#endif
-      continue;
-    }
-
-    const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
-    if (eob_val == LAST_EOB) {
-      // Just code a flag indicating whether the value is >1 or 1.
-      aom_write_bit(w, token != ONE_TOKEN);
-    } else {
-      int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + p->first_val;
-      aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val);
-    }
-    if (token > ONE_TOKEN) {
-      aom_write_symbol(w, token - TWO_TOKEN, *p->tail_cdf, TAIL_TOKENS);
-    }
-
-    if (extra_bits->base_val) {
-      const int bit_string = p->extra;
-      const int bit_string_length = extra_bits->len;  // Length of extra bits to
-      const int is_cat6 = (extra_bits->base_val == CAT6_MIN_VAL);
-      // be written excluding
-      // the sign bit.
-      int skip_bits = is_cat6
-                          ? (int)sizeof(av1_cat6_prob) -
-                                av1_get_cat6_extrabits_size(tx_size, bit_depth)
-                          : 0;
-
-      assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
-      if (bit_string_length > 0)
-#if CONFIG_NEW_MULTISYMBOL
-        write_coeff_extra(extra_bits->cdf, bit_string >> 1,
-                          bit_string_length - skip_bits, w);
-#else
-        write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
-                          skip_bits, w, token_stats);
-#endif
-
-      aom_write_bit_record(w, bit_string & 1, token_stats);
-    }
-    ++p;
-
-#if CONFIG_VAR_TX
-    ++count;
-    if (eob_val == EARLY_EOB || count == seg_eob) break;
-#endif
-  }
-
-  *tp = p;
-}
-#endif  // !CONFIG_LV_MAP
-#else   // !CONFIG_PVQ
-static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
-  PVQ_INFO *pvq;
-
-  assert(pvq_q->curr_pos <= pvq_q->last_pos);
-  assert(pvq_q->curr_pos < pvq_q->buf_len);
-
-  pvq = pvq_q->buf + pvq_q->curr_pos;
-  ++pvq_q->curr_pos;
-
-  return pvq;
-}
-
-static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x,
-                            MACROBLOCKD *const xd, int plane, BLOCK_SIZE bsize,
-                            const TX_SIZE tx_size) {
-  PVQ_INFO *pvq;
-  int idx, idy;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  od_adapt_ctx *adapt;
-  int max_blocks_wide;
-  int max_blocks_high;
-  int step = (1 << tx_size);
-
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
 
-  adapt = x->daala_enc.state.adapt;
-
-  max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-  for (idy = 0; idy < max_blocks_high; idy += step) {
-    for (idx = 0; idx < max_blocks_wide; idx += step) {
-      const int is_keyframe = 0;
-      const int encode_flip = 0;
-      const int flip = 0;
-      int i;
-      const int has_dc_skip = 1;
-      int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0];
-      int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS;
-      generic_encoder *model = adapt->pvq.pvq_param_model;
-
-      pvq = get_pvq_block(x->pvq_q);
-
-      // encode block skip info
-      aom_write_symbol(w, pvq->ac_dc_coded,
-                       adapt->skip_cdf[2 * tx_size + (plane != 0)], 4);
-
-      // AC coeffs coded?
-      if (pvq->ac_dc_coded & AC_CODED) {
-        assert(pvq->bs == tx_size);
-        for (i = 0; i < pvq->nb_bands; i++) {
-          if (i == 0 ||
-              (!pvq->skip_rest && !(pvq->skip_dir & (1 << ((i - 1) % 3))))) {
-            pvq_encode_partition(
-                w, pvq->qg[i], pvq->theta[i], pvq->y + pvq->off[i],
-                pvq->size[i], pvq->k[i], model, adapt, exg + i, ext + i,
-                (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS +
-                    pvq->bs * PVQ_MAX_PARTITIONS + i,
-                is_keyframe, i == 0 && (i < pvq->nb_bands - 1), pvq->skip_rest,
-                encode_flip, flip);
-          }
-          if (i == 0 && !pvq->skip_rest && pvq->bs > 0) {
-            aom_write_symbol(
-                w, pvq->skip_dir,
-                &adapt->pvq
-                     .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0],
-                7);
-          }
-        }
-      }
-      // Encode residue of DC coeff, if exist.
-      if (!has_dc_skip || (pvq->ac_dc_coded & DC_CODED)) {
-        generic_encode(w, &adapt->model_dc[plane],
-                       abs(pvq->dq_dc_residue) - has_dc_skip,
-                       &adapt->ex_dc[plane][pvq->bs][0], 2);
-      }
-      if ((pvq->ac_dc_coded & DC_CODED)) {
-        aom_write_bit(w, pvq->dq_dc_residue < 0);
-      }
-    }
-  }  // for (idy = 0;
-}
-#endif  // !CONFIG_PVG
-
-#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
-#if CONFIG_LV_MAP
-static void pack_txb_tokens(aom_writer *w,
-#if CONFIG_LV_MAP
-                            AV1_COMMON *cm,
-#endif  // CONFIG_LV_MAP
+static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
                             const TOKENEXTRA **tp,
-                            const TOKENEXTRA *const tok_end,
-#if CONFIG_PVQ || CONFIG_LV_MAP
-                            MACROBLOCK *const x,
-#endif
-                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+                            const TOKENEXTRA *const tok_end, MACROBLOCKD *xd,
+                            MB_MODE_INFO *mbmi, int plane,
                             BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
                             int block, int blk_row, int blk_col,
                             TX_SIZE tx_size, TOKEN_STATS *token_stats) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
-
-  if (tx_size == plane_tx_size) {
-    TOKEN_STATS tmp_token_stats;
-    init_token_stats(&tmp_token_stats);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
 
-#if !CONFIG_PVQ
+  if (tx_size == plane_tx_size || plane) {
     tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-    uint16_t eob = x->mbmi_ext->eobs[plane][block];
+    const uint16_t eob = x->mbmi_ext->eobs[plane][block];
     TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
                         x->mbmi_ext->dc_sign_ctx[plane][block] };
-    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, block, plane, tx_size,
-                         tcoeff, eob, &txb_ctx);
-#else
-    pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
-#endif
+    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
+                         eob, &txb_ctx);
 #if CONFIG_RD_DEBUG
-    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
-    token_stats->cost += tmp_token_stats.cost;
-#endif
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-      const int offsetr = blk_row + (i >> 1) * bsl;
-      const int offsetc = blk_col + (i & 0x01) * bsl;
-      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      pack_txb_tokens(w,
-#if CONFIG_LV_MAP
-                      cm,
-#endif
-                      tp, tok_end,
-#if CONFIG_PVQ || CONFIG_LV_MAP
-                      x,
-#endif
-                      xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
-                      offsetc, sub_txs, token_stats);
-      block += step;
-    }
-  }
-}
-#else  // CONFIG_LV_MAP
-static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                            const TOKENEXTRA *const tok_end,
-#if CONFIG_PVQ
-                            MACROBLOCK *const x,
-#endif
-                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
-                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
-                            int block, int blk_row, int blk_col,
-                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd,
-                                    blk_row, blk_col, block, tx_size);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
-
-  if (tx_size == plane_tx_size) {
     TOKEN_STATS tmp_token_stats;
     init_token_stats(&tmp_token_stats);
-#if !CONFIG_PVQ
-    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                   tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                   &tmp_token_stats);
-#else
-    pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
-#endif
-#if CONFIG_RD_DEBUG
     token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
     token_stats->cost += tmp_token_stats.cost;
 #endif
   } else {
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-#endif
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + (i >> 1) * bsl;
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + (i & 0x01) * bsl;
-#else
-      const int offsetr = blk_row + (i >> 1) * bsl;
-      const int offsetc = blk_col + (i & 0x01) * bsl;
-#endif
-      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      pack_txb_tokens(w, tp, tok_end,
-#if CONFIG_PVQ
-                      x,
-#endif
-                      xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
-                      offsetc, sub_txs, token_stats);
-      block += step;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
+        const int offsetr = blk_row + r;
+        const int offsetc = blk_col + c;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+                        bit_depth, block, offsetr, offsetc, sub_txs,
+                        token_stats);
+        block += step;
+      }
+    }
+  }
+}
+
+static INLINE void set_spatial_segment_id(const AV1_COMMON *const cm,
+                                          uint8_t *segment_ids,
+                                          BLOCK_SIZE bsize, int mi_row,
+                                          int mi_col, int segment_id) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_ids[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+  assert(x < max);
+  const int diff = x - ref;
+  if (!ref) return x;
+  if (ref >= (max - 1)) return -x + max - 1;
+  if (2 * ref < max) {
+    if (abs(diff) <= ref) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
     }
+    return x;
+  } else {
+    if (abs(diff) < (max - ref)) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return (max - x) - 1;
   }
 }
-#endif  // CONFIG_LV_MAP
-#endif  // CONFIG_VAR_TX
 
-static void write_segment_id(aom_writer *w, const struct segmentation *seg,
-                             struct segmentation_probs *segp, int segment_id) {
-  if (seg->enabled && seg->update_map) {
-    aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
+static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
+                             aom_writer *w, const struct segmentation *seg,
+                             struct segmentation_probs *segp, int mi_row,
+                             int mi_col, int skip) {
+  if (!seg->enabled || !seg->update_map) return;
+
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+
+  if (skip) {
+    // Still need to transmit tx size for intra blocks even if skip is
+    // true. Changing segment_id may make the tx size become invalid, e.g
+    // changing from lossless to lossy.
+    assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
+
+    set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+                           mi_col, pred);
+    set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
+                           mi_col, pred);
+    /* mbmi is read only but we need to update segment_id */
+    ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+    return;
   }
+
+  const int coded_id =
+      av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+  set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+                         mi_col, mbmi->segment_id);
 }
 
-#if CONFIG_NEW_MULTISYMBOL
 #define WRITE_REF_BIT(bname, pname) \
-  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(cm, xd), 2)
-#define WRITE_REF_BIT2(bname, pname) \
   aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
-#else
-#define WRITE_REF_BIT(bname, pname) \
-  aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
-#define WRITE_REF_BIT2(bname, pname) \
-  aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
-#endif
 
 // This function encodes the reference frame
 static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                              aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
 
@@ -1006,75 +493,40 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     assert(!is_compound);
     assert(mbmi->ref_frame[0] ==
            get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] == LAST_FRAME);
   } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       if (is_comp_ref_allowed(mbmi->sb_type))
-#if CONFIG_NEW_MULTISYMBOL
-        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(cm, xd), 2);
-#else
-        aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
-#endif  // CONFIG_NEW_MULTISYMBOL
+        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
     } else {
       assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
-#if CONFIG_EXT_COMP_REFS
       const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
                                                     ? UNIDIR_COMP_REFERENCE
                                                     : BIDIR_COMP_REFERENCE;
-#if USE_UNI_COMP_REFS
-#if CONFIG_VAR_REFS
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm))
-        if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm))
-#endif  // CONFIG_VAR_REFS
-#if CONFIG_NEW_MULTISYMBOL
-          aom_write_symbol(w, comp_ref_type,
-                           av1_get_comp_reference_type_cdf(xd), 2);
-#else
-      aom_write(w, comp_ref_type, av1_get_comp_reference_type_prob(cm, xd));
-#endif
-#if CONFIG_VAR_REFS
-        else
-          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-      else
-        assert(comp_ref_type == UNIDIR_COMP_REFERENCE);
-#endif  // CONFIG_VAR_REFS
-#else   // !USE_UNI_COMP_REFS
-      // NOTE: uni-directional comp refs disabled
-      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // USE_UNI_COMP_REFS
+      aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+                       2);
 
       if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
         const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
-#if CONFIG_VAR_REFS
-        if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm))
-#endif  // CONFIG_VAR_REFS
-          WRITE_REF_BIT2(bit, uni_comp_ref_p);
+        WRITE_REF_BIT(bit, uni_comp_ref_p);
 
         if (!bit) {
           assert(mbmi->ref_frame[0] == LAST_FRAME);
-#if CONFIG_VAR_REFS
-          if (L_AND_L2(cm) && (L_AND_L3(cm) || L_AND_G(cm))) {
-#endif  // CONFIG_VAR_REFS
-            const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
-                             mbmi->ref_frame[1] == GOLDEN_FRAME;
-            WRITE_REF_BIT2(bit1, uni_comp_ref_p1);
-            if (bit1) {
-#if CONFIG_VAR_REFS
-              if (L_AND_L3(cm) && L_AND_G(cm)) {
-#endif  // CONFIG_VAR_REFS
-                const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
-                WRITE_REF_BIT2(bit2, uni_comp_ref_p2);
-#if CONFIG_VAR_REFS
-              }
-#endif  // CONFIG_VAR_REFS
-            }
-#if CONFIG_VAR_REFS
+          const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+                           mbmi->ref_frame[1] == GOLDEN_FRAME;
+          WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+            WRITE_REF_BIT(bit2, uni_comp_ref_p2);
           }
-#endif  // CONFIG_VAR_REFS
         } else {
           assert(mbmi->ref_frame[1] == ALTREF_FRAME);
         }
@@ -1083,213 +535,81 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       }
 
       assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // CONFIG_EXT_COMP_REFS
 
-#if CONFIG_EXT_REFS
       const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
                        mbmi->ref_frame[0] == LAST3_FRAME);
-#if CONFIG_VAR_REFS
-      // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-      if (L_OR_L2(cm) && L3_OR_G(cm))
-#endif  // CONFIG_VAR_REFS
-        WRITE_REF_BIT(bit, comp_ref_p);
+      WRITE_REF_BIT(bit, comp_ref_p);
 
       if (!bit) {
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (L) vs (L2) branch node in tree
-        if (L_AND_L2(cm)) {
-#endif  // CONFIG_VAR_REFS
-          const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
-          WRITE_REF_BIT(bit1, comp_ref_p1);
-#if CONFIG_VAR_REFS
-        }
-#endif  // CONFIG_VAR_REFS
+        const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+        WRITE_REF_BIT(bit1, comp_ref_p1);
       } else {
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (L3) vs (G) branch node in tree
-        if (L3_AND_G(cm)) {
-#endif  // CONFIG_VAR_REFS
-          const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
-          WRITE_REF_BIT(bit2, comp_ref_p2);
-#if CONFIG_VAR_REFS
-        }
-#endif  // CONFIG_VAR_REFS
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        WRITE_REF_BIT(bit2, comp_ref_p2);
       }
 
-#if CONFIG_VAR_REFS
-      // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree
-      if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) {
-#endif  // CONFIG_VAR_REFS
-        const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
-        WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
-
-        if (!bit_bwd) {
-#if CONFIG_VAR_REFS
-          // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in
-          // tree
-          if (BWD_AND_ALT2(cm))
-#endif  // CONFIG_VAR_REFS
-            WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
-        }
-#if CONFIG_VAR_REFS
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+      WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+      if (!bit_bwd) {
+        WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
       }
-#endif  // CONFIG_VAR_REFS
 
-#else   // !CONFIG_EXT_REFS
-      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
-      WRITE_REF_BIT(bit, comp_ref_p);
-#endif  // CONFIG_EXT_REFS
     } else {
-#if CONFIG_EXT_REFS
       const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
                         mbmi->ref_frame[0] >= BWDREF_FRAME);
-#if CONFIG_VAR_REFS
-      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT2,ALT) branch node
-      // in tree
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) &&
-          (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm)))
-#endif  // CONFIG_VAR_REFS
-        WRITE_REF_BIT(bit0, single_ref_p1);
+      WRITE_REF_BIT(bit0, single_ref_p1);
 
       if (bit0) {
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree
-        if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) {
-#endif  // CONFIG_VAR_REFS
-          const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
-          WRITE_REF_BIT(bit1, single_ref_p2);
-
-          if (!bit1) {
-#if CONFIG_VAR_REFS
-            // Test need to explicitly code (BWD) vs (ALT2) branch node in tree
-            if (BWD_AND_ALT2(cm))
-#endif  // CONFIG_VAR_REFS
-              WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
-          }
-#if CONFIG_VAR_REFS
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        WRITE_REF_BIT(bit1, single_ref_p2);
+
+        if (!bit1) {
+          WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
         }
-#endif  // CONFIG_VAR_REFS
       } else {
         const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
                           mbmi->ref_frame[0] == GOLDEN_FRAME);
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-        if (L_OR_L2(cm) && L3_OR_G(cm))
-#endif  // CONFIG_VAR_REFS
-          WRITE_REF_BIT(bit2, single_ref_p3);
+        WRITE_REF_BIT(bit2, single_ref_p3);
 
         if (!bit2) {
-#if CONFIG_VAR_REFS
-          // Test need to explicitly code (L) vs (L2) branch node in tree
-          if (L_AND_L2(cm)) {
-#endif  // CONFIG_VAR_REFS
-            const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
-            WRITE_REF_BIT(bit3, single_ref_p4);
-#if CONFIG_VAR_REFS
-          }
-#endif  // CONFIG_VAR_REFS
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          WRITE_REF_BIT(bit3, single_ref_p4);
         } else {
-#if CONFIG_VAR_REFS
-          // Test need to explicitly code (L3) vs (G) branch node in tree
-          if (L3_AND_G(cm)) {
-#endif  // CONFIG_VAR_REFS
-            const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
-            WRITE_REF_BIT(bit4, single_ref_p5);
-#if CONFIG_VAR_REFS
-          }
-#endif  // CONFIG_VAR_REFS
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          WRITE_REF_BIT(bit4, single_ref_p5);
         }
       }
-#else   // !CONFIG_EXT_REFS
-      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      WRITE_REF_BIT(bit0, single_ref_p1);
-
-      if (bit0) {
-        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        WRITE_REF_BIT(bit1, single_ref_p2);
-      }
-#endif  // CONFIG_EXT_REFS
     }
   }
 }
 
-#if CONFIG_FILTER_INTRA
-static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
+static void write_filter_intra_mode_info(const AV1_COMMON *cm,
                                          const MACROBLOCKD *xd,
                                          const MB_MODE_INFO *const mbmi,
-                                         int mi_row, int mi_col,
                                          aom_writer *w) {
-  if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
-    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0],
-              cm->fc->filter_intra_probs[0]);
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
-      const FILTER_INTRA_MODE mode =
-          mbmi->filter_intra_mode_info.filter_intra_mode[0];
-      write_uniform(w, FILTER_INTRA_MODES, mode);
-    }
-  }
-
-#if CONFIG_CB4X4
-  if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                           xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y))
-    return;
-#else
-  (void)xd;
-  (void)mi_row;
-  (void)mi_col;
-#endif  // CONFIG_CB4X4
-
-  if (mbmi->uv_mode == UV_DC_PRED &&
-      mbmi->palette_mode_info.palette_size[1] == 0) {
-    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1],
-              cm->fc->filter_intra_probs[1]);
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) {
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
       const FILTER_INTRA_MODE mode =
-          mbmi->filter_intra_mode_info.filter_intra_mode[1];
-      write_uniform(w, FILTER_INTRA_MODES, mode);
+          mbmi->filter_intra_mode_info.filter_intra_mode;
+      aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+                       FILTER_INTRA_MODES);
     }
   }
 }
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_EXT_INTRA
-static void write_intra_angle_info(const MACROBLOCKD *xd,
-                                   FRAME_CONTEXT *const ec_ctx, aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_INTRA_INTERP
-  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-  int p_angle;
-#endif  // CONFIG_INTRA_INTERP
-
-  (void)ec_ctx;
-  if (!av1_use_angle_delta(bsize)) return;
-
-  if (av1_is_directional_mode(mbmi->mode, bsize)) {
-    write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
-                  MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-#if CONFIG_INTRA_INTERP
-    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-    if (av1_is_intra_filter_switchable(p_angle)) {
-      aom_write_symbol(w, mbmi->intra_filter,
-                       ec_ctx->intra_filter_cdf[intra_filter_ctx],
-                       INTRA_FILTERS);
-    }
-#endif  // CONFIG_INTRA_INTERP
-  }
 
-  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize)) {
-    write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
-                  MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-  }
+static void write_angle_delta(aom_writer *w, int angle_delta,
+                              aom_cdf_prob *cdf) {
+  aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+                   2 * MAX_ANGLE_DELTA + 1);
 }
-#endif  // CONFIG_EXT_INTRA
 
 static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
                                    aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
@@ -1299,36 +619,19 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
     return;
   }
   if (cm->interp_filter == SWITCHABLE) {
-#if CONFIG_DUAL_FILTER
     int dir;
     for (dir = 0; dir < 2; ++dir) {
-      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-          (mbmi->ref_frame[1] > INTRA_FRAME &&
-           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        InterpFilter filter =
-            av1_extract_interp_filter(mbmi->interp_filters, dir);
-        aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
-                         SWITCHABLE_FILTERS);
-        ++cpi->interp_filter_selected[0][filter];
-      } else {
-        assert(av1_extract_interp_filter(mbmi->interp_filters, dir) ==
-               EIGHTTAP_REGULAR);
-      }
-    }
-#else
-    {
-      const int ctx = av1_get_pred_context_switchable_interp(xd);
-      InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
       aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                        SWITCHABLE_FILTERS);
       ++cpi->interp_filter_selected[0][filter];
+      if (cm->seq_params.enable_dual_filter == 0) return;
     }
-#endif  // CONFIG_DUAL_FILTER
   }
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Transmit color values with delta encoding. Write the first value as
 // literal, and the deltas between each value and the previous one. "min_val" is
 // the smallest possible value of the deltas.
@@ -1446,207 +749,90 @@ static void write_palette_colors_uv(const MACROBLOCKD *const xd,
     }
   }
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                                    const MODE_INFO *const mi, aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
+                                    const MB_MODE_INFO *const mbmi, int mi_row,
+                                    int mi_col, aom_writer *w) {
+  const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-
-  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_LARGEST);
-  const int block_palette_idx = bsize - BLOCK_8X8;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
   if (mbmi->mode == DC_PRED) {
     const int n = pmi->palette_size[0];
-    int palette_y_mode_ctx = 0;
-    if (above_mi) {
-      palette_y_mode_ctx +=
-          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-    if (left_mi) {
-      palette_y_mode_ctx +=
-          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-#if CONFIG_NEW_MULTISYMBOL
+    const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
     aom_write_symbol(
         w, n > 0,
-        xd->tile_ctx->palette_y_mode_cdf[block_palette_idx][palette_y_mode_ctx],
-        2);
-#else
-    aom_write(
-        w, n > 0,
-        av1_default_palette_y_mode_prob[block_palette_idx][palette_y_mode_ctx]);
-#endif
+        xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
     if (n > 0) {
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
-                       xd->tile_ctx->palette_y_size_cdf[block_palette_idx],
+                       xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-#if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_y(xd, pmi, cm->bit_depth, w);
-#else
-      for (int i = 0; i < n; ++i) {
-        assert(pmi->palette_colors[i] < (1 << cm->bit_depth));
-        aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
-      }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
 
-  if (mbmi->uv_mode == UV_DC_PRED) {
+  const int uv_dc_pred =
+      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  if (uv_dc_pred) {
     const int n = pmi->palette_size[1];
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, n > 0,
                      xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
-#else
-    aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]);
-#endif
     if (n > 0) {
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
-                       xd->tile_ctx->palette_uv_size_cdf[block_palette_idx],
+                       xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-#if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
-#else
-      for (int i = 0; i < n; ++i) {
-        assert(pmi->palette_colors[PALETTE_MAX_SIZE + i] <
-               (1 << cm->bit_depth));
-        assert(pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] <
-               (1 << cm->bit_depth));
-        aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
-                          cm->bit_depth);
-        aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
-                          cm->bit_depth);
-      }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
 }
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                       const int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                       int blk_row, int blk_col, int block, int plane,
-                       TX_SIZE tx_size,
-#endif
+                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
                        aom_writer *w) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-#if !CONFIG_TXK_SEL
-#if CONFIG_VAR_TX
-  const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size;
-#else
-  const TX_SIZE tx_size = mbmi->tx_size;
-#endif  // CONFIG_VAR_TX
-#endif  // !CONFIG_TXK_SEL
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-#if !CONFIG_TXK_SEL
-  TX_TYPE tx_type = mbmi->tx_type;
-#else
   // Only y plane's tx_type is transmitted
   if (plane > 0) return;
   PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-#endif
-
-  if (!FIXED_TX_TYPE) {
-#if CONFIG_EXT_TX
-    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-    const BLOCK_SIZE bsize = mbmi->sb_type;
-    if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) >
-            1 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_MRC_TX
-      if (tx_type == MRC_DCT)
-        assert(mbmi->valid_mrc_mask && "Invalid MRC mask");
-#endif  // CONFIG_MRC_TX
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-      const int eset =
-          get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-      // eset == 0 should correspond to a set with only DCT_DCT and there
-      // is no need to send the tx_type
-      assert(eset > 0);
-      assert(av1_ext_tx_used[tx_set_type][tx_type]);
-#if !CONFIG_LGT_FROM_PRED
-      if (is_inter) {
-        aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
-                         ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                         av1_num_ext_tx_set[tx_set_type]);
-      } else if (ALLOW_INTRA_EXT_TX) {
-        aom_write_symbol(
-            w, av1_ext_tx_ind[tx_set_type][tx_type],
-            ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-            av1_num_ext_tx_set[tx_set_type]);
-      }
-#else
-      // only signal tx_type when lgt is not allowed or not selected
-      if (is_inter) {
-        if (LGT_FROM_PRED_INTER) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            aom_write(w, mbmi->use_lgt, ec_ctx->inter_lgt_prob[square_tx_size]);
-          if (!mbmi->use_lgt)
-            aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
-                             ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                             av1_num_ext_tx_set[tx_set_type]);
-        } else {
-          aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
-                           ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                           av1_num_ext_tx_set[tx_set_type]);
-        }
-      } else if (ALLOW_INTRA_EXT_TX) {
-        if (LGT_FROM_PRED_INTRA) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            aom_write(w, mbmi->use_lgt,
-                      ec_ctx->intra_lgt_prob[square_tx_size][mbmi->mode]);
-          if (!mbmi->use_lgt)
-            aom_write_symbol(
-                w, av1_ext_tx_ind[tx_set_type][tx_type],
-                ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-                av1_num_ext_tx_set[tx_set_type]);
-        } else {
-          aom_write_symbol(
-              w, av1_ext_tx_ind[tx_set_type][tx_type],
-              ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-              av1_num_ext_tx_set[tx_set_type]);
-        }
-      }
-#endif  // CONFIG_LGT_FROM_PRED
-    }
-#else  // CONFIG_EXT_TX
-    if (tx_size < TX_32X32 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      if (is_inter) {
-        aom_write_symbol(w, av1_ext_tx_ind[tx_type],
-                         ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES);
-      } else {
-        aom_write_symbol(
-            w, av1_ext_tx_ind[tx_type],
-            ec_ctx->intra_ext_tx_cdf[tx_size]
-                                    [intra_mode_to_tx_type_context[mbmi->mode]],
-            TX_TYPES);
-      }
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+                                    cm->reduced_tx_set_used);
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and there
+    // is no need to send the tx_type
+    assert(eset > 0);
+    assert(av1_ext_tx_used[tx_set_type][tx_type]);
+    if (is_inter) {
+      aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                       ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                       av1_num_ext_tx_set[tx_set_type]);
+    } else {
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      aom_write_symbol(
+          w, av1_ext_tx_ind[tx_set_type][tx_type],
+          ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type]);
     }
-#endif  // CONFIG_EXT_TX
   }
 }
 
@@ -1658,14 +844,12 @@ static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
 
 static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
                                 UV_PREDICTION_MODE uv_mode,
-                                PREDICTION_MODE y_mode, aom_writer *w) {
-#if !CONFIG_CFL
-  uv_mode = get_uv_mode(uv_mode);
-#endif
-  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES);
+                                PREDICTION_MODE y_mode,
+                                CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) {
+  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                   UV_INTRA_MODES - !cfl_allowed);
 }
 
-#if CONFIG_CFL
 static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
                              int joint_sign, aom_writer *w) {
   aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
@@ -1679,23 +863,85 @@ static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
     aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
   }
 }
-#endif
 
-static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
-                                const int mi_col,
-#if CONFIG_SUPERTX
-                                int supertx_enabled,
-#endif
-                                aom_writer *w) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  const MODE_INFO *mi = xd->mi[0];
+static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
+                       int skip, int mi_col, int mi_row) {
+  if (cm->coded_lossless || cm->allow_intrabc) {
+    // Initialize to indicate no CDEF for safety.
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    return;
+  }
 
-  const struct segmentation *const seg = &cm->seg;
-  struct segmentation_probs *const segp = &ec_ctx->seg;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
+  const MB_MODE_INFO *mbmi =
+      cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)];
+  // Initialise when at top left part of the superblock
+  if (!(mi_row & (cm->seq_params.mib_size - 1)) &&
+      !(mi_col & (cm->seq_params.mib_size - 1))) {  // Top left?
+    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+        xd->cdef_preset[3] = -1;
+  }
+
+  // Emit CDEF param at first non-skip coding block
+  const int mask = 1 << (6 - MI_SIZE_LOG2);
+  const int index = cm->seq_params.sb_size == BLOCK_128X128
+                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+                        : 0;
+  if (xd->cdef_preset[index] == -1 && !skip) {
+    aom_write_literal(w, mbmi->cdef_strength, cm->cdef_bits);
+    xd->cdef_preset[index] = mbmi->cdef_strength;
+  }
+}
+
+static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
+                                   const struct segmentation *const seg,
+                                   struct segmentation_probs *const segp,
+                                   int mi_row, int mi_col, int skip,
+                                   int preskip) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (seg->update_map) {
+    if (preskip) {
+      if (!seg->segid_preskip) return;
+    } else {
+      if (seg->segid_preskip) return;
+      if (skip) {
+        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
+        if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0;
+        return;
+      }
+    }
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+      aom_write_symbol(w, pred_flag, pred_cdf, 2);
+      if (!pred_flag) {
+        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+      }
+      if (pred_flag) {
+        set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type,
+                               mi_row, mi_col, mbmi->segment_id);
+      }
+    } else {
+      write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+    }
+  }
+}
+
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+                                const int mi_col, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
   const int segment_id = mbmi->segment_id;
@@ -1704,595 +950,323 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
   const int is_inter = is_inter_block(mbmi);
   const int is_compound = has_second_ref(mbmi);
   int skip, ref;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
   (void)mi_row;
   (void)mi_col;
 
-  if (seg->update_map) {
-    if (seg->temporal_update) {
-      const int pred_flag = mbmi->seg_id_predicted;
-#if CONFIG_NEW_MULTISYMBOL
-      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
-      aom_write_symbol(w, pred_flag, pred_cdf, 2);
-#else
-      aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd);
-      aom_write(w, pred_flag, pred_prob);
-#endif
-      if (!pred_flag) write_segment_id(w, seg, segp, segment_id);
-    } else {
-      write_segment_id(w, seg, segp, segment_id);
-    }
-  }
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+  write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
 
-#if CONFIG_SUPERTX
-  if (supertx_enabled)
-    skip = mbmi->skip;
-  else
-    skip = write_skip(cm, xd, segment_id, mi, w);
-#else
-  skip = write_skip(cm, xd, segment_id, mi, w);
-#endif  // CONFIG_SUPERTX
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
-        ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
-    if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
-      assert(mbmi->current_q_index > 0);
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
       int reduced_delta_qindex =
-          (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
-      write_delta_qindex(cm, xd, reduced_delta_qindex, w);
-      xd->prev_qindex = mbmi->current_q_index;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
+          (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_qindex = mbmi->current_qindex;
       if (cm->delta_lf_present_flag) {
         if (cm->delta_lf_multi) {
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
             int reduced_delta_lflevel =
-                (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 cm->delta_lf_res;
             write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
-            xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
-              (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               cm->delta_lf_res;
           write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
-          xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
-#else
-      if (cm->delta_lf_present_flag) {
-        int reduced_delta_lflevel =
-            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
-            cm->delta_lf_res;
-        write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
-        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
-      }
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_EXT_DELTA_Q
     }
   }
 
-#if CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-    write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
-  if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4 && CONFIG_VAR_TX && !CONFIG_RECT_TX
-      (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) &&
-#else
-      block_signals_txsize(bsize) &&
-#endif
-#if CONFIG_SUPERTX
-      !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-      !(is_inter && skip) && !xd->lossless[segment_id]) {
-#if CONFIG_VAR_TX
-    if (is_inter) {  // This implies skip flag is 0.
-      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize, 0);
-      const int bh = tx_size_high_unit[max_tx_size];
-      const int bw = tx_size_wide_unit[max_tx_size];
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
-      int init_depth =
-          (height != width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
-      int idx, idy;
-      for (idy = 0; idy < height; idy += bh)
-        for (idx = 0; idx < width; idx += bw)
-          write_tx_size_vartx(cm, xd, mbmi, max_tx_size, init_depth, idy, idx,
-                              w);
-#if CONFIG_RECT_TX_EXT
-      if (is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
-          quarter_txsize_lookup[bsize] != max_tx_size &&
-          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
-           mbmi->tx_size == max_tx_size)) {
-#if CONFIG_NEW_MULTISYMBOL
-        aom_write_symbol(w, mbmi->tx_size != max_tx_size,
-                         cm->fc->quarter_tx_size_cdf, 2);
-#else
-        aom_write(w, mbmi->tx_size != max_tx_size,
-                  cm->fc->quarter_tx_size_prob);
-#endif
-      }
-#endif
-    } else {
-      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
-      write_selected_tx_size(cm, xd, w);
-    }
-  } else {
-    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
-#else
-    write_selected_tx_size(cm, xd, w);
-#endif
-  }
+  if (mbmi->skip_mode) return;
 
   if (!is_inter) {
-    if (bsize >= BLOCK_8X8 || unify_bsize) {
-      write_intra_mode(ec_ctx, bsize, mode, w);
-    } else {
-      int idx, idy;
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-      for (idy = 0; idy < 2; idy += num_4x4_h) {
-        for (idx = 0; idx < 2; idx += num_4x4_w) {
-          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(ec_ctx, bsize, b_mode, w);
-        }
-      }
+    write_intra_mode(ec_ctx, bsize, mode, w);
+    const int use_angle_delta = av1_use_angle_delta(bsize);
+
+    if (use_angle_delta && av1_is_directional_mode(mode)) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                        ec_ctx->angle_delta_cdf[mode - V_PRED]);
     }
-#if CONFIG_CB4X4
-    if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y)) {
-      write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
-#else  // !CONFIG_CB4X4
-    write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
-#endif  // CONFIG_CB4X4
 
-#if CONFIG_CFL
-      if (mbmi->uv_mode == UV_CFL_PRED) {
+    if (!cm->seq_params.monochrome &&
+        is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                            xd->plane[1].subsampling_y)) {
+      const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+      write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+      if (uv_mode == UV_CFL_PRED)
         write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+      if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+        write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                          ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
       }
-#endif
-
-#if CONFIG_CB4X4
     }
-#endif
 
-#if CONFIG_EXT_INTRA
-    write_intra_angle_info(xd, ec_ctx, w);
-#endif  // CONFIG_EXT_INTRA
     if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-      write_palette_mode_info(cm, xd, mi, w);
-#if CONFIG_FILTER_INTRA
-    if (bsize >= BLOCK_8X8 || unify_bsize)
-      write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-#endif  // CONFIG_FILTER_INTRA
+      write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+
+    write_filter_intra_mode_info(cm, xd, mbmi, w);
   } else {
     int16_t mode_ctx;
-    write_ref_frames(cm, xd, w);
 
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-      // NOTE: Handle single ref comp mode
-      if (!is_compound)
-        aom_write(w, is_inter_singleref_comp_mode(mode),
-                  av1_get_inter_mode_prob(cm, xd));
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_COMPOUND_SINGLEREF
-    if (is_compound || is_inter_singleref_comp_mode(mode))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-    if (is_compound)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-    else
+    av1_collect_neighbors_ref_counts(xd);
+
+    write_ref_frames(cm, xd, w);
 
-      mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                           mbmi->ref_frame, bsize, -1);
+    mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
 
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
-      if (bsize >= BLOCK_8X8 || unify_bsize) {
-        if (is_inter_compound_mode(mode))
-          write_inter_compound_mode(cm, xd, w, mode, mode_ctx);
-#if CONFIG_COMPOUND_SINGLEREF
-        else if (is_inter_singleref_comp_mode(mode))
-          write_inter_singleref_comp_mode(xd, w, mode, mode_ctx);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        else if (is_inter_singleref_mode(mode))
-          write_inter_mode(w, mode, ec_ctx, mode_ctx);
-
-        if (mode == NEWMV || mode == NEW_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-            mbmi->mode == SR_NEW_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
-            have_nearmv_in_inter_mode(mode))
-          write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
-        else
-          assert(mbmi->ref_mv_idx == 0);
-      }
+      if (is_inter_compound_mode(mode))
+        write_inter_compound_mode(xd, w, mode, mode_ctx);
+      else if (is_inter_singleref_mode(mode))
+        write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+      if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+        write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
+      else
+        assert(mbmi->ref_mv_idx == 0);
     }
 
-#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
-    write_mb_interp_filter(cpi, xd, w);
-#endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
-
-    if (bsize < BLOCK_8X8 && !unify_bsize) {
-#if CONFIG_COMPOUND_SINGLEREF
-      /// NOTE: Single ref comp mode does not support sub8x8.
-      assert(is_compound || !is_inter_singleref_comp_mode(mbmi->mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-      int idx, idy;
-      for (idy = 0; idy < 2; idy += num_4x4_h) {
-        for (idx = 0; idx < 2; idx += num_4x4_w) {
-          const int j = idy * 2 + idx;
-          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-          if (!is_compound)
-            mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                                 mbmi->ref_frame, bsize, j);
-          if (is_inter_compound_mode(b_mode))
-            write_inter_compound_mode(cm, xd, w, b_mode, mode_ctx);
-          else if (is_inter_singleref_mode(b_mode))
-            write_inter_mode(w, b_mode, ec_ctx, mode_ctx);
-
-          if (b_mode == NEWMV || b_mode == NEW_NEWMV) {
-            for (ref = 0; ref < 1 + is_compound; ++ref) {
-              int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-              int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                        mbmi_ext->ref_mv_stack[rf_type], ref,
-                                        mbmi->ref_mv_idx);
-              nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-              av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
-                            &mi->bmi[j].ref_mv[ref].as_mv, nmvc, allow_hp);
-            }
-          } else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
-            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                      mbmi_ext->ref_mv_stack[rf_type], 1,
-                                      mbmi->ref_mv_idx);
-            nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
-                          &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp);
-          } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
-            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                      mbmi_ext->ref_mv_stack[rf_type], 0,
-                                      mbmi->ref_mv_idx);
-            nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
-                          &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp);
-          }
-        }
-      }
-    } else {
-      if (mode == NEWMV || mode == NEW_NEWMV) {
-        int_mv ref_mv;
-        for (ref = 0; ref < 1 + is_compound; ++ref) {
-          int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-          int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                    mbmi_ext->ref_mv_stack[rf_type], ref,
-                                    mbmi->ref_mv_idx);
-          nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-          ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
-          av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
-                        allow_hp);
-        }
-      } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                        mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
-                      &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc,
-                      allow_hp);
-      } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-        av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
-                      &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
-                      allow_hp);
-#if CONFIG_COMPOUND_SINGLEREF
-      } else if (  //  mode == SR_NEAREST_NEWMV ||
-          mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV ||
-          mode == SR_NEW_NEWMV) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-        int_mv ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0];
-        if (mode == SR_NEW_NEWMV)
-          av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
-                        allow_hp);
-        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+    if (mode == NEWMV || mode == NEW_NEWMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        nmv_context *nmvc = &ec_ctx->nmvc;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
-#endif  // CONFIG_COMPOUND_SINGLEREF
       }
+    } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+    } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
     }
 
-#if CONFIG_INTERINTRA
     if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        cpi->common.allow_interintra_compound && is_interintra_allowed(mbmi)) {
+        cpi->common.seq_params.enable_interintra_compound &&
+        is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
-#if CONFIG_NEW_MULTISYMBOL
       aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
-#else
-      aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
-#endif
       if (interintra) {
         aom_write_symbol(w, mbmi->interintra_mode,
                          ec_ctx->interintra_mode_cdf[bsize_group],
                          INTERINTRA_MODES);
         if (is_interintra_wedge_used(bsize)) {
-#if CONFIG_NEW_MULTISYMBOL
           aom_write_symbol(w, mbmi->use_wedge_interintra,
                            ec_ctx->wedge_interintra_cdf[bsize], 2);
-#else
-          aom_write(w, mbmi->use_wedge_interintra,
-                    cm->fc->wedge_interintra_prob[bsize]);
-#endif
           if (mbmi->use_wedge_interintra) {
-            aom_write_literal(w, mbmi->interintra_wedge_index,
-                              get_wedge_bits_lookup(bsize));
+            aom_write_symbol(w, mbmi->interintra_wedge_index,
+                             ec_ctx->wedge_idx_cdf[bsize], 16);
             assert(mbmi->interintra_wedge_sign == 0);
           }
         }
       }
     }
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_SUPERTX
-    if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-      if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mi, w);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    write_ncobmc_mode(xd, mi, w);
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-    if (
-#if CONFIG_COMPOUND_SINGLEREF
-        is_inter_anyref_comp_mode(mbmi->mode) &&
-#else   // !CONFIG_COMPOUND_SINGLEREF
-        cpi->common.reference_mode != SINGLE_REFERENCE &&
-        is_inter_compound_mode(mbmi->mode) &&
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_MOTION_VAR
-        mbmi->motion_mode == SIMPLE_TRANSLATION &&
-#endif  // CONFIG_MOTION_VAR
-        is_any_masked_compound_used(bsize)) {
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-      if (cm->allow_masked_compound) {
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          aom_write_bit(w, mbmi->interinter_compound_type == COMPOUND_AVERAGE);
-        else
-#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          aom_write_symbol(w, mbmi->interinter_compound_type,
-                           ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES);
-#if CONFIG_WEDGE
-        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize) &&
-            mbmi->interinter_compound_type == COMPOUND_WEDGE) {
-          aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
-          aom_write_bit(w, mbmi->wedge_sign);
+
+    if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+    // First write idx to indicate current compound inter prediction mode group
+    // Group A (0): jnt_comp, compound_average
+    // Group B (1): interintra, compound_diffwtd, wedge
+    if (has_second_ref(mbmi)) {
+      const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                       cm->seq_params.enable_masked_compound;
+
+      if (masked_compound_used) {
+        const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+        aom_write_symbol(w, mbmi->comp_group_idx,
+                         ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+      } else {
+        assert(mbmi->comp_group_idx == 0);
+      }
+
+      if (mbmi->comp_group_idx == 0) {
+        if (mbmi->compound_idx)
+          assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+        if (cm->seq_params.enable_jnt_comp) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          aom_write_symbol(w, mbmi->compound_idx,
+                           ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+        } else {
+          assert(mbmi->compound_idx == 1);
         }
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        if (mbmi->interinter_compound_type == COMPOUND_SEG) {
-          aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS);
+      } else {
+        assert(cpi->common.reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+        assert(masked_compound_used);
+        // compound_diffwtd, wedge
+        assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+               mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          aom_write_symbol(w, mbmi->interinter_comp.type - 1,
+                           ec_ctx->compound_type_cdf[bsize],
+                           COMPOUND_TYPES - 1);
+
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+          aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+                           ec_ctx->wedge_idx_cdf[bsize], 16);
+          aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+        } else {
+          assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+          aom_write_literal(w, mbmi->interinter_comp.mask_type,
+                            MAX_DIFFWTD_MASK_BITS);
         }
-#endif  // CONFIG_COMPOUND_SEGMENT
       }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     }
 
-#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
     write_mb_interp_filter(cpi, xd, w);
-#endif  // CONFIG_DUAL_FILTE || CONFIG_WARPED_MOTION
   }
+}
 
-#if !CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                    supertx_enabled,
-#endif
-                    w);
-#endif  // !CONFIG_TXK_SEL
+static void write_intrabc_info(MACROBLOCKD *xd,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  int use_intrabc = is_intrabc_block(mbmi);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+  if (use_intrabc) {
+    assert(mbmi->mode == DC_PRED);
+    assert(mbmi->uv_mode == UV_DC_PRED);
+    assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+    int_mv dv_ref = mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv;
+    av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+  }
 }
 
-static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd,
-#if CONFIG_INTRABC
+static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
                               const MB_MODE_INFO_EXT *mbmi_ext,
-#endif  // CONFIG_INTRABC
                               const int mi_row, const int mi_col,
                               aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &ec_ctx->seg;
-  const MODE_INFO *const mi = xd->mi[0];
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  (void)mi_row;
-  (void)mi_col;
+  const PREDICTION_MODE mode = mbmi->mode;
+
+  if (seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+
+  const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
 
-  if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
+  if (!seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, skip);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
 
-  const int skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
-        ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
-    if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
-      assert(mbmi->current_q_index > 0);
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
       int reduced_delta_qindex =
-          (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
-      write_delta_qindex(cm, xd, reduced_delta_qindex, w);
-      xd->prev_qindex = mbmi->current_q_index;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
+          (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_qindex = mbmi->current_qindex;
       if (cm->delta_lf_present_flag) {
         if (cm->delta_lf_multi) {
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
             int reduced_delta_lflevel =
-                (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 cm->delta_lf_res;
             write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
-            xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
-              (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               cm->delta_lf_res;
           write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
-          xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
-#else
-      if (cm->delta_lf_present_flag) {
-        int reduced_delta_lflevel =
-            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
-            cm->delta_lf_res;
-        write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
-        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
-      }
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_EXT_DELTA_Q
     }
   }
 
-  int enable_tx_size = cm->tx_mode == TX_MODE_SELECT &&
-                       block_signals_txsize(bsize) &&
-                       !xd->lossless[mbmi->segment_id];
-
-#if CONFIG_INTRABC
-  if (av1_allow_intrabc(bsize, cm)) {
-    int use_intrabc = is_intrabc_block(mbmi);
-    aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
-    if (use_intrabc) {
-      assert(mbmi->mode == DC_PRED);
-      assert(mbmi->uv_mode == UV_DC_PRED);
-      if (enable_tx_size && !mbmi->skip) write_selected_tx_size(cm, xd, w);
-      int_mv dv_ref = mbmi_ext->ref_mvs[INTRA_FRAME][0];
-      av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
-#if CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      av1_write_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                        0,
-#endif
-                        w);
-#endif  // CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      return;
-    }
+  if (av1_allow_intrabc(cm)) {
+    write_intrabc_info(xd, mbmi_ext, w);
+    if (is_intrabc_block(mbmi)) return;
   }
-#endif  // CONFIG_INTRABC
-  if (enable_tx_size) write_selected_tx_size(cm, xd, w);
 
-  if (bsize >= BLOCK_8X8 || unify_bsize) {
-    write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w);
-  } else {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-    int idx, idy;
-
-    for (idy = 0; idy < 2; idy += num_4x4_h) {
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int block = idy * 2 + idx;
-        write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, block,
-                            mi->bmi[block].as_mode, w);
-      }
-    }
+  write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  if (use_angle_delta && av1_is_directional_mode(mode)) {
+    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
   }
 
-#if CONFIG_CB4X4
-  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                           xd->plane[1].subsampling_y)) {
-    write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
-#else  // !CONFIG_CB4X4
-  write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
-#endif  // CONFIG_CB4X4
-
-#if CONFIG_CFL
-    if (mbmi->uv_mode == UV_CFL_PRED) {
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+    if (uv_mode == UV_CFL_PRED)
       write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
     }
-#endif
-
-#if CONFIG_CB4X4
   }
-#endif
-#if CONFIG_EXT_INTRA
-  write_intra_angle_info(xd, ec_ctx, w);
-#endif  // CONFIG_EXT_INTRA
+
   if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    write_palette_mode_info(cm, xd, mi, w);
-#if CONFIG_FILTER_INTRA
-  if (bsize >= BLOCK_8X8 || unify_bsize)
-    write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-#endif  // CONFIG_FILTER_INTRA
-
-#if !CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                    0,
-#endif
-                    w);
-#endif  // !CONFIG_TXK_SEL
-}
+    write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
 
-#if CONFIG_SUPERTX
-#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
-                              mi_row, mi_col)                              \
-  write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
-#else
-#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
-                              mi_row, mi_col)                              \
-  write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
-#endif  // CONFIG_SUPERTX
+  write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
 
 #if CONFIG_RD_DEBUG
 static void dump_mode_info(MODE_INFO *mi) {
-  printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row);
-  printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col);
-  printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type);
-  printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size);
-  if (mi->mbmi.sb_type >= BLOCK_8X8) {
-    printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode);
-  } else {
-    printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode);
-  }
+  printf("\nmi->mi_row == %d\n", mi->mi_row);
+  printf("&& mi->mi_col == %d\n", mi->mi_col);
+  printf("&& mi->sb_type == %d\n", mi->sb_type);
+  printf("&& mi->tx_size == %d\n", mi->tx_size);
+  printf("&& mi->mode == %d\n", mi->mode);
 }
 static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
                                    int plane) {
   if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
-#if CONFIG_VAR_TX
     int r, c;
-#endif
     printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
            plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
-#if CONFIG_VAR_TX
     printf("rd txb_coeff_cost_map\n");
     for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
       for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
@@ -2308,7 +1282,6 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
       }
       printf("\n");
     }
-#endif
     return 1;
   }
   return 0;
@@ -2319,128 +1292,139 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
 static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  MODE_INFO *m;
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
-  m = xd->mi[0];
-  if (is_inter_block(&m->mbmi)) {
-#define FRAME_TO_CHECK 1
+  const MB_MODE_INFO *const *mbmi = xd->mi[0];
+  if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
     if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
-      const MB_MODE_INFO *const mbmi = &m->mbmi;
       const BLOCK_SIZE bsize = mbmi->sb_type;
 
       int_mv mv[2];
-      int is_comp_ref = has_second_ref(&m->mbmi);
+      int is_comp_ref = has_second_ref(mbmi);
       int ref;
 
       for (ref = 0; ref < 1 + is_comp_ref; ++ref)
-        mv[ref].as_mv = m->mbmi.mv[ref].as_mv;
+        mv[ref].as_mv = mbmi->mv[ref].as_mv;
 
       if (!is_comp_ref) {
-#if CONFIG_COMPOUND_SINGLEREF
-        if (is_inter_singleref_comp_mode(m->mbmi.mode))
-          mv[1].as_mv = m->mbmi.mv[1].as_mv;
-        else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          mv[1].as_int = 0;
+        mv[1].as_int = 0;
       }
 
       MACROBLOCK *const x = &cpi->td.mb;
       const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-      const int16_t mode_ctx = av1_mode_context_analyzer(
-          mbmi_ext->mode_context, mbmi->ref_frame, bsize, -1);
+      const int16_t mode_ctx =
+          is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]
+                      : av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                  mbmi->ref_frame);
+
       const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
       int16_t zeromv_ctx = -1;
       int16_t refmv_ctx = -1;
+
       if (mbmi->mode != NEWMV) {
-        zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-        if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-          assert(mbmi->mode == ZEROMV);
-        }
-        if (mbmi->mode != ZEROMV) {
+        zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+        if (mbmi->mode != GLOBALMV)
           refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-          if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
-          if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
-          if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-        }
       }
 
-      int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
       printf(
           "=== ENCODER ===: "
-          "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, "
+          "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
           "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
-          "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, "
-          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d\n",
-          cm->current_video_frame, mi_row, mi_col, mbmi->mode, bsize,
-          cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row,
-          mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1],
-          mbmi->motion_mode, mbmi_ext->mode_context[ref_frame_type], mode_ctx,
-          newmv_ctx, zeromv_ctx, refmv_ctx);
+          "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+          cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+          bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+          mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+          mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+          zeromv_ctx, refmv_ctx, mbmi->tx_size);
     }
   }
 }
 #endif  // ENC_MISMATCH_DEBUG
 
 static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
-                         aom_writer *w,
-#if CONFIG_SUPERTX
-                         int supertx_enabled,
-#endif
-                         int mi_row, int mi_col) {
+                         aom_writer *w, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  MODE_INFO *m;
   int bh, bw;
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
-  m = xd->mi[0];
+  MB_MODE_INFO *m = xd->mi[0];
 
-  assert(m->mbmi.sb_type <= cm->sb_size ||
-         (m->mbmi.sb_type >= BLOCK_SIZES && m->mbmi.sb_type < BLOCK_SIZES_ALL));
+  assert(m->sb_type <= cm->seq_params.sb_size ||
+         (m->sb_type >= BLOCK_SIZES && m->sb_type < BLOCK_SIZES_ALL));
 
-  bh = mi_size_high[m->mbmi.sb_type];
-  bw = mi_size_wide[m->mbmi.sb_type];
+  bh = mi_size_high[m->sb_type];
+  bw = mi_size_wide[m->sb_type];
 
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cm, xd,
-#if CONFIG_INTRABC
-                      cpi->td.mb.mbmi_ext,
-#endif  // CONFIG_INTRABC
-                      mi_row, mi_col, w);
+    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext, mi_row, mi_col, w);
   } else {
-#if CONFIG_VAR_TX
-    xd->above_txfm_context =
-        cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-    xd->left_txfm_context = xd->left_txfm_context_buffer +
-                            ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-#endif
-#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
     // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
-    set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(&m->mbmi) && is_inter_singleref_comp_mode(m->mbmi.mode))
-      xd->block_refs[1] = xd->block_refs[0];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
+    set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
 
 #if ENC_MISMATCH_DEBUG
     enc_dump_logs(cpi, mi_row, mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
-    pack_inter_mode_mvs(cpi, mi_row, mi_col,
-#if CONFIG_SUPERTX
-                        supertx_enabled,
-#endif
-                        w);
+    pack_inter_mode_mvs(cpi, mi_row, mi_col, w);
+  }
+}
+
+static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
+                                  MB_MODE_INFO *const mbmi, aom_writer *w,
+                                  const TOKENEXTRA **tok,
+                                  const TOKENEXTRA *const tok_end,
+                                  TOKEN_STATS *token_stats, const int row,
+                                  const int col, int *block, const int plane) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsizec =
+      scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+  const int step =
+      tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+  const int bkw = tx_size_wide_unit[max_tx_size];
+  const int bkh = tx_size_high_unit[max_tx_size];
+
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+  int blk_row, blk_col;
+
+  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+
+  const int unit_height =
+      AOMMIN(mu_blocks_high + (row >> pd->subsampling_y), num_4x4_h);
+  const int unit_width =
+      AOMMIN(mu_blocks_wide + (col >> pd->subsampling_x), num_4x4_w);
+  for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+       blk_row += bkh) {
+    for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+         blk_col += bkw) {
+      pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                      cm->bit_depth, *block, blk_row, blk_col, max_tx_size,
+                      token_stats);
+      *block += step;
+    }
   }
 }
 
@@ -2449,167 +1433,48 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
                            const TOKENEXTRA *const tok_end, int mi_row,
                            int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO *const m = *(cm->mi_grid_visible + mi_offset);
-  MB_MODE_INFO *const mbmi = &m->mbmi;
+  MB_MODE_INFO *const mbmi = *(cm->mi_grid_visible + mi_offset);
   int plane;
   int bh, bw;
-#if CONFIG_PVQ || CONFIG_LV_MAP
   MACROBLOCK *const x = &cpi->td.mb;
   (void)tok;
   (void)tok_end;
-#endif
   xd->mi = cm->mi_grid_visible + mi_offset;
 
-  assert(mbmi->sb_type <= cm->sb_size ||
+  assert(mbmi->sb_type <= cm->seq_params.sb_size ||
          (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL));
 
   bh = mi_size_high[mbmi->sb_type];
   bw = mi_size_wide[mbmi->sb_type];
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-  for (plane = 0; plane <= 1; ++plane) {
-    const uint8_t palette_size_plane =
-        mbmi->palette_mode_info.palette_size[plane];
-    if (palette_size_plane > 0) {
-#if CONFIG_INTRABC
-      assert(mbmi->use_intrabc == 0);
-#endif
-      int rows, cols;
-      assert(mbmi->sb_type >= BLOCK_8X8);
-      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
-                               &cols);
-      assert(*tok < tok_end);
-      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
-#if !CONFIG_LV_MAP
-      assert(*tok < tok_end + mbmi->skip);
-#endif  // !CONFIG_LV_MAP
-    }
-  }
-#endif  // !CONFIG_PVQ
-
-#if CONFIG_COEF_INTERLEAVE
-  if (!mbmi->skip) {
-    const struct macroblockd_plane *const pd_y = &xd->plane[0];
-    const struct macroblockd_plane *const pd_c = &xd->plane[1];
-    const TX_SIZE tx_log2_y = mbmi->tx_size;
-    const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
-    const int tx_sz_y = (1 << tx_log2_y);
-    const int tx_sz_c = (1 << tx_log2_c);
-
-    const BLOCK_SIZE plane_bsize_y =
-        get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_y);
-    const BLOCK_SIZE plane_bsize_c =
-        get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_c);
-
-    const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
-    const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
-    const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
-    const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
-
-    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
-                                             pd_y->subsampling_x);
-    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
-                                             pd_y->subsampling_y);
-    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
-                                             pd_c->subsampling_x);
-    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
-                                             pd_c->subsampling_y);
-
-    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
-    // i.e. when the SB is splitted by tile boundaries.
-    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_y = tu_num_w_y * tu_num_h_y;
-    const int tu_num_c = tu_num_w_c * tu_num_h_c;
-
-    int tu_idx_y = 0, tu_idx_c = 0;
-    TOKEN_STATS token_stats;
-    init_token_stats(&token_stats);
-
-    assert(*tok < tok_end);
-
-    while (tu_idx_y < tu_num_y) {
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats);
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-      tu_idx_y++;
-
-      if (tu_idx_c < tu_num_c) {
-        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-        (*tok)++;
-
-        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-        (*tok)++;
-
-        tu_idx_c++;
-      }
-    }
-
-    // In 422 case, it's possilbe that Chroma has more TUs than Luma
-    while (tu_idx_c < tu_num_c) {
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
-      tu_idx_c++;
-    }
-  }
-#else  // CONFIG_COEF_INTERLEAVE
   if (!mbmi->skip) {
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-    assert(*tok < tok_end);
-#endif
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y)) {
-#if !CONFIG_LV_MAP
-        (*tok)++;
-#endif  // !CONFIG_LV_MAP
-        continue;
-      }
-#endif
-#if CONFIG_VAR_TX
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CHROMA_SUB8X8
-      const BLOCK_SIZE plane_bsize =
-          AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE plane_bsize =
-          get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
-#endif
-
+    if (!is_inter_block(mbmi))
+      av1_write_coeffs_mb(cm, x, mi_row, mi_col, w, mbmi->sb_type);
+
+    if (is_inter_block(mbmi)) {
+      int block[MAX_MB_PLANE] = { 0 };
+      const BLOCK_SIZE plane_bsize = mbmi->sb_type;
+      assert(plane_bsize == get_plane_block_size(mbmi->sb_type,
+                                                 xd->plane[0].subsampling_x,
+                                                 xd->plane[0].subsampling_y));
       const int num_4x4_w =
           block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
       const int num_4x4_h =
-          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+          block_size_high[plane_bsize] >> tx_size_high_log2[0];
       int row, col;
       TOKEN_STATS token_stats;
       init_token_stats(&token_stats);
 
-      const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
       int mu_blocks_wide =
           block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
       int mu_blocks_high =
@@ -2618,37 +1483,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
       mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
 
-      if (is_inter_block(mbmi)) {
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(
-            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
-        int block = 0;
-        const int step =
-            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-        const int bkw = tx_size_wide_unit[max_tx_size];
-        const int bkh = tx_size_high_unit[max_tx_size];
-        assert(bkw <= mu_blocks_wide);
-        assert(bkh <= mu_blocks_high);
-        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
-          const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
-          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
-            int blk_row, blk_col;
-            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
-            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
-              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
-                pack_txb_tokens(w,
-#if CONFIG_LV_MAP
-                                cm,
-#endif
-                                tok, tok_end,
-#if CONFIG_PVQ || CONFIG_LV_MAP
-                                x,
-#endif
-                                xd, mbmi, plane, plane_bsize, cm->bit_depth,
-                                block, blk_row, blk_col, max_tx_size,
-                                &token_stats);
-                block += step;
-              }
+      for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+        for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+          for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+                                     pd->subsampling_x, pd->subsampling_y)) {
+              continue;
             }
+            write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats,
+                                  row, col, &block[plane], plane);
           }
         }
 #if CONFIG_RD_DEBUG
@@ -2658,607 +1502,196 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
           assert(0);
         }
 #endif  // CONFIG_RD_DEBUG
-      } else {
-#if CONFIG_LV_MAP
-        av1_write_coeffs_mb(cm, x, w, plane);
-#else
-        const TX_SIZE tx = av1_get_tx_size(plane, xd);
-        const int bkw = tx_size_wide_unit[tx];
-        const int bkh = tx_size_high_unit[tx];
-        int blk_row, blk_col;
-
-        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
-          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
-            const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
-            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
-
-            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
-              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
-#if !CONFIG_PVQ
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                TX_TYPE tx_type =
-                    av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd,
-                                    blk_row, blk_col, 0, tx);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                               tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                               &token_stats);
-#else
-                pack_pvq_tokens(w, x, xd, plane, bsize, tx);
-#endif
-              }
-            }
-          }
-        }
-#endif  // CONFIG_LV_MAP
-      }
-#else
-      const TX_SIZE tx = av1_get_tx_size(plane, xd);
-      TOKEN_STATS token_stats;
-#if !CONFIG_PVQ
-      init_token_stats(&token_stats);
-#if CONFIG_LV_MAP
-      (void)tx;
-      av1_write_coeffs_mb(cm, x, w, plane);
-#else  // CONFIG_LV_MAP
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-      TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y,
-                                        xd, blk_row, blk_col, 0, tx);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                     tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                     &token_stats);
-#endif  // CONFIG_LV_MAP
-
-#else
-      (void)token_stats;
-      pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx);
-#endif
-#if CONFIG_RD_DEBUG
-      if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 &&
-          rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
-        dump_mode_info(m);
-        assert(0);
       }
-#endif  // CONFIG_RD_DEBUG
-#endif  // CONFIG_VAR_TX
-
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-#endif
     }
   }
-#endif  // CONFIG_COEF_INTERLEAVE
 }
 
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
-                            aom_writer *w, const TOKENEXTRA **tok,
-                            const TOKENEXTRA *const tok_end, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  partition = get_partition(cm, mi_row, mi_col, bsize);
-  subsize = get_subsize(bsize, partition);
-
-  if (subsize < BLOCK_8X8 && !unify_bsize) {
-    write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        break;
-      case PARTITION_HORZ:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_row + hbs < cm->mi_rows)
-          write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        break;
-      case PARTITION_VERT:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_col + hbs < cm->mi_cols)
-          write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        break;
-      case PARTITION_SPLIT:
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs,
-                        subsize);
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col,
-                        subsize);
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
-                        subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
-#endif
-      case PARTITION_HORZ_A:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        break;
-      case PARTITION_HORZ_B:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
-        break;
-      case PARTITION_VERT_A:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        break;
-      case PARTITION_VERT_B:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
-        break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      default: assert(0);
+static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+                          aom_writer *w, const TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end, int mi_row,
+                          int mi_col) {
+  write_mbmi_b(cpi, tile, w, mi_row, mi_col);
+
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+    const uint8_t palette_size_plane =
+        mbmi->palette_mode_info.palette_size[plane];
+    assert(!mbmi->skip_mode || !palette_size_plane);
+    if (palette_size_plane > 0) {
+      assert(mbmi->use_intrabc == 0);
+      assert(av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type));
+      int rows, cols;
+      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
     }
   }
-}
-#endif
 
-static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                          aom_writer *w, const TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end,
-#if CONFIG_SUPERTX
-                          int supertx_enabled,
-#endif
-                          int mi_row, int mi_col) {
-  write_mbmi_b(cpi, tile, w,
-#if CONFIG_SUPERTX
-               supertx_enabled,
-#endif
-               mi_row, mi_col);
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  int is_inter_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  int skip = mbmi->skip;
+  int segment_id = mbmi->segment_id;
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+    if (is_inter_tx) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+      const int txbh = tx_size_high_unit[max_tx_size];
+      const int txbw = tx_size_wide_unit[max_tx_size];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += txbh)
+        for (idx = 0; idx < width; idx += txbw)
+          write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+    } else {
+      write_selected_tx_size(xd, w);
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+                  skip && is_inter_block(mbmi), xd);
+  }
 
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-  (void)tok;
-  (void)tok_end;
-#else
-#if !CONFIG_PVQ && CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif
-    write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-#endif
+  write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
 }
 
 static void write_partition(const AV1_COMMON *const cm,
                             const MACROBLOCKD *const xd, int hbs, int mi_row,
                             int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
                             aom_writer *w) {
+  const int is_partition_point = bsize >= BLOCK_8X8;
+
+  if (!is_partition_point) return;
+
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
-  const int is_partition_point = bsize >= BLOCK_8X8;
-  const int ctx = is_partition_point
-                      ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                                has_rows, has_cols,
-#endif
-                                                bsize)
-                      : 0;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  if (!is_partition_point) return;
+  if (!has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT);
+    return;
+  }
 
   if (has_rows && has_cols) {
-#if CONFIG_EXT_PARTITION_TYPES
-    const int num_partition_types =
-        (mi_width_log2_lookup[bsize] > mi_width_log2_lookup[BLOCK_8X8])
-            ? EXT_PARTITION_TYPES
-            : PARTITION_TYPES;
-#else
-    const int num_partition_types = PARTITION_TYPES;
-#endif
-    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], num_partition_types);
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+                     partition_cdf_length(bsize));
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx]);
+    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
     aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
-  } else if (has_rows && !has_cols) {
+  } else {
+    assert(has_rows && !has_cols);
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx]);
+    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
     aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
-  } else {
-    assert(p == PARTITION_SPLIT);
   }
 }
 
-#if CONFIG_SUPERTX
-#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,   \
-                               mi_row, mi_col, bsize)                         \
-  write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
-                 bsize)
-#else
-#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
-                               mi_row, mi_col, bsize)                       \
-  write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
-#endif  // CONFIG_SUPERTX
-
 static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
                            aom_writer *const w, const TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end,
-#if CONFIG_SUPERTX
-                           int supertx_enabled,
-#endif
-                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+                           const TOKENEXTRA *const tok_end, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int hbs = mi_size_wide[bsize] / 2;
-#if CONFIG_EXT_PARTITION_TYPES
   const int quarter_step = mi_size_wide[bsize] / 4;
   int i;
-#if CONFIG_EXT_PARTITION_TYPES_AB
-  const int qbs = mi_size_wide[bsize] / 4;
-#endif  // CONFIG_EXT_PARTITION_TYPES_AB
-#endif  // CONFIG_EXT_PARTITION_TYPES
   const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-#if CONFIG_SUPERTX
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MB_MODE_INFO *mbmi;
-  const int pack_token = !supertx_enabled;
-  TX_SIZE supertx_size;
-#endif
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
-#if CONFIG_SUPERTX
-  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
-  xd->mi = cm->mi_grid_visible + mi_offset;
-  set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
-                 mi_size_wide[bsize],
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-  if (!supertx_enabled && !frame_is_intra_only(cm) &&
-      partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-      !xd->lossless[0]) {
-    aom_prob prob;
-    supertx_size = max_txsize_lookup[bsize];
-    prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                               [supertx_size];
-    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
-    aom_write(w, supertx_enabled, prob);
-  }
-#endif  // CONFIG_SUPERTX
-  if (subsize < BLOCK_8X8 && !unify_bsize) {
-    write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row,
-                          mi_col);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        break;
-      case PARTITION_HORZ:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        if (mi_row + hbs < cm->mi_rows)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row + hbs, mi_col);
-        break;
-      case PARTITION_VERT:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        if (mi_col + hbs < cm->mi_cols)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row, mi_col + hbs);
-        break;
-      case PARTITION_SPLIT:
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row, mi_col, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row, mi_col + hbs, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row + hbs, mi_col, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row + hbs, mi_col + hbs, subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      case PARTITION_HORZ_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + qbs, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        break;
-      case PARTITION_HORZ_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        if (mi_row + 3 * qbs < cm->mi_rows)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row + 3 * qbs, mi_col);
-        break;
-      case PARTITION_VERT_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + qbs);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        break;
-      case PARTITION_VERT_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        if (mi_col + 3 * qbs < cm->mi_cols)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row, mi_col + 3 * qbs);
-        break;
-#else
-      case PARTITION_HORZ_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        break;
-      case PARTITION_HORZ_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col + hbs);
-        break;
-      case PARTITION_VERT_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        break;
-      case PARTITION_VERT_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col + hbs);
-        break;
-#endif
-      case PARTITION_HORZ_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_row = mi_row + i * quarter_step;
-          if (i > 0 && this_mi_row >= cm->mi_rows) break;
-
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                this_mi_row, mi_col);
-        }
-        break;
-      case PARTITION_VERT_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_col = mi_col + i * quarter_step;
-          if (i > 0 && this_mi_col >= cm->mi_cols) break;
-
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row, this_mi_col);
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1,
+                                           &tile_tl_idx)) {
+      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
+          const RestorationUnitInfo *rui =
+              &cm->rst_info[plane].unit_info[runit_idx];
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
+                                           cpi->td.counts);
         }
-        break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      default: assert(0);
+      }
     }
   }
-#if CONFIG_SUPERTX
-  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
-    int skip;
-    const int bsw = mi_size_wide[bsize];
-    const int bsh = mi_size_high[bsize];
 
-    xd->mi = cm->mi_grid_visible + mi_offset;
-    supertx_size = mbmi->tx_size;
-    set_mi_row_col(xd, tile, mi_row, bsh, mi_col, bsw,
-#if CONFIG_DEPENDENT_HORZTILES
-                   cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                   cm->mi_rows, cm->mi_cols);
-
-    assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0));
-    assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
-
-    skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
-
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+  switch (partition) {
+    case PARTITION_NONE:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_row + hbs < cm->mi_rows)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_col + hbs < cm->mi_cols)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+                     subsize);
+      break;
+    case PARTITION_HORZ_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
 
-#if CONFIG_EXT_TX
-    if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
-        !skip) {
-      const int eset =
-          get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-      const int tx_set_type =
-          get_ext_tx_set_type(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-      if (eset > 0) {
-        aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][mbmi->tx_type],
-                         ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
-                         av1_num_ext_tx_set[tx_set_type]);
+        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
-    }
-#else
-    if (supertx_size < TX_32X32 && !skip) {
-      aom_write_symbol(w, mbmi->tx_type, ec_ctx->inter_ext_tx_cdf[supertx_size],
-                       TX_TYPES);
-    }
-#endif  // CONFIG_EXT_TX
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
 
-    if (!skip) {
-      assert(*tok < tok_end);
-      for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y,
-                                          xd, blk_row, blk_col, block, tx_size);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        const struct macroblockd_plane *const pd = &xd->plane[plane];
-        const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
-        const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd);
-
-        const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-        const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-        int row, col;
-        const TX_SIZE tx = av1_get_tx_size(plane, xd);
-        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
-
-        const int stepr = tx_size_high_unit[txb_size];
-        const int stepc = tx_size_wide_unit[txb_size];
-
-        TOKEN_STATS token_stats;
-        token_stats.cost = 0;
-        for (row = 0; row < max_blocks_high; row += stepr)
-          for (col = 0; col < max_blocks_wide; col += stepc)
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           &token_stats);
-        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-        (*tok)++;
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
-    }
-#if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context =
-        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-    set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bsw, bsh, skip, xd);
-#endif
+      break;
+    default: assert(0);
   }
-#endif  // CONFIG_SUPERTX
 
-// update partition context
-#if CONFIG_EXT_PARTITION_TYPES
+  // update partition context
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_LPF_SB
-  // send filter level for each superblock (64x64)
-  if (bsize == cm->sb_size) {
-    if (mi_row == 0 && mi_col == 0) {
-      aom_write_literal(w, cm->mi_grid_visible[0]->mbmi.filt_lvl, 6);
-      cm->mi_grid_visible[0]->mbmi.reuse_sb_lvl = 0;
-      cm->mi_grid_visible[0]->mbmi.delta = 0;
-      cm->mi_grid_visible[0]->mbmi.sign = 0;
-    } else {
-      int prev_mi_row, prev_mi_col;
-      if (mi_col - MAX_MIB_SIZE < 0) {
-        prev_mi_row = mi_row - MAX_MIB_SIZE;
-        prev_mi_col = mi_col;
-      } else {
-        prev_mi_row = mi_row;
-        prev_mi_col = mi_col - MAX_MIB_SIZE;
-      }
-      MB_MODE_INFO *curr_mbmi =
-          &cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi;
-      MB_MODE_INFO *prev_mbmi =
-          &cm->mi_grid_visible[prev_mi_row * cm->mi_stride + prev_mi_col]->mbmi;
-
-      const uint8_t curr_lvl = curr_mbmi->filt_lvl;
-      const uint8_t prev_lvl = prev_mbmi->filt_lvl;
-
-      const int reuse_prev_lvl = curr_lvl == prev_lvl;
-      const int reuse_ctx = prev_mbmi->reuse_sb_lvl;
-      curr_mbmi->reuse_sb_lvl = reuse_prev_lvl;
-      aom_write_symbol(w, reuse_prev_lvl,
-                       xd->tile_ctx->lpf_reuse_cdf[reuse_ctx], 2);
-
-      if (reuse_prev_lvl) {
-        curr_mbmi->delta = 0;
-        curr_mbmi->sign = 0;
-      } else {
-        const unsigned int delta = abs(curr_lvl - prev_lvl) / LPF_STEP;
-        const int delta_ctx = prev_mbmi->delta;
-        curr_mbmi->delta = delta;
-        aom_write_symbol(w, delta, xd->tile_ctx->lpf_delta_cdf[delta_ctx],
-                         DELTA_RANGE);
-
-        if (delta) {
-          const int sign = curr_lvl > prev_lvl;
-          const int sign_ctx = prev_mbmi->sign;
-          curr_mbmi->sign = sign;
-          aom_write_symbol(w, sign,
-                           xd->tile_ctx->lpf_sign_cdf[reuse_ctx][sign_ctx], 2);
-        } else {
-          curr_mbmi->sign = 0;
-        }
-      }
-    }
-  }
-#endif
-
-#if CONFIG_CDEF
-  if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) {
-    int width_step = mi_size_wide[BLOCK_64X64];
-    int height_step = mi_size_high[BLOCK_64X64];
-    int width, height;
-    for (height = 0; (height < mi_size_high[cm->sb_size]) &&
-                     (mi_row + height < cm->mi_rows);
-         height += height_step) {
-      for (width = 0; (width < mi_size_wide[cm->sb_size]) &&
-                      (mi_col + width < cm->mi_cols);
-           width += width_step) {
-        if (!sb_all_skip(cm, mi_row + height, mi_col + width))
-          aom_write_literal(
-              w,
-              cm->mi_grid_visible[(mi_row + height) * cm->mi_stride +
-                                  (mi_col + width)]
-                  ->mbmi.cdef_strength,
-              cm->cdef_bits);
-      }
-    }
-  }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    int rcol0, rcol1, rrow0, rrow1, nhtiles;
-    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
-                                           &rcol0, &rcol1, &rrow0, &rrow1,
-                                           &nhtiles)) {
-      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
-        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
-          int rtile_idx = rcol + rrow * nhtiles;
-          loop_restoration_write_sb_coeffs(cm, xd, w, plane, rtile_idx);
-        }
-      }
-    }
-  }
-#endif
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
@@ -3272,78 +1705,46 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
   const int mi_col_end = tile->mi_col_end;
   int mi_row, mi_col;
 
-#if CONFIG_DEPENDENT_HORZTILES
-  if (!cm->dependent_horz_tiles || mi_row_start == 0 ||
-      tile->tg_horz_boundary) {
-    av1_zero_above_context(cm, mi_col_start, mi_col_end);
-  }
-#else
-  av1_zero_above_context(cm, mi_col_start, mi_col_end);
-#endif
-#if CONFIG_PVQ
-  assert(cpi->td.mb.pvq_q->curr_pos == 0);
-#endif
+  av1_zero_above_context(cm, mi_col_start, mi_col_end, tile->tile_row);
+  av1_init_above_context(cm, xd, tile->tile_row);
+
   if (cpi->common.delta_q_present_flag) {
-    xd->prev_qindex = cpi->common.base_qindex;
-#if CONFIG_EXT_DELTA_Q
+    xd->current_qindex = cpi->common.base_qindex;
     if (cpi->common.delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
-      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-        xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      xd->prev_delta_lf_from_base = 0;
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
-#endif  // CONFIG_EXT_DELTA_Q
   }
 
-  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
+  for (mi_row = mi_row_start; mi_row < mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
     av1_zero_left_context(xd);
 
-    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
-      write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
-                             cm->sb_size);
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-      write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size);
-#endif
+    for (mi_col = mi_col_start; mi_col < mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+                     cm->seq_params.sb_size);
     }
   }
-#if CONFIG_PVQ
-  // Check that the number of PVQ blocks encoded and written to the bitstream
-  // are the same
-  assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos);
-  // Reset curr_pos in case we repack the bitstream
-  cpi->td.mb.pvq_q->curr_pos = 0;
-#endif
 }
 
-#if CONFIG_LOOP_RESTORATION
 static void encode_restoration_mode(AV1_COMMON *cm,
                                     struct aom_write_bit_buffer *wb) {
-  int p;
-  RestorationInfo *rsi = &cm->rst_info[0];
-  switch (rsi->frame_restoration_type) {
-    case RESTORE_NONE:
-      aom_wb_write_bit(wb, 0);
-      aom_wb_write_bit(wb, 0);
-      break;
-    case RESTORE_WIENER:
-      aom_wb_write_bit(wb, 1);
-      aom_wb_write_bit(wb, 0);
-      break;
-    case RESTORE_SGRPROJ:
-      aom_wb_write_bit(wb, 1);
-      aom_wb_write_bit(wb, 1);
-      break;
-    case RESTORE_SWITCHABLE:
-      aom_wb_write_bit(wb, 0);
-      aom_wb_write_bit(wb, 1);
-      break;
-    default: assert(0);
-  }
-  for (p = 1; p < MAX_MB_PLANE; ++p) {
-    rsi = &cm->rst_info[p];
+  assert(!cm->all_lossless);
+  if (!cm->seq_params.enable_restoration) return;
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
     switch (rsi->frame_restoration_type) {
-      case RESTORE_NONE: aom_wb_write_bit(wb, 0); break;
+      case RESTORE_NONE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 0);
+        break;
       case RESTORE_WIENER:
         aom_wb_write_bit(wb, 1);
         aom_wb_write_bit(wb, 0);
@@ -3352,40 +1753,52 @@ static void encode_restoration_mode(AV1_COMMON *cm,
         aom_wb_write_bit(wb, 1);
         aom_wb_write_bit(wb, 1);
         break;
+      case RESTORE_SWITCHABLE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 1);
+        break;
       default: assert(0);
     }
   }
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    rsi = &cm->rst_info[0];
-    aom_wb_write_bit(wb, rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX);
-    if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
-      aom_wb_write_bit(
-          wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1));
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    assert(rsi->restoration_unit_size >= sb_size);
+    assert(RESTORATION_UNITSIZE_MAX == 256);
+
+    if (sb_size == 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
     }
   }
-  int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
-  if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-            cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
-    aom_wb_write_bit(wb,
-                     cm->rst_info[1].restoration_tilesize !=
-                         cm->rst_info[0].restoration_tilesize);
-    assert(cm->rst_info[1].restoration_tilesize ==
-               cm->rst_info[0].restoration_tilesize ||
-           cm->rst_info[1].restoration_tilesize ==
-               (cm->rst_info[0].restoration_tilesize >> s));
-    assert(cm->rst_info[2].restoration_tilesize ==
-           cm->rst_info[1].restoration_tilesize);
-  } else if (!s) {
-    assert(cm->rst_info[1].restoration_tilesize ==
-           cm->rst_info[0].restoration_tilesize);
-    assert(cm->rst_info[2].restoration_tilesize ==
-           cm->rst_info[1].restoration_tilesize);
+
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+    if (s && !chroma_none) {
+      aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+                               cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[1].restoration_unit_size ==
+                 cm->rst_info[0].restoration_unit_size ||
+             cm->rst_info[1].restoration_unit_size ==
+                 (cm->rst_info[0].restoration_unit_size >> s));
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    } else if (!s) {
+      assert(cm->rst_info[1].restoration_unit_size ==
+             cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    }
   }
 }
 
-static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info,
+static void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info,
                                 WienerInfo *ref_wiener_info, aom_writer *wb) {
   if (wiener_win == WIENER_WIN)
     aom_write_primitive_refsubexpfin(
@@ -3428,78 +1841,106 @@ static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info,
   memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
 }
 
-static void write_sgrproj_filter(SgrprojInfo *sgrproj_info,
+static void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
                                  SgrprojInfo *ref_sgrproj_info,
                                  aom_writer *wb) {
   aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
-  aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1,
-                                   SGRPROJ_PRJ_SUBEXP_K,
-                                   ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
-                                   sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
-  aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1,
-                                   SGRPROJ_PRJ_SUBEXP_K,
-                                   ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
-                                   sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    assert(sgrproj_info->xqd[0] == 0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  } else if (params->r[1] == 0) {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  } else {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  }
+
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
 static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
                                              MACROBLOCKD *xd,
+                                             const RestorationUnitInfo *rui,
                                              aom_writer *const w, int plane,
-                                             int rtile_idx) {
+                                             FRAME_COUNTS *counts) {
   const RestorationInfo *rsi = cm->rst_info + plane;
-  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+  RestorationType frame_rtype = rsi->frame_restoration_type;
+  if (frame_rtype == RESTORE_NONE) return;
+
+  (void)counts;
+  assert(!cm->all_lossless);
 
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
   WienerInfo *wiener_info = xd->wiener_info + plane;
   SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+  RestorationType unit_rtype = rui->restoration_type;
 
-  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
-    assert(plane == 0);
-    av1_write_token(
-        w, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
-        &switchable_restore_encodings[rsi->restoration_type[rtile_idx]]);
-    if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) {
-      write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                          w);
-    } else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) {
-      write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w);
+  if (frame_rtype == RESTORE_SWITCHABLE) {
+    aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+                     RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+    ++counts->switchable_restore[unit_rtype];
+#endif
+    switch (unit_rtype) {
+      case RESTORE_WIENER:
+        write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+        break;
+      case RESTORE_SGRPROJ:
+        write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+        break;
+      default: assert(unit_rtype == RESTORE_NONE); break;
     }
-  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
-    aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE,
-              RESTORE_NONE_WIENER_PROB);
-    if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) {
-      write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                          w);
+  } else if (frame_rtype == RESTORE_WIENER) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
     }
-  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
-    aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE,
-              RESTORE_NONE_SGRPROJ_PROB);
-    if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) {
-      write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w);
+  } else if (frame_rtype == RESTORE_SGRPROJ) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
     }
   }
 }
 
-#endif  // CONFIG_LOOP_RESTORATION
-
 static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->coded_lossless);
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
   int i;
   struct loopfilter *lf = &cm->lf;
 
-// Encode the loop filter level and type
-#if !CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
+  // Encode the loop filter level and type
   aom_wb_write_literal(wb, lf->filter_level[0], 6);
   aom_wb_write_literal(wb, lf->filter_level[1], 6);
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    aom_wb_write_literal(wb, lf->filter_level_u, 6);
-    aom_wb_write_literal(wb, lf->filter_level_v, 6);
-  }
-#else
-  aom_wb_write_literal(wb, lf->filter_level, 6);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      aom_wb_write_literal(wb, lf->filter_level_u, 6);
+      aom_wb_write_literal(wb, lf->filter_level_v, 6);
+    }
+  }
   aom_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
@@ -3508,48 +1949,58 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
 
   if (lf->mode_ref_delta_enabled) {
     aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+
     if (lf->mode_ref_delta_update) {
-      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) {
+      const int prime_idx = cm->primary_ref_frame;
+      const int buf_idx =
+          prime_idx == PRIMARY_REF_NONE ? -1 : cm->frame_refs[prime_idx].idx;
+      int8_t last_ref_deltas[REF_FRAMES];
+      if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+        av1_set_default_ref_deltas(last_ref_deltas);
+      } else {
+        memcpy(last_ref_deltas, cm->buffer_pool->frame_bufs[buf_idx].ref_deltas,
+               REF_FRAMES);
+      }
+      for (i = 0; i < REF_FRAMES; i++) {
         const int delta = lf->ref_deltas[i];
-        const int changed = delta != lf->last_ref_deltas[i];
+        const int changed = delta != last_ref_deltas[i];
         aom_wb_write_bit(wb, changed);
-        if (changed) {
-          lf->last_ref_deltas[i] = delta;
-          aom_wb_write_inv_signed_literal(wb, delta, 6);
-        }
+        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
       }
 
+      int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+      if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+        av1_set_default_mode_deltas(last_mode_deltas);
+      } else {
+        memcpy(last_mode_deltas,
+               cm->buffer_pool->frame_bufs[buf_idx].mode_deltas,
+               MAX_MODE_LF_DELTAS);
+      }
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
         const int delta = lf->mode_deltas[i];
-        const int changed = delta != lf->last_mode_deltas[i];
+        const int changed = delta != last_mode_deltas[i];
         aom_wb_write_bit(wb, changed);
-        if (changed) {
-          lf->last_mode_deltas[i] = delta;
-          aom_wb_write_inv_signed_literal(wb, delta, 6);
-        }
+        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
       }
     }
   }
 }
 
-#if CONFIG_CDEF
 static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->coded_lossless);
+  if (!cm->seq_params.enable_cdef) return;
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
   int i;
-#if CONFIG_CDEF_SINGLEPASS
   aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
   assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
-#else
-  aom_wb_write_literal(wb, cm->cdef_pri_damping - 5, 1);
-  aom_wb_write_literal(wb, cm->cdef_sec_damping - 3, 2);
-#endif
   aom_wb_write_literal(wb, cm->cdef_bits, 2);
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
     aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
-    if (cm->subsampling_x == cm->subsampling_y)
+    if (num_planes > 1)
       aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
   }
 }
-#endif
 
 static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
@@ -3562,63 +2013,71 @@ static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
 
 static void encode_quantization(const AV1_COMMON *const cm,
                                 struct aom_write_bit_buffer *wb) {
+  const int num_planes = av1_num_planes(cm);
+
   aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
   write_delta_q(wb, cm->y_dc_delta_q);
-  write_delta_q(wb, cm->uv_dc_delta_q);
-  write_delta_q(wb, cm->uv_ac_delta_q);
-#if CONFIG_AOM_QM
+  if (num_planes > 1) {
+    int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
+                        (cm->u_ac_delta_q != cm->v_ac_delta_q);
+    if (cm->separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    write_delta_q(wb, cm->u_dc_delta_q);
+    write_delta_q(wb, cm->u_ac_delta_q);
+    if (diff_uv_delta) {
+      write_delta_q(wb, cm->v_dc_delta_q);
+      write_delta_q(wb, cm->v_ac_delta_q);
+    }
+  }
   aom_wb_write_bit(wb, cm->using_qmatrix);
   if (cm->using_qmatrix) {
-    aom_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS);
-    aom_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
+    if (!cm->separate_uv_delta_q)
+      assert(cm->qm_u == cm->qm_v);
+    else
+      aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
   }
-#endif
 }
 
 static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
                                 struct aom_write_bit_buffer *wb) {
   int i, j;
-  const struct segmentation *seg = &cm->seg;
+  struct segmentation *seg = &cm->seg;
 
   aom_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled) return;
 
-  // Segmentation map
-  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
-    aom_wb_write_bit(wb, seg->update_map);
-  } else {
+  // Write update flags
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
     assert(seg->update_map == 1);
-  }
-  if (seg->update_map) {
-    // Select the coding strategy (temporal or spatial)
-    av1_choose_segmap_coding_method(cm, xd);
-
-    // Write out the chosen coding method.
-    if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    seg->temporal_update = 0;
+    assert(seg->update_data == 1);
+  } else {
+    aom_wb_write_bit(wb, seg->update_map);
+    if (seg->update_map) {
+      // Select the coding strategy (temporal or spatial)
+      av1_choose_segmap_coding_method(cm, xd);
       aom_wb_write_bit(wb, seg->temporal_update);
-    } else {
-      assert(seg->temporal_update == 0);
     }
+    aom_wb_write_bit(wb, seg->update_data);
   }
 
   // Segmentation data
-  aom_wb_write_bit(wb, seg->update_data);
   if (seg->update_data) {
-    aom_wb_write_bit(wb, seg->abs_delta);
-
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         const int active = segfeature_active(seg, i, j);
         aom_wb_write_bit(wb, active);
         if (active) {
-          const int data = get_segdata(seg, i, j);
           const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+          const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
 
           if (av1_is_segfeature_signed(j)) {
-            encode_unsigned_max(wb, abs(data), data_max);
-            aom_wb_write_bit(wb, data < 0);
+            aom_wb_write_inv_signed_literal(wb, data, ubits);
           } else {
-            encode_unsigned_max(wb, data, data_max);
+            aom_wb_write_literal(wb, data, ubits);
           }
         }
       }
@@ -3628,26 +2087,11 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
 
 static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
                           struct aom_write_bit_buffer *wb) {
-  if (cm->all_lossless) {
+  if (cm->coded_lossless) {
     *mode = ONLY_4X4;
     return;
   }
-#if CONFIG_VAR_TX_NO_TX_MODE
-  (void)wb;
-  *mode = TX_MODE_SELECT;
-  return;
-#else
-#if CONFIG_TX64X64
-  aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
-  if (*mode != TX_MODE_SELECT) {
-    aom_wb_write_literal(wb, AOMMIN(*mode, ALLOW_32X32), 2);
-    if (*mode >= ALLOW_32X32) aom_wb_write_bit(wb, *mode == ALLOW_64X64);
-  }
-#else
   aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
-  if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
 static void write_frame_interp_filter(InterpFilter filter,
@@ -3672,14 +2116,7 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
       // Only one filter is used. So set the filter at frame level
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         if (count[i]) {
-#if CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
-#if CONFIG_WARPED_MOTION
-          if (i == EIGHTTAP_REGULAR || WARP_WM_NEIGHBORS_WITH_OBMC)
-#else
-          if (i == EIGHTTAP_REGULAR || WARP_GM_NEIGHBORS_WITH_OBMC)
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
-            cm->interp_filter = i;
+          if (i == EIGHTTAP_REGULAR) cm->interp_filter = i;
           break;
         }
       }
@@ -3687,8 +2124,6 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
   }
 }
 
-#if CONFIG_MAX_TILE
-
 // Same function as write_uniform but writing to uncompresses header wb
 static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -3704,10 +2139,10 @@ static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
 
 static void write_tile_info_max_tile(const AV1_COMMON *const cm,
                                      struct aom_write_bit_buffer *wb) {
-  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int width_sb = width_mi >> MAX_MIB_SIZE_LOG2;
-  int height_sb = height_mi >> MAX_MIB_SIZE_LOG2;
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
   int size_sb, i;
 
   aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag);
@@ -3736,7 +2171,8 @@ static void write_tile_info_max_tile(const AV1_COMMON *const cm,
     // columns
     for (i = 0; i < cm->tile_cols; i++) {
       size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
-      wb_write_uniform(wb, AOMMIN(width_sb, MAX_TILE_WIDTH_SB), size_sb - 1);
+      wb_write_uniform(wb, AOMMIN(width_sb, cm->max_tile_width_sb),
+                       size_sb - 1);
       width_sb -= size_sb;
     }
     assert(width_sb == 0);
@@ -3751,72 +2187,45 @@ static void write_tile_info_max_tile(const AV1_COMMON *const cm,
     assert(height_sb == 0);
   }
 }
-#endif
 
 static void write_tile_info(const AV1_COMMON *const cm,
+                            struct aom_write_bit_buffer *saved_wb,
                             struct aom_write_bit_buffer *wb) {
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    const int tile_width =
-        ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
-        cm->mib_size_log2;
-    const int tile_height =
-        ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
-        cm->mib_size_log2;
-
-    assert(tile_width > 0);
-    assert(tile_height > 0);
-
-// Write the tile sizes
-#if CONFIG_EXT_PARTITION
-    if (cm->sb_size == BLOCK_128X128) {
-      assert(tile_width <= 32);
-      assert(tile_height <= 32);
-      aom_wb_write_literal(wb, tile_width - 1, 5);
-      aom_wb_write_literal(wb, tile_height - 1, 5);
-    } else {
-#endif  // CONFIG_EXT_PARTITION
-      assert(tile_width <= 64);
-      assert(tile_height <= 64);
-      aom_wb_write_literal(wb, tile_width - 1, 6);
-      aom_wb_write_literal(wb, tile_height - 1, 6);
-#if CONFIG_EXT_PARTITION
-    }
-#endif  // CONFIG_EXT_PARTITION
-  } else {
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_MAX_TILE
-    write_tile_info_max_tile(cm, wb);
-#else
-  int min_log2_tile_cols, max_log2_tile_cols, ones;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+  write_tile_info_max_tile(cm, wb);
 
-  // columns
-  ones = cm->log2_tile_cols - min_log2_tile_cols;
-  while (ones--) aom_wb_write_bit(wb, 1);
+  *saved_wb = *wb;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // tile id used for cdf update
+    aom_wb_write_literal(wb, 0, cm->log2_tile_cols + cm->log2_tile_rows);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 3, 2);
+  }
+}
 
-  if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+static void write_ext_tile_info(const AV1_COMMON *const cm,
+                                struct aom_write_bit_buffer *saved_wb,
+                                struct aom_write_bit_buffer *wb) {
+  // This information is stored as a separate byte.
+  int mod = wb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+  assert(aom_wb_is_byte_aligned(wb));
 
-  // rows
-  aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
-  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-    if (cm->tile_rows > 1) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
-#endif
-#if CONFIG_EXT_TILE
+  *saved_wb = *wb;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // Note that the last item in the uncompressed header is the data
+    // describing tile configuration.
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(wb, 0, 2);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 0, 2);
   }
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 }
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 static int get_refresh_mask_gf16(AV1_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
+    return 0xFF;
+
   int refresh_mask = 0;
 
   if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
@@ -3829,11 +2238,12 @@ static int get_refresh_mask_gf16(AV1_COMP *cpi) {
   return refresh_mask;
 }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
 static int get_refresh_mask(AV1_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
+    return 0xFF;
+
   int refresh_mask = 0;
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
   if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi);
 #endif  // USE_GF16_MULTI_LAYER
@@ -3847,13 +2257,12 @@ static int get_refresh_mask(AV1_COMP *cpi) {
   //     shifted and become the new virtual indexes for LAST2_FRAME and
   //     LAST3_FRAME.
   refresh_mask |=
-      (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
+      (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]);
 
-  refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
-  refresh_mask |= (cpi->refresh_alt2_ref_frame << cpi->alt2_fb_idx);
-#else   // !CONFIG_EXT_REFS
-  refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
-#endif  // CONFIG_EXT_REFS
+  refresh_mask |=
+      (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+  refresh_mask |=
+      (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]);
 
   if (av1_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
@@ -3866,26 +2275,19 @@ static int get_refresh_mask(AV1_COMP *cpi) {
     // Note: This is highly specific to the use of ARF as a forward reference,
     // and this needs to be generalized as other uses are implemented
     // (like RTC/temporal scalability).
-    return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+    return refresh_mask |
+           (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
   } else {
-#if CONFIG_EXT_REFS
-    const int arf_idx = cpi->alt_fb_idx;
-#else   // !CONFIG_EXT_REFS
-    int arf_idx = cpi->alt_fb_idx;
-    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      arf_idx = gf_group->arf_update_idx[gf_group->index];
-    }
-#endif  // CONFIG_EXT_REFS
-    return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+    const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    return refresh_mask |
+           (cpi->refresh_golden_frame << cpi->ref_fb_idx[GOLDEN_FRAME - 1]) |
            (cpi->refresh_alt_ref_frame << arf_idx);
   }
 }
 
-#if CONFIG_EXT_TILE
 static INLINE int find_identical_tile(
     const int tile_row, const int tile_col,
-    TileBufferEnc (*const tile_buffers)[1024]) {
+    TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
   const MV32 candidate_offset[1] = { { 1, 0 } };
   const uint8_t *const cur_tile_data =
       tile_buffers[tile_row][tile_col].data + 4;
@@ -3933,577 +2335,486 @@ static INLINE int find_identical_tile(
   // No identical tile found
   return 0;
 }
-#endif  // CONFIG_EXT_TILE
 
-#if !CONFIG_OBU || CONFIG_EXT_TILE
-static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
-                            unsigned int *max_tile_size,
-                            unsigned int *max_tile_col_size) {
-  const AV1_COMMON *const cm = &cpi->common;
-  aom_writer mode_bc;
-  int tile_row, tile_col;
-  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
-  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
-  uint32_t total_size = 0;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
-  unsigned int tile_size = 0;
-  const int have_tiles = tile_cols * tile_rows > 1;
-  struct aom_write_bit_buffer wb = { dst, 0 };
-  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
-  uint32_t compressed_hdr_size;
-  // Fixed size tile groups for the moment
-  const int num_tg_hdrs = cm->num_tg;
-  const int tg_size =
-#if CONFIG_EXT_TILE
-      (cm->large_scale_tile)
-          ? 1
-          :
-#endif  // CONFIG_EXT_TILE
-          (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
-  int tile_count = 0;
-  int tg_count = 1;
-  int tile_size_bytes = 4;
-  int tile_col_size_bytes;
-  uint32_t uncompressed_hdr_size = 0;
-  struct aom_write_bit_buffer tg_params_wb;
-  struct aom_write_bit_buffer tile_size_bytes_wb;
-  uint32_t saved_offset;
-  int mtu_size = cpi->oxcf.mtu;
-  int curr_tg_data_size = 0;
-  int hdr_size;
+static void write_render_size(const AV1_COMMON *cm,
+                              struct aom_write_bit_buffer *wb) {
+  const int scaling_active = av1_resize_scaled(cm);
+  aom_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    aom_wb_write_literal(wb, cm->render_width - 1, 16);
+    aom_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
 
-  *max_tile_size = 0;
-  *max_tile_col_size = 0;
+static void write_superres_scale(const AV1_COMMON *const cm,
+                                 struct aom_write_bit_buffer *wb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) {
+    assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+    return;
+  }
 
-// All tile size fields are output on 4 bytes. A call to remux_tiles will
-// later compact the data if smaller headers are adequate.
+  // First bit is whether to to scale or not
+  if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    aom_wb_write_bit(wb, 0);  // no scaling
+  } else {
+    aom_wb_write_bit(wb, 1);  // scaling, write scale factor
+    assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+    assert(cm->superres_scale_denominator <
+           SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+    aom_wb_write_literal(
+        wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+        SUPERRES_SCALE_BITS);
+  }
+}
 
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      TileInfo tile_info;
-      const int is_last_col = (tile_col == tile_cols - 1);
-      const uint32_t col_offset = total_size;
+static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
+                             struct aom_write_bit_buffer *wb) {
+  const int coded_width = cm->superres_upscaled_width - 1;
+  const int coded_height = cm->superres_upscaled_height - 1;
 
-      av1_tile_set_col(&tile_info, cm, tile_col);
+  if (frame_size_override) {
+    const SequenceHeader *seq_params = &cm->seq_params;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    aom_wb_write_literal(wb, coded_width, num_bits_width);
+    aom_wb_write_literal(wb, coded_height, num_bits_height);
+  }
 
-      // The last column does not have a column header
-      if (!is_last_col) total_size += 4;
+  write_superres_scale(cm, wb);
+  write_render_size(cm, wb);
+}
 
-      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-        const int data_offset = have_tiles ? 4 : 0;
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        av1_tile_set_row(&tile_info, cm, tile_row);
+static void write_frame_size_with_refs(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int found = 0;
 
-        buf->data = dst + total_size;
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
 
-        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
-        // even for the last one, unless no tiling is used at all.
-        total_size += data_offset;
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif
-        aom_start_encode(&mode_bc, buf->data + data_offset);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-        assert(tok == tok_end);
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
-        buf->size = tile_size;
-
-        // Record the maximum tile size we see, so we can compact headers later.
-        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+    if (cfg != NULL) {
+      found = cm->superres_upscaled_width == cfg->y_crop_width &&
+              cm->superres_upscaled_height == cfg->y_crop_height;
+      found &= cm->render_width == cfg->render_width &&
+               cm->render_height == cfg->render_height;
+    }
+    aom_wb_write_bit(wb, found);
+    if (found) {
+      write_superres_scale(cm, wb);
+      break;
+    }
+  }
 
-        if (have_tiles) {
-          // tile header: size of this tile, or copy offset
-          uint32_t tile_header = tile_size;
-          const int tile_copy_mode =
-              ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
-                  ? 1
-                  : 0;
+  if (!found) {
+    int frame_size_override = 1;  // Always equal to 1 in this function
+    write_frame_size(cm, frame_size_override, wb);
+  }
+}
 
-          // If tile_copy_mode = 1, check if this tile is a copy tile.
-          // Very low chances to have copy tiles on the key frames, so don't
-          // search on key frames to reduce unnecessary search.
-          if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
-            const int idendical_tile_offset =
-                find_identical_tile(tile_row, tile_col, tile_buffers);
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct aom_write_bit_buffer *wb) {
+  assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+  aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
 
-            if (idendical_tile_offset > 0) {
-              tile_size = 0;
-              tile_header = idendical_tile_offset | 0x80;
-              tile_header <<= 24;
-            }
-          }
+static void write_bitdepth(AV1_COMMON *const cm,
+                           struct aom_write_bit_buffer *wb) {
+  // Profile 0/1: [0] for 8 bit, [1]  10-bit
+  // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+  aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_8 ? 0 : 1);
+  if (cm->profile == PROFILE_2 && cm->bit_depth != AOM_BITS_8) {
+    aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
+  }
+}
 
-          mem_put_le32(buf->data, tile_header);
+static void write_color_config(AV1_COMMON *const cm,
+                               struct aom_write_bit_buffer *wb) {
+  write_bitdepth(cm, wb);
+  const int is_monochrome = cm->seq_params.monochrome;
+  // monochrome bit
+  if (cm->profile != PROFILE_1)
+    aom_wb_write_bit(wb, is_monochrome);
+  else
+    assert(!is_monochrome);
+  if (cm->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+      cm->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+      cm->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+    aom_wb_write_bit(wb, 0);  // No color description present
+  } else {
+    aom_wb_write_bit(wb, 1);  // Color description present
+    aom_wb_write_literal(wb, cm->color_primaries, 8);
+    aom_wb_write_literal(wb, cm->transfer_characteristics, 8);
+    aom_wb_write_literal(wb, cm->matrix_coefficients, 8);
+  }
+  if (is_monochrome) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, cm->color_range);
+    return;
+  }
+  if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
+      cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      cm->matrix_coefficients ==
+          AOM_CICP_MC_IDENTITY) {  // it would be better to remove this
+                                   // dependency too
+    assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+    assert(cm->profile == PROFILE_1 ||
+           (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12));
+  } else {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, cm->color_range);
+    if (cm->profile == PROFILE_0) {
+      // 420 only
+      assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+    } else if (cm->profile == PROFILE_1) {
+      // 444 only
+      assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+    } else if (cm->profile == PROFILE_2) {
+      if (cm->bit_depth == AOM_BITS_12) {
+        // 420, 444 or 422
+        aom_wb_write_bit(wb, cm->subsampling_x);
+        if (cm->subsampling_x == 0) {
+          assert(cm->subsampling_y == 0 &&
+                 "4:4:0 subsampling not allowed in AV1");
+        } else {
+          aom_wb_write_bit(wb, cm->subsampling_y);
         }
-
-        total_size += tile_size;
-      }
-
-      if (!is_last_col) {
-        uint32_t col_size = total_size - col_offset - 4;
-        mem_put_le32(dst + col_offset, col_size);
-
-        // If it is not final packing, record the maximum tile column size we
-        // see, otherwise, check if the tile size is out of the range.
-        *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+      } else {
+        // 422 only
+        assert(cm->subsampling_x == 1 && cm->subsampling_y == 0);
       }
     }
-  } else {
-#endif  // CONFIG_EXT_TILE
-    write_uncompressed_header_frame(cpi, &wb);
-
-#if CONFIG_EXT_REFS
-    if (cm->show_existing_frame) {
-      total_size = aom_wb_bytes_written(&wb);
-      return (uint32_t)total_size;
+    if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+      assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+    }
+    if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
+      aom_wb_write_literal(wb, cm->chroma_sample_position, 2);
     }
-#endif  // CONFIG_EXT_REFS
+  }
+  aom_wb_write_bit(wb, cm->separate_uv_delta_q);
+}
 
-    // Write the tile length code
-    tile_size_bytes_wb = wb;
-    aom_wb_write_literal(&wb, 3, 2);
+static void write_timing_info_header(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, cm->timing_info.num_units_in_display_tick,
+                                32);  // Number of units in tick
+  aom_wb_write_unsigned_literal(wb, cm->timing_info.time_scale,
+                                32);  // Time scale
+  aom_wb_write_bit(
+      wb,
+      cm->timing_info.equal_picture_interval);  // Equal picture interval bit
+  if (cm->timing_info.equal_picture_interval) {
+    aom_wb_write_uvlc(
+        wb,
+        cm->timing_info.num_ticks_per_picture - 1);  // ticks per picture
+  }
+}
 
-    /* Write a placeholder for the number of tiles in each tile group */
-    tg_params_wb = wb;
-    saved_offset = wb.bit_offset;
-    if (have_tiles) {
-      aom_wb_overwrite_literal(&wb, 3, n_log2_tiles);
-      aom_wb_overwrite_literal(&wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
-    }
+static void write_decoder_model_info(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(
+      wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
+  aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
+                                32);  // Number of units in decoding tick
+  aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_delay_length - 1, 5);
+  aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_delay_length - 1,
+                       5);
+}
 
-    if (!use_compressed_header(cm)) {
-      uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-      compressed_hdr_size = 0;
-    } else {
-      /* Write a placeholder for the compressed header length */
-      struct aom_write_bit_buffer comp_hdr_len_wb = wb;
-      aom_wb_write_literal(&wb, 0, 16);
-
-      uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-      compressed_hdr_size =
-          write_compressed_header(cpi, dst + uncompressed_hdr_size);
-      aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(compressed_hdr_size),
-                               16);
-    }
+static void write_dec_model_op_parameters(AV1_COMMON *const cm,
+                                          struct aom_write_bit_buffer *wb,
+                                          int op_num) {
+  if (op_num > MAX_NUM_OPERATING_POINTS)
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "Encoder does not support %d decoder model operating points", op_num);
 
-    hdr_size = uncompressed_hdr_size + compressed_hdr_size;
-    total_size += hdr_size;
+  //  aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
+  //  if (!cm->op_params[op_num].has_parameters) return;
 
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      TileInfo tile_info;
-      const int is_last_row = (tile_row == tile_rows - 1);
-      av1_tile_set_row(&tile_info, cm, tile_row);
+  aom_wb_write_literal(wb, cm->op_params[op_num].decoder_buffer_delay,
+                       cm->buffer_model.encoder_decoder_buffer_delay_length);
 
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-        const int is_last_col = (tile_col == tile_cols - 1);
-        const int is_last_tile = is_last_col && is_last_row;
-
-        if ((!mtu_size && tile_count > tg_size) ||
-            (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) {
-          // New tile group
-          tg_count++;
-          // We've exceeded the packet size
-          if (tile_count > 1) {
-            /* The last tile exceeded the packet size. The tile group size
-               should therefore be tile_count-1.
-               Move the last tile and insert headers before it
-             */
-            uint32_t old_total_size = total_size - tile_size - 4;
-            memmove(dst + old_total_size + hdr_size, dst + old_total_size,
-                    (tile_size + 4) * sizeof(uint8_t));
-            // Copy uncompressed header
-            memmove(dst + old_total_size, dst,
-                    uncompressed_hdr_size * sizeof(uint8_t));
-            // Write the number of tiles in the group into the last uncompressed
-            // header before the one we've just inserted
-            aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
-                                     n_log2_tiles);
-            aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2,
-                                     n_log2_tiles);
-            // Update the pointer to the last TG params
-            tg_params_wb.bit_offset = saved_offset + 8 * old_total_size;
-            // Copy compressed header
-            memmove(dst + old_total_size + uncompressed_hdr_size,
-                    dst + uncompressed_hdr_size,
-                    compressed_hdr_size * sizeof(uint8_t));
-            total_size += hdr_size;
-            tile_count = 1;
-            curr_tg_data_size = hdr_size + tile_size + 4;
-          } else {
-            // We exceeded the packet size in just one tile
-            // Copy uncompressed header
-            memmove(dst + total_size, dst,
-                    uncompressed_hdr_size * sizeof(uint8_t));
-            // Write the number of tiles in the group into the last uncompressed
-            // header
-            aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
-                                     n_log2_tiles);
-            aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1,
-                                     n_log2_tiles);
-            tg_params_wb.bit_offset = saved_offset + 8 * total_size;
-            // Copy compressed header
-            memmove(dst + total_size + uncompressed_hdr_size,
-                    dst + uncompressed_hdr_size,
-                    compressed_hdr_size * sizeof(uint8_t));
-            total_size += hdr_size;
-            tile_count = 0;
-            curr_tg_data_size = hdr_size;
-          }
-        }
-        tile_count++;
-        av1_tile_set_col(&tile_info, cm, tile_col);
+  aom_wb_write_literal(wb, cm->op_params[op_num].encoder_buffer_delay,
+                       cm->buffer_model.encoder_decoder_buffer_delay_length);
 
-#if CONFIG_DEPENDENT_HORZTILES
-        av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-#endif
-        buf->data = dst + total_size;
+  aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
 
-        // The last tile does not have a header.
-        if (!is_last_tile) total_size += 4;
+  cm->op_frame_timing[op_num].buffer_removal_delay =
+      0;  // reset the decoded frame counter
+}
 
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif  // CONFIG_ANS
-#if CONFIG_LOOP_RESTORATION
-        for (int p = 0; p < MAX_MB_PLANE; ++p) {
-          set_default_wiener(cpi->td.mb.e_mbd.wiener_info + p);
-          set_default_sgrproj(cpi->td.mb.e_mbd.sgrproj_info + p);
-        }
-#endif  // CONFIG_LOOP_RESTORATION
+static void write_tu_pts_info(AV1_COMMON *const cm,
+                              struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(
+      wb, (uint32_t)cm->tu_presentation_delay,
+      cm->buffer_model.frame_presentation_delay_length);
+}
 
-        aom_start_encode(&mode_bc, dst + total_size);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-#if !CONFIG_LV_MAP
-#if !CONFIG_PVQ
-        assert(tok == tok_end);
-#endif  // !CONFIG_PVQ
-#endif  // !CONFIG_LV_MAP
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
+static void write_film_grain_params(AV1_COMP *cpi,
+                                    struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  aom_film_grain_t *pars = &cm->film_grain_params;
 
-        assert(tile_size > 0);
+  cm->cur_frame->film_grain_params = *pars;
 
-        curr_tg_data_size += tile_size + 4;
-        buf->size = tile_size;
+  aom_wb_write_bit(wb, pars->apply_grain);
+  if (!pars->apply_grain) return;
 
-        if (!is_last_tile) {
-          *max_tile_size = AOMMAX(*max_tile_size, tile_size);
-          // size of this tile
-          mem_put_le32(buf->data, tile_size);
-        }
+  aom_wb_write_literal(wb, pars->random_seed, 16);
 
-        total_size += tile_size;
+  pars->random_seed += 3245;  // For film grain test vectors purposes
+  if (!pars->random_seed)     // Random seed should not be zero
+    pars->random_seed += 1735;
+  if (cm->frame_type == INTER_FRAME)
+    aom_wb_write_bit(wb, pars->update_parameters);
+  else
+    pars->update_parameters = 1;
+  if (!pars->update_parameters) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    int ref_frame, ref_idx, buf_idx;
+    for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+      ref_idx = get_ref_frame_map_idx(cpi, ref_frame);
+      assert(ref_idx != INVALID_IDX);
+      buf_idx = cm->ref_frame_map[ref_idx];
+      if (frame_bufs[buf_idx].film_grain_params_present &&
+          memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) {
+        break;
       }
     }
-    // Write the final tile group size
-    if (n_log2_tiles) {
-      aom_wb_overwrite_literal(
-          &tg_params_wb, (tile_cols * tile_rows) - tile_count, n_log2_tiles);
-      aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
-    }
-    // Remux if possible. TODO (Thomas Davies): do this for more than one tile
-    // group
-    if (have_tiles && tg_count == 1) {
-      int data_size =
-          total_size - (uncompressed_hdr_size + compressed_hdr_size);
-      data_size =
-          remux_tiles(cm, dst + uncompressed_hdr_size + compressed_hdr_size,
-                      data_size, *max_tile_size, *max_tile_col_size,
-                      &tile_size_bytes, &tile_col_size_bytes);
-      total_size = data_size + uncompressed_hdr_size + compressed_hdr_size;
-      aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
+    assert(ref_frame < REF_FRAMES);
+    aom_wb_write_literal(wb, ref_idx, 3);
+    return;
+  }
+
+  // Scaling functions parameters
+  aom_wb_write_literal(wb, pars->num_y_points, 4);  // max 14
+  for (int i = 0; i < pars->num_y_points; i++) {
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+  }
+
+  if (!cm->seq_params.monochrome)
+    aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+  else
+    pars->chroma_scaling_from_luma = 0;  // for monochrome override to 0
+
+  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+       (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
+  } else {
+    aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
     }
 
-#if CONFIG_EXT_TILE
+    aom_wb_write_literal(wb, pars->num_cr_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+    }
   }
-#endif  // CONFIG_EXT_TILE
-  return (uint32_t)total_size;
-}
-#endif
 
-static void write_render_size(const AV1_COMMON *cm,
-                              struct aom_write_bit_buffer *wb) {
-  const int scaling_active = !av1_resize_unscaled(cm);
-  aom_wb_write_bit(wb, scaling_active);
-  if (scaling_active) {
-    aom_wb_write_literal(wb, cm->render_width - 1, 16);
-    aom_wb_write_literal(wb, cm->render_height - 1, 16);
+  aom_wb_write_literal(wb, pars->scaling_shift - 8, 2);  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+  aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2);  // 8 + value
+
+  aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+  if (pars->num_cb_points) {
+    aom_wb_write_literal(wb, pars->cb_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_offset, 9);
   }
-}
 
-#if CONFIG_FRAME_SUPERRES
-static void write_superres_scale(const AV1_COMMON *const cm,
-                                 struct aom_write_bit_buffer *wb) {
-  // First bit is whether to to scale or not
-  if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
-    aom_wb_write_bit(wb, 0);  // no scaling
-  } else {
-    aom_wb_write_bit(wb, 1);  // scaling, write scale factor
-    aom_wb_write_literal(
-        wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
-        SUPERRES_SCALE_BITS);
+  if (pars->num_cr_points) {
+    aom_wb_write_literal(wb, pars->cr_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_offset, 9);
   }
+
+  aom_wb_write_bit(wb, pars->overlap_flag);
+
+  aom_wb_write_bit(wb, pars->clip_to_restricted_range);
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
-static void write_frame_size(const AV1_COMMON *cm,
-                             struct aom_write_bit_buffer *wb) {
-#if CONFIG_FRAME_SUPERRES
-  aom_wb_write_literal(wb, cm->superres_upscaled_width - 1, 16);
-  aom_wb_write_literal(wb, cm->superres_upscaled_height - 1, 16);
-  write_superres_scale(cm, wb);
-#else
-  aom_wb_write_literal(wb, cm->width - 1, 16);
-  aom_wb_write_literal(wb, cm->height - 1, 16);
-#endif  // CONFIG_FRAME_SUPERRES
-  write_render_size(cm, wb);
+static void write_sb_size(SequenceHeader *seq_params,
+                          struct aom_write_bit_buffer *wb) {
+  (void)seq_params;
+  (void)wb;
+  assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+  assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+  assert(seq_params->sb_size == BLOCK_128X128 ||
+         seq_params->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
 }
 
-static void write_frame_size_with_refs(AV1_COMP *cpi,
-                                       struct aom_write_bit_buffer *wb) {
+void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
-  int found = 0;
+  SequenceHeader *seq_params = &cm->seq_params;
 
-  MV_REFERENCE_FRAME ref_frame;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+  int max_frame_width = cpi->oxcf.forced_max_frame_width
+                            ? cpi->oxcf.forced_max_frame_width
+                            : cpi->oxcf.width;
+  int max_frame_height = cpi->oxcf.forced_max_frame_height
+                             ? cpi->oxcf.forced_max_frame_height
+                             : cpi->oxcf.height;
+  const int num_bits_width =
+      (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+  const int num_bits_height =
+      (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
+  assert(num_bits_width <= 16);
+  assert(num_bits_height <= 16);
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  aom_wb_write_literal(wb, num_bits_width - 1, 4);
+  aom_wb_write_literal(wb, num_bits_height - 1, 4);
+  aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width);
+  aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height);
 
-    if (cfg != NULL) {
-#if CONFIG_FRAME_SUPERRES
-      found = cm->superres_upscaled_width == cfg->y_crop_width &&
-              cm->superres_upscaled_height == cfg->y_crop_height;
-#else
-      found =
-          cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height;
-#endif
-      found &= cm->render_width == cfg->render_width &&
-               cm->render_height == cfg->render_height;
-    }
-    aom_wb_write_bit(wb, found);
-    if (found) {
-#if CONFIG_FRAME_SUPERRES
-      write_superres_scale(cm, wb);
-#endif  // CONFIG_FRAME_SUPERRES
-      break;
+  /* Placeholder for actually writing to the bitstream */
+  if (!seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag =
+        cm->large_scale_tile ? 0 : cm->error_resilient_mode;
+    seq_params->frame_id_length = FRAME_ID_LENGTH;
+    seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+    aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+    if (seq_params->frame_id_numbers_present_flag) {
+      // We must always have delta_frame_id_length < frame_id_length,
+      // in order for a frame to be referenced with a unique delta.
+      // Avoid wasting bits by using a coding that enforces this restriction.
+      aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+      aom_wb_write_literal(
+          wb,
+          seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+          3);
     }
   }
 
-  if (!found) write_frame_size(cm, wb);
-}
+  write_sb_size(seq_params, wb);
 
-static void write_profile(BITSTREAM_PROFILE profile,
-                          struct aom_write_bit_buffer *wb) {
-  switch (profile) {
-    case PROFILE_0: aom_wb_write_literal(wb, 0, 2); break;
-    case PROFILE_1: aom_wb_write_literal(wb, 2, 2); break;
-    case PROFILE_2: aom_wb_write_literal(wb, 1, 2); break;
-    case PROFILE_3: aom_wb_write_literal(wb, 6, 3); break;
-    default: assert(0);
-  }
-}
+  aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+  aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
 
-static void write_bitdepth_colorspace_sampling(
-    AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
-  if (cm->profile >= PROFILE_2) {
-    assert(cm->bit_depth > AOM_BITS_8);
-    aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
-  }
-#if CONFIG_COLORSPACE_HEADERS
-  aom_wb_write_literal(wb, cm->color_space, 5);
-  aom_wb_write_literal(wb, cm->transfer_function, 5);
-#else
-  aom_wb_write_literal(wb, cm->color_space, 3);
-#endif
-  if (cm->color_space != AOM_CS_SRGB) {
-    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
-    aom_wb_write_bit(wb, cm->color_range);
-    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
-      aom_wb_write_bit(wb, cm->subsampling_x);
-      aom_wb_write_bit(wb, cm->subsampling_y);
-      aom_wb_write_bit(wb, 0);  // unused
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+    aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+    aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+    aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+    aom_wb_write_bit(wb, seq_params->enable_order_hint);
+
+    if (seq_params->enable_order_hint) {
+      aom_wb_write_bit(wb, seq_params->enable_jnt_comp);
+      aom_wb_write_bit(wb, seq_params->enable_ref_frame_mvs);
+    }
+    if (seq_params->force_screen_content_tools == 2) {
+      aom_wb_write_bit(wb, 1);
     } else {
-      assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
     }
-#if CONFIG_COLORSPACE_HEADERS
-    if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
-      aom_wb_write_literal(wb, cm->chroma_sample_position, 2);
+    if (seq_params->force_screen_content_tools > 0) {
+      if (seq_params->force_integer_mv == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, seq_params->force_integer_mv);
+      }
+    } else {
+      assert(seq_params->force_integer_mv == 2);
     }
-#endif
-  } else {
-    assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
-    aom_wb_write_bit(wb, 0);  // unused
+    if (seq_params->enable_order_hint)
+      aom_wb_write_literal(wb, seq_params->order_hint_bits_minus_1, 3);
   }
-}
 
-#if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(AV1_COMMON *const cm,
-                           struct aom_write_bit_buffer *wb) {
-  SequenceHeader *seq_params = &cm->seq_params;
-  /* Placeholder for actually writing to the bitstream */
-  seq_params->frame_id_numbers_present_flag =
-#if CONFIG_EXT_TILE
-      cm->large_scale_tile ? 0 :
-#endif  // CONFIG_EXT_TILE
-                           FRAME_ID_NUMBERS_PRESENT_FLAG;
-  seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
-  seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
-
-  aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
-  if (seq_params->frame_id_numbers_present_flag) {
-    aom_wb_write_literal(wb, seq_params->frame_id_length_minus7, 4);
-    aom_wb_write_literal(wb, seq_params->delta_frame_id_length_minus2, 4);
-  }
+  aom_wb_write_bit(wb, seq_params->enable_superres);
+  aom_wb_write_bit(wb, seq_params->enable_cdef);
+  aom_wb_write_bit(wb, seq_params->enable_restoration);
 }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-static void write_sb_size(const AV1_COMMON *cm,
-                          struct aom_write_bit_buffer *wb) {
-  (void)cm;
-  (void)wb;
-  assert(cm->mib_size == mi_size_wide[cm->sb_size]);
-  assert(cm->mib_size == 1 << cm->mib_size_log2);
-#if CONFIG_EXT_PARTITION
-  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
-  aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
-#else
-  assert(cm->sb_size == BLOCK_64X64);
-#endif  // CONFIG_EXT_PARTITION
-}
-
-static void write_compound_tools(const AV1_COMMON *cm,
-                                 struct aom_write_bit_buffer *wb) {
-  (void)cm;
-  (void)wb;
-#if CONFIG_INTERINTRA
-  if (!frame_is_intra_only(cm) && cm->reference_mode != COMPOUND_REFERENCE) {
-    aom_wb_write_bit(wb, cm->allow_interintra_compound);
-  } else {
-    assert(cm->allow_interintra_compound == 0);
-  }
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm)) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    aom_wb_write_bit(wb, cm->allow_masked_compound);
-  } else {
-    assert(cm->allow_masked_compound == 0);
-  }
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-}
-
-#if CONFIG_GLOBAL_MOTION
 static void write_global_motion_params(const WarpedMotionParams *params,
                                        const WarpedMotionParams *ref_params,
                                        struct aom_write_bit_buffer *wb,
                                        int allow_hp) {
-  TransformationType type = params->wmtype;
-  int trans_bits;
-  int trans_prec_diff;
+  const TransformationType type = params->wmtype;
 
   aom_wb_write_bit(wb, type != IDENTITY);
   if (type != IDENTITY) {
-#if GLOBAL_TRANS_TYPES > 4
-    aom_wb_write_literal(wb, type - 1, GLOBAL_TYPE_BITS);
-#else
     aom_wb_write_bit(wb, type == ROTZOOM);
     if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
-#endif  // GLOBAL_TRANS_TYPES > 4
-  }
-
-  switch (type) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (type != HORTRAPEZOID)
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
-            (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
-      if (type != VERTRAPEZOID)
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
-            (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
-    // fallthrough intended
-    case AFFINE:
-    case ROTZOOM:
-      aom_wb_write_signed_primitive_refsubexpfin(
-          wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
-              (1 << GM_ALPHA_PREC_BITS),
-          (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      if (type != VERTRAPEZOID)
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-            (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (type >= AFFINE) {
-        if (type != HORTRAPEZOID)
-          aom_wb_write_signed_primitive_refsubexpfin(
-              wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-              (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-              (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS),
-            (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS));
-      }
-    // fallthrough intended
-    case TRANSLATION:
-      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                                         : GM_ABS_TRANS_BITS;
-      trans_prec_diff = (type == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      aom_wb_write_signed_primitive_refsubexpfin(
-          wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[0] >> trans_prec_diff),
-          (params->wmmat[0] >> trans_prec_diff));
-      aom_wb_write_signed_primitive_refsubexpfin(
-          wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[1] >> trans_prec_diff),
-          (params->wmmat[1] >> trans_prec_diff));
-      break;
-    case IDENTITY: break;
-    default: assert(0);
+  }
+
+  if (type >= ROTZOOM) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+  }
+
+  if (type >= AFFINE) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[0] >> trans_prec_diff),
+        (params->wmmat[0] >> trans_prec_diff));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[1] >> trans_prec_diff),
+        (params->wmmat[1] >> trans_prec_diff));
   }
 }
 
@@ -4513,8 +2824,8 @@ static void write_global_motion(AV1_COMP *cpi,
   int frame;
   for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
     const WarpedMotionParams *ref_params =
-        cm->error_resilient_mode ? &default_warp_params
-                                 : &cm->prev_frame->global_motion[frame];
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
     write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
                                cm->allow_high_precision_mv);
     // TODO(sarahparker, debargha): The logic in the commented out code below
@@ -4541,820 +2852,452 @@ static void write_global_motion(AV1_COMP *cpi,
            */
   }
 }
-#endif
 
-#if !CONFIG_OBU
-static void write_uncompressed_header_frame(AV1_COMP *cpi,
-                                            struct aom_write_bit_buffer *wb) {
+static void check_frame_refs_short_signaling(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
-  aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2);
-
-  write_profile(cm->profile, wb);
-
-#if CONFIG_EXT_TILE
-  aom_wb_write_literal(wb, cm->large_scale_tile, 1);
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_EXT_REFS
-  // NOTE: By default all coded frames to be used as a reference
-  cm->is_reference_frame = 1;
+  if (!cm->frame_refs_short_signaling) return;
 
-  if (cm->show_existing_frame) {
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
-
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer %d does not contain a reconstructed frame",
-                         frame_to_show);
+  // Check whether all references are distinct frames.
+  int buf_markers[FRAME_BUFFERS] = { 0 };
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    if (buf_idx != INVALID_IDX) {
+      assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+      buf_markers[buf_idx] = 1;
     }
-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-
-    aom_wb_write_bit(wb, 1);  // show_existing_frame
-    aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
-
-#if CONFIG_REFERENCE_BUFFER
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
-      aom_wb_write_literal(wb, display_frame_id, frame_id_len);
-      /* Add a zero byte to prevent emulation of superframe marker */
-      /* Same logic as when when terminating the entropy coder */
-      /* Consider to have this logic only one place */
-      aom_wb_write_literal(wb, 0, 8);
-    }
-#endif  // CONFIG_REFERENCE_BUFFER
-
-    return;
-  } else {
-#endif                        // CONFIG_EXT_REFS
-    aom_wb_write_bit(wb, 0);  // show_existing_frame
-#if CONFIG_EXT_REFS
   }
-#endif  // CONFIG_EXT_REFS
-
-  aom_wb_write_bit(wb, cm->frame_type);
-  aom_wb_write_bit(wb, cm->show_frame);
-  if (cm->frame_type != KEY_FRAME)
-    if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
-  aom_wb_write_bit(wb, cm->error_resilient_mode);
 
-  if (frame_is_intra_only(cm)) {
-#if CONFIG_REFERENCE_BUFFER
-    write_sequence_header(cm, wb);
-#endif  // CONFIG_REFERENCE_BUFFER
+  int num_refs = 0;
+  for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) {
+    num_refs += buf_markers[buf_idx];
   }
-#if CONFIG_REFERENCE_BUFFER
-  cm->invalid_delta_frame_id_minus1 = 0;
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-    aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
-  }
-#endif  // CONFIG_REFERENCE_BUFFER
-  if (cm->frame_type == KEY_FRAME) {
-    write_bitdepth_colorspace_sampling(cm, wb);
-    write_frame_size(cm, wb);
-    write_sb_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-    assert(cpi->common.ans_window_size_log2 >= 8);
-    assert(cpi->common.ans_window_size_log2 < 24);
-    aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if CONFIG_AMVR
-    if (cm->allow_screen_content_tools) {
-      if (cm->seq_mv_precision_level == 2) {
-        aom_wb_write_bit(wb, 1);
-      } else {
-        aom_wb_write_bit(wb, 0);
-        aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0);
-      }
-    }
-#endif
-  } else {
-    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      if (cm->intra_only) {
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-      } else {
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
-        if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
-          aom_wb_write_bit(wb,
-                           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-      }
-    }
-#endif
-#if CONFIG_EXT_REFS
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-#endif  // CONFIG_EXT_REFS
-
-    if (cm->intra_only) {
-      write_bitdepth_colorspace_sampling(cm, wb);
-
-#if CONFIG_EXT_REFS
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-      write_frame_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-      assert(cpi->common.ans_window_size_log2 >= 8);
-      assert(cpi->common.ans_window_size_log2 < 24);
-      aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    } else {
-      MV_REFERENCE_FRAME ref_frame;
 
-#if CONFIG_EXT_REFS
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-      if (!cpi->refresh_frame_mask) {
-        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-        //       will not be used as a reference
-        cm->is_reference_frame = 0;
-      }
-#endif  // CONFIG_EXT_REFS
-
-      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-        aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                             REF_FRAMES_LOG2);
-#if !CONFIG_FRAME_SIGN_BIAS
-        aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_REFERENCE_BUFFER
-        if (cm->seq_params.frame_id_numbers_present_flag) {
-          int i = get_ref_frame_map_idx(cpi, ref_frame);
-          int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-          int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-          int delta_frame_id_minus1 =
-              ((cm->current_frame_id - cm->ref_frame_id[i] +
-                (1 << frame_id_len)) %
-               (1 << frame_id_len)) -
-              1;
-          if (delta_frame_id_minus1 < 0 ||
-              delta_frame_id_minus1 >= (1 << diff_len))
-            cm->invalid_delta_frame_id_minus1 = 1;
-          aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
-        }
-#endif  // CONFIG_REFERENCE_BUFFER
-      }
-
-#if CONFIG_FRAME_SIGN_BIAS
-#define FRAME_SIGN_BIAS_DEBUG 0
-#if FRAME_SIGN_BIAS_DEBUG
-      {
-        printf("\n\nENCODER: Frame=%d, show_frame=%d:", cm->current_video_frame,
-               cm->show_frame);
-        for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-          printf(" sign_bias[%d]=%d", ref_frame,
-                 cm->ref_frame_sign_bias[ref_frame]);
-        }
-        printf("\n");
-      }
-#endif  // FRAME_SIGN_BIAS_DEBUG
-#undef FRAME_SIGN_BIAS_DEBUG
-#endif  // CONFIG_FRAME_SIGN_BIAS
-
-#if CONFIG_FRAME_SIZE
-      if (cm->error_resilient_mode == 0) {
-        write_frame_size_with_refs(cpi, wb);
-      } else {
-        write_frame_size(cm, wb);
-      }
-#else
-      write_frame_size_with_refs(cpi, wb);
-#endif
-
-#if CONFIG_AMVR
-      if (cm->seq_mv_precision_level == 2) {
-        aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0);
-      }
-#endif
-      aom_wb_write_bit(wb, cm->allow_high_precision_mv);
-
-      fix_interp_filter(cm, cpi->td.counts);
-      write_frame_interp_filter(cm->interp_filter, wb);
-#if CONFIG_TEMPMV_SIGNALING
-      if (frame_might_use_prev_frame_mvs(cm)) {
-        aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
-      }
-#endif
-    }
+  // We only turn on frame_refs_short_signaling when all references are
+  // distinct.
+  if (num_refs < INTER_REFS_PER_FRAME) {
+    // It indicates that there exist more than one reference frame pointing to
+    // the same reference buffer, i.e. two or more references are duplicate.
+    cm->frame_refs_short_signaling = 0;
+    return;
   }
 
-#if CONFIG_FRAME_MARKER
-  if (cm->show_frame == 0) {
-    int arf_offset = AOMMIN(
-        (MAX_GF_INTERVAL - 1),
-        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-#if CONFIG_EXT_REFS
-    int brf_offset =
-        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+  // Check whether the encoder side ref frame choices are aligned with that to
+  // be derived at the decoder side.
+  RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME];
 
-    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-#endif
-    aom_wb_write_literal(wb, arf_offset, 4);
-  }
-#endif
+  // Backup the frame refs info
+  memcpy(frame_refs_copy, cm->frame_refs,
+         INTER_REFS_PER_FRAME * sizeof(RefBuffer));
 
-#if CONFIG_REFERENCE_BUFFER
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    cm->refresh_mask =
-        cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
-  }
-#endif  // CONFIG_REFERENCE_BUFFER
+  const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME);
+  const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
 
-  if (!cm->error_resilient_mode) {
-    aom_wb_write_bit(
-        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
-  }
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
-#endif
-  encode_loopfilter(cm, wb);
-  encode_quantization(cm, wb);
-  encode_segmentation(cm, xd, wb);
-  {
-    int i;
-    struct segmentation *const seg = &cm->seg;
-    int segment_quantizer_active = 0;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
-        segment_quantizer_active = 1;
-      }
-    }
+  // Set up the frame refs mapping indexes according to the
+  // frame_refs_short_signaling policy.
+  av1_set_frame_refs(cm, lst_map_idx, gld_map_idx);
 
-    if (cm->delta_q_present_flag)
-      assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
-    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
-      aom_wb_write_bit(wb, cm->delta_q_present_flag);
-      if (cm->delta_q_present_flag) {
-        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
-        xd->prev_qindex = cm->base_qindex;
-#if CONFIG_EXT_DELTA_Q
-        assert(seg->abs_delta == SEGMENT_DELTADATA);
-        aom_wb_write_bit(wb, cm->delta_lf_present_flag);
-        if (cm->delta_lf_present_flag) {
-          aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
-          xd->prev_delta_lf_from_base = 0;
-#if CONFIG_LOOPFILTER_LEVEL
-          aom_wb_write_bit(wb, cm->delta_lf_multi);
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-            xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-        }
-#endif  // CONFIG_EXT_DELTA_Q
-      }
+  // We only turn on frame_refs_short_signaling when the encoder side decision
+  // on ref frames is identical to that at the decoder side.
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+    // Compare the buffer index between two reference frames indexed
+    // respectively by the encoder and the decoder side decisions.
+    if (cm->frame_refs[ref_idx].idx != frame_refs_copy[ref_idx].idx) {
+      cm->frame_refs_short_signaling = 0;
+      break;
     }
   }
-#if CONFIG_CDEF
-  if (!cm->all_lossless) {
-    encode_cdef(cm, wb);
-  }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  encode_restoration_mode(cm, wb);
-#endif  // CONFIG_LOOP_RESTORATION
-  write_tx_mode(cm, &cm->tx_mode, wb);
-
-  if (cpi->allow_comp_inter_inter) {
-    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-#if !CONFIG_REF_ADAPT
-    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
-#endif  // !CONFIG_REF_ADAPT
 
-    aom_wb_write_bit(wb, use_hybrid_pred);
-#if !CONFIG_REF_ADAPT
-    if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
-#endif  // !CONFIG_REF_ADAPT
+#if 0   // For debug
+  printf("\nFrame=%d: \n", cm->current_video_frame);
+  printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. "
+        "dec_ref(map_idx=%d, buf_idx=%d)=%d\n",
+        get_ref_frame_map_idx(cpi, ref_frame),
+        get_ref_frame_buf_idx(cpi, ref_frame), ref_frame,
+        cm->frame_refs[ref_frame - LAST_FRAME].map_idx,
+        cm->frame_refs[ref_frame - LAST_FRAME].idx, ref_frame);
   }
-  write_compound_tools(cm, wb);
-
-#if CONFIG_EXT_TX
-  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_ADAPT_SCAN
-  aom_wb_write_bit(wb, cm->use_adapt_scan);
-#endif
-
-#if CONFIG_GLOBAL_MOTION
-  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
-#endif  // CONFIG_GLOBAL_MOTION
+#endif  // 0
 
-  write_tile_info(cm, wb);
+  // Restore the frame refs info if frame_refs_short_signaling is off.
+  if (!cm->frame_refs_short_signaling)
+    memcpy(cm->frame_refs, frame_refs_copy,
+           INTER_REFS_PER_FRAME * sizeof(RefBuffer));
 }
 
-#else
 // New function based on HLS R18
 static void write_uncompressed_header_obu(AV1_COMP *cpi,
+                                          struct aom_write_bit_buffer *saved_wb,
                                           struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-#if CONFIG_EXT_TILE
-  aom_wb_write_literal(wb, cm->large_scale_tile, 1);
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_EXT_REFS
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
+  cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
 
-  if (cm->show_existing_frame) {
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+  if (cm->seq_params.still_picture) {
+    assert(cm->show_existing_frame == 0);
+    assert(cm->show_frame == 1);
+    assert(cm->frame_type == KEY_FRAME);
+  }
+  if (!cm->seq_params.reduced_still_picture_hdr) {
+    if (cm->show_existing_frame) {
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer %d does not contain a reconstructed frame",
-                         frame_to_show);
+      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer %d does not contain a reconstructed frame",
+                           frame_to_show);
+      }
+      ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+      aom_wb_write_bit(wb, 1);  // show_existing_frame
+      aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0) {
+        write_tu_pts_info(cm, wb);
+      }
+      if (cm->seq_params.frame_id_numbers_present_flag) {
+        int frame_id_len = cm->seq_params.frame_id_length;
+        int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+        aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      }
+
+      if (cm->reset_decoder_state &&
+          frame_bufs[frame_to_show].frame_type != KEY_FRAME) {
+        aom_internal_error(
+            &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+            "show_existing_frame to reset state on KEY_FRAME only");
+      }
+
+      return;
+    } else {
+      aom_wb_write_bit(wb, 0);  // show_existing_frame
     }
-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
 
-    aom_wb_write_bit(wb, 1);  // show_existing_frame
-    aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+    aom_wb_write_literal(wb, cm->frame_type, 2);
 
-#if CONFIG_REFERENCE_BUFFER
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
-      aom_wb_write_literal(wb, display_frame_id, frame_id_len);
-      /* Add a zero byte to prevent emulation of superframe marker */
-      /* Same logic as when when terminating the entropy coder */
-      /* Consider to have this logic only one place */
-      aom_wb_write_literal(wb, 0, 8);
+    aom_wb_write_bit(wb, cm->show_frame);
+    if (cm->show_frame) {
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0)
+        write_tu_pts_info(cm, wb);
+    } else {
+      aom_wb_write_bit(wb, cm->showable_frame);
     }
-#endif  // CONFIG_REFERENCE_BUFFER
+    if (frame_is_sframe(cm)) {
+      assert(cm->error_resilient_mode);
+    } else if (!(cm->frame_type == KEY_FRAME && cm->show_frame)) {
+      aom_wb_write_bit(wb, cm->error_resilient_mode);
+    }
+  }
+  aom_wb_write_bit(wb, cm->disable_cdf_update);
 
-    return;
+  if (cm->seq_params.force_screen_content_tools == 2) {
+    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
-#endif  // CONFIG_EXT_REFS
-    aom_wb_write_bit(wb, 0);  // show_existing_frame
-#if CONFIG_EXT_REFS
+    assert(cm->allow_screen_content_tools ==
+           cm->seq_params.force_screen_content_tools);
   }
-#endif  // CONFIG_EXT_REFS
 
-  cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
-  aom_wb_write_literal(wb, cm->frame_type, 2);
+  if (cm->allow_screen_content_tools) {
+    if (cm->seq_params.force_integer_mv == 2) {
+      aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
+    } else {
+      assert(cm->cur_frame_force_integer_mv == cm->seq_params.force_integer_mv);
+    }
+  } else {
+    assert(cm->cur_frame_force_integer_mv == 0);
+  }
 
-  if (cm->intra_only) cm->frame_type = INTRA_ONLY_FRAME;
+  cm->invalid_delta_frame_id_minus_1 = 0;
+  int frame_size_override_flag = 0;
+  cm->frame_refs_short_signaling = 0;
 
-  aom_wb_write_bit(wb, cm->show_frame);
-  aom_wb_write_bit(wb, cm->error_resilient_mode);
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    assert(cm->width == cm->seq_params.max_frame_width &&
+           cm->height == cm->seq_params.max_frame_height);
+  } else {
+    if (cm->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_len = cm->seq_params.frame_id_length;
+      aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    }
 
-#if CONFIG_REFERENCE_BUFFER
-  cm->invalid_delta_frame_id_minus1 = 0;
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-    aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    if (cm->width > cm->seq_params.max_frame_width ||
+        cm->height > cm->seq_params.max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Frame dimensions are larger than the maximum values");
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm) ? 1
+                            : (cm->width != cm->seq_params.max_frame_width ||
+                               cm->height != cm->seq_params.max_frame_height);
+    if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+    if (cm->seq_params.enable_order_hint)
+      aom_wb_write_literal(wb, cm->frame_offset,
+                           cm->seq_params.order_hint_bits_minus_1 + 1);
+
+    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+      aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (cm->seq_params.decoder_model_info_present_flag) {
+    aom_wb_write_bit(wb, cm->buffer_removal_delay_present);
+    if (cm->buffer_removal_delay_present) {
+      for (int op_num = 0;
+           op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+          if (((cm->seq_params.operating_point_idc[op_num] >>
+                cm->temporal_layer_id) &
+                   0x1 &&
+               (cm->seq_params.operating_point_idc[op_num] >>
+                (cm->spatial_layer_id + 8)) &
+                   0x1) ||
+              cm->seq_params.operating_point_idc[op_num] == 0) {
+            aom_wb_write_literal(
+                wb, (uint32_t)cm->op_frame_timing[op_num].buffer_removal_delay,
+                cm->buffer_model.buffer_removal_delay_length);
+            cm->op_frame_timing[op_num].buffer_removal_delay++;
+          }
+        }
+      }
+    }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
+  cpi->refresh_frame_mask = get_refresh_mask(cpi);
   if (cm->frame_type == KEY_FRAME) {
-    write_frame_size(cm, wb);
-    write_sb_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-    assert(cpi->common.ans_window_size_log2 >= 8);
-    assert(cpi->common.ans_window_size_log2 < 24);
-    aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if CONFIG_AMVR
-    if (cm->allow_screen_content_tools) {
-      if (cm->seq_mv_precision_level == 2) {
-        aom_wb_write_bit(wb, 1);
+    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+    } else {
+      assert(cpi->refresh_frame_mask == 0xFF);
+    }
+  } else {
+    if (cm->frame_type == INTRA_ONLY_FRAME) {
+      assert(cpi->refresh_frame_mask != 0xFF);
+      int updated_fb = -1;
+      for (int i = 0; i < REF_FRAMES; i++) {
+        // If more than one frame is refreshed, it doesn't matter which one
+        // we pick, so pick the first.
+        if (cpi->refresh_frame_mask & (1 << i)) {
+          updated_fb = i;
+          break;
+        }
+      }
+      assert(updated_fb >= 0);
+      cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+    } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+      if (cm->frame_type == INTER_FRAME) {
+        aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
       } else {
-        aom_wb_write_bit(wb, 0);
-        aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0);
+        assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF);
+      }
+      int updated_fb = -1;
+      for (int i = 0; i < REF_FRAMES; i++) {
+        // If more than one frame is refreshed, it doesn't matter which one
+        // we pick, so pick the first.
+        if (cpi->refresh_frame_mask & (1 << i)) {
+          updated_fb = i;
+          break;
+        }
       }
-    }
-#endif
-  } else if (cm->frame_type == INTRA_ONLY_FRAME) {
-    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      if (cm->intra_only) {
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      // large scale tile sometimes won't refresh any fbs
+      if (updated_fb >= 0) {
+        cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
       }
-    }
-#endif
-#if CONFIG_EXT_REFS
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-#endif  // CONFIG_EXT_REFS
 
-    if (cm->intra_only) {
-#if CONFIG_EXT_REFS
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-      write_frame_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-      assert(cpi->common.ans_window_size_log2 >= 8);
-      assert(cpi->common.ans_window_size_log2 < 24);
-      aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    }
-  } else if (cm->frame_type == INTER_FRAME) {
-    MV_REFERENCE_FRAME ref_frame;
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
-      if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      if (!cpi->refresh_frame_mask) {
+        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
     }
-#endif
+  }
 
-#if CONFIG_EXT_REFS
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-    aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-    aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-    if (!cpi->refresh_frame_mask) {
-      // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-      //       will not be used as a reference
-      cm->is_reference_frame = 0;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                           REF_FRAMES_LOG2);
-#if !CONFIG_FRAME_SIGN_BIAS
-      aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_REFERENCE_BUFFER
-      if (cm->seq_params.frame_id_numbers_present_flag) {
-        int i = get_ref_frame_map_idx(cpi, ref_frame);
-        int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-        int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-        int delta_frame_id_minus1 =
-            ((cm->current_frame_id - cm->ref_frame_id[i] +
-              (1 << frame_id_len)) %
-             (1 << frame_id_len)) -
-            1;
-        if (delta_frame_id_minus1 < 0 ||
-            delta_frame_id_minus1 >= (1 << diff_len))
-          cm->invalid_delta_frame_id_minus1 = 1;
-        aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+  if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
+    // Write all ref frame order hints if error_resilient_mode == 1
+    if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Get buffer index
+        const int buf_idx = cm->ref_frame_map[ref_idx];
+        assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+
+        // Write order hint to bit stream
+        aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset,
+                             cm->seq_params.order_hint_bits_minus_1 + 1);
       }
-#endif  // CONFIG_REFERENCE_BUFFER
     }
+  }
 
-#if CONFIG_FRAME_SIZE
-    if (cm->error_resilient_mode == 0) {
-      write_frame_size_with_refs(cpi, wb);
-    } else {
-      write_frame_size(cm, wb);
-    }
-#else
-    write_frame_size_with_refs(cpi, wb);
-#endif
+  if (cm->frame_type == KEY_FRAME) {
+    write_frame_size(cm, frame_size_override_flag, wb);
+    assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+      aom_wb_write_bit(wb, cm->allow_intrabc);
+    // all eight fbs are refreshed, pick one that will live long enough
+    cm->fb_of_context_type[REGULAR_FRAME] = 0;
+  } else {
+    if (cm->frame_type == INTRA_ONLY_FRAME) {
+      write_frame_size(cm, frame_size_override_flag, wb);
+      assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+        aom_wb_write_bit(wb, cm->allow_intrabc);
+    } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+      MV_REFERENCE_FRAME ref_frame;
 
-#if CONFIG_AMVR
-    if (cm->seq_mv_precision_level == 2) {
-      aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0);
-    }
-#endif
-    aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+      // NOTE: Error resilient mode turns off frame_refs_short_signaling
+      //       automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+      cm->frame_refs_short_signaling = cm->seq_params.enable_order_hint;
+#endif  // FRAME_REFS_SHORT_SIGNALING
 
-    fix_interp_filter(cm, cpi->td.counts);
-    write_frame_interp_filter(cm->interp_filter, wb);
-#if CONFIG_TEMPMV_SIGNALING
-    if (frame_might_use_prev_frame_mvs(cm)) {
-      aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
-    }
-#endif
-  } else if (cm->frame_type == S_FRAME) {
-    MV_REFERENCE_FRAME ref_frame;
-
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
-      if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-    }
-#endif
+      if (cm->frame_refs_short_signaling) {
+        // NOTE(zoeliu@google.com):
+        //   An example solution for encoder-side implementation on frame refs
+        //   short signaling, which is only turned on when the encoder side
+        //   decision on ref frames is identical to that at the decoder side.
+        check_frame_refs_short_signaling(cpi);
+      }
 
-#if CONFIG_EXT_REFS
-    if (!cpi->refresh_frame_mask) {
-      // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-      //       will not be used as a reference
-      cm->is_reference_frame = 0;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                           REF_FRAMES_LOG2);
-      assert(cm->ref_frame_sign_bias[ref_frame] == 0);
-#if CONFIG_REFERENCE_BUFFER
-      if (cm->seq_params.frame_id_numbers_present_flag) {
-        int i = get_ref_frame_map_idx(cpi, ref_frame);
-        int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-        int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-        int delta_frame_id_minus1 =
-            ((cm->current_frame_id - cm->ref_frame_id[i] +
-              (1 << frame_id_len)) %
-             (1 << frame_id_len)) -
-            1;
-        if (delta_frame_id_minus1 < 0 ||
-            delta_frame_id_minus1 >= (1 << diff_len))
-          cm->invalid_delta_frame_id_minus1 = 1;
-        aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+      if (cm->seq_params.enable_order_hint)
+        aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
+
+      if (cm->frame_refs_short_signaling) {
+        const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME);
+        aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+        const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+        aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+      }
+
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        if (!cm->frame_refs_short_signaling)
+          aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                               REF_FRAMES_LOG2);
+        if (cm->seq_params.frame_id_numbers_present_flag) {
+          int i = get_ref_frame_map_idx(cpi, ref_frame);
+          int frame_id_len = cm->seq_params.frame_id_length;
+          int diff_len = cm->seq_params.delta_frame_id_length;
+          int delta_frame_id_minus_1 =
+              ((cm->current_frame_id - cm->ref_frame_id[i] +
+                (1 << frame_id_len)) %
+               (1 << frame_id_len)) -
+              1;
+          if (delta_frame_id_minus_1 < 0 ||
+              delta_frame_id_minus_1 >= (1 << diff_len))
+            cm->invalid_delta_frame_id_minus_1 = 1;
+          aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+        }
       }
-#endif  // CONFIG_REFERENCE_BUFFER
-    }
-
-#if CONFIG_FRAME_SIZE
-    if (cm->error_resilient_mode == 0) {
-      write_frame_size_with_refs(cpi, wb);
-    } else {
-      write_frame_size(cm, wb);
-    }
-#else
-    write_frame_size_with_refs(cpi, wb);
-#endif
 
-    aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+      if (!cm->error_resilient_mode && frame_size_override_flag) {
+        write_frame_size_with_refs(cpi, wb);
+      } else {
+        write_frame_size(cm, frame_size_override_flag, wb);
+      }
 
-    fix_interp_filter(cm, cpi->td.counts);
-    write_frame_interp_filter(cm->interp_filter, wb);
-#if CONFIG_TEMPMV_SIGNALING
-    if (frame_might_use_prev_frame_mvs(cm)) {
-      aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
+      if (cm->cur_frame_force_integer_mv) {
+        cm->allow_high_precision_mv = 0;
+      } else {
+        aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+      }
+      fix_interp_filter(cm, cpi->td.counts);
+      write_frame_interp_filter(cm->interp_filter, wb);
+      aom_wb_write_bit(wb, cm->switchable_motion_mode);
+      if (frame_might_allow_ref_frame_mvs(cm)) {
+        aom_wb_write_bit(wb, cm->allow_ref_frame_mvs);
+      } else {
+        assert(cm->allow_ref_frame_mvs == 0);
+      }
     }
-#endif
-  }
-
-#if CONFIG_MFMV
-  if (cm->show_frame == 0) {
-    int arf_offset = AOMMIN(
-        (MAX_GF_INTERVAL - 1),
-        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-#if CONFIG_EXT_REFS
-    int brf_offset =
-        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
-
-    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-#endif
-    aom_wb_write_literal(wb, arf_offset, 4);
   }
-#endif
 
-#if CONFIG_REFERENCE_BUFFER
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    cm->refresh_mask =
-        cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
-  }
-#endif  // CONFIG_REFERENCE_BUFFER
+  const int might_bwd_adapt =
+      !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  if (cm->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
-  if (!cm->error_resilient_mode) {
+  if (might_bwd_adapt) {
     aom_wb_write_bit(
-        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
+        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
   }
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
-#endif
-  encode_loopfilter(cm, wb);
+
+  write_tile_info(cm, saved_wb, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
-  {
-    int i;
-    struct segmentation *const seg = &cm->seg;
-    int segment_quantizer_active = 0;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
-        segment_quantizer_active = 1;
-      }
-    }
 
-    if (cm->delta_q_present_flag)
-      assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
-    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
-      aom_wb_write_bit(wb, cm->delta_q_present_flag);
-      if (cm->delta_q_present_flag) {
-        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
-        xd->prev_qindex = cm->base_qindex;
-#if CONFIG_EXT_DELTA_Q
-        assert(seg->abs_delta == SEGMENT_DELTADATA);
+  if (cm->delta_q_present_flag) assert(cm->base_qindex > 0);
+  if (cm->base_qindex > 0) {
+    aom_wb_write_bit(wb, cm->delta_q_present_flag);
+    if (cm->delta_q_present_flag) {
+      aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+      xd->current_qindex = cm->base_qindex;
+      if (cm->allow_intrabc)
+        assert(cm->delta_lf_present_flag == 0);
+      else
         aom_wb_write_bit(wb, cm->delta_lf_present_flag);
-        if (cm->delta_lf_present_flag) {
-          aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
-#if CONFIG_LOOPFILTER_LEVEL
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-            xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-          xd->prev_delta_lf_from_base = 0;
-        }
-#endif  // CONFIG_EXT_DELTA_Q
+      if (cm->delta_lf_present_flag) {
+        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+        aom_wb_write_bit(wb, cm->delta_lf_multi);
+        av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
       }
     }
   }
-#if CONFIG_CDEF
-  if (!cm->all_lossless) {
-    encode_cdef(cm, wb);
+
+  if (cm->all_lossless) {
+    assert(!av1_superres_scaled(cm));
+  } else {
+    if (!cm->coded_lossless) {
+      encode_loopfilter(cm, wb);
+      encode_cdef(cm, wb);
+    }
+    encode_restoration_mode(cm, wb);
   }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  encode_restoration_mode(cm, wb);
-#endif  // CONFIG_LOOP_RESTORATION
+
   write_tx_mode(cm, &cm->tx_mode, wb);
 
   if (cpi->allow_comp_inter_inter) {
     const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-#if !CONFIG_REF_ADAPT
-    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
-#endif  // !CONFIG_REF_ADAPT
 
     aom_wb_write_bit(wb, use_hybrid_pred);
-#if !CONFIG_REF_ADAPT
-    if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
-#endif  // !CONFIG_REF_ADAPT
   }
-  write_compound_tools(cm, wb);
-
-#if CONFIG_EXT_TX
-  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_GLOBAL_MOTION
-  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
-#endif  // CONFIG_GLOBAL_MOTION
-
-  write_tile_info(cm, wb);
-}
-#endif  // CONFIG_OBU
-
-static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
-  AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_SUPERTX
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-#endif  // CONFIG_SUPERTX
-  FRAME_CONTEXT *const fc = cm->fc;
-  aom_writer *header_bc;
-  int i;
-#if !CONFIG_NEW_MULTISYMBOL
-  FRAME_COUNTS *counts = cpi->td.counts;
-  int j;
-#endif
-
-  const int probwt = cm->num_tg;
-  (void)probwt;
-  (void)i;
-  (void)fc;
-
-  aom_writer real_header_bc;
-  header_bc = &real_header_bc;
-#if CONFIG_ANS
-  header_bc->size = 1 << cpi->common.ans_window_size_log2;
-#endif
-  aom_start_encode(header_bc, data);
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  if (cm->tx_mode == TX_MODE_SELECT)
-    av1_cond_prob_diff_update(header_bc, &cm->fc->quarter_tx_size_prob,
-                              cm->counts.quarter_tx_size, probwt);
-#endif
-#if CONFIG_LV_MAP
-  av1_write_txb_probs(cpi, header_bc);
-#endif  // CONFIG_LV_MAP
 
-#if CONFIG_VAR_TX && !CONFIG_NEW_MULTISYMBOL
-  if (cm->tx_mode == TX_MODE_SELECT)
-    update_txfm_partition_probs(cm, header_bc, counts, probwt);
-#endif
-
-#if !CONFIG_NEW_MULTISYMBOL
-  update_skip_probs(cm, header_bc, counts);
-#endif
-
-  if (!frame_is_intra_only(cm)) {
-#if !CONFIG_NEW_MULTISYMBOL
-    update_inter_mode_probs(cm, header_bc, counts);
-#endif
-#if CONFIG_INTERINTRA
-    if (cm->reference_mode != COMPOUND_REFERENCE &&
-        cm->allow_interintra_compound) {
-#if !CONFIG_NEW_MULTISYMBOL
-      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-        if (is_interintra_allowed_bsize_group(i)) {
-          av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
-                                    cm->counts.interintra[i], probwt);
-        }
-      }
-#endif
-#if CONFIG_WEDGE && !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_PARTITION_TYPES
-      int block_sizes_to_update = BLOCK_SIZES_ALL;
-#else
-      int block_sizes_to_update = BLOCK_SIZES;
-#endif
-      for (i = 0; i < block_sizes_to_update; i++) {
-        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
-          av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
-                                    cm->counts.wedge_interintra[i], probwt);
-      }
-#endif  // CONFIG_WEDGE && CONFIG_NEW_MULTISYMBOL
-    }
-#endif  // CONFIG_INTERINTRA
+  if (cm->is_skip_mode_allowed) aom_wb_write_bit(wb, cm->skip_mode_flag);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
-                                counts->intra_inter[i], probwt);
-#endif
+  if (frame_might_allow_warped_motion(cm))
+    aom_wb_write_bit(wb, cm->allow_warped_motion);
+  else
+    assert(!cm->allow_warped_motion);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    if (cpi->allow_comp_inter_inter) {
-      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-      if (use_hybrid_pred)
-        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-          av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
-                                    counts->comp_inter[i], probwt);
-    }
+  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
 
-    if (cm->reference_mode != COMPOUND_REFERENCE) {
-      for (i = 0; i < REF_CONTEXTS; i++) {
-        for (j = 0; j < (SINGLE_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
-                                    counts->single_ref[i][j], probwt);
-        }
-      }
-    }
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
 
-    if (cm->reference_mode != SINGLE_REFERENCE) {
-#if CONFIG_EXT_COMP_REFS
-      for (i = 0; i < COMP_REF_TYPE_CONTEXTS; i++)
-        av1_cond_prob_diff_update(header_bc, &fc->comp_ref_type_prob[i],
-                                  counts->comp_ref_type[i], probwt);
-
-      for (i = 0; i < UNI_COMP_REF_CONTEXTS; i++)
-        for (j = 0; j < (UNIDIR_COMP_REFS - 1); j++)
-          av1_cond_prob_diff_update(header_bc, &fc->uni_comp_ref_prob[i][j],
-                                    counts->uni_comp_ref[i][j], probwt);
-#endif  // CONFIG_EXT_COMP_REFS
-
-      for (i = 0; i < REF_CONTEXTS; i++) {
-#if CONFIG_EXT_REFS
-        for (j = 0; j < (FWD_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
-                                    counts->comp_ref[i][j], probwt);
-        }
-        for (j = 0; j < (BWD_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
-                                    counts->comp_bwdref[i][j], probwt);
-        }
-#else
-        for (j = 0; j < (COMP_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
-                                    counts->comp_ref[i][j], probwt);
-        }
-#endif  // CONFIG_EXT_REFS
-      }
+  if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+    int flip_back_update_parameters_flag = 0;
+    if (cm->frame_type != INTER_FRAME &&
+        cm->film_grain_params.update_parameters == 0) {
+      cm->film_grain_params.update_parameters = 1;
+      flip_back_update_parameters_flag = 1;
     }
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
-      av1_cond_prob_diff_update(header_bc, &fc->comp_inter_mode_prob[i],
-                                counts->comp_inter_mode[i], probwt);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    write_film_grain_params(cpi, wb);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, counts->mv);
-#endif
-#if CONFIG_SUPERTX
-    if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
-#endif  // CONFIG_SUPERTX
+    if (flip_back_update_parameters_flag)
+      cm->film_grain_params.update_parameters = 0;
   }
-  aom_stop_encode(header_bc);
-  assert(header_bc->pos <= 0xffff);
-  return header_bc->pos;
+
+  if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
 }
 
-#if !CONFIG_OBU || CONFIG_EXT_TILE
 static int choose_size_bytes(uint32_t size, int spare_msbs) {
   // Choose the number of bytes required to represent size, without
   // using the 'spare_msbs' number of most significant bits.
@@ -5394,116 +3337,112 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
   int tsb;
   int tcsb;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     // The top bit in the tile size field indicates tile copy mode, so we
     // have 1 less bit to code the tile size
     tsb = choose_size_bytes(max_tile_size, 1);
     tcsb = choose_size_bytes(max_tile_col_size, 0);
   } else {
-#endif  // CONFIG_EXT_TILE
     tsb = choose_size_bytes(max_tile_size, 0);
     tcsb = 4;  // This is ignored
     (void)max_tile_col_size;
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
 
   assert(tsb > 0);
   assert(tcsb > 0);
 
   *tile_size_bytes = tsb;
   *tile_col_size_bytes = tcsb;
+  if (tsb == 4 && tcsb == 4) return data_size;
 
-  if (tsb == 4 && tcsb == 4) {
-    return data_size;
-  } else {
-    uint32_t wpos = 0;
-    uint32_t rpos = 0;
-
-#if CONFIG_EXT_TILE
-    if (cm->large_scale_tile) {
-      int tile_row;
-      int tile_col;
-
-      for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-        // All but the last column has a column header
-        if (tile_col < cm->tile_cols - 1) {
-          uint32_t tile_col_size = mem_get_le32(dst + rpos);
-          rpos += 4;
-
-          // Adjust the tile column size by the number of bytes removed
-          // from the tile size fields.
-          tile_col_size -= (4 - tsb) * cm->tile_rows;
-
-          mem_put_varsize(dst + wpos, tcsb, tile_col_size);
-          wpos += tcsb;
-        }
+  uint32_t wpos = 0;
+  uint32_t rpos = 0;
 
-        for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
-          // All, including the last row has a header
-          uint32_t tile_header = mem_get_le32(dst + rpos);
-          rpos += 4;
-
-          // If this is a copy tile, we need to shift the MSB to the
-          // top bit of the new width, and there is no data to copy.
-          if (tile_header >> 31 != 0) {
-            if (tsb < 4) tile_header >>= 32 - 8 * tsb;
-            mem_put_varsize(dst + wpos, tsb, tile_header);
-            wpos += tsb;
-          } else {
-            mem_put_varsize(dst + wpos, tsb, tile_header);
-            wpos += tsb;
+  if (cm->large_scale_tile) {
+    int tile_row;
+    int tile_col;
 
-            memmove(dst + wpos, dst + rpos, tile_header);
-            rpos += tile_header;
-            wpos += tile_header;
-          }
-        }
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < cm->tile_cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
       }
-    } else {
-#endif  // CONFIG_EXT_TILE
-      const int n_tiles = cm->tile_cols * cm->tile_rows;
-      int n;
 
-      for (n = 0; n < n_tiles; n++) {
-        int tile_size;
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
 
-        if (n == n_tiles - 1) {
-          tile_size = data_size - rpos;
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
         } else {
-          tile_size = mem_get_le32(dst + rpos);
-          rpos += 4;
-          mem_put_varsize(dst + wpos, tsb, tile_size);
+          mem_put_varsize(dst + wpos, tsb, tile_header);
           wpos += tsb;
-        }
-
-        memmove(dst + wpos, dst + rpos, tile_size);
 
-        rpos += tile_size;
-        wpos += tile_size;
+          tile_header += AV1_MIN_TILE_SIZE_BYTES;
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
       }
-#if CONFIG_EXT_TILE
     }
-#endif  // CONFIG_EXT_TILE
 
     assert(rpos > wpos);
     assert(rpos == data_size);
 
     return wpos;
   }
+  const int n_tiles = cm->tile_cols * cm->tile_rows;
+  int n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_size;
+
+    if (n == n_tiles - 1) {
+      tile_size = data_size - rpos;
+    } else {
+      tile_size = mem_get_le32(dst + rpos);
+      rpos += 4;
+      mem_put_varsize(dst + wpos, tsb, tile_size);
+      tile_size += AV1_MIN_TILE_SIZE_BYTES;
+      wpos += tsb;
+    }
+
+    memmove(dst + wpos, dst + rpos, tile_size);
+
+    rpos += tile_size;
+    wpos += tile_size;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == data_size);
+
+  return wpos;
 }
-#endif
 
-#if CONFIG_OBU
-static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
-                                 uint8_t *const dst) {
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                          uint8_t *const dst) {
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
-  aom_wb_write_literal(&wb, (int)obu_type, 5);
-  aom_wb_write_literal(&wb, 0, 2);
+  aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+  aom_wb_write_literal(&wb, (int)obu_type, 4);
   aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_payload_length_field
+  aom_wb_write_literal(&wb, 0, 1);  // reserved
+
   if (obu_extension) {
     aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
   }
@@ -5512,87 +3451,156 @@ static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
   return size;
 }
 
-static uint32_t write_temporal_delimiter_obu() { return 0; }
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+                        uint8_t *dest) {
+  const uint32_t obu_size = obu_payload_size;
+  const uint32_t offset = obu_header_size;
+  size_t coded_obu_size = 0;
 
-static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  struct aom_write_bit_buffer wb = { dst, 0 };
-  uint32_t size = 0;
+  if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+                      &coded_obu_size) != 0) {
+    return AOM_CODEC_ERROR;
+  }
 
-  write_profile(cm->profile, &wb);
+  return AOM_CODEC_OK;
+}
 
-  aom_wb_write_literal(&wb, 0, 4);
+static size_t obu_memmove(uint32_t obu_header_size, uint32_t obu_payload_size,
+                          uint8_t *data) {
+  const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+  const uint32_t move_dst_offset =
+      (uint32_t)length_field_size + obu_header_size;
+  const uint32_t move_src_offset = obu_header_size;
+  const uint32_t move_size = obu_payload_size;
+  memmove(data + move_dst_offset, data + move_src_offset, move_size);
+  return length_field_size;
+}
 
-  seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
-  aom_wb_write_literal(&wb, seq_params->frame_id_numbers_present_flag, 1);
-  if (seq_params->frame_id_numbers_present_flag) {
-    seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
-    seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
-    aom_wb_write_literal(&wb, seq_params->frame_id_length_minus7, 4);
-    aom_wb_write_literal(&wb, seq_params->delta_frame_id_length_minus2, 4);
+static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+  if (aom_wb_is_byte_aligned(wb)) {
+    aom_wb_write_literal(wb, 0x80, 8);
+  } else {
+    // assumes that the other bits are already 0s
+    aom_wb_write_bit(wb, 1);
   }
+}
 
-  // color_config
-  write_bitdepth_colorspace_sampling(cm, &wb);
-
-  size = aom_wb_bytes_written(&wb);
-  return size;
+static void write_bitstream_level(BitstreamLevel bl,
+                                  struct aom_write_bit_buffer *wb) {
+  uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl);
+  assert(is_valid_seq_level_idx(seq_level_idx));
+  aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
 }
 
-static uint32_t write_frame_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
   AV1_COMMON *const cm = &cpi->common;
   struct aom_write_bit_buffer wb = { dst, 0 };
-  uint32_t total_size = 0;
-  uint32_t compressed_hdr_size, uncompressed_hdr_size;
+  uint32_t size = 0;
+
+  write_profile(cm->profile, &wb);
 
-  write_uncompressed_header_obu(cpi, &wb);
+  // Still picture or not
+  aom_wb_write_bit(&wb, cm->seq_params.still_picture);
+  assert(IMPLIES(!cm->seq_params.still_picture,
+                 !cm->seq_params.reduced_still_picture_hdr));
+  // whether to use reduced still picture header
+  aom_wb_write_bit(&wb, cm->seq_params.reduced_still_picture_hdr);
+
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    assert(cm->timing_info_present == 0);
+    assert(cm->seq_params.decoder_model_info_present_flag == 0);
+    assert(cm->seq_params.display_model_info_present_flag == 0);
+    write_bitstream_level(cm->seq_params.level[0], &wb);
+  } else {
+    aom_wb_write_bit(&wb, cm->timing_info_present);  // timing info present flag
 
-  if (cm->show_existing_frame) {
-    total_size = aom_wb_bytes_written(&wb);
-    return total_size;
+    if (cm->timing_info_present) {
+      // timing_info
+      write_timing_info_header(cm, &wb);
+      aom_wb_write_bit(&wb, cm->seq_params.decoder_model_info_present_flag);
+      if (cm->seq_params.decoder_model_info_present_flag) {
+        write_decoder_model_info(cm, &wb);
+      }
+    }
+    aom_wb_write_bit(&wb, cm->seq_params.display_model_info_present_flag);
+    aom_wb_write_literal(&wb, cm->seq_params.operating_points_cnt_minus_1,
+                         OP_POINTS_CNT_MINUS_1_BITS);
+    int i;
+    for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
+      aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
+                           OP_POINTS_IDC_BITS);
+      write_bitstream_level(cm->seq_params.level[i], &wb);
+      if (cm->seq_params.level[i].major > 3)
+        aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
+      if (cm->seq_params.decoder_model_info_present_flag) {
+        aom_wb_write_bit(&wb,
+                         cm->op_params[i].decoder_model_param_present_flag);
+        if (cm->op_params[i].decoder_model_param_present_flag)
+          write_dec_model_op_parameters(cm, &wb, i);
+      }
+      if (cm->seq_params.display_model_info_present_flag) {
+        aom_wb_write_bit(&wb,
+                         cm->op_params[i].display_model_param_present_flag);
+        if (cm->op_params[i].display_model_param_present_flag) {
+          assert(cm->op_params[i].initial_display_delay <= 10);
+          aom_wb_write_literal(&wb, cm->op_params[i].initial_display_delay - 1,
+                               4);
+        }
+      }
+    }
   }
+  write_sequence_header(cpi, &wb);
 
-  // write the tile length code  (Always 4 bytes for now)
-  aom_wb_write_literal(&wb, 3, 2);
+  write_color_config(cm, &wb);
 
-  if (!use_compressed_header(cm)) {
-    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-    compressed_hdr_size = 0;
-  } else {
-    // placeholder for the compressed header length
-    struct aom_write_bit_buffer compr_hdr_len_wb = wb;
-    aom_wb_write_literal(&wb, 0, 16);
+  aom_wb_write_bit(&wb, cm->film_grain_params_present);
 
-    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-    compressed_hdr_size =
-        write_compressed_header(cpi, dst + uncompressed_hdr_size);
-    aom_wb_overwrite_literal(&compr_hdr_len_wb, (int)(compressed_hdr_size), 16);
-  }
+  add_trailing_bits(&wb);
 
-  total_size = uncompressed_hdr_size + compressed_hdr_size;
-  return total_size;
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t *const dst,
+                                       int append_trailing_bits) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  if (append_trailing_bits) add_trailing_bits(&wb);
+  return aom_wb_bytes_written(&wb);
 }
 
 static uint32_t write_tile_group_header(uint8_t *const dst, int startTile,
-                                        int endTile, int tiles_log2) {
+                                        int endTile, int tiles_log2,
+                                        int tile_start_and_end_present_flag) {
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
-  aom_wb_write_literal(&wb, startTile, tiles_log2);
-  aom_wb_write_literal(&wb, endTile, tiles_log2);
+  if (!tiles_log2) return size;
+
+  aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+  if (tile_start_and_end_present_flag) {
+    aom_wb_write_literal(&wb, startTile, tiles_log2);
+    aom_wb_write_literal(&wb, endTile, tiles_log2);
+  }
 
   size = aom_wb_bytes_written(&wb);
   return size;
 }
 
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
 static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
-                                       unsigned int *max_tile_size,
-                                       unsigned int *max_tile_col_size,
-                                       uint8_t *const frame_header_obu_location,
-                                       uint32_t frame_header_obu_size,
-                                       int insert_frame_header_obu_flag) {
-  const AV1_COMMON *const cm = &cpi->common;
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t obu_extension_header,
+                                       const FrameHeaderInfo *fh_info) {
+  AV1_COMMON *const cm = &cpi->common;
   aom_writer mode_bc;
   int tile_row, tile_col;
   TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
@@ -5601,29 +3609,53 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   unsigned int tile_size = 0;
+  unsigned int max_tile_size = 0;
+  unsigned int max_tile_col_size = 0;
   const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
   // Fixed size tile groups for the moment
   const int num_tg_hdrs = cm->num_tg;
   const int tg_size =
-#if CONFIG_EXT_TILE
       (cm->large_scale_tile)
           ? 1
-          :
-#endif  // CONFIG_EXT_TILE
-          (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+          : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
   int tile_count = 0;
   int curr_tg_data_size = 0;
   uint8_t *data = dst;
   int new_tg = 1;
-#if CONFIG_EXT_TILE
   const int have_tiles = tile_cols * tile_rows > 1;
-#endif
+  int first_tg = 1;
 
-  *max_tile_size = 0;
-  *max_tile_col_size = 0;
+  cm->largest_tile_id = 0;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
+    // For large_scale_tile case, we always have only one tile group, so it can
+    // be written as an OBU_FRAME.
+    const OBU_TYPE obu_type = OBU_FRAME;
+    const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data);
+    data += tg_hdr_size;
+
+    const uint32_t frame_header_size =
+        write_frame_header_obu(cpi, saved_wb, data, 0);
+    data += frame_header_size;
+    total_size += frame_header_size;
+
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+    {
+      char fn[20] = "./fh";
+      fn[4] = cm->current_video_frame / 100 + '0';
+      fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+      fn[6] = (cm->current_video_frame % 10) + '0';
+      fn[7] = '\0';
+      av1_print_uncompressed_frame_header(data - frame_header_size,
+                                          frame_header_size, fn);
+    }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+    int tile_size_bytes = 0;
+    int tile_col_size_bytes = 0;
+
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       TileInfo tile_info;
       const int is_last_col = (tile_col == tile_cols - 1);
@@ -5643,7 +3675,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
         av1_tile_set_row(&tile_info, cm, tile_row);
 
-        buf->data = dst + total_size;
+        buf->data = dst + total_size + tg_hdr_size;
 
         // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
         // even for the last one, unless no tiling is used at all.
@@ -5651,29 +3683,25 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         // Initialise tile context from the frame context
         this_tile->tctx = *cm->fc;
         cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif
+        mode_bc.allow_update_cdf = !cm->large_scale_tile;
+        mode_bc.allow_update_cdf =
+            mode_bc.allow_update_cdf && !cm->disable_cdf_update;
         aom_start_encode(&mode_bc, buf->data + data_offset);
         write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
         assert(tok == tok_end);
         aom_stop_encode(&mode_bc);
         tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
         buf->size = tile_size;
 
         // Record the maximum tile size we see, so we can compact headers later.
-        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+        if (tile_size > max_tile_size) {
+          max_tile_size = tile_size;
+          cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        }
 
         if (have_tiles) {
           // tile header: size of this tile, or copy offset
-          uint32_t tile_header = tile_size;
+          uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
           const int tile_copy_mode =
               ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
                   ? 1
@@ -5683,12 +3711,12 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
           // Very low chances to have copy tiles on the key frames, so don't
           // search on key frames to reduce unnecessary search.
           if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
-            const int idendical_tile_offset =
+            const int identical_tile_offset =
                 find_identical_tile(tile_row, tile_col, tile_buffers);
 
-            if (idendical_tile_offset > 0) {
+            if (identical_tile_offset > 0) {
               tile_size = 0;
-              tile_header = idendical_tile_offset | 0x80;
+              tile_header = identical_tile_offset | 0x80;
               tile_header <<= 24;
             }
           }
@@ -5701,263 +3729,287 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
 
       if (!is_last_col) {
         uint32_t col_size = total_size - col_offset - 4;
-        mem_put_le32(dst + col_offset, col_size);
+        mem_put_le32(dst + col_offset + tg_hdr_size, col_size);
 
-        // If it is not final packing, record the maximum tile column size we
-        // see, otherwise, check if the tile size is out of the range.
-        *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+        // Record the maximum tile column size we see.
+        max_tile_col_size = AOMMAX(max_tile_col_size, col_size);
       }
     }
-  } else {
-#endif  // CONFIG_EXT_TILE
 
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      TileInfo tile_info;
-      const int is_last_row = (tile_row == tile_rows - 1);
-      av1_tile_set_row(&tile_info, cm, tile_row);
+    if (have_tiles) {
+      total_size = remux_tiles(cm, data, total_size - frame_header_size,
+                               max_tile_size, max_tile_col_size,
+                               &tile_size_bytes, &tile_col_size_bytes);
+      total_size += frame_header_size;
+    }
+
+    // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+    // current tile group size before tile data(include tile column header).
+    // Tile group size doesn't include the bytes storing tg size.
+    total_size += tg_hdr_size;
+    const uint32_t obu_payload_size = total_size - tg_hdr_size;
+    const size_t length_field_size =
+        obu_memmove(tg_hdr_size, obu_payload_size, dst);
+    if (write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
+        AOM_CODEC_OK) {
+      assert(0);
+    }
+    total_size += (uint32_t)length_field_size;
+    saved_wb->bit_buffer += length_field_size;
 
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-        const int is_last_col = (tile_col == tile_cols - 1);
-        const int is_last_tile = is_last_col && is_last_row;
-        int is_last_tile_in_tg = 0;
-
-        if (new_tg) {
-          if (insert_frame_header_obu_flag && tile_idx) {
-            // insert a copy of frame header OBU (including 4-byte size),
-            // except before the first tile group
-            data = dst + total_size;
-            memmove(data, frame_header_obu_location, frame_header_obu_size);
-            total_size += frame_header_obu_size;
-          }
-          data = dst + total_size;
-          // A new tile group begins at this tile.  Write the obu header and
-          // tile group header
-          curr_tg_data_size = write_obu_header(OBU_TILE_GROUP, 0, data + 4);
-          if (n_log2_tiles)
-            curr_tg_data_size += write_tile_group_header(
-                data + curr_tg_data_size + 4, tile_idx,
-                AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
-                n_log2_tiles);
-          total_size += curr_tg_data_size + 4;
-          new_tg = 0;
-          tile_count = 0;
-        }
-        tile_count++;
-        av1_tile_set_col(&tile_info, cm, tile_col);
+    // Now fill in the gaps in the uncompressed header.
+    if (have_tiles) {
+      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
 
-        if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
-          is_last_tile_in_tg = 1;
-          new_tg = 1;
-        } else {
-          is_last_tile_in_tg = 0;
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+    }
+    return total_size;
+  }
+
+  uint32_t obu_header_size = 0;
+  uint8_t *tile_data_start = dst + total_size;
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      int is_last_tile_in_tg = 0;
+
+      if (new_tg) {
+        data = dst + total_size;
+
+        // A new tile group begins at this tile.  Write the obu header and
+        // tile group header
+        const OBU_TYPE obu_type =
+            (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+        curr_tg_data_size =
+            write_obu_header(obu_type, obu_extension_header, data);
+        obu_header_size = curr_tg_data_size;
+
+        if (num_tg_hdrs == 1) {
+          curr_tg_data_size += write_frame_header_obu(
+              cpi, saved_wb, data + curr_tg_data_size, 0);
         }
+        curr_tg_data_size += write_tile_group_header(
+            data + curr_tg_data_size, tile_idx,
+            AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
+            n_log2_tiles, cm->num_tg > 1);
+        total_size += curr_tg_data_size;
+        tile_data_start += curr_tg_data_size;
+        new_tg = 0;
+        tile_count = 0;
+      }
+      tile_count++;
+      av1_tile_set_col(&tile_info, cm, tile_col);
 
-#if CONFIG_DEPENDENT_HORZTILES
-        av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-#endif
-        buf->data = dst + total_size;
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+        is_last_tile_in_tg = 1;
+        new_tg = 1;
+      } else {
+        is_last_tile_in_tg = 0;
+      }
 
-        // The last tile of the tile group does not have a header.
-        if (!is_last_tile_in_tg) total_size += 4;
+      buf->data = dst + total_size;
 
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif  // CONFIG_ANS
-        aom_start_encode(&mode_bc, dst + total_size);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-#if !CONFIG_LV_MAP
-#if !CONFIG_PVQ
-        assert(tok == tok_end);
-#endif  // !CONFIG_PVQ
-#endif  // !CONFIG_LV_MAP
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
-        assert(tile_size > 0);
+      // The last tile of the tile group does not have a header.
+      if (!is_last_tile_in_tg) total_size += 4;
 
-        curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
-        buf->size = tile_size;
+      // Initialise tile context from the frame context
+      this_tile->tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      mode_bc.allow_update_cdf = 1;
+      mode_bc.allow_update_cdf =
+          mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+      const int num_planes = av1_num_planes(cm);
+      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+
+      aom_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+      assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+      curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
+      buf->size = tile_size;
+      if (tile_size > max_tile_size) {
+        cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        max_tile_size = tile_size;
+      }
 
-        if (!is_last_tile) {
-          *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+      if (!is_last_tile_in_tg) {
+        // size of this tile
+        mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+      } else {
+        // write current tile group size
+        const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
+        const size_t length_field_size =
+            obu_memmove(obu_header_size, obu_payload_size, data);
+        if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+            AOM_CODEC_OK) {
+          assert(0);
         }
-        if (!is_last_tile_in_tg) {
-          // size of this tile
-          mem_put_le32(buf->data, tile_size);
-        } else {
-          // write current tile group size
-          mem_put_le32(data, curr_tg_data_size);
+        curr_tg_data_size += (int)length_field_size;
+        total_size += (uint32_t)length_field_size;
+        tile_data_start += length_field_size;
+        if (num_tg_hdrs == 1) {
+          // if this tg is combined with the frame header then update saved
+          // frame header base offset accroding to length field size
+          saved_wb->bit_buffer += length_field_size;
         }
 
-        total_size += tile_size;
+        if (!first_tg && cm->error_resilient_mode) {
+          // Make room for a duplicate Frame Header OBU.
+          memmove(data + fh_info->total_length, data, curr_tg_data_size);
+
+          // Insert a copy of the Frame Header OBU.
+          memcpy(data, fh_info->frame_header, fh_info->total_length);
+
+          // Force context update tile to be the first tile in error
+          // resiliant mode as the duplicate frame headers will have
+          // context_update_tile_id set to 0
+          cm->largest_tile_id = 0;
+
+          // Rewrite the OBU header to change the OBU type to Redundant Frame
+          // Header.
+          write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header,
+                           &data[fh_info->obu_header_byte_offset]);
+
+          data += fh_info->total_length;
+
+          curr_tg_data_size += (int)(fh_info->total_length);
+          total_size += (uint32_t)(fh_info->total_length);
+        }
+        first_tg = 0;
       }
+
+      total_size += tile_size;
     }
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
-  return (uint32_t)total_size;
-}
 
-#endif
+  if (have_tiles) {
+    // Fill in context_update_tile_id indicating the tile to use for the
+    // cdf update. The encoder currently sets it to the largest tile
+    // (but is up to the encoder)
+    aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id,
+                             cm->log2_tile_cols + cm->log2_tile_rows);
+    // If more than one tile group. tile_size_bytes takes the default value 4
+    // and does not need to be set. For a single tile group it is set in the
+    // section below.
+    if (num_tg_hdrs == 1) {
+      int tile_size_bytes = 4, unused;
+      const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+      const uint32_t tile_data_size = total_size - tile_data_offset;
+
+      total_size =
+          remux_tiles(cm, tile_data_start, tile_data_size, max_tile_size,
+                      max_tile_col_size, &tile_size_bytes, &unused);
+      total_size += tile_data_offset;
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
 
-void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+      // Update the OBU length if remux_tiles() reduced the size.
+      uint64_t payload_size;
+      size_t length_field_size;
+      int res =
+          aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size,
+                          &payload_size, &length_field_size);
+      assert(res == 0);
+      (void)res;
+
+      const uint64_t new_payload_size =
+          total_size - obu_header_size - length_field_size;
+      if (new_payload_size != payload_size) {
+        size_t new_length_field_size;
+        res = aom_uleb_encode(new_payload_size, length_field_size,
+                              dst + obu_header_size, &new_length_field_size);
+        assert(res == 0);
+        if (new_length_field_size < length_field_size) {
+          const size_t src_offset = obu_header_size + length_field_size;
+          const size_t dst_offset = obu_header_size + new_length_field_size;
+          memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+          total_size -= (int)(length_field_size - new_length_field_size);
+        }
+      }
+    }
+  }
+  return total_size;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   uint8_t *data = dst;
   uint32_t data_size;
-#if CONFIG_EXT_TILE
-  AV1_COMMON *const cm = &cpi->common;
-  uint32_t compressed_hdr_size = 0;
-  uint32_t uncompressed_hdr_size;
-  struct aom_write_bit_buffer saved_wb;
-  struct aom_write_bit_buffer wb = { data, 0 };
-  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
-  int tile_size_bytes;
-  int tile_col_size_bytes;
-#endif  // CONFIG_EXT_TILE
-  unsigned int max_tile_size;
-  unsigned int max_tile_col_size;
-#if CONFIG_OBU
-#if !CONFIG_EXT_TILE
   AV1_COMMON *const cm = &cpi->common;
-#endif
-  uint32_t obu_size;
-  uint8_t *frame_header_location;
-  uint32_t frame_header_size;
-#endif
+  uint32_t obu_header_size = 0;
+  uint32_t obu_payload_size = 0;
+  FrameHeaderInfo fh_info = { NULL, 0, 0 };
+  const uint8_t obu_extension_header =
+      cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
 
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_reset_write();
 #endif
 
-#if CONFIG_OBU
-  // write temporal delimiter obu, preceded by 4-byte size
-  obu_size = write_obu_header(OBU_TD, 0, data + 4);
-  obu_size += write_temporal_delimiter_obu(/*data + 4 + obu_size*/);
-  mem_put_le32(data, obu_size);
-  data += obu_size + 4;
+  // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
   if (cm->frame_type == KEY_FRAME) {
-    obu_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data + 4);
-    obu_size += write_sequence_header_obu(cpi, data + 4 + obu_size);
-    mem_put_le32(data, obu_size);
-    data += obu_size + 4;
-  }
+    obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
 
-  // write frame header obu, preceded by 4-byte size
-  frame_header_location = data + 4;
-  obu_size = write_obu_header(OBU_FRAME_HEADER, 0, frame_header_location);
-  frame_header_size = write_frame_header_obu(cpi, data + 4 + obu_size);
-  obu_size += frame_header_size;
-  mem_put_le32(data, obu_size);
-  data += obu_size + 4;
+    obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
 
-  if (cm->show_existing_frame) {
-    data_size = 0;
-  } else {
-    //  Each tile group obu will be preceded by 4-byte size of the tile group
-    //  obu
-    data_size =
-        write_tiles_in_tg_obus(cpi, data, &max_tile_size, &max_tile_col_size,
-                               frame_header_location - 4, obu_size + 4,
-                               1 /* cm->error_resilient_mode */);
+    data += obu_header_size + obu_payload_size + length_field_size;
   }
 
-#endif
-
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    // Write the uncompressed header
-    write_uncompressed_header_frame(cpi, &wb);
-
-#if CONFIG_EXT_REFS
-    if (cm->show_existing_frame) {
-      *size = aom_wb_bytes_written(&wb);
-      return;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    // We do not know these in advance. Output placeholder bit.
-    saved_wb = wb;
-    // Write tile size magnitudes
-    if (have_tiles) {
-      // Note that the last item in the uncompressed header is the data
-      // describing tile configuration.
-      // Number of bytes in tile column size - 1
-      aom_wb_write_literal(&wb, 0, 2);
+  const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame);
+  struct aom_write_bit_buffer saved_wb;
+  if (write_frame_header) {
+    // Write Frame Header OBU.
+    fh_info.frame_header = data;
+    obu_header_size =
+        write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size =
+        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
 
-      // Number of bytes in tile size - 1
-      aom_wb_write_literal(&wb, 0, 2);
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
     }
 
-    if (!use_compressed_header(cm)) {
-      uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb);
-      aom_clear_system_state();
-      compressed_hdr_size = 0;
-    } else {
-      // Size of compressed header
-      aom_wb_write_literal(&wb, 0, 16);
-      uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb);
-      aom_clear_system_state();
-      // Write the compressed header
-      compressed_hdr_size =
-          write_compressed_header(cpi, data + uncompressed_hdr_size);
-    }
-    data += uncompressed_hdr_size + compressed_hdr_size;
+    fh_info.obu_header_byte_offset = 0;
+    fh_info.total_length =
+        obu_header_size + obu_payload_size + length_field_size;
+    data += fh_info.total_length;
 
-    // Write the encoded tile data
-    data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if !CONFIG_OBU
-    data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-#endif
-#if CONFIG_EXT_TILE
+    // Since length_field_size is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    saved_wb.bit_buffer += length_field_size;
   }
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    if (have_tiles) {
-      data_size =
-          remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
-                      &tile_size_bytes, &tile_col_size_bytes);
-    }
-
-    data += data_size;
-
-    // Now fill in the gaps in the uncompressed header.
-    if (have_tiles) {
-      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
-      aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
 
-      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
-      aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
-    }
-    // TODO(jbb): Figure out what to do if compressed_hdr_size > 16 bits.
-    assert(compressed_hdr_size <= 0xffff);
-    aom_wb_write_literal(&saved_wb, compressed_hdr_size, 16);
+  if (cm->show_existing_frame) {
+    data_size = 0;
   } else {
-#endif  // CONFIG_EXT_TILE
-    data += data_size;
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_ANS && ANS_REVERSE
-  // Avoid aliasing the superframe index
-  *data++ = 0;
-#endif
+    //  Each tile group obu will be preceded by 4-byte size of the tile group
+    //  obu
+    data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb,
+                                       obu_extension_header, &fh_info);
+  }
+  data += data_size;
   *size = data - dst;
+  return AOM_CODEC_OK;
 }
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index 76eb85116..2047b6833 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -20,34 +20,24 @@ extern "C" {
 
 struct aom_write_bit_buffer;
 
-#if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(AV1_COMMON *const cm,
-                           struct aom_write_bit_buffer *wb);
-#endif
+void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb);
+
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                          uint8_t *const dst);
 
-void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+                        uint8_t *dest);
 
-void av1_encode_token_init(void);
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
 
 static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
-#if CONFIG_EXT_REFS
   // Do not swap gf and arf indices for internal overlay frames
   return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
          !cpi->rc.is_src_frame_ext_arf;
-#else
-  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-         cpi->rc.is_src_frame_alt_ref;
-#endif  // CONFIG_EXT_REFS
 }
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                       const int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                       int blk_row, int blk_col, int block, int plane,
-                       TX_SIZE tx_size,
-#endif
+                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
                        aom_writer *w);
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 8b6627825..13fc11c31 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -14,9 +14,6 @@
 
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
-#if CONFIG_PVQ
-#include "av1/encoder/encint.h"
-#endif
 #include "av1/common/mvref_common.h"
 #include "av1/encoder/hash.h"
 #if CONFIG_DIST_8X8
@@ -27,12 +24,6 @@
 extern "C" {
 #endif
 
-#if CONFIG_PVQ
-// Maximum possible # of tx blocks in luma plane, which is currently 256,
-// since there can be 16x16 of 4x4 tx.
-#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0)
-#endif
-
 typedef struct {
   unsigned int sse;
   int sum;
@@ -41,52 +32,38 @@ typedef struct {
 
 typedef struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
-#if CONFIG_PVQ
-  DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
-#endif
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
-#if CONFIG_LV_MAP
   uint8_t *txb_entropy_ctx;
-#endif
   struct buf_2d src;
 
   // Quantizer setings
-  const int16_t *quant_fp;
-  const int16_t *round_fp;
-  const int16_t *quant;
-  const int16_t *quant_shift;
-  const int16_t *zbin;
-  const int16_t *round;
-#if CONFIG_NEW_QUANT
-  const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES];
-#endif  // CONFIG_NEW_QUANT
+  // These are used/accessed only in the quantization process
+  // RDO does not / must not depend on any of these values
+  // All values below share the coefficient scale/shift used in TX
+  const int16_t *quant_fp_QTX;
+  const int16_t *round_fp_QTX;
+  const int16_t *quant_QTX;
+  const int16_t *quant_shift_QTX;
+  const int16_t *zbin_QTX;
+  const int16_t *round_QTX;
+  const int16_t *dequant_QTX;
 } MACROBLOCK_PLANE;
 
-typedef int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
-                          [TAIL_TOKENS];
-
-#if CONFIG_LV_MAP
 typedef struct {
   int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
-  int nz_map_cost[SIG_COEF_CONTEXTS][2];
-  int eob_cost[EOB_COEF_CONTEXTS][2];
+  int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  int base_cost[SIG_COEF_CONTEXTS][4];
+  int eob_extra_cost[EOB_COEF_CONTEXTS][2];
   int dc_sign_cost[DC_SIGN_CONTEXTS][2];
-  int base_cost[NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS][2];
-#if BR_NODE
   int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
-  int br_cost[BASE_RANGE_SETS][LEVEL_CONTEXTS][2];
-#else   // BR_NODE
-  int lps_cost[LEVEL_CONTEXTS][2];
-#endif  // BR_NODE
-#if CONFIG_CTX1D
-  int eob_mode_cost[TX_CLASSES][2];
-  int empty_line_cost[TX_CLASSES][EMPTY_LINE_CONTEXTS][2];
-  int hv_eob_cost[TX_CLASSES][HV_EOB_CONTEXTS][2];
-#endif
 } LV_MAP_COEFF_COST;
 
+typedef struct {
+  int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
 typedef struct {
   tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
   uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
@@ -95,20 +72,17 @@ typedef struct {
   int dc_sign_ctx[MAX_MB_PLANE]
                  [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
 } CB_COEFF_BUFFER;
-#endif
 
 typedef struct {
-  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int16_t mode_context[MODE_CTX_REF_FRAMES];
-#if CONFIG_LV_MAP
   // TODO(angiebird): Reduce the buffer size according to sb_type
   tran_low_t *tcoeff[MAX_MB_PLANE];
   uint16_t *eobs[MAX_MB_PLANE];
   uint8_t *txb_skip_ctx[MAX_MB_PLANE];
   int *dc_sign_ctx[MAX_MB_PLANE];
-#endif
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  int_mv global_mvs[REF_FRAMES];
   int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
@@ -120,39 +94,119 @@ typedef struct {
 } MvLimits;
 
 typedef struct {
-  uint8_t best_palette_color_map[MAX_SB_SQUARE];
-  float kmeans_data_buf[2 * MAX_SB_SQUARE];
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
 } PALETTE_BUFFER;
 
 typedef struct {
-  TX_TYPE tx_type;
   TX_SIZE tx_size;
-#if CONFIG_VAR_TX
-  TX_SIZE min_tx_size;
-  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#endif  // CONFIG_VAR_TX
-#if CONFIG_TXK_SEL
-  TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-#endif  // CONFIG_TXK_SEL
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
   RD_STATS rd_stats;
   uint32_t hash_value;
-} TX_RD_INFO;
+} MB_RD_INFO;
 
 #define RD_RECORD_BUFFER_LEN 8
 typedef struct {
-  TX_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
   int index_start;
   int num;
-  CRC_CALCULATOR crc_calculator;  // Hash function.
-} TX_RD_RECORD;
+  CRC32C crc_calculator;  // Hash function.
+} MB_RD_RECORD;
+
+typedef struct {
+  int64_t dist;
+  int64_t sse;
+  int rate;
+  uint16_t eob;
+  TX_TYPE tx_type;
+  uint16_t entropy_context;
+  uint8_t txb_entropy_ctx;
+  uint8_t valid;
+  uint8_t fast;  // This is not being used now.
+} TXB_RD_INFO;
+
+#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+typedef struct {
+  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  int index_start;
+  int num;
+} TXB_RD_RECORD;
+
+typedef struct tx_size_rd_info_node {
+  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
+  struct tx_size_rd_info_node *children[4];
+} TXB_RD_INFO_NODE;
+
+// Region size for mode decision sampling in the first pass of partition
+// search(two_pass_partition_search speed feature), in units of mi size(4).
+// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
+#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
+#define FIRST_PARTITION_PASS_STATS_TABLES                     \
+  (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \
+      (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+#define FIRST_PARTITION_PASS_STATS_STRIDE \
+  (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+
+static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) {
+  const int row =
+      (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  const int col =
+      (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col;
+}
+
+typedef struct {
+  uint8_t ref0_counts[REF_FRAMES];  // Counters for ref_frame[0].
+  uint8_t ref1_counts[REF_FRAMES];  // Counters for ref_frame[1].
+  int sample_counts;                // Number of samples collected.
+} FIRST_PARTITION_PASS_STATS;
+
+#define MAX_INTERP_FILTER_STATS 64
+typedef struct {
+  InterpFilters filters;
+  int_mv mv[2];
+  int8_t ref_frames[2];
+} INTERPOLATION_FILTER_STATS;
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
 
-  // Save the transform RD search info.
-  TX_RD_RECORD tx_rd_record;
+  // Determine if one would go with reduced complexity transform block
+  // search model to select prediction modes, or full complexity model
+  // to select transform kernel.
+  int rd_model;
+
+  // Indicate if the encoder is running in the first pass partition search.
+  // In that case, apply certain speed features therein to reduce the overhead
+  // cost in the first pass search.
+  int cb_partition_scan;
+
+  FIRST_PARTITION_PASS_STATS
+  first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
+
+  // [comp_idx][saved stat_idx]
+  INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS];
+  int interp_filter_stats_idx[2];
+
+  // Activate constrained coding block partition search range.
+  int use_cb_search_range;
+
+  // Inter macroblock RD search info.
+  MB_RD_RECORD mb_rd_record;
+
+  // Inter transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
+  TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
+  TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
+  TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
+
+  // Intra transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_intra;
 
   MACROBLOCKD e_mbd;
   MB_MODE_INFO_EXT *mbmi_ext;
@@ -173,34 +227,29 @@ struct macroblock {
   int *m_search_count_ptr;
   int *ex_search_count_ptr;
 
-#if CONFIG_VAR_TX
   unsigned int txb_split_count;
-#endif
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
 
-  int mv_best_ref_index[TOTAL_REFS_PER_FRAME];
-  unsigned int max_mv_context[TOTAL_REFS_PER_FRAME];
+  unsigned int max_mv_context[REF_FRAMES];
   unsigned int source_variance;
-  unsigned int pred_sse[TOTAL_REFS_PER_FRAME];
-  int pred_mv_sad[TOTAL_REFS_PER_FRAME];
+  unsigned int pred_sse[REF_FRAMES];
+  int pred_mv_sad[REF_FRAMES];
 
   int *nmvjointcost;
-  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
-  int *nmvcost[NMV_CONTEXTS][2];
-  int *nmvcost_hp[NMV_CONTEXTS][2];
-  int **mv_cost_stack[NMV_CONTEXTS];
+  int nmv_vec_cost[MV_JOINTS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int **mv_cost_stack;
   int **mvcost;
 
-#if CONFIG_MOTION_VAR
   int32_t *wsrc_buf;
   int32_t *mask_buf;
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
-#endif  // CONFIG_MOTION_VAR
 
   PALETTE_BUFFER *palette_buffer;
 
@@ -208,108 +257,80 @@ struct macroblock {
   // from extending outside the UMV borders
   MvLimits mv_limits;
 
-#if CONFIG_VAR_TX
-  uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#endif
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t blk_skip_drl[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
   int skip;
-
-#if CONFIG_CB4X4
   int skip_chroma_rd;
-#endif
+  int skip_cost[SKIP_CONTEXTS][2];
+
+  int skip_mode;  // 0: off; 1: on
+  int skip_mode_cost[SKIP_CONTEXTS][2];
+
+  int compound_idx;
 
-#if CONFIG_LV_MAP
   LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  LV_MAP_EOB_COST eob_costs[7][2];
   uint16_t cb_offset;
-#endif
-
-  av1_coeff_cost token_head_costs[TX_SIZES];
-  av1_coeff_cost token_tail_costs[TX_SIZES];
 
   // mode costs
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+
   int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
   int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
-  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
   int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
   int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
 
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                       [CDF_SIZE(2)];
+  // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
+  // GOLDEN_FRAME) in bidir-comp mode.
+  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+  // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
+  // BWDREF_FRAME) in bidir-comp mode.
+  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
   int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES];
-#if CONFIG_COMPOUND_SINGLEREF
-  int inter_singleref_comp_mode_cost[INTER_MODE_CONTEXTS]
-                                    [INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
+  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
   int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   int motion_mode_cost1[BLOCK_SIZES_ALL][2];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  int motion_mode_cost2[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES];
-#endif
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-  int ncobmc_mode_cost[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int intra_uv_mode_cost[INTRA_MODES][UV_INTRA_MODES];
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-#if CONFIG_EXT_PARTITION_TYPES
-  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
-                    [EXT_PARTITION_TYPES];
-#else
-  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
-                    [PARTITION_TYPES];
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_MRC_TX
-  int mrc_mask_inter_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                         [PALETTE_COLORS];
-  int mrc_mask_intra_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                         [PALETTE_COLORS];
-#endif  // CONFIG_MRC_TX
-  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
-  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
   int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
                           [PALETTE_COLORS];
   int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
                            [PALETTE_COLORS];
-#if CONFIG_CFL
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
   // The rate associated with each alpha codeword
   int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
-#endif  // CONFIG_CFL
   int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
-#if CONFIG_EXT_TX
-#if CONFIG_LGT_FROM_PRED
-  int intra_lgt_cost[LGT_SIZES][INTRA_MODES][2];
-  int inter_lgt_cost[LGT_SIZES][2];
-#endif
+  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
   int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
   int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
                          [TX_TYPES];
-#else
-  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_LOOP_RESTORATION
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
   int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_INTRABC
+  int wiener_restore_cost[2];
+  int sgrproj_restore_cost[2];
   int intrabc_cost[2];
-#endif  // CONFIG_INTRABC
-
-  int optimize;
 
   // Used to store sub partition's choices.
-  MV pred_mv[TOTAL_REFS_PER_FRAME];
+  MV pred_mv[REF_FRAMES];
 
   // Store the best motion vector during motion search
   int_mv best_mv;
@@ -320,38 +341,65 @@ struct macroblock {
   int use_default_intra_tx_type;
   // use default transform and skip transform type search for inter modes
   int use_default_inter_tx_type;
-#if CONFIG_PVQ
-  int rate;
-  // 1 if neither AC nor DC is coded. Only used during RDO.
-  int pvq_skip[MAX_MB_PLANE];
-  PVQ_QUEUE *pvq_q;
-
-  // Storage for PVQ tx block encodings in a superblock.
-  // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ
-  // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from:
-  // 1) Since PVQ is applied to each trasnform-ed block
-  // 2) 4x4 is the smallest tx size in AV1
-  // 3) AV1 allows using smaller tx size than block (i.e. partition) size
-  // TODO(yushin) : The memory usage could be improved a lot, since this has
-  // storage for 10 bands and 128 coefficients for every 4x4 block,
-  PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE];
-  daala_enc_ctx daala_enc;
-  int pvq_speed;
-  int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
-#endif
 #if CONFIG_DIST_8X8
   int using_dist_8x8;
   aom_tune_metric tune_metric;
-#if CONFIG_CB4X4
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, decoded_8x8[8 * 8]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]);
-#endif
-#endif  // CONFIG_CB4X4
+  DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]);
 #endif  // CONFIG_DIST_8X8
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  // Bit flags for pruning tx type search, tx split, etc.
+  int tx_search_prune[EXT_TX_SET_TYPES];
+  int must_find_valid_partition;
+  int tx_split_prune_flag;  // Flag to skip tx split RD search.
 };
 
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES_ALL] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    1,  // BLOCK_32X64
+    1,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+    1,  // BLOCK_4X16
+    1,  // BLOCK_16X4
+    1,  // BLOCK_8X32
+    1,  // BLOCK_32X8
+    1,  // BLOCK_16X64
+    1,  // BLOCK_64X16
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+         !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+  TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (tx_size != ctx_size) {
+    depth++;
+    ctx_size = sub_tx_size_map[ctx_size];
+    assert(depth <= MAX_TX_DEPTH);
+  }
+  return depth;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
index 113ceb29d..66dedd9ed 100644
--- a/third_party/aom/av1/encoder/blockiness.c
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -9,9 +9,10 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "av1/common/common.h"
 #include "av1/common/filter.h"
 #include "aom/aom_integer.h"
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index 4bbf0e5fb..d6e556b93 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -13,32 +13,18 @@
 #include "av1/encoder/encoder.h"
 
 static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
-#if CONFIG_CB4X4
-  BLOCK_4X4,
-#endif
-  BLOCK_8X8,     BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-  BLOCK_128X128,
-#endif  // CONFIG_EXT_PARTITION
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
 };
 
 static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
-#if CONFIG_EXT_PARTITION_TYPES
-                               PARTITION_TYPE partition,
-#endif
                                PICK_MODE_CONTEXT *ctx) {
+  const int num_planes = av1_num_planes(cm);
   int i;
   const int num_blk = num_pix / 16;
   ctx->num_4x4_blk = num_blk;
 
-#if CONFIG_EXT_PARTITION_TYPES
-  ctx->partition = partition;
-#endif
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_VAR_TX
-    CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t)));
-#endif
+  CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t)));
+  for (i = 0; i < num_planes; ++i) {
     CHECK_MEM_ERROR(cm, ctx->coeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
     CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
@@ -47,148 +33,94 @@ static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
                     aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
     CHECK_MEM_ERROR(cm, ctx->eobs[i],
                     aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
-#if CONFIG_LV_MAP
     CHECK_MEM_ERROR(
         cm, ctx->txb_entropy_ctx[i],
         aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
-#endif
-
-#if CONFIG_PVQ
-    CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i],
-                    aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i])));
-#endif
   }
 
-  for (i = 0; i < 2; ++i) {
-    CHECK_MEM_ERROR(
-        cm, ctx->color_index_map[i],
-        aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+  if (num_pix <= MAX_PALETTE_SQUARE) {
+    for (i = 0; i < 2; ++i) {
+      CHECK_MEM_ERROR(
+          cm, ctx->color_index_map[i],
+          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    }
   }
-#if CONFIG_MRC_TX
-  CHECK_MEM_ERROR(cm, ctx->mrc_mask,
-                  aom_memalign(32, num_pix * sizeof(*ctx->mrc_mask)));
-#endif  // CONFIG_MRC_TX
 }
 
-static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+static void free_mode_context(PICK_MODE_CONTEXT *ctx, const int num_planes) {
   int i;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_VAR_TX
-    aom_free(ctx->blk_skip[i]);
-    ctx->blk_skip[i] = 0;
-#endif
+  aom_free(ctx->blk_skip);
+  ctx->blk_skip = 0;
+  for (i = 0; i < num_planes; ++i) {
     aom_free(ctx->coeff[i]);
     ctx->coeff[i] = 0;
     aom_free(ctx->qcoeff[i]);
     ctx->qcoeff[i] = 0;
     aom_free(ctx->dqcoeff[i]);
     ctx->dqcoeff[i] = 0;
-#if CONFIG_PVQ
-    aom_free(ctx->pvq_ref_coeff[i]);
-    ctx->pvq_ref_coeff[i] = 0;
-#endif
     aom_free(ctx->eobs[i]);
     ctx->eobs[i] = 0;
-#if CONFIG_LV_MAP
     aom_free(ctx->txb_entropy_ctx[i]);
     ctx->txb_entropy_ctx[i] = 0;
-#endif
   }
 
   for (i = 0; i < 2; ++i) {
     aom_free(ctx->color_index_map[i]);
     ctx->color_index_map[i] = 0;
   }
-#if CONFIG_MRC_TX
-  aom_free(ctx->mrc_mask);
-  ctx->mrc_mask = 0;
-#endif  // CONFIG_MRC_TX
 }
 
-static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix) {
-#if CONFIG_EXT_PARTITION_TYPES
-  alloc_mode_context(cm, num_pix, PARTITION_NONE, &tree->none);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ, &tree->horizontal[0]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[0]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->horizontal[1]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[1]);
-
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[1]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_A, &tree->horizontala[2]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_B, &tree->horizontalb[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[1]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[2]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[1]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_A, &tree->verticala[2]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_B, &tree->verticalb[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[1]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[2]);
-  for (int i = 0; i < 4; ++i) {
-    alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4,
-                       &tree->horizontal4[i]);
-    alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4, &tree->vertical4[i]);
-  }
-#if CONFIG_SUPERTX
-  alloc_mode_context(cm, num_pix, PARTITION_HORZ, &tree->horizontal_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_VERT, &tree->vertical_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_SPLIT, &tree->split_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_HORZ_A, &tree->horizontala_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_HORZ_B, &tree->horizontalb_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_VERT_A, &tree->verticala_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_VERT_B, &tree->verticalb_supertx);
-#endif  // CONFIG_SUPERTX
-#else
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix,
+                                int is_leaf) {
   alloc_mode_context(cm, num_pix, &tree->none);
+
+  if (is_leaf) return;
+
   alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]);
-#if CONFIG_SUPERTX
-  alloc_mode_context(cm, num_pix, &tree->horizontal_supertx);
-  alloc_mode_context(cm, num_pix, &tree->vertical_supertx);
-  alloc_mode_context(cm, num_pix, &tree->split_supertx);
-#endif
 
-  if (num_pix > 16) {
-    alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
-    alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
-  } else {
-    memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
-    memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2]);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2]);
+
+  for (int i = 0; i < 4; ++i) {
+    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i]);
+    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i]);
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-static void free_tree_contexts(PC_TREE *tree) {
-#if CONFIG_EXT_PARTITION_TYPES
+static void free_tree_contexts(PC_TREE *tree, const int num_planes) {
   int i;
   for (i = 0; i < 3; i++) {
-    free_mode_context(&tree->horizontala[i]);
-    free_mode_context(&tree->horizontalb[i]);
-    free_mode_context(&tree->verticala[i]);
-    free_mode_context(&tree->verticalb[i]);
+    free_mode_context(&tree->horizontala[i], num_planes);
+    free_mode_context(&tree->horizontalb[i], num_planes);
+    free_mode_context(&tree->verticala[i], num_planes);
+    free_mode_context(&tree->verticalb[i], num_planes);
   }
   for (i = 0; i < 4; ++i) {
-    free_mode_context(&tree->horizontal4[i]);
-    free_mode_context(&tree->vertical4[i]);
+    free_mode_context(&tree->horizontal4[i], num_planes);
+    free_mode_context(&tree->vertical4[i], num_planes);
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
-  free_mode_context(&tree->none);
-  free_mode_context(&tree->horizontal[0]);
-  free_mode_context(&tree->horizontal[1]);
-  free_mode_context(&tree->vertical[0]);
-  free_mode_context(&tree->vertical[1]);
-#if CONFIG_SUPERTX
-  free_mode_context(&tree->horizontal_supertx);
-  free_mode_context(&tree->vertical_supertx);
-  free_mode_context(&tree->split_supertx);
-#if CONFIG_EXT_PARTITION_TYPES
-  free_mode_context(&tree->horizontala_supertx);
-  free_mode_context(&tree->horizontalb_supertx);
-  free_mode_context(&tree->verticala_supertx);
-  free_mode_context(&tree->verticalb_supertx);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_SUPERTX
+  free_mode_context(&tree->none, num_planes);
+  free_mode_context(&tree->horizontal[0], num_planes);
+  free_mode_context(&tree->horizontal[1], num_planes);
+  free_mode_context(&tree->vertical[0], num_planes);
+  free_mode_context(&tree->vertical[1], num_planes);
 }
 
 // This function sets up a tree of contexts such that at each square
@@ -197,65 +129,25 @@ static void free_tree_contexts(PC_TREE *tree) {
 // represents the state of our search.
 void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   int i, j;
-#if CONFIG_CB4X4
-#if CONFIG_EXT_PARTITION
   const int tree_nodes_inc = 1024;
-#else
-  const int tree_nodes_inc = 256;
-#endif  // CONFIG_EXT_PARTITION
   const int leaf_factor = 4;
-#else
-  const int tree_nodes_inc = 0;
-  const int leaf_factor = 1;
-#endif
-#if CONFIG_EXT_PARTITION
   const int leaf_nodes = 256 * leaf_factor;
   const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
-#else
-  const int leaf_nodes = 64 * leaf_factor;
-  const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
-#endif  // CONFIG_EXT_PARTITION
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   int square_index = 1;
   int nodes;
 
-#if !CONFIG_CB4X4
-  aom_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
-                  aom_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
-  PICK_MODE_CONTEXT *this_leaf = &td->leaf_tree[0];
-#endif
   aom_free(td->pc_tree);
   CHECK_MEM_ERROR(cm, td->pc_tree,
                   aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
   this_pc = &td->pc_tree[0];
 
-#if !CONFIG_CB4X4
-  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
-  // context so we only need to allocate 1 for each 8x8 block.
-  for (i = 0; i < leaf_nodes; ++i) {
-#if CONFIG_EXT_PARTITION_TYPES
-    alloc_mode_context(cm, 4, PARTITION_NONE, &td->leaf_tree[i]);
-#else
-    alloc_mode_context(cm, 16, &td->leaf_tree[i]);
-#endif
-  }
-#endif
-
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
     PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
-#if CONFIG_CB4X4
-    alloc_tree_contexts(cm, tree, 16);
-#else
-    alloc_tree_contexts(cm, tree, 4);
-#endif
-#if !CONFIG_CB4X4
-    tree->leaf_split[0] = this_leaf++;
-    for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
-#endif
+    alloc_tree_contexts(cm, tree, 16, 1);
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -263,11 +155,7 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-#if CONFIG_CB4X4
-      alloc_tree_contexts(cm, tree, 16 << (2 * square_index));
-#else
-      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
-#endif
+      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0);
       tree->block_size = square[square_index];
       for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
       ++pc_tree_index;
@@ -286,35 +174,41 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   }
 }
 
-void av1_free_pc_tree(ThreadData *td) {
-#if CONFIG_CB4X4
-#if CONFIG_EXT_PARTITION
+void av1_free_pc_tree(ThreadData *td, const int num_planes) {
   const int tree_nodes_inc = 1024;
-#else
-  const int tree_nodes_inc = 256;
-#endif  // CONFIG_EXT_PARTITION
-#else
-  const int tree_nodes_inc = 0;
-#endif
 
-#if CONFIG_EXT_PARTITION
   const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
-#else
-  const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
-#endif  // CONFIG_EXT_PARTITION
   int i;
-  for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+  for (i = 0; i < tree_nodes; ++i)
+    free_tree_contexts(&td->pc_tree[i], num_planes);
   aom_free(td->pc_tree);
   td->pc_tree = NULL;
-#if !CONFIG_CB4X4
-  const int leaf_factor = 1;
-#if CONFIG_EXT_PARTITION
-  const int leaf_nodes = 256 * leaf_factor;
-#else
-  const int leaf_nodes = 64 * leaf_factor;
-#endif  // CONFIG_EXT_PARTITION
-  for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
-  aom_free(td->leaf_tree);
-  td->leaf_tree = NULL;
-#endif
+}
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx) {
+  dst_ctx->mic = src_ctx->mic;
+  dst_ctx->mbmi_ext = src_ctx->mbmi_ext;
+
+  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+  dst_ctx->skip = src_ctx->skip;
+  dst_ctx->skippable = src_ctx->skippable;
+  dst_ctx->best_mode_index = src_ctx->best_mode_index;
+
+  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+         sizeof(uint8_t) * src_ctx->num_4x4_blk);
+
+  dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
+  dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
+  dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
+
+  dst_ctx->rate = src_ctx->rate;
+  dst_ctx->dist = src_ctx->dist;
+  dst_ctx->rdcost = src_ctx->rdcost;
+  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+
+  memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
+  dst_ctx->pred_interp_filter = src_ctx->pred_interp_filter;
+
+  dst_ctx->partition = src_ctx->partition;
 }
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index 38052ba27..c05f48a7a 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -23,28 +23,29 @@ struct AV1_COMP;
 struct AV1Common;
 struct ThreadData;
 
+typedef enum {
+  // Search all the partition types in this plane.
+  SEARCH_FULL_PLANE = 0,
+  // Only search none_partition coding block.
+  NONE_PARTITION_PLANE = 1,
+  // Search all the partition types in this plane except split.
+  SEARCH_SAME_PLANE = 2,
+  // Skip search partition on this plane. Go split directly.
+  SPLIT_PLANE = 3,
+} CB_TREE_SEARCH;
+
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
-  MODE_INFO mic;
+  MB_MODE_INFO mic;
   MB_MODE_INFO_EXT mbmi_ext;
   uint8_t *color_index_map[2];
-#if CONFIG_MRC_TX
-  uint8_t *mrc_mask;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_VAR_TX
-  uint8_t *blk_skip[MAX_MB_PLANE];
-#endif
+  uint8_t *blk_skip;
 
   tran_low_t *coeff[MAX_MB_PLANE];
   tran_low_t *qcoeff[MAX_MB_PLANE];
   tran_low_t *dqcoeff[MAX_MB_PLANE];
-#if CONFIG_PVQ
-  tran_low_t *pvq_ref_coeff[MAX_MB_PLANE];
-#endif
   uint16_t *eobs[MAX_MB_PLANE];
-#if CONFIG_LV_MAP
   uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
-#endif
 
   int num_4x4_blk;
   int skip;
@@ -60,16 +61,27 @@ typedef struct {
   // scope of refactoring.
   int rate;
   int64_t dist;
+  int64_t rdcost;
+  int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
+                         // been made.
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
-  MV pred_mv[TOTAL_REFS_PER_FRAME];
+  MV pred_mv[REF_FRAMES];
   InterpFilter pred_interp_filter;
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
-#endif
 } PICK_MODE_CONTEXT;
 
+typedef struct {
+  int valid;
+  int split;
+  int skip;
+  int64_t rdcost;
+  int sub_block_split[4];
+  int sub_block_skip[4];
+  int64_t sub_block_rdcost[4];
+} PC_TREE_STATS;
+
 typedef struct PC_TREE {
   int index;
   PARTITION_TYPE partitioning;
@@ -77,34 +89,21 @@ typedef struct PC_TREE {
   PICK_MODE_CONTEXT none;
   PICK_MODE_CONTEXT horizontal[2];
   PICK_MODE_CONTEXT vertical[2];
-#if CONFIG_EXT_PARTITION_TYPES
   PICK_MODE_CONTEXT horizontala[3];
   PICK_MODE_CONTEXT horizontalb[3];
   PICK_MODE_CONTEXT verticala[3];
   PICK_MODE_CONTEXT verticalb[3];
   PICK_MODE_CONTEXT horizontal4[4];
   PICK_MODE_CONTEXT vertical4[4];
-#endif
-  // TODO(jingning): remove leaf_split[] when cb4x4 experiment flag is removed.
-  union {
-    struct PC_TREE *split[4];
-    PICK_MODE_CONTEXT *leaf_split[4];
-  };
-#if CONFIG_SUPERTX
-  PICK_MODE_CONTEXT horizontal_supertx;
-  PICK_MODE_CONTEXT vertical_supertx;
-  PICK_MODE_CONTEXT split_supertx;
-#if CONFIG_EXT_PARTITION_TYPES
-  PICK_MODE_CONTEXT horizontala_supertx;
-  PICK_MODE_CONTEXT horizontalb_supertx;
-  PICK_MODE_CONTEXT verticala_supertx;
-  PICK_MODE_CONTEXT verticalb_supertx;
-#endif
-#endif
+  CB_TREE_SEARCH cb_search_range;
+  struct PC_TREE *split[4];
+  PC_TREE_STATS pc_tree_stats;
 } PC_TREE;
 
 void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
-void av1_free_pc_tree(struct ThreadData *td);
+void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
index 3827b65fa..29e934deb 100644
--- a/third_party/aom/av1/encoder/corner_match.c
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -13,7 +13,8 @@
 #include <memory.h>
 #include <math.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/encoder/corner_match.h"
 
 #define SEARCH_SZ 9
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
index e33df53e4..323e2aed5 100644
--- a/third_party/aom/av1/encoder/cost.c
+++ b/third_party/aom/av1/encoder/cost.c
@@ -13,65 +13,26 @@
 #include "av1/encoder/cost.h"
 #include "av1/common/entropy.h"
 
-/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
-   Begins with a bogus entry for simpler addressing. */
-const uint16_t av1_prob_cost[256] = {
-  4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260,
-  2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718,
-  1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409,
-  1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192,
-  1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024,
-  1013, 1001, 990,  979,  968,  958,  947,  937,  927,  917,  907,  897,  887,
-  878,  868,  859,  850,  841,  832,  823,  814,  806,  797,  789,  780,  772,
-  764,  756,  748,  740,  732,  724,  717,  709,  702,  694,  687,  680,  673,
-  665,  658,  651,  644,  637,  631,  624,  617,  611,  604,  598,  591,  585,
-  578,  572,  566,  560,  554,  547,  541,  535,  530,  524,  518,  512,  506,
-  501,  495,  489,  484,  478,  473,  467,  462,  456,  451,  446,  441,  435,
-  430,  425,  420,  415,  410,  405,  400,  395,  390,  385,  380,  375,  371,
-  366,  361,  356,  352,  347,  343,  338,  333,  329,  324,  320,  316,  311,
-  307,  302,  298,  294,  289,  285,  281,  277,  273,  268,  264,  260,  256,
-  252,  248,  244,  240,  236,  232,  228,  224,  220,  216,  212,  209,  205,
-  201,  197,  194,  190,  186,  182,  179,  175,  171,  168,  164,  161,  157,
-  153,  150,  146,  143,  139,  136,  132,  129,  125,  122,  119,  115,  112,
-  109,  105,  102,  99,   95,   92,   89,   86,   82,   79,   76,   73,   70,
-  66,   63,   60,   57,   54,   51,   48,   45,   42,   38,   35,   32,   29,
-  26,   23,   20,   18,   15,   12,   9,    6,    3
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+  512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+  430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+  356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+  289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+  228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+  171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+  119, 115, 112, 109, 105, 102, 99,  95,  92,  89,  86,  82,  79,  76,  73,
+  70,  66,  63,  60,  57,  54,  51,  48,  45,  42,  38,  35,  32,  29,  26,
+  23,  20,  18,  15,  12,  9,   6,   3,
 };
 
-static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
-                 int c) {
-  const aom_prob prob = probs[i / 2];
-  int b;
-
-  assert(prob != 0);
-  for (b = 0; b <= 1; ++b) {
-    const int cc = c + av1_cost_bit(prob, b);
-    const aom_tree_index ii = tree[i + b];
-
-    if (ii <= 0)
-      costs[-ii] = cc;
-    else
-      cost(costs, tree, probs, ii, cc);
-  }
-}
-
-void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) {
-  cost(costs, tree, probs, 0, 0);
-}
-
-void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) {
-  assert(tree[0] <= 0 && tree[1] > 0);
-
-  costs[-tree[0]] = av1_cost_bit(probs[0], 0);
-  cost(costs, tree, probs, 2, 0);
-}
-
 void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
                               const int *inv_map) {
   int i;
   aom_cdf_prob prev_cdf = 0;
   for (i = 0;; ++i) {
-    const aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
     prev_cdf = AOM_ICDF(cdf[i]);
 
     if (inv_map)
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
index e60632005..5de7765c5 100644
--- a/third_party/aom/av1/encoder/cost.h
+++ b/third_party/aom/av1/encoder/cost.h
@@ -19,17 +19,11 @@
 extern "C" {
 #endif
 
-extern const uint16_t av1_prob_cost[256];
+extern const uint16_t av1_prob_cost[128];
 
 // The factor to scale from cost in bits to cost in av1_prob_cost units.
 #define AV1_PROB_COST_SHIFT 9
 
-#define av1_cost_zero(prob) (av1_prob_cost[prob])
-
-#define av1_cost_one(prob) av1_cost_zero(256 - (prob))
-
-#define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob))
-
 // Cost of coding an n bit literal, using 128 (i.e. 50%) probability
 // for each bit.
 #define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
@@ -38,31 +32,11 @@ extern const uint16_t av1_prob_cost[256];
 static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
   assert(0 < p15 && p15 < CDF_PROB_TOP);
   const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
-  return av1_cost_zero(get_prob(p15 << shift, CDF_PROB_TOP)) +
-         av1_cost_literal(shift);
-}
-
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          aom_prob p) {
-  return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p);
-}
-
-static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits,
-                             int len) {
-  int cost = 0;
-  aom_tree_index i = 0;
-
-  do {
-    const int bit = (bits >> --len) & 1;
-    cost += av1_cost_bit(probs[i >> 1], bit);
-    i = tree[i + bit];
-  } while (len);
-
-  return cost;
+  const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+  assert(prob >= 128);
+  return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
 }
 
-void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
-void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
 void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
                               const int *inv_map);
 
diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c
deleted file mode 100644
index c60e2d3d7..000000000
--- a/third_party/aom/av1/encoder/daala_compat_enc.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "encint.h"
-
-void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) {
-#if !CONFIG_ANS
-  od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  OD_COPY(&rbuf->adapt, enc->state.adapt, 1);
-}
-
-void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) {
-#if !CONFIG_ANS
-  od_ec_enc_rollback(&enc->w.ec, &rbuf->ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  OD_COPY(enc->state.adapt, &rbuf->adapt, 1);
-}
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
deleted file mode 100644
index a04d46b72..000000000
--- a/third_party/aom/av1/encoder/dct.c
+++ /dev/null
@@ -1,2797 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_ports/mem.h"
-#include "av1/common/blockd.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
-#include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
-#include "av1/common/daala_tx.h"
-#endif
-
-static INLINE void range_check(const tran_low_t *input, const int size,
-                               const int bit) {
-#if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
-// TODO(angiebird): the range_check is not used because the bit range
-// in fdct# is not correct. Since we are going to merge in a new version
-// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
-  int i;
-  for (i = 0; i < size; ++i) {
-    assert(abs(input[i]) < (1 << bit));
-  }
-#else
-  (void)input;
-  (void)size;
-  (void)bit;
-#endif
-}
-
-static void fdct4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[4];
-
-  // stage 0
-  range_check(input, 4, 14);
-
-  // stage 1
-  output[0] = input[0] + input[3];
-  output[1] = input[1] + input[2];
-  output[2] = input[1] - input[2];
-  output[3] = input[0] - input[3];
-
-  range_check(output, 4, 15);
-
-  // stage 2
-  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
-  step[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
-  step[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
-  step[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
-  step[3] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 4, 16);
-
-  // stage 3
-  output[0] = step[0];
-  output[1] = step[2];
-  output[2] = step[1];
-  output[3] = step[3];
-
-  range_check(output, 4, 16);
-}
-
-static void fdct8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[8];
-
-  // stage 0
-  range_check(input, 8, 13);
-
-  // stage 1
-  output[0] = input[0] + input[7];
-  output[1] = input[1] + input[6];
-  output[2] = input[2] + input[5];
-  output[3] = input[3] + input[4];
-  output[4] = input[3] - input[4];
-  output[5] = input[2] - input[5];
-  output[6] = input[1] - input[6];
-  output[7] = input[0] - input[7];
-
-  range_check(output, 8, 14);
-
-  // stage 2
-  step[0] = output[0] + output[3];
-  step[1] = output[1] + output[2];
-  step[2] = output[1] - output[2];
-  step[3] = output[0] - output[3];
-  step[4] = output[4];
-  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  step[7] = output[7];
-
-  range_check(step, 8, 15);
-
-  // stage 3
-  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
-  output[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
-  output[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  output[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
-  output[3] = (tran_low_t)fdct_round_shift(temp);
-  output[4] = step[4] + step[5];
-  output[5] = step[4] - step[5];
-  output[6] = step[7] - step[6];
-  output[7] = step[7] + step[6];
-
-  range_check(output, 8, 16);
-
-  // stage 4
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
-  step[4] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
-  step[7] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 8, 16);
-
-  // stage 5
-  output[0] = step[0];
-  output[1] = step[4];
-  output[2] = step[2];
-  output[3] = step[6];
-  output[4] = step[1];
-  output[5] = step[5];
-  output[6] = step[3];
-  output[7] = step[7];
-
-  range_check(output, 8, 16);
-}
-
-static void fdct16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[16];
-
-  // stage 0
-  range_check(input, 16, 13);
-
-  // stage 1
-  output[0] = input[0] + input[15];
-  output[1] = input[1] + input[14];
-  output[2] = input[2] + input[13];
-  output[3] = input[3] + input[12];
-  output[4] = input[4] + input[11];
-  output[5] = input[5] + input[10];
-  output[6] = input[6] + input[9];
-  output[7] = input[7] + input[8];
-  output[8] = input[7] - input[8];
-  output[9] = input[6] - input[9];
-  output[10] = input[5] - input[10];
-  output[11] = input[4] - input[11];
-  output[12] = input[3] - input[12];
-  output[13] = input[2] - input[13];
-  output[14] = input[1] - input[14];
-  output[15] = input[0] - input[15];
-
-  range_check(output, 16, 14);
-
-  // stage 2
-  step[0] = output[0] + output[7];
-  step[1] = output[1] + output[6];
-  step[2] = output[2] + output[5];
-  step[3] = output[3] + output[4];
-  step[4] = output[3] - output[4];
-  step[5] = output[2] - output[5];
-  step[6] = output[1] - output[6];
-  step[7] = output[0] - output[7];
-  step[8] = output[8];
-  step[9] = output[9];
-  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
-  step[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
-  step[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
-  step[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
-  step[13] = (tran_low_t)fdct_round_shift(temp);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  range_check(step, 16, 15);
-
-  // stage 3
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = step[1] - step[2];
-  output[3] = step[0] - step[3];
-  output[4] = step[4];
-  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
-  output[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
-  output[6] = (tran_low_t)fdct_round_shift(temp);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = step[9] - step[10];
-  output[11] = step[8] - step[11];
-  output[12] = step[15] - step[12];
-  output[13] = step[14] - step[13];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  range_check(output, 16, 16);
-
-  // stage 4
-  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
-  step[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
-  step[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
-  step[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
-  step[3] = (tran_low_t)fdct_round_shift(temp);
-  step[4] = output[4] + output[5];
-  step[5] = output[4] - output[5];
-  step[6] = output[7] - output[6];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
-  step[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
-  step[10] = (tran_low_t)fdct_round_shift(temp);
-  step[11] = output[11];
-  step[12] = output[12];
-  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
-  step[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
-  step[14] = (tran_low_t)fdct_round_shift(temp);
-  step[15] = output[15];
-
-  range_check(step, 16, 16);
-
-  // stage 5
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
-  output[4] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
-  output[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
-  output[6] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
-  output[7] = (tran_low_t)fdct_round_shift(temp);
-  output[8] = step[8] + step[9];
-  output[9] = step[8] - step[9];
-  output[10] = step[11] - step[10];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = step[12] - step[13];
-  output[14] = step[15] - step[14];
-  output[15] = step[15] + step[14];
-
-  range_check(output, 16, 16);
-
-  // stage 6
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
-  step[8] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
-  step[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
-  step[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
-  step[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
-  step[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
-  step[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
-  step[14] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
-  step[15] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 16, 16);
-
-  // stage 7
-  output[0] = step[0];
-  output[1] = step[8];
-  output[2] = step[4];
-  output[3] = step[12];
-  output[4] = step[2];
-  output[5] = step[10];
-  output[6] = step[6];
-  output[7] = step[14];
-  output[8] = step[1];
-  output[9] = step[9];
-  output[10] = step[5];
-  output[11] = step[13];
-  output[12] = step[3];
-  output[13] = step[11];
-  output[14] = step[7];
-  output[15] = step[15];
-
-  range_check(output, 16, 16);
-}
-
-static void fdct32(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[32];
-
-  // stage 0
-  range_check(input, 32, 14);
-
-  // stage 1
-  output[0] = input[0] + input[31];
-  output[1] = input[1] + input[30];
-  output[2] = input[2] + input[29];
-  output[3] = input[3] + input[28];
-  output[4] = input[4] + input[27];
-  output[5] = input[5] + input[26];
-  output[6] = input[6] + input[25];
-  output[7] = input[7] + input[24];
-  output[8] = input[8] + input[23];
-  output[9] = input[9] + input[22];
-  output[10] = input[10] + input[21];
-  output[11] = input[11] + input[20];
-  output[12] = input[12] + input[19];
-  output[13] = input[13] + input[18];
-  output[14] = input[14] + input[17];
-  output[15] = input[15] + input[16];
-  output[16] = input[15] - input[16];
-  output[17] = input[14] - input[17];
-  output[18] = input[13] - input[18];
-  output[19] = input[12] - input[19];
-  output[20] = input[11] - input[20];
-  output[21] = input[10] - input[21];
-  output[22] = input[9] - input[22];
-  output[23] = input[8] - input[23];
-  output[24] = input[7] - input[24];
-  output[25] = input[6] - input[25];
-  output[26] = input[5] - input[26];
-  output[27] = input[4] - input[27];
-  output[28] = input[3] - input[28];
-  output[29] = input[2] - input[29];
-  output[30] = input[1] - input[30];
-  output[31] = input[0] - input[31];
-
-  range_check(output, 32, 15);
-
-  // stage 2
-  step[0] = output[0] + output[15];
-  step[1] = output[1] + output[14];
-  step[2] = output[2] + output[13];
-  step[3] = output[3] + output[12];
-  step[4] = output[4] + output[11];
-  step[5] = output[5] + output[10];
-  step[6] = output[6] + output[9];
-  step[7] = output[7] + output[8];
-  step[8] = output[7] - output[8];
-  step[9] = output[6] - output[9];
-  step[10] = output[5] - output[10];
-  step[11] = output[4] - output[11];
-  step[12] = output[3] - output[12];
-  step[13] = output[2] - output[13];
-  step[14] = output[1] - output[14];
-  step[15] = output[0] - output[15];
-  step[16] = output[16];
-  step[17] = output[17];
-  step[18] = output[18];
-  step[19] = output[19];
-  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
-  step[20] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
-  step[22] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
-  step[23] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
-  step[24] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
-  step[25] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
-  step[27] = (tran_low_t)fdct_round_shift(temp);
-  step[28] = output[28];
-  step[29] = output[29];
-  step[30] = output[30];
-  step[31] = output[31];
-
-  range_check(step, 32, 16);
-
-  // stage 3
-  output[0] = step[0] + step[7];
-  output[1] = step[1] + step[6];
-  output[2] = step[2] + step[5];
-  output[3] = step[3] + step[4];
-  output[4] = step[3] - step[4];
-  output[5] = step[2] - step[5];
-  output[6] = step[1] - step[6];
-  output[7] = step[0] - step[7];
-  output[8] = step[8];
-  output[9] = step[9];
-  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
-  output[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
-  output[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
-  output[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
-  output[13] = (tran_low_t)fdct_round_shift(temp);
-  output[14] = step[14];
-  output[15] = step[15];
-  output[16] = step[16] + step[23];
-  output[17] = step[17] + step[22];
-  output[18] = step[18] + step[21];
-  output[19] = step[19] + step[20];
-  output[20] = step[19] - step[20];
-  output[21] = step[18] - step[21];
-  output[22] = step[17] - step[22];
-  output[23] = step[16] - step[23];
-  output[24] = step[31] - step[24];
-  output[25] = step[30] - step[25];
-  output[26] = step[29] - step[26];
-  output[27] = step[28] - step[27];
-  output[28] = step[28] + step[27];
-  output[29] = step[29] + step[26];
-  output[30] = step[30] + step[25];
-  output[31] = step[31] + step[24];
-
-  range_check(output, 32, 17);
-
-  // stage 4
-  step[0] = output[0] + output[3];
-  step[1] = output[1] + output[2];
-  step[2] = output[1] - output[2];
-  step[3] = output[0] - output[3];
-  step[4] = output[4];
-  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  step[7] = output[7];
-  step[8] = output[8] + output[11];
-  step[9] = output[9] + output[10];
-  step[10] = output[9] - output[10];
-  step[11] = output[8] - output[11];
-  step[12] = output[15] - output[12];
-  step[13] = output[14] - output[13];
-  step[14] = output[14] + output[13];
-  step[15] = output[15] + output[12];
-  step[16] = output[16];
-  step[17] = output[17];
-  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
-  step[18] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
-  step[19] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
-  step[20] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  step[22] = output[22];
-  step[23] = output[23];
-  step[24] = output[24];
-  step[25] = output[25];
-  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
-  step[27] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
-  step[28] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
-  step[29] = (tran_low_t)fdct_round_shift(temp);
-  step[30] = output[30];
-  step[31] = output[31];
-
-  range_check(step, 32, 18);
-
-  // stage 5
-  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
-  output[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
-  output[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  output[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
-  output[3] = (tran_low_t)fdct_round_shift(temp);
-  output[4] = step[4] + step[5];
-  output[5] = step[4] - step[5];
-  output[6] = step[7] - step[6];
-  output[7] = step[7] + step[6];
-  output[8] = step[8];
-  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
-  output[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
-  output[10] = (tran_low_t)fdct_round_shift(temp);
-  output[11] = step[11];
-  output[12] = step[12];
-  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
-  output[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
-  output[14] = (tran_low_t)fdct_round_shift(temp);
-  output[15] = step[15];
-  output[16] = step[16] + step[19];
-  output[17] = step[17] + step[18];
-  output[18] = step[17] - step[18];
-  output[19] = step[16] - step[19];
-  output[20] = step[23] - step[20];
-  output[21] = step[22] - step[21];
-  output[22] = step[22] + step[21];
-  output[23] = step[23] + step[20];
-  output[24] = step[24] + step[27];
-  output[25] = step[25] + step[26];
-  output[26] = step[25] - step[26];
-  output[27] = step[24] - step[27];
-  output[28] = step[31] - step[28];
-  output[29] = step[30] - step[29];
-  output[30] = step[30] + step[29];
-  output[31] = step[31] + step[28];
-
-  range_check(output, 32, 18);
-
-  // stage 6
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
-  step[4] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
-  step[7] = (tran_low_t)fdct_round_shift(temp);
-  step[8] = output[8] + output[9];
-  step[9] = output[8] - output[9];
-  step[10] = output[11] - output[10];
-  step[11] = output[11] + output[10];
-  step[12] = output[12] + output[13];
-  step[13] = output[12] - output[13];
-  step[14] = output[15] - output[14];
-  step[15] = output[15] + output[14];
-  step[16] = output[16];
-  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
-  step[17] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
-  step[18] = (tran_low_t)fdct_round_shift(temp);
-  step[19] = output[19];
-  step[20] = output[20];
-  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
-  step[22] = (tran_low_t)fdct_round_shift(temp);
-  step[23] = output[23];
-  step[24] = output[24];
-  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
-  step[25] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  step[27] = output[27];
-  step[28] = output[28];
-  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
-  step[29] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
-  step[30] = (tran_low_t)fdct_round_shift(temp);
-  step[31] = output[31];
-
-  range_check(step, 32, 18);
-
-  // stage 7
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = step[4];
-  output[5] = step[5];
-  output[6] = step[6];
-  output[7] = step[7];
-  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
-  output[8] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
-  output[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
-  output[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
-  output[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
-  output[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
-  output[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
-  output[14] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
-  output[15] = (tran_low_t)fdct_round_shift(temp);
-  output[16] = step[16] + step[17];
-  output[17] = step[16] - step[17];
-  output[18] = step[19] - step[18];
-  output[19] = step[19] + step[18];
-  output[20] = step[20] + step[21];
-  output[21] = step[20] - step[21];
-  output[22] = step[23] - step[22];
-  output[23] = step[23] + step[22];
-  output[24] = step[24] + step[25];
-  output[25] = step[24] - step[25];
-  output[26] = step[27] - step[26];
-  output[27] = step[27] + step[26];
-  output[28] = step[28] + step[29];
-  output[29] = step[28] - step[29];
-  output[30] = step[31] - step[30];
-  output[31] = step[31] + step[30];
-
-  range_check(output, 32, 18);
-
-  // stage 8
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = output[10];
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = output[13];
-  step[14] = output[14];
-  step[15] = output[15];
-  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
-  step[16] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
-  step[17] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
-  step[18] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
-  step[19] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
-  step[20] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
-  step[22] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
-  step[23] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
-  step[24] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
-  step[25] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
-  step[27] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
-  step[28] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
-  step[29] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
-  step[30] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
-  step[31] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 32, 18);
-
-  // stage 9
-  output[0] = step[0];
-  output[1] = step[16];
-  output[2] = step[8];
-  output[3] = step[24];
-  output[4] = step[4];
-  output[5] = step[20];
-  output[6] = step[12];
-  output[7] = step[28];
-  output[8] = step[2];
-  output[9] = step[18];
-  output[10] = step[10];
-  output[11] = step[26];
-  output[12] = step[6];
-  output[13] = step[22];
-  output[14] = step[14];
-  output[15] = step[30];
-  output[16] = step[1];
-  output[17] = step[17];
-  output[18] = step[9];
-  output[19] = step[25];
-  output[20] = step[5];
-  output[21] = step[21];
-  output[22] = step[13];
-  output[23] = step[29];
-  output[24] = step[3];
-  output[25] = step[19];
-  output[26] = step[11];
-  output[27] = step[27];
-  output[28] = step[7];
-  output[29] = step[23];
-  output[30] = step[15];
-  output[31] = step[31];
-
-  range_check(output, 32, 18);
-}
-
-#ifndef AV1_DCT_GTEST
-static void fadst4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t x0, x1, x2, x3;
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_4_9 * x0;
-  s2 = sinpi_2_9 * x1;
-  s3 = sinpi_1_9 * x1;
-  s4 = sinpi_3_9 * x2;
-  s5 = sinpi_4_9 * x3;
-  s6 = sinpi_2_9 * x3;
-  s7 = x0 + x1 - x3;
-
-  x0 = s0 + s2 + s5;
-  x1 = sinpi_3_9 * s7;
-  x2 = s1 - s3 + s6;
-  x3 = s4;
-
-  s0 = x0 + x3;
-  s1 = x1;
-  s2 = x2 - x3;
-  s3 = x2 - x0 + x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  output[0] = (tran_low_t)fdct_round_shift(s0);
-  output[1] = (tran_low_t)fdct_round_shift(s1);
-  output[2] = (tran_low_t)fdct_round_shift(s2);
-  output[3] = (tran_low_t)fdct_round_shift(s3);
-}
-
-static void fadst8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = fdct_round_shift(s0 - s4);
-  x5 = fdct_round_shift(s1 - s5);
-  x6 = fdct_round_shift(s2 - s6);
-  x7 = fdct_round_shift(s3 - s7);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
-  x0 = fdct_round_shift(s0 + s2);
-  x1 = fdct_round_shift(s1 + s3);
-  x2 = fdct_round_shift(s0 - s2);
-  x3 = fdct_round_shift(s1 - s3);
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x4;
-  output[2] = (tran_low_t)x6;
-  output[3] = (tran_low_t)-x2;
-  output[4] = (tran_low_t)x3;
-  output[5] = (tran_low_t)-x7;
-  output[6] = (tran_low_t)x5;
-  output[7] = (tran_low_t)-x1;
-}
-
-static void fadst16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = s0 + s8;
-  x1 = s1 + s9;
-  x2 = s2 + s10;
-  x3 = s3 + s11;
-  x4 = s4 + s12;
-  x5 = s5 + s13;
-  x6 = s6 + s14;
-  x7 = s7 + s15;
-
-  x8 = fdct_round_shift(s0 - s8);
-  x9 = fdct_round_shift(s1 - s9);
-  x10 = fdct_round_shift(s2 - s10);
-  x11 = fdct_round_shift(s3 - s11);
-  x12 = fdct_round_shift(s4 - s12);
-  x13 = fdct_round_shift(s5 - s13);
-  x14 = fdct_round_shift(s6 - s14);
-  x15 = fdct_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = fdct_round_shift(s0 - s4);
-  x5 = fdct_round_shift(s1 - s5);
-  x6 = fdct_round_shift(s2 - s6);
-  x7 = fdct_round_shift(s3 - s7);
-
-  x8 = s8 + s12;
-  x9 = s9 + s13;
-  x10 = s10 + s14;
-  x11 = s11 + s15;
-  x12 = fdct_round_shift(s8 - s12);
-  x13 = fdct_round_shift(s9 - s13);
-  x14 = fdct_round_shift(s10 - s14);
-  x15 = fdct_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = fdct_round_shift(s0 + s2);
-  x1 = fdct_round_shift(s1 + s3);
-  x2 = fdct_round_shift(s0 - s2);
-  x3 = fdct_round_shift(s1 - s3);
-
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-
-  x8 = fdct_round_shift(s8 + s10);
-  x9 = fdct_round_shift(s9 + s11);
-  x10 = fdct_round_shift(s8 - s10);
-  x11 = fdct_round_shift(s9 - s11);
-
-  x12 = fdct_round_shift(s12 + s14);
-  x13 = fdct_round_shift(s13 + s15);
-  x14 = fdct_round_shift(s12 - s14);
-  x15 = fdct_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-  x10 = fdct_round_shift(s10);
-  x11 = fdct_round_shift(s11);
-  x14 = fdct_round_shift(s14);
-  x15 = fdct_round_shift(s15);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x8;
-  output[2] = (tran_low_t)x12;
-  output[3] = (tran_low_t)-x4;
-  output[4] = (tran_low_t)x6;
-  output[5] = (tran_low_t)x14;
-  output[6] = (tran_low_t)x10;
-  output[7] = (tran_low_t)x2;
-  output[8] = (tran_low_t)x3;
-  output[9] = (tran_low_t)x11;
-  output[10] = (tran_low_t)x15;
-  output[11] = (tran_low_t)x7;
-  output[12] = (tran_low_t)x5;
-  output[13] = (tran_low_t)-x13;
-  output[14] = (tran_low_t)x9;
-  output[15] = (tran_low_t)-x1;
-}
-
-// For use in lieu of ADST
-static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i] * 4;
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
-  }
-  fdct16(inputhalf, output);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
-#if CONFIG_MRC_TX
-static void get_masked_residual32(const int16_t **input, int *input_stride,
-                                  const uint8_t *pred, int pred_stride,
-                                  int16_t *masked_input,
-                                  TxfmParam *txfm_param) {
-  int n_masked_vals = 0;
-  uint8_t *mrc_mask;
-  uint8_t mask_tmp[32 * 32];
-  if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
-      (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
-    mrc_mask = txfm_param->mask;
-    n_masked_vals = get_mrc_diff_mask(*input, *input_stride, mrc_mask, 32, 32,
-                                      32, txfm_param->is_inter);
-  } else {
-    mrc_mask = mask_tmp;
-    n_masked_vals = get_mrc_pred_mask(pred, pred_stride, mrc_mask, 32, 32, 32,
-                                      txfm_param->is_inter);
-  }
-
-  // Do not use MRC_DCT if mask is invalid. DCT_DCT will be used instead.
-  if (!is_valid_mrc_mask(n_masked_vals, 32, 32)) {
-    *txfm_param->valid_mask = 0;
-    return;
-  }
-  int32_t sum = 0;
-  int16_t avg;
-  // Get the masked average of the prediction
-  for (int i = 0; i < 32; ++i) {
-    for (int j = 0; j < 32; ++j) {
-      sum += mrc_mask[i * 32 + j] * (*input)[i * (*input_stride) + j];
-    }
-  }
-  avg = sum / n_masked_vals;
-  // Replace all of the unmasked pixels in the prediction with the average
-  // of the masked pixels
-  for (int i = 0; i < 32; ++i) {
-    for (int j = 0; j < 32; ++j)
-      masked_input[i * 32 + j] =
-          (mrc_mask[i * 32 + j]) ? (*input)[i * (*input_stride) + j] : avg;
-  }
-  *input = masked_input;
-  *input_stride = 32;
-  *txfm_param->valid_mask = 1;
-}
-#endif  // CONFIG_MRC_TX
-
-#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
-static void flgt4(const tran_low_t *input, tran_low_t *output,
-                  const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT4) {
-    fdct4(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST4) {
-    fadst4(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,4
-  tran_high_t s[4] = { 0 };
-  for (int i = 0; i < 4; ++i)
-    for (int j = 0; j < 4; ++j) s[j] += lgtmtx[j * 4 + i] * input[i];
-
-  for (int i = 0; i < 4; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
-}
-
-static void flgt8(const tran_low_t *input, tran_low_t *output,
-                  const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT8) {
-    fdct8(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST8) {
-    fadst8(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,8
-  tran_high_t s[8] = { 0 };
-  for (int i = 0; i < 8; ++i)
-    for (int j = 0; j < 8; ++j) s[j] += lgtmtx[j * 8 + i] * input[i];
-
-  for (int i = 0; i < 8; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
-}
-#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
-
-#if CONFIG_LGT_FROM_PRED
-static void flgt16up(const tran_low_t *input, tran_low_t *output,
-                     const tran_high_t *lgtmtx) {
-  if (lgtmtx[0] == DCT16) {
-    fdct16(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST16) {
-    fadst16(input, output);
-    return;
-  } else if (lgtmtx[0] == DCT32) {
-    fdct32(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST32) {
-    fhalfright32(input, output);
-    return;
-  } else {
-    assert(0);
-  }
-}
-
-typedef void (*FlgtFunc)(const tran_low_t *input, tran_low_t *output,
-                         const tran_high_t *lgtmtx);
-
-static FlgtFunc flgt_func[4] = { flgt4, flgt8, flgt16up, flgt16up };
-
-typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t *lgtmtx[], int ntx);
-
-static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
-                                      get_lgt16up_from_pred,
-                                      get_lgt16up_from_pred };
-
-// this inline function corresponds to the up scaling before the first
-// transform in the av1_fht* functions
-static INLINE tran_low_t fwd_upscale_wrt_txsize(const tran_high_t val,
-                                                const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return (tran_low_t)val << 4;
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4:
-    case TX_8X32:
-    case TX_32X8: return (tran_low_t)val << 2;
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X16:
-    case TX_16X8: return (tran_low_t)fdct_round_shift(val * 4 * Sqrt2);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-// This inline function corresponds to the bit shift after the second
-// transform in the av1_fht* functions
-static INLINE tran_low_t fwd_downscale_wrt_txsize(const tran_low_t val,
-                                                  const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return (val + 1) >> 2;
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4: return (val + (val < 0)) >> 1;
-    case TX_8X16:
-    case TX_16X8: return val;
-    case TX_8X32:
-    case TX_32X8: return ROUND_POWER_OF_TWO_SIGNED(val, 2);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-void flgt2d_from_pred_c(const int16_t *input, tran_low_t *output, int stride,
-                        TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const int w = tx_size_wide[tx_size];
-  const int h = tx_size_high[tx_size];
-  const int wlog2 = tx_size_wide_log2[tx_size];
-  const int hlog2 = tx_size_high_log2[tx_size];
-  assert(w <= 8 || h <= 8);
-
-  int i, j;
-  tran_low_t out[256];  // max size: 8x32 and 32x8
-  tran_low_t temp_in[32], temp_out[32];
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
-  get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
-
-  // For forward transforms, to be consistent with av1_fht functions, we apply
-  // short transform first and long transform second.
-  if (w < h) {
-    // Row transforms
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j)
-        temp_in[j] = fwd_upscale_wrt_txsize(input[i * stride + j], tx_size);
-      flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]);
-      // right shift of 2 bits here in fht8x16 and fht16x8
-      for (j = 0; j < w; ++j)
-        out[j * h + i] = (tx_size == TX_16X8 || tx_size == TX_8X16)
-                             ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2)
-                             : temp_out[j];
-    }
-    // Column transforms
-    for (i = 0; i < w; ++i) {
-      for (j = 0; j < h; ++j) temp_in[j] = out[j + i * h];
-      flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]);
-      for (j = 0; j < h; ++j)
-        output[j * w + i] = fwd_downscale_wrt_txsize(temp_out[j], tx_size);
-    }
-  } else {
-    // Column transforms
-    for (i = 0; i < w; ++i) {
-      for (j = 0; j < h; ++j)
-        temp_in[j] = fwd_upscale_wrt_txsize(input[j * stride + i], tx_size);
-      flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]);
-      // fht8x16 and fht16x8 have right shift of 2 bits here
-      for (j = 0; j < h; ++j)
-        out[j * w + i] = (tx_size == TX_16X8 || tx_size == TX_8X16)
-                             ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2)
-                             : temp_out[j];
-    }
-    // Row transforms
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) temp_in[j] = out[j + i * w];
-      flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]);
-      for (j = 0; j < w; ++j)
-        output[j + i * w] = fwd_downscale_wrt_txsize(temp_out[j], tx_size);
-    }
-  }
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_EXT_TX
-// TODO(sarahparker) these functions will be removed once the highbitdepth
-// codepath works properly for rectangular transforms. They have almost
-// identical versions in av1_fwd_txfm1d.c, but those are currently only
-// being used for square transforms.
-static void fidtx4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; ++i) {
-    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
-  }
-}
-
-static void fidtx8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    output[i] = input[i] * 2;
-  }
-}
-
-static void fidtx16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
-  }
-}
-
-static void fidtx32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[i] * 4;
-  }
-}
-
-static void copy_block(const int16_t *src, int src_stride, int l, int w,
-                       int16_t *dest, int dest_stride) {
-  int i;
-  for (i = 0; i < l; ++i) {
-    memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
-  }
-}
-
-static void fliplr(int16_t *dest, int stride, int l, int w) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < w / 2; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + w - 1 - j];
-      dest[i * stride + w - 1 - j] = tmp;
-    }
-  }
-}
-
-static void flipud(int16_t *dest, int stride, int l, int w) {
-  int i, j;
-  for (j = 0; j < w; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-static void fliplrud(int16_t *dest, int stride, int l, int w) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < w; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
-      dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
-    }
-  }
-}
-
-static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
-                        int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, w, dest, dest_stride);
-  fliplr(dest, dest_stride, l, w);
-}
-
-static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
-                        int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, w, dest, dest_stride);
-  flipud(dest, dest_stride, l, w);
-}
-
-static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, w, dest, dest_stride);
-  fliplrud(dest, dest_stride, l, w);
-}
-
-static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
-                             int16_t *buff, TX_TYPE tx_type) {
-  switch (tx_type) {
-#if CONFIG_MRC_TX
-    case MRC_DCT:
-#endif  // CONFIG_MRC_TX
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST: break;
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST:
-      copy_flipud(*src, *src_stride, l, w, buff, w);
-      *src = buff;
-      *src_stride = w;
-      break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      copy_fliplr(*src, *src_stride, l, w, buff, w);
-      *src = buff;
-      *src_stride = w;
-      break;
-    case FLIPADST_FLIPADST:
-      copy_fliplrud(*src, *src_stride, l, w, buff, w);
-      *src = buff;
-      *src_stride = w;
-      break;
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-#if !CONFIG_DAALA_DCT4
-  if (tx_type == DCT_DCT) {
-    aom_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif
-  {
-    static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT4
-      { daala_fdct4, daala_fdct4 },  // DCT_DCT
-      { daala_fdst4, daala_fdct4 },  // ADST_DCT
-      { daala_fdct4, daala_fdst4 },  // DCT_ADST
-      { daala_fdst4, daala_fdst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { daala_fdst4, daala_fdct4 },  // FLIPADST_DCT
-      { daala_fdct4, daala_fdst4 },  // DCT_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // FLIPADST_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // ADST_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // FLIPADST_ADST
-      { daala_idtx4, daala_idtx4 },  // IDTX
-      { daala_fdct4, daala_idtx4 },  // V_DCT
-      { daala_idtx4, daala_fdct4 },  // H_DCT
-      { daala_fdst4, daala_idtx4 },  // V_ADST
-      { daala_idtx4, daala_fdst4 },  // H_ADST
-      { daala_fdst4, daala_idtx4 },  // V_FLIPADST
-      { daala_idtx4, daala_fdst4 },  // H_FLIPADST
-#endif
-#else
-      { fdct4, fdct4 },    // DCT_DCT
-      { fadst4, fdct4 },   // ADST_DCT
-      { fdct4, fadst4 },   // DCT_ADST
-      { fadst4, fadst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { fadst4, fdct4 },   // FLIPADST_DCT
-      { fdct4, fadst4 },   // DCT_FLIPADST
-      { fadst4, fadst4 },  // FLIPADST_FLIPADST
-      { fadst4, fadst4 },  // ADST_FLIPADST
-      { fadst4, fadst4 },  // FLIPADST_ADST
-      { fidtx4, fidtx4 },  // IDTX
-      { fdct4, fidtx4 },   // V_DCT
-      { fidtx4, fdct4 },   // H_DCT
-      { fadst4, fidtx4 },  // V_ADST
-      { fidtx4, fadst4 },  // H_ADST
-      { fadst4, fidtx4 },  // V_FLIPADST
-      { fidtx4, fadst4 },  // H_FLIPADST
-#endif
-#endif
-    };
-    const transform_2d ht = FHT[tx_type];
-    tran_low_t out[4 * 4];
-    int i, j;
-    tran_low_t temp_in[4], temp_out[4];
-
-#if CONFIG_EXT_TX
-    int16_t flipped_input[4 * 4];
-    maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-    // Choose LGT adaptive to the prediction. We may apply different LGTs for
-    // different rows/columns, indicated by the pointers to 2D arrays
-    const tran_high_t *lgtmtx_col[1];
-    const tran_high_t *lgtmtx_row[1];
-    int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-    int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      /* A C99-safe upshift by 4 for both Daala and VPx TX. */
-      for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
-#if !CONFIG_DAALA_DCT4
-      if (i == 0 && temp_in[0]) temp_in[0] += 1;
-#endif
-#if CONFIG_LGT
-      if (use_lgt_col)
-        flgt4(temp_in, temp_out, lgtmtx_col[0]);
-      else
-#endif
-        ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
-#if CONFIG_LGT
-      if (use_lgt_row)
-        flgt4(temp_in, temp_out, lgtmtx_row[0]);
-      else
-#endif
-        ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_DCT4
-      /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
-         the usual VPx coefficient left-shift of 3. */
-      for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
-#else
-      for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
-#endif
-    }
-  }
-}
-
-void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct8, fdct4 },    // DCT_DCT
-    { fadst8, fdct4 },   // ADST_DCT
-    { fdct8, fadst4 },   // DCT_ADST
-    { fadst8, fadst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst8, fdct4 },   // FLIPADST_DCT
-    { fdct8, fadst4 },   // DCT_FLIPADST
-    { fadst8, fadst4 },  // FLIPADST_FLIPADST
-    { fadst8, fadst4 },  // ADST_FLIPADST
-    { fadst8, fadst4 },  // FLIPADST_ADST
-    { fidtx8, fidtx4 },  // IDTX
-    { fdct8, fidtx4 },   // V_DCT
-    { fidtx8, fdct4 },   // H_DCT
-    { fadst8, fidtx4 },  // V_ADST
-    { fidtx8, fadst4 },  // H_ADST
-    { fadst8, fidtx4 },  // V_FLIPADST
-    { fidtx8, fadst4 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n2 = 8;
-  tran_low_t out[8 * 4];
-  tran_low_t temp_in[8], temp_out[8];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[8 * 4];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt4(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct4, fdct8 },    // DCT_DCT
-    { fadst4, fdct8 },   // ADST_DCT
-    { fdct4, fadst8 },   // DCT_ADST
-    { fadst4, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst4, fdct8 },   // FLIPADST_DCT
-    { fdct4, fadst8 },   // DCT_FLIPADST
-    { fadst4, fadst8 },  // FLIPADST_FLIPADST
-    { fadst4, fadst8 },  // ADST_FLIPADST
-    { fadst4, fadst8 },  // FLIPADST_ADST
-    { fidtx4, fidtx8 },  // IDTX
-    { fdct4, fidtx8 },   // V_DCT
-    { fidtx4, fdct8 },   // H_DCT
-    { fadst4, fidtx8 },  // V_ADST
-    { fidtx4, fadst8 },  // H_ADST
-    { fadst4, fidtx8 },  // V_FLIPADST
-    { fidtx4, fadst8 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n2 = 8;
-  tran_low_t out[8 * 4];
-  tran_low_t temp_in[8], temp_out[8];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[8 * 4];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt4(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct16, fdct4 },    // DCT_DCT
-    { fadst16, fdct4 },   // ADST_DCT
-    { fdct16, fadst4 },   // DCT_ADST
-    { fadst16, fadst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct4 },   // FLIPADST_DCT
-    { fdct16, fadst4 },   // DCT_FLIPADST
-    { fadst16, fadst4 },  // FLIPADST_FLIPADST
-    { fadst16, fadst4 },  // ADST_FLIPADST
-    { fadst16, fadst4 },  // FLIPADST_ADST
-    { fidtx16, fidtx4 },  // IDTX
-    { fdct16, fidtx4 },   // V_DCT
-    { fidtx16, fdct4 },   // H_DCT
-    { fadst16, fidtx4 },  // V_ADST
-    { fidtx16, fadst4 },  // H_ADST
-    { fadst16, fidtx4 },  // V_FLIPADST
-    { fidtx16, fadst4 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n4 = 16;
-  tran_low_t out[16 * 4];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 4];
-  maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt4(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct4, fdct16 },    // DCT_DCT
-    { fadst4, fdct16 },   // ADST_DCT
-    { fdct4, fadst16 },   // DCT_ADST
-    { fadst4, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst4, fdct16 },   // FLIPADST_DCT
-    { fdct4, fadst16 },   // DCT_FLIPADST
-    { fadst4, fadst16 },  // FLIPADST_FLIPADST
-    { fadst4, fadst16 },  // ADST_FLIPADST
-    { fadst4, fadst16 },  // FLIPADST_ADST
-    { fidtx4, fidtx16 },  // IDTX
-    { fdct4, fidtx16 },   // V_DCT
-    { fidtx4, fdct16 },   // H_DCT
-    { fadst4, fidtx16 },  // V_ADST
-    { fidtx4, fadst16 },  // H_ADST
-    { fadst4, fidtx16 },  // V_FLIPADST
-    { fidtx4, fadst16 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n4 = 16;
-  tran_low_t out[16 * 4];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 4];
-  maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // Columns
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt4(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct16, fdct8 },    // DCT_DCT
-    { fadst16, fdct8 },   // ADST_DCT
-    { fdct16, fadst8 },   // DCT_ADST
-    { fadst16, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct8 },   // FLIPADST_DCT
-    { fdct16, fadst8 },   // DCT_FLIPADST
-    { fadst16, fadst8 },  // FLIPADST_FLIPADST
-    { fadst16, fadst8 },  // ADST_FLIPADST
-    { fadst16, fadst8 },  // FLIPADST_ADST
-    { fidtx16, fidtx8 },  // IDTX
-    { fdct16, fidtx8 },   // V_DCT
-    { fidtx16, fdct8 },   // H_DCT
-    { fadst16, fidtx8 },  // V_ADST
-    { fidtx16, fadst8 },  // H_ADST
-    { fadst16, fidtx8 },  // V_FLIPADST
-    { fidtx16, fadst8 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n2 = 16;
-  tran_low_t out[16 * 8];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 8];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct8, fdct16 },    // DCT_DCT
-    { fadst8, fdct16 },   // ADST_DCT
-    { fdct8, fadst16 },   // DCT_ADST
-    { fadst8, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst8, fdct16 },   // FLIPADST_DCT
-    { fdct8, fadst16 },   // DCT_FLIPADST
-    { fadst8, fadst16 },  // FLIPADST_FLIPADST
-    { fadst8, fadst16 },  // ADST_FLIPADST
-    { fadst8, fadst16 },  // FLIPADST_ADST
-    { fidtx8, fidtx16 },  // IDTX
-    { fdct8, fidtx16 },   // V_DCT
-    { fidtx8, fdct16 },   // H_DCT
-    { fadst8, fidtx16 },  // V_ADST
-    { fidtx8, fadst16 },  // H_ADST
-    { fadst8, fidtx16 },  // V_FLIPADST
-    { fidtx8, fadst16 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n2 = 16;
-  tran_low_t out[16 * 8];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 8];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct32, fdct8 },         // DCT_DCT
-    { fhalfright32, fdct8 },   // ADST_DCT
-    { fdct32, fadst8 },        // DCT_ADST
-    { fhalfright32, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct8 },   // FLIPADST_DCT
-    { fdct32, fadst8 },        // DCT_FLIPADST
-    { fhalfright32, fadst8 },  // FLIPADST_FLIPADST
-    { fhalfright32, fadst8 },  // ADST_FLIPADST
-    { fhalfright32, fadst8 },  // FLIPADST_ADST
-    { fidtx32, fidtx8 },       // IDTX
-    { fdct32, fidtx8 },        // V_DCT
-    { fidtx32, fdct8 },        // H_DCT
-    { fhalfright32, fidtx8 },  // V_ADST
-    { fidtx32, fadst8 },       // H_ADST
-    { fhalfright32, fidtx8 },  // V_FLIPADST
-    { fidtx32, fadst8 },       // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n4 = 32;
-  tran_low_t out[32 * 8];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 8];
-  maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct8, fdct32 },         // DCT_DCT
-    { fadst8, fdct32 },        // ADST_DCT
-    { fdct8, fhalfright32 },   // DCT_ADST
-    { fadst8, fhalfright32 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst8, fdct32 },        // FLIPADST_DCT
-    { fdct8, fhalfright32 },   // DCT_FLIPADST
-    { fadst8, fhalfright32 },  // FLIPADST_FLIPADST
-    { fadst8, fhalfright32 },  // ADST_FLIPADST
-    { fadst8, fhalfright32 },  // FLIPADST_ADST
-    { fidtx8, fidtx32 },       // IDTX
-    { fdct8, fidtx32 },        // V_DCT
-    { fidtx8, fdct32 },        // H_DCT
-    { fadst8, fidtx32 },       // V_ADST
-    { fidtx8, fhalfright32 },  // H_ADST
-    { fadst8, fidtx32 },       // V_FLIPADST
-    { fidtx8, fhalfright32 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n4 = 32;
-  tran_low_t out[32 * 8];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 8];
-  maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // Columns
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct32, fdct16 },         // DCT_DCT
-    { fhalfright32, fdct16 },   // ADST_DCT
-    { fdct32, fadst16 },        // DCT_ADST
-    { fhalfright32, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct16 },   // FLIPADST_DCT
-    { fdct32, fadst16 },        // DCT_FLIPADST
-    { fhalfright32, fadst16 },  // FLIPADST_FLIPADST
-    { fhalfright32, fadst16 },  // ADST_FLIPADST
-    { fhalfright32, fadst16 },  // FLIPADST_ADST
-    { fidtx32, fidtx16 },       // IDTX
-    { fdct32, fidtx16 },        // V_DCT
-    { fidtx32, fdct16 },        // H_DCT
-    { fhalfright32, fidtx16 },  // V_ADST
-    { fidtx32, fadst16 },       // H_ADST
-    { fhalfright32, fidtx16 },  // V_FLIPADST
-    { fidtx32, fadst16 },       // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 16;
-  const int n2 = 32;
-  tran_low_t out[32 * 16];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 16];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 4 times unitary
-}
-
-void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct16, fdct32 },         // DCT_DCT
-    { fadst16, fdct32 },        // ADST_DCT
-    { fdct16, fhalfright32 },   // DCT_ADST
-    { fadst16, fhalfright32 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct32 },        // FLIPADST_DCT
-    { fdct16, fhalfright32 },   // DCT_FLIPADST
-    { fadst16, fhalfright32 },  // FLIPADST_FLIPADST
-    { fadst16, fhalfright32 },  // ADST_FLIPADST
-    { fadst16, fhalfright32 },  // FLIPADST_ADST
-    { fidtx16, fidtx32 },       // IDTX
-    { fdct16, fidtx32 },        // V_DCT
-    { fidtx16, fdct32 },        // H_DCT
-    { fadst16, fidtx32 },       // V_ADST
-    { fidtx16, fhalfright32 },  // H_ADST
-    { fadst16, fidtx32 },       // V_FLIPADST
-    { fidtx16, fhalfright32 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 16;
-  const int n2 = 32;
-  tran_low_t out[32 * 16];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 16];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 4 times unitary
-}
-
-void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-#if !CONFIG_DAALA_DCT8
-  if (tx_type == DCT_DCT) {
-    aom_fdct8x8_c(input, output, stride);
-    return;
-  }
-#endif
-  {
-    static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT8
-      { daala_fdct8, daala_fdct8 },  // DCT_DCT
-      { daala_fdst8, daala_fdct8 },  // ADST_DCT
-      { daala_fdct8, daala_fdst8 },  // DCT_ADST
-      { daala_fdst8, daala_fdst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { daala_fdst8, daala_fdct8 },  // FLIPADST_DCT
-      { daala_fdct8, daala_fdst8 },  // DCT_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // FLIPADST_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // ADST_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // FLIPADST_ADST
-      { daala_idtx8, daala_idtx8 },  // IDTX
-      { daala_fdct8, daala_idtx8 },  // V_DCT
-      { daala_idtx8, daala_fdct8 },  // H_DCT
-      { daala_fdst8, daala_idtx8 },  // V_ADST
-      { daala_idtx8, daala_fdst8 },  // H_ADST
-      { daala_fdst8, daala_idtx8 },  // V_FLIPADST
-      { daala_idtx8, daala_fdst8 },  // H_FLIPADST
-#endif
-#else
-      { fdct8, fdct8 },    // DCT_DCT
-      { fadst8, fdct8 },   // ADST_DCT
-      { fdct8, fadst8 },   // DCT_ADST
-      { fadst8, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { fadst8, fdct8 },   // FLIPADST_DCT
-      { fdct8, fadst8 },   // DCT_FLIPADST
-      { fadst8, fadst8 },  // FLIPADST_FLIPADST
-      { fadst8, fadst8 },  // ADST_FLIPADST
-      { fadst8, fadst8 },  // FLIPADST_ADST
-      { fidtx8, fidtx8 },  // IDTX
-      { fdct8, fidtx8 },   // V_DCT
-      { fidtx8, fdct8 },   // H_DCT
-      { fadst8, fidtx8 },  // V_ADST
-      { fidtx8, fadst8 },  // H_ADST
-      { fadst8, fidtx8 },  // V_FLIPADST
-      { fidtx8, fadst8 },  // H_FLIPADST
-#endif
-#endif
-    };
-    const transform_2d ht = FHT[tx_type];
-    tran_low_t out[64];
-    int i, j;
-    tran_low_t temp_in[8], temp_out[8];
-
-#if CONFIG_EXT_TX
-    int16_t flipped_input[8 * 8];
-    maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-    const tran_high_t *lgtmtx_col[1];
-    const tran_high_t *lgtmtx_row[1];
-    int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-    int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_DCT8
-      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16;
-#else
-      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
-#endif
-#if CONFIG_LGT
-      if (use_lgt_col)
-        flgt8(temp_in, temp_out, lgtmtx_col[0]);
-      else
-#endif
-        ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
-#if CONFIG_LGT
-      if (use_lgt_row)
-        flgt8(temp_in, temp_out, lgtmtx_row[0]);
-      else
-#endif
-        ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_DCT8
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#endif
-    }
-  }
-}
-
-/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
-   pixel. */
-void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  int i;
-  tran_high_t a1, b1, c1, d1, e1;
-  const int16_t *ip_pass0 = input;
-  const tran_low_t *ip = NULL;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip_pass0[0 * stride];
-    b1 = ip_pass0[1 * stride];
-    c1 = ip_pass0[2 * stride];
-    d1 = ip_pass0[3 * stride];
-
-    a1 += b1;
-    d1 = d1 - c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = (tran_low_t)a1;
-    op[4] = (tran_low_t)c1;
-    op[8] = (tran_low_t)d1;
-    op[12] = (tran_low_t)b1;
-
-    ip_pass0++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0];
-    b1 = ip[1];
-    c1 = ip[2];
-    d1 = ip[3];
-
-    a1 += b1;
-    d1 -= c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
-    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
-    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
-    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT16
-    { daala_fdct16, daala_fdct16 },  // DCT_DCT
-    { daala_fdst16, daala_fdct16 },  // ADST_DCT
-    { daala_fdct16, daala_fdst16 },  // DCT_ADST
-    { daala_fdst16, daala_fdst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { daala_fdst16, daala_fdct16 },  // FLIPADST_DCT
-    { daala_fdct16, daala_fdst16 },  // DCT_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // FLIPADST_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // ADST_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx16 },  // IDTX
-    { daala_fdct16, daala_idtx16 },  // V_DCT
-    { daala_idtx16, daala_fdct16 },  // H_DCT
-    { daala_fdst16, daala_idtx16 },  // V_ADST
-    { daala_idtx16, daala_fdst16 },  // H_ADST
-    { daala_fdst16, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx16, daala_fdst16 },  // H_FLIPADST
-#endif
-#else
-    { fdct16, fdct16 },    // DCT_DCT
-    { fadst16, fdct16 },   // ADST_DCT
-    { fdct16, fadst16 },   // DCT_ADST
-    { fadst16, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct16 },   // FLIPADST_DCT
-    { fdct16, fadst16 },   // DCT_FLIPADST
-    { fadst16, fadst16 },  // FLIPADST_FLIPADST
-    { fadst16, fadst16 },  // ADST_FLIPADST
-    { fadst16, fadst16 },  // FLIPADST_ADST
-    { fidtx16, fidtx16 },  // IDTX
-    { fdct16, fidtx16 },   // V_DCT
-    { fidtx16, fdct16 },   // H_DCT
-    { fadst16, fidtx16 },  // V_ADST
-    { fidtx16, fadst16 },  // H_ADST
-    { fadst16, fidtx16 },  // V_FLIPADST
-    { fidtx16, fadst16 },  // H_FLIPADST
-#endif
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[256];
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 16];
-  maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_DCT16
-      temp_in[j] = input[j * stride + i] * 16;
-#else
-      temp_in[j] = input[j * stride + i] * 4;
-#endif
-    }
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_DCT16
-      out[j * 16 + i] = temp_out[j];
-#else
-      out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-#endif
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_DCT16
-      output[j + i * 16] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
-      output[j + i * 16] = temp_out[j];
-#endif
-    }
-  }
-}
-
-void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  av1_fwht4x4_c(input, output, stride);
-}
-
-void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT32
-    { daala_fdct32, daala_fdct32 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { daala_fdst32, daala_fdct32 },  // ADST_DCT
-    { daala_fdct32, daala_fdst32 },  // DCT_ADST
-    { daala_fdst32, daala_fdst32 },  // ADST_ADST
-    { daala_fdst32, daala_fdct32 },  // FLIPADST_DCT
-    { daala_fdct32, daala_fdst32 },  // DCT_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // FLIPADST_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // ADST_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx32 },  // IDTX
-    { daala_fdct32, daala_idtx32 },  // V_DCT
-    { daala_idtx32, daala_fdct32 },  // H_DCT
-    { daala_fdst32, daala_idtx32 },  // V_ADST
-    { daala_idtx32, daala_fdst32 },  // H_ADST
-    { daala_fdst32, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx32, daala_fdst32 },  // H_FLIPADST
-#endif
-#else
-    { fdct32, fdct32 },              // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct32 },        // ADST_DCT
-    { fdct32, fhalfright32 },        // DCT_ADST
-    { fhalfright32, fhalfright32 },  // ADST_ADST
-    { fhalfright32, fdct32 },        // FLIPADST_DCT
-    { fdct32, fhalfright32 },        // DCT_FLIPADST
-    { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
-    { fhalfright32, fhalfright32 },  // ADST_FLIPADST
-    { fhalfright32, fhalfright32 },  // FLIPADST_ADST
-    { fidtx32, fidtx32 },            // IDTX
-    { fdct32, fidtx32 },             // V_DCT
-    { fidtx32, fdct32 },             // H_DCT
-    { fhalfright32, fidtx32 },       // V_ADST
-    { fidtx32, fhalfright32 },       // H_ADST
-    { fhalfright32, fidtx32 },       // V_FLIPADST
-    { fidtx32, fhalfright32 },       // H_FLIPADST
-#endif
-#endif
-#if CONFIG_MRC_TX
-    { fdct32, fdct32 },  // MRC_TX
-#endif                   // CONFIG_MRC_TX
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[1024];
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 32];
-  maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
-#endif
-
-#if CONFIG_MRC_TX
-  if (tx_type == MRC_DCT) {
-    int16_t masked_input[32 * 32];
-    get_masked_residual32(&input, &stride, txfm_param->dst, txfm_param->stride,
-                          masked_input, txfm_param);
-  }
-#endif  // CONFIG_MRC_TX
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_DCT32
-      temp_in[j] = input[j * stride + i] * 16;
-#else
-      temp_in[j] = input[j * stride + i] * 4;
-#endif
-    }
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_DCT32
-      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
-      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      output[j + i * 32] = temp_out[j];
-    }
-  }
-}
-
-#if CONFIG_TX64X64
-#if !CONFIG_DAALA_DCT64
-#if CONFIG_EXT_TX
-static void fidtx64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; ++i)
-    output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
-}
-
-// For use in lieu of ADST
-static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
-  }
-  fdct32(inputhalf, output);
-  // Note overall scaling factor is 2 times unitary
-}
-#endif  // CONFIG_EXT_TX
-
-static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_64, fwd_stage_range_col_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-
-static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-#endif
-
-void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT64
-    { daala_fdct64, daala_fdct64 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { daala_fdst64, daala_fdct64 },  // ADST_DCT
-    { daala_fdct64, daala_fdst64 },  // DCT_ADST
-    { daala_fdst64, daala_fdst64 },  // ADST_ADST
-    { daala_fdst64, daala_fdct64 },  // FLIPADST_DCT
-    { daala_fdct64, daala_fdst64 },  // DCT_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // FLIPADST_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // ADST_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx64 },  // IDTX
-    { daala_fdct64, daala_idtx64 },  // V_DCT
-    { daala_idtx64, daala_fdct64 },  // H_DCT
-    { daala_fdst64, daala_idtx64 },  // V_ADST
-    { daala_idtx64, daala_fdst64 },  // H_ADST
-    { daala_fdst64, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx64, daala_fdst64 },  // H_FLIPADST
-#endif                               // CONFIG_EXT_TX
-#else
-    { fdct64_col, fdct64_row },      // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright64, fdct64_row },    // ADST_DCT
-    { fdct64_col, fhalfright64 },    // DCT_ADST
-    { fhalfright64, fhalfright64 },  // ADST_ADST
-    { fhalfright64, fdct64_row },    // FLIPADST_DCT
-    { fdct64_col, fhalfright64 },    // DCT_FLIPADST
-    { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
-    { fhalfright64, fhalfright64 },  // ADST_FLIPADST
-    { fhalfright64, fhalfright64 },  // FLIPADST_ADST
-    { fidtx64, fidtx64 },            // IDTX
-    { fdct64_col, fidtx64 },         // V_DCT
-    { fidtx64, fdct64_row },         // H_DCT
-    { fhalfright64, fidtx64 },       // V_ADST
-    { fidtx64, fhalfright64 },       // H_ADST
-    { fhalfright64, fidtx64 },       // V_FLIPADST
-    { fidtx64, fhalfright64 },       // H_FLIPADST
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_DAALA_DCT64
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[4096];
-  int i, j;
-  tran_low_t temp_in[64], temp_out[64];
-#if CONFIG_EXT_TX
-  int16_t flipped_input[64 * 64];
-  maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_DCT64
-    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
-
-#else
-    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-#endif
-  }
-
-  // Rows
-  for (i = 0; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-#if CONFIG_DAALA_DCT64
-      output[j + i * 64] = temp_out[j];
-#else
-      output[j + i * 64] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-#endif
-  }
-}
-
-void av1_fht64x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct32, fdct64_row },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct64_row },    // ADST_DCT
-    { fdct32, fhalfright64 },        // DCT_ADST
-    { fhalfright32, fhalfright64 },  // ADST_ADST
-    { fhalfright32, fdct64_row },    // FLIPADST_DCT
-    { fdct32, fhalfright64 },        // DCT_FLIPADST
-    { fhalfright32, fhalfright64 },  // FLIPADST_FLIPADST
-    { fhalfright32, fhalfright64 },  // ADST_FLIPADST
-    { fhalfright32, fhalfright64 },  // FLIPADST_ADST
-    { fidtx32, fidtx64 },            // IDTX
-    { fdct32, fidtx64 },             // V_DCT
-    { fidtx32, fdct64_row },         // H_DCT
-    { fhalfright32, fidtx64 },       // V_ADST
-    { fidtx32, fhalfright64 },       // H_ADST
-    { fhalfright32, fidtx64 },       // V_FLIPADST
-    { fidtx32, fhalfright64 },       // H_FLIPADST
-#endif                               // CONFIG_EXT_TX
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[2048];
-  int i, j;
-  tran_low_t temp_in[64], temp_out[64];
-  const int n = 32;
-  const int n2 = 64;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 64];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] =
-          (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-}
-
-void av1_fht32x64_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct64_row, fdct32 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright64, fdct32 },        // ADST_DCT
-    { fdct64_row, fhalfright32 },    // DCT_ADST
-    { fhalfright64, fhalfright32 },  // ADST_ADST
-    { fhalfright64, fdct32 },        // FLIPADST_DCT
-    { fdct64_row, fhalfright32 },    // DCT_FLIPADST
-    { fhalfright64, fhalfright32 },  // FLIPADST_FLIPADST
-    { fhalfright64, fhalfright32 },  // ADST_FLIPADST
-    { fhalfright64, fhalfright32 },  // FLIPADST_ADST
-    { fidtx64, fidtx32 },            // IDTX
-    { fdct64_row, fidtx32 },         // V_DCT
-    { fidtx64, fdct32 },             // H_DCT
-    { fhalfright64, fidtx32 },       // V_ADST
-    { fidtx64, fhalfright32 },       // H_ADST
-    { fhalfright64, fidtx32 },       // V_FLIPADST
-    { fidtx64, fhalfright32 },       // H_FLIPADST
-#endif                               // CONFIG_EXT_TX
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[32 * 64];
-  int i, j;
-  tran_low_t temp_in[64], temp_out[64];
-  const int n = 32;
-  const int n2 = 64;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 64];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-}
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_EXT_TX
-// Forward identity transform.
-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
-                    int bsx, int bsy, TX_TYPE tx_type) {
-  int r, c;
-  const int pels = bsx * bsy;
-  const int shift = 3 - ((pels > 256) + (pels > 1024));
-  if (tx_type == IDTX) {
-    for (r = 0; r < bsy; ++r) {
-      for (c = 0; c < bsx; ++c) coeff[c] = src_diff[c] * (1 << shift);
-      src_diff += stride;
-      coeff += bsx;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX
-#endif  // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 000000000..0a57ebcfb
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,144 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) * 2;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) * 2;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+                                          uint8_t *x, int pitch_x,
+                                          tran_low_t *c, int pitch_c,
+                                          int dwt_scale_bits, int hbd) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+  if (hbd) {
+    uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  } else {
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  }
+
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd) {
+  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  int acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+    }
+  return acsad;
+}
+
+uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  uint64_t acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r > 0 || c > 0) acsad += abs(output[r * stride + c]);
+    }
+
+  return acsad;
+}
+
+uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
+  int sum = 0;
+  uint32_t sse = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      sum += input[r * stride + c];
+      sse += input[r * stride + c] * input[r * stride + c];
+    }
+  return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
+}
+
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+  tran_low_t output[64];
+
+  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  return av1_haar_ac_sad(output, 8, 8, 8);
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 000000000..9a86db2f1
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,9 @@
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd);
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
diff --git a/third_party/aom/av1/encoder/encint.h b/third_party/aom/av1/encoder/encint.h
deleted file mode 100644
index 30ea8521f..000000000
--- a/third_party/aom/av1/encoder/encint.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-/* clang-format off */
-
-#if !defined(_encint_H)
-# define _encint_H (1)
-
-typedef struct daala_enc_ctx od_enc_ctx;
-typedef struct od_params_ctx od_params_ctx;
-typedef struct od_rollback_buffer od_rollback_buffer;
-
-# include "aom_dsp/entenc.h"
-# include "av1/common/odintrin.h"
-# include "av1/common/pvq_state.h"
-
-struct daala_enc_ctx{
-  /* Stores context-adaptive CDFs for PVQ. */
-  od_state state;
-  /* AOM entropy encoder. */
-  aom_writer w;
-  int use_activity_masking;
-  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
-  int qm;
-  /*Normalized PVQ lambda for use where we've already performed
-     quantization.*/
-  double pvq_norm_lambda;
-  double pvq_norm_lambda_dc;
-};
-
-// from daalaenc.h
-/**The encoder context.*/
-typedef struct daala_enc_ctx daala_enc_ctx;
-
-/** Holds important encoder information so we can roll back decisions */
-struct od_rollback_buffer {
-  od_ec_enc ec;
-  od_adapt_ctx adapt;
-};
-
-void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf);
-void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf);
-
-#endif
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index f79a678fb..027b80a16 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -13,9 +13,9 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
@@ -23,6 +23,11 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/system_state.h"
 
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
@@ -36,105 +41,55 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
+#include "av1/encoder/ab_partition_model_weights.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
-#if CONFIG_SUPERTX
-#include "av1/encoder/cost.h"
-#endif
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
 #include "av1/encoder/global_motion.h"
-#endif  // CONFIG_GLOBAL_MOTION
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
+#include "av1/encoder/ml.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#include "av1/encoder/pvq_encoder.h"
-#endif
-#if CONFIG_HIGHBITDEPTH
-#define IF_HBD(...) __VA_ARGS__
-#else
-#define IF_HBD(...)
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int *rate);
-
-#if CONFIG_SUPERTX
-static int check_intra_b(PICK_MODE_CONTEXT *ctx);
-
-static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                          PC_TREE *pc_tree);
-static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                               int mi_row_ori, int mi_col_ori, int mi_row_pred,
-                               int mi_col_pred, int plane,
-                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
-static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
-                            PC_TREE *pc_tree);
-static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
-                               const TileInfo *const tile, int mi_row,
-                               int mi_col, int mi_row_ori, int mi_col_ori,
-                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
-                               int dst_stride[3], PC_TREE *pc_tree);
-static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                    const TileInfo *const tile, int mi_row,
-                                    int mi_col, BLOCK_SIZE bsize,
-                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
-static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
-                          const TileInfo *const tile, int mi_row, int mi_col,
-                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
-                          TX_TYPE *best_tx, PC_TREE *pc_tree);
-#endif  // CONFIG_SUPERTX
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int *rate);
 
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
 static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-#if CONFIG_EXT_PARTITION
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-#endif  // CONFIG_EXT_PARTITION
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
 };
 
-#if CONFIG_HIGHBITDEPTH
 static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-#if CONFIG_EXT_PARTITION
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-#endif  // CONFIG_EXT_PARTITION
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
 };
 
 static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
@@ -146,7 +101,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
-#if CONFIG_EXT_PARTITION
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
@@ -155,7 +109,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
-#endif  // CONFIG_EXT_PARTITION
 };
 
 static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
@@ -168,8 +121,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
-  128 * 16,
-#if CONFIG_EXT_PARTITION
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
@@ -179,10 +130,17 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
-  128 * 16
-#endif  // CONFIG_EXT_PARTITION
+  128 * 16, 128 * 16
+};
+
+#if CONFIG_FP_MB_STATS
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4
 };
-#endif  // CONFIG_HIGHBITDEPTH
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
+  1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2
+};
+#endif  // CONFIG_FP_MB_STATS
 
 unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
@@ -193,7 +151,6 @@ unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
-#if CONFIG_HIGHBITDEPTH
 unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd) {
@@ -218,7 +175,6 @@ unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
   }
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
                                                    const struct buf_2d *ref,
@@ -266,24 +222,21 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
                                            MACROBLOCK *const x, int mi_row,
                                            int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
 
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
 
-  set_skip_context(xd, mi_row, mi_col);
-#if CONFIG_VAR_TX
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-  xd->max_tx_size = max_txsize_lookup[bsize];
-#endif
+  set_skip_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   // Set up destination pointers.
   av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
+                       mi_col, 0, num_planes);
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
@@ -293,18 +246,15 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
   x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
   x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
 
-  set_plane_n4(xd, mi_width, mi_height);
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
 
   // Set up distance of MB to edge of frame in 1/8th pel units.
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+                 cm->mi_cols);
 
   // Set up source buffers.
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
 
   // R/D setup.
   x->rdmult = cpi->rd.RDMULT;
@@ -323,292 +273,111 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
 
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
 
-  mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_CFL
-  xd->cfl->mi_row = mi_row;
-  xd->cfl->mi_col = mi_col;
-#endif
+  mbmi = xd->mi[0];
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
+
+  mbmi->segment_id = 0;
 
   // Setup segment ID.
   if (seg->enabled) {
-    if (!cpi->vaq_refresh) {
+    if (seg->enabled && !cpi->vaq_refresh) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mbmi->segment_id =
+          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
     }
     av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
-  } else {
-    mbmi->segment_id = 0;
-  }
-
-#if CONFIG_SUPERTX
-  mbmi->segment_id_supertx = MAX_SEGMENTS;
-#endif  // CONFIG_SUPERTX
-}
-
-#if CONFIG_SUPERTX
-static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                const TileInfo *const tile, int mi_row,
-                                int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &td->mb;
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-#if CONFIG_DEPENDENT_HORZTILES
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col, cm->dependent_horz_tiles);
-#else
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-#endif
-
-  // Set up distance of MB to edge of frame in 1/8th pel units.
-  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-}
-
-static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
-                               const TileInfo *const tile, int mi_row_pred,
-                               int mi_col_pred, int mi_row_ori, int mi_col_ori,
-                               BLOCK_SIZE bsize_pred) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  MACROBLOCK *const x = &td->mb;
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_width = mi_size_wide[bsize_pred];
-  const int mi_height = mi_size_high[bsize_pred];
-
-#if CONFIG_DEPENDENT_HORZTILES
-  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori,
-                        cm->dependent_horz_tiles);
-#else
-  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
-#endif
-
-  // Set up limit values for MV components.
-  // Mv beyond the range do not produce new/different prediction block.
-  x->mv_limits.row_min =
-      -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.col_min =
-      -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.row_max =
-      (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND;
-  x->mv_limits.col_max =
-      (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND;
-
-// Set up distance of MB to edge of frame in 1/8th pel units.
-#if !CONFIG_CB4X4
-  assert(!(mi_col_pred & (mi_width - mi_size_wide[BLOCK_8X8])) &&
-         !(mi_row_pred & (mi_height - mi_size_high[BLOCK_8X8])));
-#endif
-  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-  xd->up_available = (mi_row_ori > tile->mi_row_start);
-  xd->left_available = (mi_col_ori > tile->mi_col_start);
-
-  // R/D setup.
-  x->rdmult = cpi->rd.RDMULT;
-}
-
-static void set_segment_id_supertx(const AV1_COMP *const cpi,
-                                   MACROBLOCK *const x, const int mi_row,
-                                   const int mi_col, const BLOCK_SIZE bsize) {
-  const AV1_COMMON *cm = &cpi->common;
-  const struct segmentation *seg = &cm->seg;
-  const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
-  const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
-  int r, c;
-  int seg_id_supertx = MAX_SEGMENTS;
-
-  if (!seg->enabled) {
-    seg_id_supertx = 0;
-  } else {
-    // Find the minimum segment_id
-    for (r = 0; r < mih; r++)
-      for (c = 0; c < miw; c++)
-        seg_id_supertx =
-            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
-    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
-
-    // Initialize plane quantisers
-    av1_init_plane_quantizers(cpi, x, seg_id_supertx);
   }
-
-  // Assign the the segment_id back to segment_id_supertx
-  for (r = 0; r < mih; r++)
-    for (c = 0; c < miw; c++)
-      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
 }
-#endif  // CONFIG_SUPERTX
 
-#if CONFIG_DUAL_FILTER
-static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                                    MB_MODE_INFO *mbmi) {
+static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) {
   InterpFilter filters[2];
-  InterpFilter default_filter = av1_unswitchable_filter(cm->interp_filter);
 
   for (int dir = 0; dir < 2; ++dir) {
-    filters[dir] = ((!has_subpel_mv_component(xd->mi[0], xd, dir) &&
-                     (mbmi->ref_frame[1] == NONE_FRAME ||
-                      !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
-                        ? default_filter
-                        : av1_extract_interp_filter(mbmi->interp_filters, dir));
+    filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir);
   }
   mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
 }
 
-static void update_filter_type_count(FRAME_COUNTS *counts,
+static void update_filter_type_count(uint8_t allow_update_cdf,
+                                     FRAME_COUNTS *counts,
                                      const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *mbmi) {
   int dir;
   for (dir = 0; dir < 2; ++dir) {
-    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-        (mbmi->ref_frame[1] > INTRA_FRAME &&
-         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-      InterpFilter filter =
-          av1_extract_interp_filter(mbmi->interp_filters, dir);
-      ++counts->switchable_interp[ctx][filter];
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    ++counts->switchable_interp[ctx][filter];
+    if (allow_update_cdf) {
       update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
                  SWITCHABLE_FILTERS);
     }
   }
 }
-#endif
-#if CONFIG_GLOBAL_MOTION
+
 static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
                                       const MB_MODE_INFO *mbmi,
                                       RD_COUNTS *rdc) {
-  if (mode == ZEROMV || mode == ZERO_ZEROMV) {
-    const int num_4x4s =
-        num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize];
+  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
+    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
     int ref;
     for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
       rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
     }
   }
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
-static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
                           const TX_MODE tx_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   if (xd->lossless[mbmi->segment_id]) {
     mbmi->tx_size = TX_4X4;
   } else if (tx_mode != TX_MODE_SELECT) {
-    mbmi->tx_size =
-        tx_size_from_tx_mode(mbmi->sb_type, tx_mode, is_inter_block(mbmi));
-  }
-}
-
-static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
-                                 int8_t rf_type) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-
-  const int bw = xd->n8_w << MI_SIZE_LOG2;
-  const int bh = xd->n8_h << MI_SIZE_LOG2;
-  int ref_mv_idx = mbmi->ref_mv_idx;
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type];
-
-  if (has_second_ref(mbmi)) {
-    // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx
-    // (like NEARMV) instead
-    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx += 1;
-
-    if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
-      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-      mbmi->pred_mv[0] = this_mv;
-      mi_pred_mv[0] = this_mv;
-    }
-    if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
-      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-      mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-      mbmi->pred_mv[1] = this_mv;
-      mi_pred_mv[1] = this_mv;
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
-    // Special case: SR_NEAR_NEWMV uses 1 + mbmi->ref_mv_idx
-    // (like NEARMV) instead
-    if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx += 1;
-
-    if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-        compound_ref1_mode(mbmi->mode) == NEWMV) {
-      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
-      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-      mbmi->pred_mv[0] = this_mv;
-      mi_pred_mv[0] = this_mv;
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
   } else {
-    if (mbmi->mode == NEWMV) {
-      int i;
-      for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-        int_mv this_mv = (i == 0) ? curr_ref_mv_stack[ref_mv_idx].this_mv
-                                  : curr_ref_mv_stack[ref_mv_idx].comp_mv;
-        clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-        mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
-        mbmi->pred_mv[i] = this_mv;
-        mi_pred_mv[i] = this_mv;
-      }
-    }
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+  }
+  if (is_inter_block(mbmi)) {
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
   }
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  av1_zero(x->blk_skip);
+  x->skip = 0;
 }
 
-static void update_state(const AV1_COMP *const cpi, ThreadData *td,
-                         PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                         BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                         ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row,
+                         int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int i, x_idx, y;
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   RD_COUNTS *const rdc = &td->rd_counts;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  MODE_INFO *mi_addr = xd->mi[0];
+  MB_MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
-  const int bw = mi_size_wide[mi->mbmi.sb_type];
-  const int bh = mi_size_high[mi->mbmi.sb_type];
+  const int bw = mi_size_wide[mi->sb_type];
+  const int bh = mi_size_high[mi->sb_type];
   const int mis = cm->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
-  const int unify_bsize = CONFIG_CB4X4;
 
-  int8_t rf_type;
-
-#if !CONFIG_SUPERTX
-  assert(mi->mbmi.sb_type == bsize);
-#endif
+  assert(mi->sb_type == bsize);
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
 
-#if CONFIG_DUAL_FILTER
-  reset_intmv_filter_type(cm, xd, mbmi);
-#endif
+  reset_intmv_filter_type(mi_addr);
 
-  rf_type = av1_ref_frame_type(mbmi->ref_frame);
-  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
-      (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
-    set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
-  }
+  memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+  x->skip = ctx->skip;
 
   // If segmentation in use
   if (seg->enabled) {
@@ -616,34 +385,29 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
     if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-      reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+      mi_addr->segment_id =
+          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, cm->tx_mode);
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
-                                        bsize, ctx->rate, ctx->dist, x->skip);
-      reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip);
+      reset_tx_size(x, mi_addr, cm->tx_mode);
     }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
   }
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
-#if CONFIG_PVQ
-    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
-#endif
     p[i].eobs = ctx->eobs[i];
-#if CONFIG_LV_MAP
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-#endif  // CONFIG_LV_MAP
   }
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-#if CONFIG_MRC_TX
-  xd->mrc_mask = ctx->mrc_mask;
-#endif  // CONFIG_MRC_TX
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -653,26 +417,7 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         xd->mi[x_idx + y * mis] = mi_addr;
       }
 
-#if !CONFIG_EXT_DELTA_Q
-  if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ)
-    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
-#else
-  if (cpi->oxcf.aq_mode)
-    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
-#endif
-
-  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
-  }
-
-  x->skip = ctx->skip;
-
-#if CONFIG_VAR_TX
-  for (i = 0; i < 1; ++i)
-    memcpy(x->blk_skip[i], ctx->blk_skip[i],
-           sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif
+  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
 
   if (dry_run) return;
 
@@ -687,18 +432,16 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         THR_H_PRED /*H_PRED*/,
         THR_D45_PRED /*D45_PRED*/,
         THR_D135_PRED /*D135_PRED*/,
-        THR_D117_PRED /*D117_PRED*/,
-        THR_D153_PRED /*D153_PRED*/,
-        THR_D207_PRED /*D207_PRED*/,
-        THR_D63_PRED /*D63_PRED*/,
-        THR_SMOOTH, /*SMOOTH_PRED*/
-#if CONFIG_SMOOTH_HV
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
         THR_SMOOTH_V, /*SMOOTH_V_PRED*/
         THR_SMOOTH_H, /*SMOOTH_H_PRED*/
-#endif                // CONFIG_SMOOTH_HV
-        THR_TM /*TM_PRED*/,
+        THR_PAETH /*PAETH_PRED*/,
       };
-      ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
     } else {
       // Note how often each mode chosen as best
       ++mode_chosen_counts[ctx->best_mode_index];
@@ -706,42 +449,17 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
   if (!frame_is_intra_only(cm)) {
-    if (is_inter_block(mbmi)) {
-      av1_update_mv_count(td);
-#if CONFIG_GLOBAL_MOTION
-      if (bsize >= BLOCK_8X8) {
-        // TODO(sarahparker): global motion stats need to be handled per-tile
-        // to be compatible with tile-based threading.
-        update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
-      } else {
-        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-        int idx, idy;
-        for (idy = 0; idy < 2; idy += num_4x4_h) {
-          for (idx = 0; idx < 2; idx += num_4x4_w) {
-            const int j = idy * 2 + idx;
-            update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
-          }
-        }
-      }
-#endif  // CONFIG_GLOBAL_MOTION
-      if (cm->interp_filter == SWITCHABLE
-#if CONFIG_WARPED_MOTION
-          && mbmi->motion_mode != WARPED_CAUSAL
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
-          && !is_nontrans_global_motion(xd)
-#endif  // CONFIG_GLOBAL_MOTION
-              ) {
-#if CONFIG_DUAL_FILTER
-        update_filter_type_count(td->counts, xd, mbmi);
-#else
-        const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
-        const InterpFilter filter =
-            av1_extract_interp_filter(mbmi->interp_filters, 0);
-        ++td->counts->switchable_interp[switchable_ctx][filter];
-#endif
-      }
+    if (is_inter_block(mi_addr)) {
+      // TODO(sarahparker): global motion stats need to be handled per-tile
+      // to be compatible with tile-based threading.
+      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
+    }
+
+    if (cm->interp_filter == SWITCHABLE &&
+        mi_addr->motion_mode != WARPED_CAUSAL &&
+        !is_nontrans_global_motion(xd, xd->mi[0])) {
+      update_filter_type_count(tile_data->allow_update_cdf, td->counts, xd,
+                               mi_addr);
     }
 
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
@@ -754,1147 +472,793 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
   av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
-#if CONFIG_SUPERTX
-static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                 PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
-  int y, x_idx;
-#if CONFIG_VAR_TX
-  int i;
-#endif
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col, const int num_planes) {
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+    const int is_uv = i > 0;
+    setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->sb_type, src->buffers[i],
+                     src->crop_widths[is_uv], src->crop_heights[is_uv],
+                     src->strides[is_uv], mi_row, mi_col, NULL,
+                     x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+  }
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int8_t segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
-  RD_COUNTS *const rdc = &td->rd_counts;
-  MACROBLOCK *const x = &td->mb;
+  av1_init_plane_quantizers(cpi, x, segment_id);
+  aom_clear_system_state();
+  int segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return av1_compute_rd_mult(
+      cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                             MACROBLOCK *const x, int mi_row, int mi_col,
+                             RD_STATS *rd_cost, PARTITION_TYPE partition,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  MODE_INFO *mi_addr = xd->mi[0];
-  const struct segmentation *const seg = &cm->seg;
-  const int mis = cm->mi_stride;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int unify_bsize = CONFIG_CB4X4;
-  int8_t rf_type;
+  MB_MODE_INFO *mbmi;
+  MB_MODE_INFO *ctx_mbmi = &ctx->mic;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
+  int i, orig_rdmult;
 
-  *mi_addr = *mi;
-  *x->mbmi_ext = ctx->mbmi_ext;
-  assert(is_inter_block(mbmi));
-  assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
+  aom_clear_system_state();
 
-#if CONFIG_DUAL_FILTER
-  reset_intmv_filter_type(cm, xd, mbmi);
-#endif
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
-  rf_type = av1_ref_frame_type(mbmi->ref_frame);
-  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
-      (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
-    set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
-  }
+  mbmi = xd->mi[0];
 
-  // If segmentation in use
-  if (seg->enabled) {
-    if (cpi->vaq_refresh) {
-      const int energy =
-          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
-      mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy);
-    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      // For cyclic refresh mode, now update the segment map
-      // and set the segment id.
-      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
-                                        bsize, ctx->rate, ctx->dist, 1);
-    } else {
-      // Otherwise just set the segment id based on the current segment map
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
+  if (ctx->rd_mode_is_ready) {
+    assert(ctx_mbmi->sb_type == bsize);
+    assert(ctx_mbmi->partition == partition);
+    *mbmi = *ctx_mbmi;
+    rd_cost->rate = ctx->rate;
+    rd_cost->dist = ctx->dist;
+    rd_cost->rdcost = ctx->rdcost;
+  } else {
+    mbmi->sb_type = bsize;
+    mbmi->partition = partition;
   }
-  // Restore the coding context of the MB to that that was in place
-  // when the mode was picked for it
-  for (y = 0; y < mi_height; y++)
-    for (x_idx = 0; x_idx < mi_width; x_idx++)
-      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
-          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
-        xd->mi[x_idx + y * mis] = mi_addr;
-      }
 
-#if !CONFIG_CB4X4
-  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
-  }
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
 #endif
 
-  x->skip = ctx->skip;
-
-#if CONFIG_VAR_TX
-  for (i = 0; i < 1; ++i)
-    memcpy(x->blk_skip[i], ctx->blk_skip[i],
-           sizeof(uint8_t) * ctx->num_4x4_blk);
-
-  if (!is_inter_block(mbmi) || mbmi->skip)
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
 
-#if CONFIG_VAR_TX
-  {
-    const TX_SIZE mtx = mbmi->tx_size;
-    const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
-    const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
-    int idy, idx;
-    mbmi->inter_tx_size[0][0] = mtx;
-    for (idy = 0; idy < num_4x4_blocks_high; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mtx;
-  }
-#endif  // CONFIG_VAR_TX
-  // Turn motion variation off for supertx
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
 
-  if (dry_run) return;
+  if (!ctx->rd_mode_is_ready) {
+    ctx->skippable = 0;
 
-  if (!frame_is_intra_only(cm)) {
-    av1_update_mv_count(td);
-
-#if CONFIG_GLOBAL_MOTION
-    if (is_inter_block(mbmi)) {
-      if (bsize >= BLOCK_8X8) {
-        // TODO(sarahparker): global motion stats need to be handled per-tile
-        // to be compatible with tile-based threading.
-        update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
-      } else {
-        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-        int idx, idy;
-        for (idy = 0; idy < 2; idy += num_4x4_h) {
-          for (idx = 0; idx < 2; idx += num_4x4_w) {
-            const int j = idy * 2 + idx;
-            update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
-          }
-        }
-      }
-    }
-#endif  // CONFIG_GLOBAL_MOTION
-
-    if (cm->interp_filter == SWITCHABLE
-#if CONFIG_GLOBAL_MOTION
-        && !is_nontrans_global_motion(xd)
-#endif  // CONFIG_GLOBAL_MOTION
-            ) {
-#if CONFIG_DUAL_FILTER
-      update_filter_type_count(td->counts, xd, mbmi);
-#else
-      const int pred_ctx = av1_get_pred_context_switchable_interp(xd);
-      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
-#endif
-    }
+    // Set to zero to make sure we do not use the previous encoded frame stats
+    mbmi->skip = 0;
 
-    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+    // Reset skip mode flag.
+    mbmi->skip_mode = 0;
   }
 
-  const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
-  av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
-}
+  x->skip_chroma_rd =
+      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y);
 
-static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                    const TileInfo *const tile, int mi_row,
-                                    int mi_col, BLOCK_SIZE bsize,
-                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  int hbs = mi_size_wide[bsize] / 2;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  PARTITION_TYPE partition = pc_tree->partitioning;
-  BLOCK_SIZE subsize = get_subsize(bsize, partition);
-  int i;
-#if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-  PICK_MODE_CONTEXT *pmc = NULL;
+  if (ctx->rd_mode_is_ready) {
+    x->skip = ctx->skip;
+    *x->mbmi_ext = ctx->mbmi_ext;
+    return;
+  }
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
 
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_block_energy(cpi, x, bsize);
+  // Save rdmult before it might be changed, so it can be restored later.
+  orig_rdmult = x->rdmult;
 
-  switch (partition) {
-    case PARTITION_NONE:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
-                           dry_run);
-      break;
-    case PARTITION_VERT:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
-                           subsize, dry_run);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
-        update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
-                             mi_col + hbs, subsize, dry_run);
-      }
-      pmc = &pc_tree->vertical_supertx;
-      break;
-    case PARTITION_HORZ:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
-                           subsize, dry_run);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
-        update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
-                             mi_col, subsize, dry_run);
-      }
-      pmc = &pc_tree->horizontal_supertx;
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
-                             subsize, dry_run);
-      } else {
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
-                                pc_tree->split[0]);
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
-                                dry_run, pc_tree->split[1]);
-        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
-                                dry_run, pc_tree->split[2]);
-        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                                subsize, dry_run, pc_tree->split[3]);
-      }
-      pmc = &pc_tree->split_supertx;
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
-                           bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
-                           mi_col + hbs, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
-                           mi_col, subsize, dry_run);
-      pmc = &pc_tree->horizontala_supertx;
-      break;
-    case PARTITION_HORZ_B:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
-                           subsize, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
-                           mi_col, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, dry_run);
-      pmc = &pc_tree->horizontalb_supertx;
-      break;
-    case PARTITION_VERT_A:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
-                           bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
-                           mi_col, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
-      update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
-                           mi_col + hbs, subsize, dry_run);
-      pmc = &pc_tree->verticala_supertx;
-      break;
-    case PARTITION_VERT_B:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
-                           subsize, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
-                           mi_col + hbs, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, dry_run);
-      pmc = &pc_tree->verticalb_supertx;
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
+  if (aq_mode == VARIANCE_AQ) {
+    if (cpi->vaq_refresh) {
+      const int energy =
+          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
+      mbmi->segment_id = av1_vaq_segment_id(energy);
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+      x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    if (pmc != NULL) {
-      p[i].coeff = pmc->coeff[i];
-      p[i].qcoeff = pmc->qcoeff[i];
-      pd[i].dqcoeff = pmc->dqcoeff[i];
-      p[i].eobs = pmc->eobs[i];
+  if (deltaq_mode > 0) x->rdmult = set_deltaq_rdmult(cpi, xd);
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+    av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
+                              best_rd);
+  } else {
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd);
     } else {
-      // These should never be used
-      p[i].coeff = NULL;
-      p[i].qcoeff = NULL;
-      pd[i].dqcoeff = NULL;
-      p[i].eobs = NULL;
+      av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                                bsize, ctx, best_rd);
     }
   }
-}
 
-static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx,
-                                 int best_tx, TX_SIZE supertx_size) {
-  MACROBLOCK *const x = &td->mb;
-#if CONFIG_VAR_TX
-  int i;
-
-  for (i = 0; i < 1; ++i)
-    memcpy(ctx->blk_skip[i], x->blk_skip[i],
-           sizeof(uint8_t) * ctx->num_4x4_blk);
-  ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size);
-#endif  // CONFIG_VAR_TX
-  ctx->mic.mbmi.tx_size = supertx_size;
-  ctx->skip = x->skip;
-  ctx->mic.mbmi.tx_type = best_tx;
-}
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
+      (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+       cpi->refresh_alt2_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
 
-static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
-                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                    int best_tx, TX_SIZE supertx_size,
-                                    PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  PARTITION_TYPE partition = pc_tree->partitioning;
-  BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-  int i;
-#endif
+  x->rdmult = orig_rdmult;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
 
-  switch (partition) {
-    case PARTITION_NONE:
-      update_supertx_param(td, &pc_tree->none, best_tx, supertx_size);
-      break;
-    case PARTITION_VERT:
-      update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize))
-        update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size);
-      break;
-    case PARTITION_HORZ:
-      update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize))
-        update_supertx_param(td, &pc_tree->horizontal[1], best_tx,
-                             supertx_size);
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size);
-      } else {
-        update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx,
-                                supertx_size, pc_tree->split[0]);
-        update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx,
-                                supertx_size, pc_tree->split[1]);
-        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx,
-                                supertx_size, pc_tree->split[2]);
-        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
-                                best_tx, supertx_size, pc_tree->split[3]);
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
-                             supertx_size);
-      break;
-    case PARTITION_HORZ_B:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
-                             supertx_size);
-      break;
-    case PARTITION_VERT_A:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size);
-      break;
-    case PARTITION_VERT_B:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size);
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
+  ctx->rdcost = rd_cost->rdcost;
 }
-#endif  // CONFIG_SUPERTX
 
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-static void set_mode_info_b(const AV1_COMP *const cpi,
-                            const TileInfo *const tile, ThreadData *td,
-                            int mi_row, int mi_col, BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCK *const x = &td->mb;
-  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, 1);
-}
+static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                    PREDICTION_MODE mode, int16_t mode_context,
+                                    uint8_t allow_update_cdf) {
+  (void)counts;
 
-static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
-                             const TileInfo *const tile, TOKENEXTRA **tp,
-                             int mi_row, int mi_col, BLOCK_SIZE bsize,
-                             PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-  const int quarter_step = mi_size_wide[bsize] / 4;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][0];
 #endif
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-  assert(bsize >= BLOCK_8X8);
+    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    return;
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][1];
 #endif
+    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  switch (partition) {
-    case PARTITION_NONE:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, &pc_tree->none);
-      break;
-    case PARTITION_VERT:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->vertical[0]);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
-                        &pc_tree->vertical[1]);
-      }
-      break;
-    case PARTITION_HORZ:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->horizontal[0]);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
-                        &pc_tree->horizontal[1]);
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                        pc_tree->leaf_split[0]);
-      } else {
-        set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col, subsize,
-                         pc_tree->split[0]);
-        set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, subsize,
-                         pc_tree->split[1]);
-        set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, subsize,
-                         pc_tree->split[2]);
-        set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, subsize,
-                         pc_tree->split[3]);
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->zeromv_mode[mode_ctx][0];
 #endif
-    case PARTITION_HORZ_A:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
-                      &pc_tree->horizontala[0]);
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
-                      &pc_tree->horizontala[1]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
-                      &pc_tree->horizontala[2]);
-      break;
-    case PARTITION_HORZ_B:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->horizontalb[0]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
-                      &pc_tree->horizontalb[1]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
-                      &pc_tree->horizontalb[2]);
-      break;
-    case PARTITION_VERT_A:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
-                      &pc_tree->verticala[0]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
-                      &pc_tree->verticala[1]);
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
-                      &pc_tree->verticala[2]);
-      break;
-    case PARTITION_VERT_B:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->verticalb[0]);
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
-                      &pc_tree->verticalb[1]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
-                      &pc_tree->verticalb[2]);
-      break;
-    case PARTITION_HORZ_4:
-      for (int i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= cm->mi_rows) break;
-
-        set_mode_info_b(cpi, tile, td, this_mi_row, mi_col, subsize,
-                        &pc_tree->horizontal4[i]);
-      }
-      break;
-    case PARTITION_VERT_4:
-      for (int i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
-
-        set_mode_info_b(cpi, tile, td, mi_row, this_mi_col, subsize,
-                        &pc_tree->vertical4[i]);
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0 && "Invalid partition type."); break;
-  }
-}
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void av1_get_ncobmc_mode_rd(const AV1_COMP *const cpi,
-                                   MACROBLOCK *const x, MACROBLOCKD *const xd,
-                                   int bsize, const int mi_row,
-                                   const int mi_col, NCOBMC_MODE *mode) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-
-  assert(bsize >= BLOCK_8X8);
-
-  reset_xd_boundary(xd, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
-                    cm->mi_cols);
-
-  // set up source buffers before calling the mode searching function
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
-
-  *mode = get_ncobmc_mode(cpi, x, xd, mi_row, mi_col, bsize);
-}
-static void get_ncobmc_intrpl_pred(const AV1_COMP *const cpi, ThreadData *td,
-                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int hbs = AOMMAX(mi_size_wide[bsize] / 2, mi_size_high[bsize] / 2);
-  const BLOCK_SIZE sqr_blk = bsize_2_sqr_bsize[bsize];
-
-  if (mi_width > mi_height) {
-    // horizontal partition
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
-                           &mbmi->ncobmc_mode[0]);
-    xd->mi += hbs;
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col + hbs,
-                           &mbmi->ncobmc_mode[1]);
-  } else if (mi_height > mi_width) {
-    // vertical partition
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
-                           &mbmi->ncobmc_mode[0]);
-    xd->mi += hbs * xd->mi_stride;
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row + hbs, mi_col,
-                           &mbmi->ncobmc_mode[1]);
-  } else {
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
-                           &mbmi->ncobmc_mode[0]);
-  }
-  // restore the info
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-}
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-
-void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
-                          int mi_row, int mi_col) {
-  uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer };
-  const int widths[3] = { src->y_crop_width, src->uv_crop_width,
-                          src->uv_crop_width };
-  const int heights[3] = { src->y_crop_height, src->uv_crop_height,
-                           src->uv_crop_height };
-  const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  int i;
-
-  // Set current frame pointer.
-  x->e_mbd.cur_buf = src;
-
-  for (i = 0; i < MAX_MB_PLANE; i++)
-    setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->mbmi.sb_type, buffers[i],
-                     widths[i], heights[i], strides[i], mi_row, mi_col, NULL,
-                     x->e_mbd.plane[i].subsampling_x,
-                     x->e_mbd.plane[i].subsampling_y);
-}
-
-static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              int8_t segment_id) {
-  int segment_qindex;
-  const AV1_COMMON *const cm = &cpi->common;
-  av1_init_plane_quantizers(cpi, x, segment_id);
-  aom_clear_system_state();
-  segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-  return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
-}
-
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-static void dist_8x8_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
-                                    BLOCK_SIZE bsize, int bw, int bh,
-                                    int mi_row, int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int dst_stride = pd->dst.stride;
-  uint8_t *dst = pd->dst.buf;
-
-  assert(bsize < BLOCK_8X8);
-
-  if (bsize < BLOCK_8X8) {
-    int i, j;
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *dst8x8_16 = (uint16_t *)dst8x8;
-      uint16_t *dst_sub8x8 = &dst8x8_16[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
-
-      for (j = 0; j < bh; ++j)
-        for (i = 0; i < bw; ++i)
-          dst_sub8x8[j * 8 + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+      return;
     } else {
+#if CONFIG_ENTROPY_STATS
+      ++counts->zeromv_mode[mode_ctx][1];
 #endif
-      uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
-
-      for (j = 0; j < bh; ++j)
-        for (i = 0; i < bw; ++i)
-          dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
-#if CONFIG_HIGHBITDEPTH
-    }
+      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
 #endif
+      if (allow_update_cdf)
+        update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+    }
   }
 }
-#endif
 
-static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
-                             MACROBLOCK *const x, int mi_row, int mi_col,
-                             RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                             int *totalrate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                             PARTITION_TYPE partition,
-#endif
-                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                             int64_t best_rd) {
-  const AV1_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
-  int i, orig_rdmult;
-
-  aom_clear_system_state();
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                               FRAME_COUNTS *counts, uint8_t allow_update_cdf) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
-#if CONFIG_PVQ
-  x->pvq_speed = 1;
-  x->pvq_coded = 0;
-#endif
+  (void)counts;
 
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-  mbmi = &xd->mi[0]->mbmi;
-  mbmi->sb_type = bsize;
-#if CONFIG_RD_DEBUG
-  mbmi->mi_row = mi_row;
-  mbmi->mi_col = mi_col;
-#endif
-#if CONFIG_SUPERTX
-  // We set tx_size here as skip blocks would otherwise not set it.
-  // tx_size needs to be set at this point as supertx_enable in
-  // write_modes_sb is computed based on this, and if the garbage in memory
-  // just happens to be the supertx_size, then the packer will code this
-  // block as a supertx block, even if rdopt did not pick it as such.
-  mbmi->tx_size = max_txsize_lookup[bsize];
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-  mbmi->partition = partition;
-#endif
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff[i];
-    p[i].qcoeff = ctx->qcoeff[i];
-    pd[i].dqcoeff = ctx->dqcoeff[i];
-#if CONFIG_PVQ
-    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
 #endif
-    p[i].eobs = ctx->eobs[i];
-#if CONFIG_LV_MAP
-    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+    if (allow_update_cdf)
+      update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+                 n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
 #endif
+      if (allow_update_cdf) {
+        update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+      }
+    }
   }
 
-  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-#if CONFIG_MRC_TX
-  xd->mrc_mask = ctx->mrc_mask;
-#endif  // CONFIG_MRC_TX
-
-  ctx->skippable = 0;
-
-  // Set to zero to make sure we do not use the previous encoded frame stats
-  mbmi->skip = 0;
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
 
-#if CONFIG_CB4X4
-  x->skip_chroma_rd =
-      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y);
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
 #endif
+    if (allow_update_cdf)
+      update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    x->source_variance = av1_high_get_sby_perpixel_variance(
-        cpi, &x->plane[0].src, bsize, xd->bd);
-  } else {
-    x->source_variance =
-        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+      }
+    }
   }
-#else
-  x->source_variance =
-      av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-#endif  // CONFIG_HIGHBITDEPTH
+}
 
-  // Save rdmult before it might be changed, so it can be restored later.
-  orig_rdmult = x->rdmult;
+static void sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                            MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                            const MB_MODE_INFO *above_mi,
+                            const MB_MODE_INFO *left_mi, const int intraonly,
+                            const int mi_row, const int mi_col,
+                            uint8_t allow_update_cdf) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
-  if (aq_mode == VARIANCE_AQ) {
-    if (cpi->vaq_refresh) {
-      const int energy =
-          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
-      mbmi->segment_id = av1_vaq_segment_id(energy);
-      // Re-initialise quantiser
-      av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
-    }
-    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-  } else if (aq_mode == COMPLEXITY_AQ) {
-    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-    // If segment is boosted, use rdmult for that segment.
-    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
-      x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
   }
 
-  // Find best coding mode & reconstruct the MB so it is available
-  // as a predictor for MBs that follow in the SB
-  if (frame_is_intra_only(cm)) {
-    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-    *totalrate_nocoef = 0;
-#endif  // CONFIG_SUPERTX
-  } else {
-    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
-                                         rd_cost, bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-      *totalrate_nocoef = rd_cost->rate;
-#endif  // CONFIG_SUPERTX
-    } else {
-      av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-#if CONFIG_SUPERTX
-                                totalrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                                bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-      assert(*totalrate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf) {
+      update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode,
+                 2);
+      if (use_filter_intra_mode) {
+        update_cdf(fc->filter_intra_mode_cdf,
+                   mbmi->filter_intra_mode_info.filter_intra_mode,
+                   FILTER_INTRA_MODES);
+      }
     }
   }
-
-  // Examine the resulting rate and for AQ mode 2 make a segment choice.
-  if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
-      (bsize >= BLOCK_16X16) &&
-      (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
-#if CONFIG_EXT_REFS
-       cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
-    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    if (allow_update_cdf) {
+      update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+                 mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+                 2 * MAX_ANGLE_DELTA + 1);
+    }
   }
 
-  x->rdmult = orig_rdmult;
-
-  // TODO(jingning) The rate-distortion optimization flow needs to be
-  // refactored to provide proper exit/return handle.
-  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
-
-  ctx->rate = rd_cost->rate;
-  ctx->dist = rd_cost->dist;
-}
-
-static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
-                                    int16_t mode_context) {
-  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
-  if (mode == NEWMV) {
-    ++counts->newmv_mode[mode_ctx][0];
+  if (!is_chroma_reference(mi_row, mi_col, bsize,
+                           xd->plane[AOM_PLANE_U].subsampling_x,
+                           xd->plane[AOM_PLANE_U].subsampling_y))
     return;
-  } else {
-    ++counts->newmv_mode[mode_ctx][1];
 
-    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      return;
-    }
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[is_cfl_allowed(xd)][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  if (allow_update_cdf) {
+    const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+    update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+               UV_INTRA_MODES - !cfl_allowed);
+  }
+  if (uv_mode == UV_CFL_PRED) {
+    const int joint_sign = mbmi->cfl_alpha_signs;
+    const int idx = mbmi->cfl_alpha_idx;
 
-    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-    if (mode == ZEROMV) {
-      ++counts->zeromv_mode[mode_ctx][0];
-      return;
-    } else {
-      ++counts->zeromv_mode[mode_ctx][1];
-      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
 
-      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      if (allow_update_cdf)
+        update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
 
-      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+      if (allow_update_cdf)
+        update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+      av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[uv_mode - UV_V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    if (allow_update_cdf) {
+      update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+                 mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+                 2 * MAX_ANGLE_DELTA + 1);
     }
   }
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+    update_palette_cdf(xd, mbmi, counts, allow_update_cdf);
 }
 
-static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
-                         int mi_col
-#if CONFIG_SUPERTX
-                         ,
-                         int supertx_enabled
-#endif
-                         ) {
+static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
+                         ThreadData *td, int mi_row, int mi_col) {
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const MODE_INFO *const mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *fc = xd->tile_ctx;
+  const uint8_t allow_update_cdf = tile_data->allow_update_cdf;
 
   // delta quant applies to both intra and inter
-  int super_block_upper_left =
-      ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
+  const int super_block_upper_left =
+      ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+      ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (cm->skip_mode_flag && !seg_ref_active && is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode) {
+    if (!seg_ref_active) {
+      const int skip_ctx = av1_get_skip_context(xd);
+#if CONFIG_ENTROPY_STATS
+      td->counts->skip[skip_ctx][mbmi->skip]++;
+#endif
+      if (allow_update_cdf) update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
+    }
+  }
 
-  if (cm->delta_q_present_flag && (bsize != cm->sb_size || !mbmi->skip) &&
+  if (cm->delta_q_present_flag &&
+      (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
       super_block_upper_left) {
-    const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+#if CONFIG_ENTROPY_STATS
+    const int dq =
+        (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
     const int absdq = abs(dq);
-    int i;
-    for (i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
       td->counts->delta_q[i][1]++;
     }
     if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
-    xd->prev_qindex = mbmi->current_q_index;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
+#endif
+    xd->current_qindex = mbmi->current_qindex;
     if (cm->delta_lf_present_flag) {
       if (cm->delta_lf_multi) {
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+#if CONFIG_ENTROPY_STATS
           const int delta_lf =
-              (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
-              cm->delta_lf_res;
+              (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res;
           const int abs_delta_lf = abs(delta_lf);
-          for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
             td->counts->delta_lf_multi[lf_id][i][1]++;
           }
           if (abs_delta_lf < DELTA_LF_SMALL)
             td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
-          xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+#endif
+          xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
         }
       } else {
+#if CONFIG_ENTROPY_STATS
         const int delta_lf =
-            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
             cm->delta_lf_res;
         const int abs_delta_lf = abs(delta_lf);
-        for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
           td->counts->delta_lf[i][1]++;
         }
         if (abs_delta_lf < DELTA_LF_SMALL)
           td->counts->delta_lf[abs_delta_lf][0]++;
-        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
-      }
-    }
-#else
-    if (cm->delta_lf_present_flag) {
-      const int dlf =
-          (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
-          cm->delta_lf_res;
-      const int absdlf = abs(dlf);
-      for (i = 0; i < AOMMIN(absdlf, DELTA_LF_SMALL); ++i) {
-        td->counts->delta_lf[i][1]++;
+#endif
+        xd->delta_lf_from_base = mbmi->delta_lf_from_base;
       }
-      if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++;
-      xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
     }
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
   }
+
+  if (!is_inter_block(mbmi)) {
+    sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                    frame_is_intra_only(cm), mi_row, mi_col,
+                    tile_data->allow_update_cdf);
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    if (allow_update_cdf)
+      update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc_block(mbmi)];
+#endif  // CONFIG_ENTROPY_STATS
+  }
+
   if (!frame_is_intra_only(cm)) {
-    FRAME_COUNTS *const counts = td->counts;
     RD_COUNTS *rdc = &td->rd_counts;
+
+    FRAME_COUNTS *const counts = td->counts;
+
+    if (mbmi->skip_mode) {
+      rdc->skip_mode_used_flag = 1;
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      return;
+    }
+
     const int inter_block = is_inter_block(mbmi);
-    const int seg_ref_active =
-        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
     if (!seg_ref_active) {
-#if CONFIG_SUPERTX
-      if (!supertx_enabled)
-#endif
-        counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
-#if CONFIG_NEW_MULTISYMBOL
-      update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
-                 inter_block, 2);
+#if CONFIG_ENTROPY_STATS
+      counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
 #endif
+      if (allow_update_cdf) {
+        update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+                   inter_block, 2);
+      }
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-#if CONFIG_EXT_REFS
         const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-#endif  // CONFIG_EXT_REFS
+
+        av1_collect_neighbors_ref_counts(xd);
 
         if (cm->reference_mode == REFERENCE_MODE_SELECT) {
           if (has_second_ref(mbmi))
             // This flag is also updated for 4x4 blocks
             rdc->compound_ref_used_flag = 1;
-          else
-            // This flag is also updated for 4x4 blocks
-            rdc->single_ref_used_flag = 1;
-          if (is_comp_ref_allowed(mbmi->sb_type)) {
-            counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
+          if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->comp_inter[av1_get_reference_mode_context(xd)]
                               [has_second_ref(mbmi)]++;
-#if CONFIG_NEW_MULTISYMBOL
-            update_cdf(av1_get_reference_mode_cdf(cm, xd), has_second_ref(mbmi),
-                       2);
-#endif  // CONFIG_NEW_MULTISYMBOL
+#endif  // CONFIG_ENTROPY_STATS
+            if (allow_update_cdf) {
+              update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi),
+                         2);
+            }
           }
         }
 
         if (has_second_ref(mbmi)) {
-#if CONFIG_EXT_COMP_REFS
           const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
                                                         ? UNIDIR_COMP_REFERENCE
                                                         : BIDIR_COMP_REFERENCE;
-#if !USE_UNI_COMP_REFS
-          // TODO(zoeliu): Temporarily turn off uni-directional comp refs
-          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // !USE_UNI_COMP_REFS
+          if (allow_update_cdf) {
+            update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                       COMP_REFERENCE_TYPES);
+          }
+#if CONFIG_ENTROPY_STATS
           counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
                                [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
 
           if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
             const int bit = (ref0 == BWDREF_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
             counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
                                 [bit]++;
+#endif  // CONFIG_ENTROPY_STATS
             if (!bit) {
               const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+              if (allow_update_cdf)
+                update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
               counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
                                   [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
               if (bit1) {
+                if (allow_update_cdf) {
+                  update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                             ref1 == GOLDEN_FRAME, 2);
+                }
+#if CONFIG_ENTROPY_STATS
                 counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
                                     [2][ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
               }
             }
           } else {
-#endif  // CONFIG_EXT_COMP_REFS
-#if CONFIG_EXT_REFS
             const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
-
-            counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
             if (!bit) {
-              counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
-                              [ref0 == LAST_FRAME]++;
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_ref_p1(xd),
+                           ref0 == LAST2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                              [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             } else {
-              counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_ref_p2(xd),
+                           ref0 == GOLDEN_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
                               [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             }
-
-            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
+            if (allow_update_cdf) {
+              update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd),
+                         ref1 == ALTREF_FRAME, 2);
+            }
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
                                [ref1 == ALTREF_FRAME]++;
-            if (ref1 != ALTREF_FRAME)
-              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(cm, xd)]
-                                 [1][ref1 == ALTREF2_FRAME]++;
-#else   // !CONFIG_EXT_REFS
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
-                          [ref0 == GOLDEN_FRAME]++;
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_EXT_COMP_REFS
+#endif  // CONFIG_ENTROPY_STATS
+            if (ref1 != ALTREF_FRAME) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                           ref1 == ALTREF2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                                 [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
           }
-#endif  // CONFIG_EXT_COMP_REFS
         } else {
-#if CONFIG_EXT_REFS
           const int bit = (ref0 >= BWDREF_FRAME);
-
+          if (allow_update_cdf)
+            update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
           counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
           if (bit) {
             assert(ref0 <= ALTREF_FRAME);
+            if (allow_update_cdf) {
+              update_cdf(av1_get_pred_cdf_single_ref_p2(xd),
+                         ref0 == ALTREF_FRAME, 2);
+            }
+#if CONFIG_ENTROPY_STATS
             counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
                               [ref0 == ALTREF_FRAME]++;
-            if (ref0 != ALTREF_FRAME)
+#endif  // CONFIG_ENTROPY_STATS
+            if (ref0 != ALTREF_FRAME) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                           ref0 == ALTREF2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
               counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
                                 [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
           } else {
             const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
             counts
                 ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
             if (!bit1) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p4(xd),
+                           ref0 != LAST_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
               counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
                                 [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             } else {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p5(xd),
+                           ref0 != LAST3_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
               counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
                                 [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             }
           }
-#else   // !CONFIG_EXT_REFS
-          counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
-                            [ref0 != LAST_FRAME]++;
-          if (ref0 != LAST_FRAME) {
-            counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
-                              [ref0 != GOLDEN_FRAME]++;
-          }
-#endif  // CONFIG_EXT_REFS
         }
 
-#if CONFIG_COMPOUND_SINGLEREF
-        if (!has_second_ref(mbmi))
-          counts->comp_inter_mode[av1_get_inter_mode_context(xd)]
-                                 [is_inter_singleref_comp_mode(mbmi->mode)]++;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_INTERINTRA
-        if (cm->reference_mode != COMPOUND_REFERENCE &&
-#if CONFIG_SUPERTX
-            !supertx_enabled &&
-#endif
-            cm->allow_interintra_compound && is_interintra_allowed(mbmi)) {
+        if (cm->seq_params.enable_interintra_compound &&
+            is_interintra_allowed(mbmi)) {
           const int bsize_group = size_group_lookup[bsize];
           if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
             counts->interintra[bsize_group][1]++;
-#if CONFIG_NEW_MULTISYMBOL
-            update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
 #endif
+            if (allow_update_cdf)
+              update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
             counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
-            update_cdf(fc->interintra_mode_cdf[bsize_group],
-                       mbmi->interintra_mode, INTERINTRA_MODES);
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->interintra_mode_cdf[bsize_group],
+                         mbmi->interintra_mode, INTERINTRA_MODES);
+            }
             if (is_interintra_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
               counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
-#if CONFIG_NEW_MULTISYMBOL
-              update_cdf(fc->wedge_interintra_cdf[bsize],
-                         mbmi->use_wedge_interintra, 2);
 #endif
+              if (allow_update_cdf) {
+                update_cdf(fc->wedge_interintra_cdf[bsize],
+                           mbmi->use_wedge_interintra, 2);
+              }
+              if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+                counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+                if (allow_update_cdf) {
+                  update_cdf(fc->wedge_idx_cdf[bsize],
+                             mbmi->interintra_wedge_index, 16);
+                }
+              }
             }
           } else {
+#if CONFIG_ENTROPY_STATS
             counts->interintra[bsize_group][0]++;
-#if CONFIG_NEW_MULTISYMBOL
-            update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
 #endif
+            if (allow_update_cdf)
+              update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
           }
         }
-#endif  // CONFIG_INTERINTRA
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_WARPED_MOTION
         set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        const MOTION_MODE motion_allowed =
+            cm->switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+            counts->motion_mode[bsize][mbmi->motion_mode]++;
 #endif
-        const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-            0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-            xd,
-#endif
-            mi);
-#if CONFIG_SUPERTX
-        if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-          if (mbmi->ref_frame[1] != INTRA_FRAME)
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-          {
-            if (motion_allowed == WARPED_CAUSAL) {
-              counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
-              update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode,
+            if (allow_update_cdf) {
+              update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
                          MOTION_MODES);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-            } else if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
-              counts->ncobmc[mbmi->sb_type][mbmi->motion_mode]++;
-              update_cdf(fc->ncobmc_cdf[mbmi->sb_type], mbmi->motion_mode,
-                         OBMC_FAMILY_MODES);
-            } else if (motion_allowed == OBMC_CAUSAL) {
-              counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
-              update_cdf(fc->obmc_cdf[mbmi->sb_type], mbmi->motion_mode, 2);
             }
-#else
-            } else if (motion_allowed == OBMC_CAUSAL) {
-              counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
-#if CONFIG_NEW_MULTISYMBOL
-              update_cdf(fc->obmc_cdf[mbmi->sb_type],
-                         mbmi->motion_mode == OBMC_CAUSAL, 2);
+          } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+            counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
 #endif
+            if (allow_update_cdf) {
+              update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL,
+                         2);
             }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-          }
-#else
-          if (motion_allowed > SIMPLE_TRANSLATION) {
-            counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
-            update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode,
-                       MOTION_MODES);
-          }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-        if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) {
-          ADAPT_OVERLAP_BLOCK ao_block =
-              adapt_overlap_block_lookup[mbmi->sb_type];
-          ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[0]];
-          update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[0],
-                     MAX_NCOBMC_MODES);
-          if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-            ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[1]];
-            update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[1],
-                       MAX_NCOBMC_MODES);
           }
         }
+
+        if (has_second_ref(mbmi)) {
+          assert(cm->reference_mode != SINGLE_REFERENCE &&
+                 is_inter_compound_mode(mbmi->mode) &&
+                 mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+          const int masked_compound_used =
+              is_any_masked_compound_used(bsize) &&
+              cm->seq_params.enable_masked_compound;
+          if (masked_compound_used) {
+            const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+            ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
 #endif
+            if (allow_update_cdf) {
+              update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                         mbmi->comp_group_idx, 2);
+            }
+          }
 
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-        if (
-#if CONFIG_COMPOUND_SINGLEREF
-            is_inter_anyref_comp_mode(mbmi->mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-            cm->reference_mode != SINGLE_REFERENCE &&
-            is_inter_compound_mode(mbmi->mode)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-            && mbmi->motion_mode == SIMPLE_TRANSLATION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-            ) {
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+          if (mbmi->comp_group_idx == 0) {
+            const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
 #endif
-            counts
-                ->compound_interinter[bsize][mbmi->interinter_compound_type]++;
-            update_cdf(fc->compound_type_cdf[bsize],
-                       mbmi->interinter_compound_type, COMPOUND_TYPES);
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+            if (allow_update_cdf) {
+              update_cdf(fc->compound_index_cdf[comp_index_ctx],
+                         mbmi->compound_idx, 2);
+            }
+          } else {
+            assert(masked_compound_used);
+            if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+              ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1];
+#endif
+              if (allow_update_cdf) {
+                update_cdf(fc->compound_type_cdf[bsize],
+                           mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1);
+              }
+            }
           }
+        }
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
 #endif
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+            if (allow_update_cdf) {
+              update_cdf(fc->wedge_idx_cdf[bsize],
+                         mbmi->interinter_comp.wedge_index, 16);
+            }
+          }
         }
       }
     }
@@ -1903,37 +1267,33 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       int16_t mode_ctx;
       const PREDICTION_MODE mode = mbmi->mode;
+
+      mode_ctx =
+          av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
       if (has_second_ref(mbmi)) {
-        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+#if CONFIG_ENTROPY_STATS
         ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
-        update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
-                   INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
-#if CONFIG_COMPOUND_SINGLEREF
-      } else if (is_inter_singleref_comp_mode(mode)) {
-        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-        ++counts->inter_singleref_comp_mode[mode_ctx]
-                                           [INTER_SINGLEREF_COMP_OFFSET(mode)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
+#endif
+        if (allow_update_cdf)
+          update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                     INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
       } else {
-        mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                             mbmi->ref_frame, bsize, -1);
-        update_inter_mode_stats(counts, mode, mode_ctx);
+        update_inter_mode_stats(fc, counts, mode, mode_ctx, allow_update_cdf);
       }
 
       int mode_allowed = (mbmi->mode == NEWMV);
       mode_allowed |= (mbmi->mode == NEW_NEWMV);
-#if CONFIG_COMPOUND_SINGLEREF
-      mode_allowed |= (mbmi->mode == SR_NEW_NEWMV);
-#endif  // CONFIG_COMPOUND_SINGLEREF
       if (mode_allowed) {
         uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
         int idx;
 
         for (idx = 0; idx < 2; ++idx) {
           if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
             uint8_t drl_ctx =
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
             ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
 
             if (mbmi->ref_mv_idx == idx) break;
           }
@@ -1946,47 +1306,35 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 
         for (idx = 1; idx < 3; ++idx) {
           if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
             uint8_t drl_ctx =
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
             ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
 
             if (mbmi->ref_mv_idx == idx - 1) break;
           }
         }
       }
     }
-#if CONFIG_INTRABC
-  } else {
-    if (av1_allow_intrabc(bsize, cm)) {
-      FRAME_COUNTS *const counts = td->counts;
-      ++counts->intrabc[mbmi->use_intrabc];
-    } else {
-      assert(!mbmi->use_intrabc);
-    }
-#endif
   }
 }
 
 typedef struct {
-  ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
   PARTITION_CONTEXT sa[MAX_MIB_SIZE];
   PARTITION_CONTEXT sl[MAX_MIB_SIZE];
-#if CONFIG_VAR_TX
   TXFM_CONTEXT *p_ta;
   TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[2 * MAX_MIB_SIZE];
-  TXFM_CONTEXT tl[2 * MAX_MIB_SIZE];
-#endif
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
 } RD_SEARCH_MACROBLOCK_CONTEXT;
 
 static void restore_context(MACROBLOCK *x,
                             const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
-                            int mi_col,
-#if CONFIG_PVQ
-                            od_rollback_buffer *rdo_buf,
-#endif
-                            BLOCK_SIZE bsize) {
+                            int mi_col, BLOCK_SIZE bsize,
+                            const int num_planes) {
   MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide =
@@ -1995,11 +1343,9 @@ static void restore_context(MACROBLOCK *x,
       block_size_high[bsize] >> tx_size_high_log2[0];
   int mi_width = mi_size_wide[bsize];
   int mi_height = mi_size_high[bsize];
-  for (p = 0; p < MAX_MB_PLANE; p++) {
-    int tx_col;
-    int tx_row;
-    tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-    tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
     memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
            ctx->a + num_4x4_blocks_wide * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
@@ -2013,25 +1359,17 @@ static void restore_context(MACROBLOCK *x,
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
-#if CONFIG_VAR_TX
   xd->above_txfm_context = ctx->p_ta;
   xd->left_txfm_context = ctx->p_tl;
   memcpy(xd->above_txfm_context, ctx->ta,
-         sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2));
+         sizeof(*xd->above_txfm_context) * mi_width);
   memcpy(xd->left_txfm_context, ctx->tl,
-         sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2));
-#endif
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, rdo_buf);
-#endif
+         sizeof(*xd->left_txfm_context) * mi_height);
 }
 
 static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                         int mi_row, int mi_col,
-#if CONFIG_PVQ
-                         od_rollback_buffer *rdo_buf,
-#endif
-                         BLOCK_SIZE bsize) {
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes) {
   const MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide =
@@ -2042,11 +1380,9 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
   int mi_height = mi_size_high[bsize];
 
   // buffer the above/left context information of the block in search.
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    int tx_col;
-    int tx_row;
-    tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-    tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
     memcpy(ctx->a + num_4x4_blocks_wide * p,
            xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
@@ -2060,386 +1396,165 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
-#if CONFIG_VAR_TX
   memcpy(ctx->ta, xd->above_txfm_context,
-         sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2));
+         sizeof(*xd->above_txfm_context) * mi_width);
   memcpy(ctx->tl, xd->left_txfm_context,
-         sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2));
+         sizeof(*xd->left_txfm_context) * mi_height);
   ctx->p_ta = xd->above_txfm_context;
   ctx->p_tl = xd->left_txfm_context;
-#endif
-#if CONFIG_PVQ
-  od_encode_checkpoint(&x->daala_enc, rdo_buf);
-#endif
 }
 
-static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                      ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
                      RUN_TYPE dry_run, BLOCK_SIZE bsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_TYPE partition,
-#endif
-                     PICK_MODE_CONTEXT *ctx, int *rate) {
+                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx,
+                     int *rate) {
+  TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
-#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q | \
-    CONFIG_NCOBMC_ADAPT_WEIGHT
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
-  int check_ncobmc;
-#endif
-#endif
 
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
-#if CONFIG_EXT_PARTITION_TYPES
-  x->e_mbd.mi[0]->mbmi.partition = partition;
-#endif
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-  mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_WARPED_MOTION
-  set_ref_ptrs(&cpi->common, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-#endif
-#endif
-
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-  const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      xd->mi[0]);
-#endif  // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
-  check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
-  if (!dry_run && check_ncobmc) {
-    av1_check_ncobmc_rd(cpi, x, mi_row, mi_col);
-    av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                         get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-  }
-#endif
-
-#if CONFIG_LV_MAP
-  av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
-#endif
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  update_state(cpi, tile_data, td, ctx, mi_row, mi_col, bsize, dry_run);
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (dry_run == OUTPUT_ENABLED && !frame_is_intra_only(&cpi->common)) {
-    if (motion_allowed >= NCOBMC_ADAPT_WEIGHT && is_inter_block(mbmi)) {
-      get_ncobmc_intrpl_pred(cpi, td, mi_row, mi_col, bsize);
-      av1_check_ncobmc_adapt_weight_rd(cpi, x, mi_row, mi_col);
-    }
-    av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                         get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-  }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (!dry_run) av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
 
-  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, rate);
+  encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
+                    rate);
 
-#if CONFIG_LV_MAP
   if (dry_run == 0)
     x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-#endif
 
   if (!dry_run) {
-#if CONFIG_EXT_DELTA_Q
-    mbmi = &xd->mi[0]->mbmi;
-    if (bsize == cpi->common.sb_size && mbmi->skip == 1 &&
+    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
         cpi->common.delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
-      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-        mbmi->curr_delta_lf[lf_id] = xd->prev_delta_lf[lf_id];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base;
+      const int frame_lf_count = av1_num_planes(&cpi->common) > 1
+                                     ? FRAME_LF_COUNT
+                                     : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
     }
-#endif
-#if CONFIG_SUPERTX
-    update_stats(&cpi->common, td, mi_row, mi_col, 0);
-#else
-    update_stats(&cpi->common, td, mi_row, mi_col);
-#endif
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+    update_stats(&cpi->common, tile_data, td, mi_row, mi_col);
   }
 }
 
 static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
-                      const TileInfo *const tile, TOKENEXTRA **tp, int mi_row,
+                      TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row,
                       int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int hbs = mi_size_wide[bsize] / 2;
-#if CONFIG_EXT_PARTITION_TYPES && CONFIG_EXT_PARTITION_TYPES_AB
-  const int qbs = mi_size_wide[bsize] / 4;
-#endif
   const int is_partition_root = bsize >= BLOCK_8X8;
   const int ctx = is_partition_root
-                      ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                                mi_row + hbs < cm->mi_rows,
-                                                mi_col + hbs < cm->mi_cols,
-#endif
-                                                bsize)
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
                       : -1;
   const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   int quarter_step = mi_size_wide[bsize] / 4;
   int i;
-#if !CONFIG_EXT_PARTITION_TYPES_AB
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-#endif
-
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-  assert(bsize >= BLOCK_8X8);
-#endif
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
-
-#if CONFIG_SUPERTX
-  if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-      partition != PARTITION_NONE && !xd->lossless[0]) {
-    int supertx_enabled;
-    TX_SIZE supertx_size = max_txsize_lookup[bsize];
-    supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
-    if (supertx_enabled) {
-      const int mi_width = mi_size_wide[bsize];
-      const int mi_height = mi_size_high[bsize];
-      int x_idx, y_idx, i;
-      uint8_t *dst_buf[3];
-      int dst_stride[3];
-      set_skip_context(xd, mi_row, mi_col);
-      set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
-                              pc_tree);
-
-      av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                           mi_col);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        dst_buf[i] = xd->plane[i].dst.buf;
-        dst_stride[i] = xd->plane[i].dst.stride;
-      }
-      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
-                         bsize, bsize, dst_buf, dst_stride, pc_tree);
-
-      set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-      set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
-
-      if (!x->skip) {
-        int this_rate = 0;
-        av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize);
-        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, mi_row, mi_col, bsize,
-                                rate);
-        if (rate) *rate += this_rate;
-      } else {
-        xd->mi[0]->mbmi.skip = 1;
-        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
-        av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-      }
-      if (!dry_run) {
-        for (y_idx = 0; y_idx < mi_height; y_idx++)
-          for (x_idx = 0; x_idx < mi_width; x_idx++) {
-            if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
-                    x_idx &&
-                (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height >
-                    y_idx) {
-              xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
-                  xd->mi[0]->mbmi.skip;
-            }
-          }
-        td->counts->supertx[partition_supertx_context_lookup[partition]]
-                           [supertx_size][1]++;
-        td->counts->supertx_size[supertx_size]++;
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < cm->mi_rows;
+    const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+    if (has_rows && has_cols) {
 #if CONFIG_ENTROPY_STATS
-#if CONFIG_EXT_TX
-        if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
-                1 &&
-            !xd->mi[0]->mbmi.skip) {
-          const int eset =
-              get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-          if (eset > 0) {
-            ++td->counts
-                  ->inter_ext_tx[eset][supertx_size][xd->mi[0]->mbmi.tx_type];
-          }
-        }
-#else
-        if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) {
-          ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
-        }
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_ENTROPY_STATS
-      }
-#if CONFIG_EXT_PARTITION_TYPES
-      update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
-                                   partition);
-#else
-      if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+      td->counts->partition[ctx][partition]++;
 #endif
-#if CONFIG_VAR_TX
-      set_txfm_ctxs(supertx_size, mi_width, mi_height, xd->mi[0]->mbmi.skip,
-                    xd);
-#endif  // CONFIG_VAR_TX
-      return;
-    } else {
-      if (!dry_run) {
-        td->counts->supertx[partition_supertx_context_lookup[partition]]
-                           [supertx_size][0]++;
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
       }
     }
   }
-#endif  // CONFIG_SUPERTX
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-               partition,
-#endif
-               &pc_tree->none, rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->none, rate);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-               partition,
-#endif
-               &pc_tree->vertical[0], rate);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                 partition,
-#endif
-                 &pc_tree->vertical[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->vertical[0], rate);
+      if (mi_col + hbs < cm->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, &pc_tree->vertical[1], rate);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-               partition,
-#endif
-               &pc_tree->horizontal[0], rate);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                 partition,
-#endif
-                 &pc_tree->horizontal[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < cm->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal[1], rate);
       }
       break;
     case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                 partition,
-#endif
-                 pc_tree->leaf_split[0], rate);
-      } else {
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
-                  pc_tree->split[0], rate);
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                  pc_tree->split[1], rate);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                  pc_tree->split[2], rate);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
-                  subsize, pc_tree->split[3], rate);
-      }
-      break;
-
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_HORZ_4), partition,
-               &pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + qbs, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_HORZ_4), partition,
-               &pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontala[2], rate);
-      break;
-    case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_HORZ_4), partition,
-               &pc_tree->horizontalb[1], rate);
-      if (mi_row + 3 * qbs < cm->mi_rows)
-        encode_b(cpi, tile, td, tp, mi_row + 3 * qbs, mi_col, dry_run,
-                 get_subsize(bsize, PARTITION_HORZ_4), partition,
-                 &pc_tree->horizontalb[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
       break;
-    case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_VERT_4), partition,
-               &pc_tree->verticala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + qbs, dry_run,
-               get_subsize(bsize, PARTITION_VERT_4), partition,
-               &pc_tree->verticala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-               partition, &pc_tree->verticala[2], rate);
 
-      break;
-    case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run,
-               get_subsize(bsize, PARTITION_VERT_4), partition,
-               &pc_tree->verticalb[1], rate);
-      if (mi_col + 3 * qbs < cm->mi_cols)
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + 3 * qbs, dry_run,
-                 get_subsize(bsize, PARTITION_VERT_4), partition,
-                 &pc_tree->verticalb[2], rate);
-      break;
-#else
     case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
-               &pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
                partition, &pc_tree->horizontala[2], rate);
       break;
     case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
                partition, &pc_tree->horizontalb[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->horizontalb[2], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->horizontalb[2], rate);
       break;
     case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
-               &pc_tree->verticala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
                partition, &pc_tree->verticala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
                partition, &pc_tree->verticala[2], rate);
 
       break;
     case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->verticalb[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->verticalb[2], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->verticalb[2], rate);
       break;
-#endif
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
         if (i > 0 && this_mi_row >= cm->mi_rows) break;
 
-        encode_b(cpi, tile, td, tp, this_mi_row, mi_col, dry_run, subsize,
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
                  partition, &pc_tree->horizontal4[i], rate);
       }
       break;
@@ -2448,20 +1563,14 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
         int this_mi_col = mi_col + i * quarter_step;
         if (i > 0 && this_mi_col >= cm->mi_cols) break;
 
-        encode_b(cpi, tile, td, tp, mi_row, this_mi_col, dry_run, subsize,
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
                  partition, &pc_tree->vertical4[i], rate);
       }
       break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -2483,19 +1592,19 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
   return bsize;
 }
 
-static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
-                                     int bh_in, int bw_in,
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
                                      int mi_rows_remaining,
                                      int mi_cols_remaining, BLOCK_SIZE bsize,
-                                     MODE_INFO **mib) {
+                                     MB_MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < cm->mib_size; r += bh) {
+  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < cm->mib_size; c += bw) {
+    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
       const int index = r * cm->mi_stride + c;
       mib[index] = mi + index;
-      mib[index]->mbmi.sb_type = find_partition_size(
+      mib[index]->sb_type = find_partition_size(
           bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
     }
   }
@@ -2507,26 +1616,27 @@ static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                   MODE_INFO **mib, int mi_row, int mi_col,
+                                   MB_MODE_INFO **mib, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &cpi->common;
   const int mi_rows_remaining = tile->mi_row_end - mi_row;
   const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
   int bh = mi_size_high[bsize];
   int bw = mi_size_wide[bsize];
 
   assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
   // Apply the requested partition size to the SB if it is all "in image"
-  if ((mi_cols_remaining >= cm->mib_size) &&
-      (mi_rows_remaining >= cm->mib_size)) {
-    for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
-      for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
+      (mi_rows_remaining >= cm->seq_params.mib_size)) {
+    for (block_row = 0; block_row < cm->seq_params.mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->seq_params.mib_size;
+           block_col += bw) {
         int index = block_row * cm->mi_stride + block_col;
         mib[index] = mi_upper_left + index;
-        mib[index]->mbmi.sb_type = bsize;
+        mib[index]->sb_type = bsize;
       }
     }
   } else {
@@ -2537,14 +1647,12 @@ static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
 }
 
 static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                             TileDataEnc *tile_data, MODE_INFO **mib,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
-#if CONFIG_SUPERTX
-                             int *rate_nocoef,
-#endif
                              int do_recon, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2552,37 +1660,23 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   const int hbs = bs / 2;
   int i;
   const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                               mi_row + hbs < cm->mi_rows,
-                                               mi_col + hbs < cm->mi_cols,
-#endif
-                                               bsize)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
                      : 0;
   const PARTITION_TYPE partition =
       (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
                            : PARTITION_NONE;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_STATS last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mib[0]->sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-  const int unify_bsize = CONFIG_CB4X4;
-#if CONFIG_SUPERTX
-  int last_part_rate_nocoef = INT_MAX;
-  int none_rate_nocoef = INT_MAX;
-  int chosen_rate_nocoef = INT_MAX;
-#endif
-#if CONFIG_PVQ
-  od_rollback_buffer pre_rdo_buf;
-#endif
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  assert(num_4x4_blocks_wide_lookup[bsize] ==
-         num_4x4_blocks_high_lookup[bsize]);
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
   av1_invalid_rd_stats(&last_part_rdc);
   av1_invalid_rd_stats(&none_rdc);
@@ -2590,17 +1684,10 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
   pc_tree->partitioning = partition;
 
-#if CONFIG_VAR_TX
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-#endif
-#if !CONFIG_PVQ
-  save_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -2612,12 +1699,12 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
-      sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
-        if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
+        MB_MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+        if (this_mi && this_mi->sb_type >= sub_subsize) {
           splits_below = 0;
         }
       }
@@ -2629,28 +1716,15 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-#if CONFIG_SUPERTX
-                       &none_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_NONE,
-#endif
-                       bsize, ctx_none, INT64_MAX);
+                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-#if CONFIG_SUPERTX
-        none_rate_nocoef += x->partition_cost[pl][PARTITION_NONE];
-#endif
       }
 
-#if !CONFIG_PVQ
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-      restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
-      mib[0]->mbmi.sb_type = bs_type;
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      mib[0]->sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
@@ -2658,127 +1732,65 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                       &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_NONE,
-#endif
-                       bsize, ctx_none, INT64_MAX);
+                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
       break;
     case PARTITION_HORZ:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                       &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_HORZ,
-#endif
-                       subsize, &pc_tree->horizontal[0], INT64_MAX);
+                       PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + hbs < cm->mi_rows) {
         RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-        int rt_nocoef = 0;
-#endif
         PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
         av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                          NULL);
+        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-#if CONFIG_SUPERTX
-                         &rt_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_HORZ,
-#endif
-                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+                         PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                         INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
-#if CONFIG_SUPERTX
-          last_part_rate_nocoef = INT_MAX;
-#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
-#if CONFIG_SUPERTX
-        last_part_rate_nocoef += rt_nocoef;
-#endif
       }
       break;
     case PARTITION_VERT:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                       &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_VERT,
-#endif
-                       subsize, &pc_tree->vertical[0], INT64_MAX);
+                       PARTITION_VERT, subsize, &pc_tree->vertical[0],
+                       INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + hbs < cm->mi_cols) {
         RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-        int rt_nocoef = 0;
-#endif
         PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
         av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                          NULL);
+        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-#if CONFIG_SUPERTX
-                         &rt_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_VERT,
-#endif
-                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
-                         INT64_MAX);
+                         PARTITION_VERT, subsize,
+                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
-#if CONFIG_SUPERTX
-          last_part_rate_nocoef = INT_MAX;
-#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
-#if CONFIG_SUPERTX
-        last_part_rate_nocoef += rt_nocoef;
-#endif
       }
       break;
     case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                         &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_SPLIT,
-#endif
-                         subsize, pc_tree->leaf_split[0], INT64_MAX);
-        break;
-      }
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
-#if CONFIG_SUPERTX
-      last_part_rate_nocoef = 0;
-#endif
       for (i = 0; i < 4; i++) {
         int x_idx = (i & 1) * hbs;
         int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-        int rt_nocoef;
-#endif
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
@@ -2786,33 +1798,21 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         rd_use_partition(cpi, td, tile_data,
                          mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
-                         &tmp_rdc.dist,
-#if CONFIG_SUPERTX
-                         &rt_nocoef,
-#endif
-                         i != 3, pc_tree->split[i]);
+                         &tmp_rdc.dist, i != 3, pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
-#if CONFIG_SUPERTX
-          last_part_rate_nocoef = INT_MAX;
-#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
-#if CONFIG_SUPERTX
-        last_part_rate_nocoef += rt_nocoef;
-#endif
       }
       break;
-#if CONFIG_EXT_PARTITION_TYPES
     case PARTITION_VERT_A:
     case PARTITION_VERT_B:
     case PARTITION_HORZ_A:
     case PARTITION_HORZ_B:
     case PARTITION_HORZ_4:
     case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
-#endif  //  CONFIG_EXT_PARTITION_TYPES
     default: assert(0); break;
   }
 
@@ -2820,9 +1820,6 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     last_part_rdc.rate += x->partition_cost[pl][partition];
     last_part_rdc.rdcost =
         RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
-#if CONFIG_SUPERTX
-    last_part_rate_nocoef += x->partition_cost[pl][partition];
-#endif
   }
 
   if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
@@ -2830,17 +1827,11 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
       (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
       (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
-    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    chosen_rdc.rate = 0;
-    chosen_rdc.dist = 0;
-#if CONFIG_SUPERTX
-    chosen_rate_nocoef = 0;
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
@@ -2848,175 +1839,108 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       int x_idx = (i & 1) * hbs;
       int y_idx = (i >> 1) * hbs;
       RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-      int rt_nocoef = 0;
-#endif
-#if CONFIG_PVQ
-      od_rollback_buffer buf;
-#endif
+
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-#if !CONFIG_PVQ
-      save_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-      save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
-#endif
+      save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc,
-#if CONFIG_SUPERTX
-                       &rt_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_SPLIT,
-#endif
-                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
+                       &tmp_rdc, PARTITION_SPLIT, split_subsize,
+                       &pc_tree->split[i]->none, INT64_MAX);
 
-#if !CONFIG_PVQ
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-      restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
-#endif
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         av1_invalid_rd_stats(&chosen_rdc);
-#if CONFIG_SUPERTX
-        chosen_rate_nocoef = INT_MAX;
-#endif
         break;
       }
 
       chosen_rdc.rate += tmp_rdc.rate;
       chosen_rdc.dist += tmp_rdc.dist;
-#if CONFIG_SUPERTX
-      chosen_rate_nocoef += rt_nocoef;
-#endif
 
       if (i != 3)
-        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
                   OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
       chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-#if CONFIG_SUPERTX
-      chosen_rate_nocoef += x->partition_cost[pl][PARTITION_SPLIT];
-#endif
     }
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
-#if CONFIG_SUPERTX
-      chosen_rate_nocoef += x->partition_cost[pl][PARTITION_NONE];
-#endif
     }
   }
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mib[0]->mbmi.sb_type = bsize;
+    mib[0]->sb_type = bsize;
     if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
-#if CONFIG_SUPERTX
-    chosen_rate_nocoef = last_part_rate_nocoef;
-#endif
   }
   // If none was better set the partitioning to that.
   if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
     chosen_rdc = none_rdc;
-#if CONFIG_SUPERTX
-    chosen_rate_nocoef = none_rate_nocoef;
-#endif
   }
 
-#if !CONFIG_PVQ
-  restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-  restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == cm->sb_size)
+  if (bsize == cm->seq_params.sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    if (bsize == cm->sb_size) {
+    if (bsize == cm->seq_params.sb_size) {
       // NOTE: To get estimate for rate due to the tokens, use:
       // int rate_coeffs = 0;
-      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
       //           bsize, pc_tree, &rate_coeffs);
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
                 pc_tree, NULL);
     } else {
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
                 pc_tree, NULL);
     }
   }
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
-#if CONFIG_SUPERTX
-  *rate_nocoef = chosen_rate_nocoef;
-#endif
 }
 
 /* clang-format off */
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,    //    2x2,    2x4,     4x2
-#endif
                             BLOCK_4X4,    //                     4x4
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
   BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
-#if CONFIG_EXT_PARTITION
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   4x16,   16x4,    8x32
   BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,  //   32x8,  16x64,   64x16
-#if CONFIG_EXT_PARTITION
-  BLOCK_16X16, BLOCK_16X16                // 32x128, 128x32
-#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_4X4,     BLOCK_4X4,       BLOCK_4X4,    //    2x2,    2x4,     4x2
-#endif
                                   BLOCK_8X8,    //                     4x4
   BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,    //    4x8,    8x4,     8x8
   BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,    //   8x16,   16x8,   16x16
   BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
-#if CONFIG_EXT_PARTITION
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
   BLOCK_32X32,   BLOCK_LARGEST, BLOCK_LARGEST,  //   32x8,  16x64,   64x16
-#if CONFIG_EXT_PARTITION
-  BLOCK_LARGEST, BLOCK_LARGEST                  // 32x128, 128x32
-#endif  // CONFIG_EXT_PARTITION
 };
 
 // Next square block size less or equal than current block size.
 static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,   BLOCK_2X2,     BLOCK_2X2,    //    2x2,    2x4,     4x2
-#endif
                               BLOCK_4X4,    //                     4x4
   BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,    //    4x8,    8x4,     8x8
   BLOCK_8X8,   BLOCK_8X8,     BLOCK_16X16,  //   8x16,   16x8,   16x16
   BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
   BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
-#if CONFIG_EXT_PARTITION
   BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
   BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,    //   32x8,  16x64,   64x16
-#if CONFIG_EXT_PARTITION
-  BLOCK_32X32, BLOCK_32X32                  // 32x128, 128x32
-#endif  // CONFIG_EXT_PARTITION
 };
 /* clang-format on */
 
@@ -3029,17 +1953,17 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
 // function so repeat calls can accumulate a min and max of more than one
 // superblock.
 static void get_sb_partition_size_range(const AV1_COMMON *const cm,
-                                        MACROBLOCKD *xd, MODE_INFO **mib,
+                                        MACROBLOCKD *xd, MB_MODE_INFO **mib,
                                         BLOCK_SIZE *min_block_size,
                                         BLOCK_SIZE *max_block_size) {
   int i, j;
   int index = 0;
 
   // Check the sb_type for each block that belongs to this region.
-  for (i = 0; i < cm->mib_size; ++i) {
-    for (j = 0; j < cm->mib_size; ++j) {
-      MODE_INFO *mi = mib[index + j];
-      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
+  for (i = 0; i < cm->seq_params.mib_size; ++i) {
+    for (j = 0; j < cm->seq_params.mib_size; ++j) {
+      MB_MODE_INFO *mi = mib[index + j];
+      BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4;
       *min_block_size = AOMMIN(*min_block_size, sb_type);
       *max_block_size = AOMMAX(*max_block_size, sb_type);
     }
@@ -3047,6 +1971,68 @@ static void get_sb_partition_size_range(const AV1_COMMON *const cm,
   }
 }
 
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
+  return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) ||
+         active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size);
+}
+
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
@@ -3054,7 +2040,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
                                     int mi_col, BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
   AV1_COMMON *const cm = &cpi->common;
-  MODE_INFO **mi = xd->mi;
+  MB_MODE_INFO **mi = xd->mi;
   const int left_in_image = xd->left_available && mi[-1];
   const int above_in_image = xd->up_available && mi[-xd->mi_stride];
   const int mi_rows_remaining = tile->mi_row_end - mi_row;
@@ -3073,18 +2059,19 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
     // passed in values for min and max as a starting point.
     // Find the min and max partition used in previous frame at this location
     if (cm->frame_type != KEY_FRAME) {
-      MODE_INFO **prev_mi =
+      MB_MODE_INFO **prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
       get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
     }
     // Find the min and max partition sizes used in the left superblock
     if (left_in_image) {
-      MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+      MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size];
       get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
     }
     // Find the min and max partition sizes used in the above suprblock.
     if (above_in_image) {
-      MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+      MB_MODE_INFO **above_sb_mi =
+          &mi[-xd->mi_stride * cm->seq_params.mib_size];
       get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
     }
 
@@ -3103,7 +2090,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
-  if (av1_active_edge_sb(cpi, mi_row, mi_col)) {
+  if (active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
     min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
@@ -3116,8 +2103,8 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
     min_size = AOMMIN(min_size, next_square_size[max_size]);
   }
 
-  *min_block_size = AOMMIN(min_size, cm->sb_size);
-  *max_block_size = AOMMIN(max_size, cm->sb_size);
+  *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size);
+  *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size);
 }
 
 // TODO(jingning) refactor functions setting partition search range
@@ -3131,15 +2118,15 @@ static void set_partition_range(const AV1_COMMON *const cm,
   int idx, idy;
 
   const int idx_str = cm->mi_stride * mi_row + mi_col;
-  MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
-  BLOCK_SIZE min_size = cm->sb_size;  // default values
+  MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
+  BLOCK_SIZE min_size = cm->seq_params.sb_size;  // default values
   BLOCK_SIZE max_size = BLOCK_4X4;
 
   if (prev_mi) {
     for (idy = 0; idy < mi_height; ++idy) {
       for (idx = 0; idx < mi_width; ++idx) {
-        const MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
-        const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+        const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
+        const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
         min_size = AOMMIN(min_size, bs);
         max_size = AOMMAX(max_size, bs);
       }
@@ -3148,8 +2135,8 @@ static void set_partition_range(const AV1_COMMON *const cm,
 
   if (xd->left_available) {
     for (idy = 0; idy < mi_height; ++idy) {
-      const MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
-      const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+      const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
+      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
       min_size = AOMMIN(min_size, bs);
       max_size = AOMMAX(max_size, bs);
     }
@@ -3157,8 +2144,8 @@ static void set_partition_range(const AV1_COMMON *const cm,
 
   if (xd->up_available) {
     for (idx = 0; idx < mi_width; ++idx) {
-      const MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
-      const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+      const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
+      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
       min_size = AOMMIN(min_size, bs);
       max_size = AOMMAX(max_size, bs);
     }
@@ -3169,8 +2156,8 @@ static void set_partition_range(const AV1_COMMON *const cm,
     max_size = max_partition_size[max_size];
   }
 
-  *min_bs = AOMMIN(min_size, cm->sb_size);
-  *max_bs = AOMMIN(max_size, cm->sb_size);
+  *min_bs = AOMMIN(min_size, cm->seq_params.sb_size);
+  *max_bs = AOMMIN(max_size, cm->seq_params.sb_size);
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -3184,24 +2171,18 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 #if CONFIG_FP_MB_STATS
 const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
   0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
-#if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
   130, 130, 150
-#endif  // CONFIG_EXT_PARTITION
 };
 const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
   0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
-#if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
   160, 160, 240
-#endif  // CONFIG_EXT_PARTITION
 };
 const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
-#if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
   8, 8, 10
-#endif  // CONFIG_EXT_PARTITION
 };
 
 typedef enum {
@@ -3227,260 +2208,673 @@ static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
   }
 }
 
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
-                                           MOTION_DIRECTION that_mv) {
-  if (this_mv == that_mv) {
-    return 0;
-  } else {
-    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+                                           MOTION_DIRECTION that_mv) {
+  if (this_mv == that_mv) {
+    return 0;
+  } else {
+    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+  }
+}
+#endif
+
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks)
+static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TOKENEXTRA **tp,
+                           int is_first, int is_last, int mi_row, int mi_col,
+                           BLOCK_SIZE subsize, RD_STATS *best_rdc,
+                           RD_STATS *sum_rdc, RD_STATS *this_rdc,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *prev_ctx,
+                           PICK_MODE_CONTEXT *this_ctx) {
+#define RTS_X_RATE_NOCOEF_ARG
+#define RTS_MAX_RDCOST best_rdc->rdcost
+
+  MACROBLOCK *const x = &td->mb;
+
+  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+
+  // On the first time around, write the rd stats straight to sum_rdc. Also, we
+  // should treat sum_rdc as containing zeros (even if it doesn't) to avoid
+  // having to zero it at the start.
+  if (is_first) this_rdc = sum_rdc;
+  const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost;
+  const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost;
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+                   RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+                   rdcost_remaining);
+
+  if (!is_first) {
+    if (this_rdc->rate == INT_MAX) {
+      sum_rdc->rdcost = INT64_MAX;
+    } else {
+      sum_rdc->rate += this_rdc->rate;
+      sum_rdc->dist += this_rdc->dist;
+      sum_rdc->rdcost += this_rdc->rdcost;
+    }
+  }
+
+  if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
+
+  if (!is_last) {
+    update_state(cpi, tile_data, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                      subsize, NULL);
+  }
+
+  return 1;
+
+#undef RTS_X_RATE_NOCOEF_ARG
+#undef RTS_MAX_RDCOST
+}
+
+static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TOKENEXTRA **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               PICK_MODE_CONTEXT ctxs[3],
+                               PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                               BLOCK_SIZE bsize, PARTITION_TYPE partition,
+                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_STATS sum_rdc, this_rdc;
+#define RTP_STX_TRY_ARGS
+
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
+    return;
+
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
+    return;
+
+  // With the new layout of mixed partitions for PARTITION_HORZ_B and
+  // PARTITION_VERT_B, the last subblock might start past halfway through the
+  // main block, so we might signal it even though the subblock lies strictly
+  // outside the image. In that case, we won't spend any bits coding it and the
+  // difference (obviously) doesn't contribute to the error.
+  const int try_block2 = 1;
+  if (try_block2 &&
+      !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
+    return;
+
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+  int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  sum_rdc.rate += x->partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
+
+#undef RTP_STX_TRY_ARGS
+}
+
+#if CONFIG_DIST_8X8
+static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            uint8_t *src_plane_8x8[MAX_MB_PLANE],
+                            uint8_t *dst_plane_8x8[MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t dist_8x8, dist_8x8_uv, total_dist;
+  const int src_stride = x->plane[0].src.stride;
+  int plane;
+
+  const int dst_stride = xd->plane[0].dst.stride;
+  dist_8x8 =
+      av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0],
+                   dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
+      << 4;
+
+  // Compute chroma distortion for a luma 8x8 block
+  dist_8x8_uv = 0;
+
+  if (num_planes > 1) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      unsigned sse;
+      const int src_stride_uv = x->plane[plane].src.stride;
+      const int dst_stride_uv = xd->plane[plane].dst.stride;
+      const int ssx = xd->plane[plane].subsampling_x;
+      const int ssy = xd->plane[plane].subsampling_y;
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy);
+
+      cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv,
+                                  dst_plane_8x8[plane], dst_stride_uv, &sse);
+      dist_8x8_uv += (int64_t)sse << 4;
+    }
+  }
+
+  return total_dist = dist_8x8 + dist_8x8_uv;
+}
+#endif  // CONFIG_DIST_8X8
+
+static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      reset_partition(pc_tree->split[idx], subsize);
+  }
+}
+
+static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
+                                  TileDataEnc *tile_data, TOKENEXTRA **tp,
+                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                  RD_STATS *rd_cost, int64_t best_rd,
+                                  PC_TREE *pc_tree, int64_t *none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = mi_size_wide[bsize] / 2;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TOKENEXTRA *const tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  int tmp_partition_cost[PARTITION_TYPES];
+  BLOCK_SIZE subsize;
+  RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc;
+  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  int do_square_split = bsize_at_least_8x8;
+  const int pl = bsize_at_least_8x8
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const int *partition_cost =
+      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+  const int num_planes = av1_num_planes(cm);
+
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int has_rows = (mi_row + mi_step < cm->mi_rows);
+  const int has_cols = (mi_col + mi_step < cm->mi_cols);
+
+  if (none_rd) *none_rd = 0;
+
+  int partition_none_allowed = has_rows && has_cols;
+
+  (void)*tp_orig;
+  (void)split_rd;
+
+  av1_zero(pc_tree->pc_tree_stats);
+  pc_tree->pc_tree_stats.valid = 1;
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to magic number
+  // when debugging.
+  memset(x->blk_skip, 234, sizeof(x->blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_init_rd_stats(&this_rdc);
+  av1_init_rd_stats(&sum_rdc);
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                     PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+
+    pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
+    pc_tree->pc_tree_stats.skip = ctx_none->skip;
+
+    if (none_rd) *none_rd = this_rdc.rdcost;
+    if (this_rdc.rate != INT_MAX) {
+      if (bsize_at_least_8x8) {
+        const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                                ? partition_cost[PARTITION_NONE]
+                                : 0;
+        this_rdc.rate += pt_cost;
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        // Adjust dist breakout threshold according to the partition size.
+        const int64_t dist_breakout_thr =
+            cpi->sf.partition_search_breakout_dist_thr >>
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+        const int rate_breakout_thr =
+            cpi->sf.partition_search_breakout_rate_thr *
+            num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+        pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
+            (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
+             best_rdc.rate < rate_breakout_thr)) {
+          do_square_split = 0;
+        }
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+  int64_t temp_best_rdcost = best_rdc.rdcost;
+  pn_rdc = best_rdc;
+
+#if CONFIG_DIST_8X8
+  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
+
+  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
+    for (int i = 0; i < MAX_MB_PLANE; i++) {
+      src_plane_8x8[i] = x->plane[i].src.buf;
+      dst_plane_8x8[i] = xd->plane[i].dst.buf;
+    }
   }
-}
-#endif
+#endif  // CONFIG_DIST_8X8
 
-#if CONFIG_EXT_PARTITION_TYPES
-// Try searching for an encoding for the given subblock. Returns zero if the
-// rdcost is already too high (to tell the caller not to bother searching for
-// encodings of further subblocks)
-static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
-                           TileDataEnc *tile_data, TOKENEXTRA **tp,
-                           int is_first, int is_last, int mi_row, int mi_col,
-                           BLOCK_SIZE subsize, RD_STATS *best_rdc,
-                           RD_STATS *sum_rdc, RD_STATS *this_rdc,
-#if CONFIG_SUPERTX
-                           int64_t best_rd, int *sum_rate_nocoef,
-                           int *this_rate_nocoef, int *abort_flag,
-#endif
-                           PARTITION_TYPE partition,
-                           PICK_MODE_CONTEXT *prev_ctx,
-                           PICK_MODE_CONTEXT *this_ctx) {
-#if CONFIG_SUPERTX
-#define RTS_X_RATE_NOCOEF_ARG ((is_first) ? sum_rate_nocoef : this_rate_nocoef),
-#define RTS_MAX_RDCOST INT64_MAX
-#else
-#define RTS_X_RATE_NOCOEF_ARG
-#define RTS_MAX_RDCOST best_rdc->rdcost
-#endif
+  // PARTITION_SPLIT
+  if (do_square_split) {
+    int reached_last_index = 0;
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    int idx;
 
-  MACROBLOCK *const x = &td->mb;
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
 
-  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
 
-  // On the first time around, write the rd stats straight to sum_rdc. Also, we
-  // should treat sum_rdc as containing zeros (even if it doesn't) to avoid
-  // having to zero it at the start.
-  if (is_first) this_rdc = sum_rdc;
-  const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost;
-  const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost;
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-  rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
-                   RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
-                   rdcost_remaining);
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+      rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                            mi_col + x_idx, subsize, &this_rdc,
+                            temp_best_rdcost - sum_rdc.rdcost,
+                            pc_tree->split[idx], p_split_rd);
 
-#if CONFIG_SUPERTX
-  if (is_first) *abort_flag = sum_rdc->rdcost >= best_rd;
-#endif
+      pc_tree->pc_tree_stats.sub_block_rdcost[idx] = this_rdc.rdcost;
+      pc_tree->pc_tree_stats.sub_block_skip[idx] =
+          pc_tree->split[idx]->none.skip;
 
-  if (!is_first) {
-    if (this_rdc->rate == INT_MAX) {
-      sum_rdc->rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-      *sum_rate_nocoef = INT_MAX;
-#endif
-    } else {
-      sum_rdc->rate += this_rdc->rate;
-      sum_rdc->dist += this_rdc->dist;
-      sum_rdc->rdcost += this_rdc->rdcost;
-#if CONFIG_SUPERTX
-      *sum_rate_nocoef += *this_rate_nocoef;
-#endif
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
     }
-  }
+    reached_last_index = (idx == 4);
 
-  if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8 && reached_last_index &&
+        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    }
+#endif  // CONFIG_DIST_8X8
 
-  if (!is_last) {
-    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
-    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                      NULL);
-  }
+    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
-  return 1;
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    }
 
-#undef RTS_X_RATE_NOCOEF_ARG
-#undef RTS_MAX_RDCOST
-}
+    int has_split = 0;
+    if (pc_tree->partitioning == PARTITION_SPLIT) {
+      for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) {
+        if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT)
+          ++has_split;
+      }
 
-static void rd_test_partition3(
-    const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
-    TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc,
-    PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, PARTITION_TYPE partition,
-#if CONFIG_SUPERTX
-    int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
-#endif
-    int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1,
-    BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  RD_STATS sum_rdc, this_rdc;
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int has_rows = mi_row + hbs < cm->mi_rows;
-  const int has_cols = mi_col + hbs < cm->mi_cols;
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
-#if CONFIG_SUPERTX || CONFIG_EXT_PARTITION_TYPES_AB
-  const AV1_COMMON *const cm = &cpi->common;
-#endif
-#if CONFIG_SUPERTX
-  TileInfo *const tile_info = &tile_data->tile_info;
-  int sum_rate_nocoef, this_rate_nocoef;
-  int abort_flag;
-  const int supertx_allowed = !frame_is_intra_only(cm) &&
-                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-                              !xd->lossless[0];
-
-#define RTP_STX_TRY_ARGS \
-  best_rd, &sum_rate_nocoef, &this_rate_nocoef, &abort_flag,
-#else
-#define RTP_STX_TRY_ARGS
-#endif
+      if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) {
+        pc_tree->cb_search_range = SPLIT_PLANE;
+      }
+    }
 
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
-                       best_rdc, &sum_rdc, &this_rdc,
-                       RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
-    return;
+    if (pc_tree->partitioning == PARTITION_NONE) {
+      pc_tree->cb_search_range = SEARCH_SAME_PLANE;
+      if (pn_rdc.dist <= sum_rdc.dist)
+        pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+    }
 
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1,
-                       best_rdc, &sum_rdc, &this_rdc,
-                       RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
-    return;
+    if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE;
 
-// With the new layout of mixed partitions for PARTITION_HORZ_B and
-// PARTITION_VERT_B, the last subblock might start past halfway through the
-// main block, so we might signal it even though the subblock lies strictly
-// outside the image. In that case, we won't spend any bits coding it and the
-// difference (obviously) doesn't contribute to the error.
-#if CONFIG_EXT_PARTITION_TYPES_AB
-  const int try_block2 = mi_row2 < cm->mi_rows && mi_col2 < cm->mi_cols;
-#else
-  const int try_block2 = 1;
-#endif
-  if (try_block2 &&
-      !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
-                       best_rdc, &sum_rdc, &this_rdc,
-                       RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
-    return;
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }  // if (do_split)
 
-#if CONFIG_SUPERTX
-  if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
-    TX_SIZE supertx_size = max_txsize_lookup[bsize];
-    const PARTITION_TYPE best_partition = pc_tree->partitioning;
-    pc_tree->partitioning = partition;
-    sum_rdc.rate += av1_cost_bit(
-        cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                            [supertx_size],
-        0);
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-    if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-      TX_TYPE best_tx = DCT_DCT;
-      RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 };
-
-      restore_context(x, x_ctx, mi_row, mi_col, bsize);
-
-      rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
-                    &tmp_rdc.dist, &best_tx, pc_tree);
-
-      tmp_rdc.rate += av1_cost_bit(
-          cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                              [supertx_size],
-          1);
-      tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-      if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-        sum_rdc = tmp_rdc;
-        update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                supertx_size, pc_tree);
-      }
+  pc_tree->pc_tree_stats.split = pc_tree->partitioning == PARTITION_SPLIT;
+  if (do_square_split) {
+    for (int i = 0; i < 4; ++i) {
+      pc_tree->pc_tree_stats.sub_block_split[i] =
+          pc_tree->split[i]->partitioning == PARTITION_SPLIT;
+    }
+  }
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void)best_rd;
+  *rd_cost = best_rdc;
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    if (bsize == cm->seq_params.sb_size) {
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
     }
+  }
 
-    pc_tree->partitioning = best_partition;
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
+      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
   }
-#endif
+#endif  // CONFIG_DIST_8X8
 
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+  if (bsize == cm->seq_params.sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
 
-  int pl = partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                   has_rows, has_cols,
-#endif
-                                   bsize);
-  sum_rdc.rate += x->partition_cost[pl][partition];
-  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-  sum_rate_nocoef += x->partition_cost[pl][partition];
-#endif
+#define FEATURE_SIZE 19
+static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
+  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
+  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
+};
 
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
+  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
+  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
+  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
+};
 
-#if CONFIG_SUPERTX
-  *best_rate_nocoef = sum_rate_nocoef;
-  assert(*best_rate_nocoef >= 0);
-#endif
-  *best_rdc = sum_rdc;
-  pc_tree->partitioning = partition;
+static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
+  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
+  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
+  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
+};
 
-#undef RTP_STX_TRY_ARGS
-}
-#endif  // CONFIG_EXT_PARTITION_TYPES
+static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
+  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
+  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
+  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
+};
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            uint8_t *y_src_8x8) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int64_t dist_8x8, dist_8x8_uv, total_dist;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t *decoded_8x8;
-  int plane;
+static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
+  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
+  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
+  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
+};
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-  else
-#endif
-    decoded_8x8 = (uint8_t *)x->decoded_8x8;
+static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
+  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
+  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
+  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
+};
 
-  dist_8x8 = av1_dist_8x8(cpi, x, y_src_8x8, src_stride, decoded_8x8, 8,
-                          BLOCK_8X8, 8, 8, 8, 8, x->qindex)
-             << 4;
+static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
+  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
+  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
+  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
+};
 
-  // Compute chroma distortion for a luma 8x8 block
-  dist_8x8_uv = 0;
+static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
+  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
+  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
+  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
+};
 
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    const int src_stride_uv = x->plane[plane].src.stride;
-    const int dst_stride_uv = xd->plane[plane].dst.stride;
-    // uv buff pointers now (i.e. the last sub8x8 block) is the same
-    // to those at the first sub8x8 block because
-    // uv buff pointer is set only once at first sub8x8 block in a 8x8.
-    uint8_t *src_uv = x->plane[plane].src.buf;
-    uint8_t *dst_uv = xd->plane[plane].dst.buf;
-    unsigned sse;
-#if CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(BLOCK_8X8, &xd->plane[plane]));
-#else
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(BLOCK_8X8, &xd->plane[plane]);
-#endif
-    cpi->fn_ptr[plane_bsize].vf(src_uv, src_stride_uv, dst_uv, dst_stride_uv,
-                                &sse);
-    dist_8x8_uv += (int64_t)sse << 4;
+static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
+  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
+  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
+  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
+};
+
+static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
+  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
+  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
+  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+
+// split_score indicates confidence of picking split partition;
+// none_score indicates confidence of picking none partition;
+static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
+                                          BLOCK_SIZE bsize, int *split_score,
+                                          int *none_score) {
+  if (!pc_tree_stats->valid) return 0;
+  const float *split_weights = NULL;
+  const float *none_weights = NULL;
+  switch (bsize) {
+    case BLOCK_4X4: break;
+    case BLOCK_8X8:
+      split_weights = two_pass_split_partition_weights_8;
+      none_weights = two_pass_none_partition_weights_8;
+      break;
+    case BLOCK_16X16:
+      split_weights = two_pass_split_partition_weights_16;
+      none_weights = two_pass_none_partition_weights_16;
+      break;
+    case BLOCK_32X32:
+      split_weights = two_pass_split_partition_weights_32;
+      none_weights = two_pass_none_partition_weights_32;
+      break;
+    case BLOCK_64X64:
+      split_weights = two_pass_split_partition_weights_64;
+      none_weights = two_pass_none_partition_weights_64;
+      break;
+    case BLOCK_128X128:
+      split_weights = two_pass_split_partition_weights_128;
+      none_weights = two_pass_none_partition_weights_128;
+      break;
+    default: assert(0 && "Unexpected bsize.");
   }
+  if (!split_weights || !none_weights) return 0;
 
-  return total_dist = dist_8x8 + dist_8x8_uv;
+  aom_clear_system_state();
+
+  float features[FEATURE_SIZE];
+  int feature_index = 0;
+  features[feature_index++] = (float)pc_tree_stats->split;
+  features[feature_index++] = (float)pc_tree_stats->skip;
+  const int rdcost = (int)AOMMIN(INT_MAX, pc_tree_stats->rdcost);
+  const int rd_valid = rdcost > 0 && rdcost < 1000000000;
+  features[feature_index++] = (float)rd_valid;
+  for (int i = 0; i < 4; ++i) {
+    features[feature_index++] = (float)pc_tree_stats->sub_block_split[i];
+    features[feature_index++] = (float)pc_tree_stats->sub_block_skip[i];
+    const int sub_rdcost =
+        (int)AOMMIN(INT_MAX, pc_tree_stats->sub_block_rdcost[i]);
+    const int sub_rd_valid = sub_rdcost > 0 && sub_rdcost < 1000000000;
+    features[feature_index++] = (float)sub_rd_valid;
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (rd_valid && sub_rd_valid && sub_rdcost < rdcost)
+      rd_ratio = (float)sub_rdcost / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == FEATURE_SIZE);
+
+  float score_1 = split_weights[FEATURE_SIZE];
+  float score_2 = none_weights[FEATURE_SIZE];
+  for (int i = 0; i < FEATURE_SIZE; ++i) {
+    score_1 += features[i] * split_weights[i];
+    score_2 += features[i] * none_weights[i];
+  }
+  *split_score = (int)(score_1 * 100);
+  *none_score = (int)(score_2 * 100);
+  return 1;
+}
+#undef FEATURE_SIZE
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                                  int64_t best_rd, int64_t horz_rd[2],
+                                  int64_t vert_rd[2], int64_t split_rd[4],
+                                  int *const horza_partition_allowed,
+                                  int *const horzb_partition_allowed,
+                                  int *const verta_partition_allowed,
+                                  int *const vertb_partition_allowed) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_8X8: nn_config = NULL; break;
+    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[10];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)var_ctx;
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == 10);
+
+  // Calculate scores using the NN model.
+  float score[16] = { 0.0f };
+  av1_nn_predict(features, nn_config, score);
+  int int_score[16];
+  int max_score = -1000;
+  for (int i = 0; i < 16; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 150; break;
+    case BLOCK_32X32: thresh -= 100; break;
+    default: break;
+  }
+  *horza_partition_allowed = 0;
+  *horzb_partition_allowed = 0;
+  *verta_partition_allowed = 0;
+  *vertb_partition_allowed = 0;
+  for (int i = 0; i < 16; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *horza_partition_allowed = 1;
+      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
+      if ((i >> 2) & 1) *verta_partition_allowed = 1;
+      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+    }
+  }
 }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
@@ -3488,12 +2882,10 @@ static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                               TileDataEnc *tile_data, TOKENEXTRA **tp,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                              int *rate_nocoef,
-#endif
-                              int64_t best_rd, PC_TREE *pc_tree) {
+                              RD_STATS *rd_cost, int64_t best_rd,
+                              PC_TREE *pc_tree, int64_t *none_rd) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3501,114 +2893,87 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   const TOKENEXTRA *const tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int has_rows = mi_row + hbs < cm->mi_rows;
-  const int has_cols = mi_col + hbs < cm->mi_cols;
-#else
   int tmp_partition_cost[PARTITION_TYPES];
-#endif
   BLOCK_SIZE subsize;
   RD_STATS this_rdc, sum_rdc, best_rdc;
   const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
   int do_square_split = bsize_at_least_8x8;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
   const int pl = bsize_at_least_8x8
-                     ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                               has_rows, has_cols,
-#endif
-                                               bsize)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
                      : 0;
-#else
-  const int unify_bsize = 0;
-  const int pl = partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                         has_rows, has_cols,
-#endif
-                                         bsize);
-#endif  // CONFIG_CB4X4
   const int *partition_cost =
       pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
-#if CONFIG_SUPERTX
-  int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
-  int abort_flag;
-  const int supertx_allowed = !frame_is_intra_only(cm) && bsize >= BLOCK_8X8 &&
-                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-                              !xd->lossless[0];
-#endif  // CONFIG_SUPERTX
 
   int do_rectangular_split = 1;
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+  int64_t horz_rd[2] = { 0, 0 };
+  int64_t vert_rd[2] = { 0, 0 };
+
+  int split_ctx_is_ready[2] = { 0, 0 };
+  int horz_ctx_is_ready = 0;
+  int vert_ctx_is_ready = 0;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks
-  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
-  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int has_rows = (mi_row + mi_step < cm->mi_rows);
+  const int has_cols = (mi_col + mi_step < cm->mi_cols);
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
   BLOCK_SIZE min_size = x->min_partition_size;
   BLOCK_SIZE max_size = x->max_partition_size;
 
+  if (none_rd) *none_rd = 0;
+
 #if CONFIG_FP_MB_STATS
   unsigned int src_diff_var = UINT_MAX;
   int none_complexity = 0;
 #endif
 
-  int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed =
-      !force_vert_split && yss <= xss && bsize_at_least_8x8;
-  int partition_vert_allowed =
-      !force_horz_split && xss <= yss && bsize_at_least_8x8;
-
-#if CONFIG_PVQ
-  od_rollback_buffer pre_rdo_buf;
-#endif
+  int partition_none_allowed = has_rows && has_cols;
+  int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+  int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
 
   (void)*tp_orig;
 
-#if !CONFIG_UNPOISON_PARTITION_CTX
-  if (force_horz_split || force_vert_split) {
-    tmp_partition_cost[PARTITION_NONE] = INT_MAX;
-
-    if (!force_vert_split) {  // force_horz_split only
-      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
-      tmp_partition_cost[PARTITION_HORZ] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0);
-      tmp_partition_cost[PARTITION_SPLIT] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1);
-    } else if (!force_horz_split) {  // force_vert_split only
-      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
-      tmp_partition_cost[PARTITION_VERT] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0);
-      tmp_partition_cost[PARTITION_SPLIT] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1);
-    } else {  // force_ horz_split && force_vert_split horz_split
-      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
-      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
       tmp_partition_cost[PARTITION_SPLIT] = 0;
     }
 
     partition_cost = tmp_partition_cost;
   }
-#endif
 
-#if CONFIG_VAR_TX
 #ifndef NDEBUG
   // Nothing should rely on the default value of this array (which is just
   // leftover from encoding the previous block. Setting it to magic number
   // when debugging.
-  memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0]));
+  memset(x->blk_skip, 234, sizeof(x->blk_skip));
 #endif  // NDEBUG
-#endif  // CONFIG_VAR_TX
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
   av1_init_rd_stats(&this_rdc);
-  av1_init_rd_stats(&sum_rdc);
   av1_invalid_rd_stats(&best_rdc);
   best_rdc.rdcost = best_rd;
 
@@ -3634,26 +2999,70 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     // Note: Further partitioning is NOT allowed when bsize == min_size already.
     const int partition_allowed = (bsize <= max_size && bsize > min_size);
     partition_none_allowed &= no_partition_allowed;
-    partition_horz_allowed &= partition_allowed || force_horz_split;
-    partition_vert_allowed &= partition_allowed || force_vert_split;
+    partition_horz_allowed &= partition_allowed || !has_rows;
+    partition_vert_allowed &= partition_allowed || !has_cols;
     do_square_split &= bsize > min_size;
   }
   if (cpi->sf.use_square_partition_only) {
-    partition_horz_allowed &= force_horz_split;
-    partition_vert_allowed &= force_vert_split;
+    partition_horz_allowed &= !has_rows;
+    partition_vert_allowed &= !has_cols;
+  }
+
+  if (bsize > BLOCK_4X4 && x->use_cb_search_range &&
+      cpi->sf.auto_min_max_partition_size == 0) {
+    int split_score = 0;
+    int none_score = 0;
+    const int score_valid = ml_prune_2pass_split_partition(
+        &pc_tree->pc_tree_stats, bsize, &split_score, &none_score);
+    if (score_valid) {
+      {
+        const int only_split_thresh = 300;
+        const int no_none_thresh = 250;
+        const int no_split_thresh = 0;
+        if (split_score > only_split_thresh) {
+          partition_none_allowed = 0;
+          partition_horz_allowed = 0;
+          partition_vert_allowed = 0;
+        } else if (split_score > no_none_thresh) {
+          partition_none_allowed = 0;
+        }
+        if (split_score < no_split_thresh) do_square_split = 0;
+      }
+      {
+        const int no_split_thresh = 120;
+        const int no_none_thresh = -120;
+        if (none_score > no_split_thresh && partition_none_allowed)
+          do_square_split = 0;
+        if (none_score < no_none_thresh) partition_none_allowed = 0;
+      }
+    } else {
+      if (pc_tree->cb_search_range == SPLIT_PLANE) {
+        partition_none_allowed = 0;
+        partition_horz_allowed = 0;
+        partition_vert_allowed = 0;
+      }
+      if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) do_square_split = 0;
+      if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) {
+        do_square_split = 0;
+        partition_horz_allowed = 0;
+        partition_vert_allowed = 0;
+      }
+    }
+
+    // Fall back to default values in case all partition modes are rejected.
+    if (partition_none_allowed == 0 && do_square_split == 0 &&
+        partition_horz_allowed == 0 && partition_vert_allowed == 0) {
+      do_square_split = bsize_at_least_8x8;
+      partition_none_allowed = has_rows && has_cols;
+      partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+      partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+    }
   }
 
-#if CONFIG_VAR_TX
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-#endif
-#if !CONFIG_PVQ
-  save_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -3712,16 +3121,17 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
 
+BEGIN_PARTITION_SEARCH:
+  if (x->must_find_valid_partition) {
+    partition_none_allowed = has_rows && has_cols;
+    partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+    partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+  }
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-#if CONFIG_SUPERTX
-                     &this_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_NONE,
-#endif
-                     bsize, ctx_none, best_rdc.rdcost);
+                     PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+    if (none_rd) *none_rd = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
         const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
@@ -3729,9 +3139,6 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                                 : 0;
         this_rdc.rate += pt_cost;
         this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-#if CONFIG_SUPERTX
-        this_rate_nocoef += pt_cost;
-#endif
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
@@ -3739,16 +3146,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         const int64_t dist_breakout_thr =
             cpi->sf.partition_search_breakout_dist_thr >>
             ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
-             (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
         const int rate_breakout_thr =
             cpi->sf.partition_search_breakout_rate_thr *
             num_pels_log2_lookup[bsize];
 
         best_rdc = this_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = this_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#endif
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
         // If all y, u, v transform blocks in this partition are skippable, and
@@ -3756,7 +3159,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         // terminated for current branch of the partition search tree.
         // The dist & rate thresholds are set to 0 at speed 0 to disable the
         // early termination at that speed.
-        if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] &&
+        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
             (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
              best_rdc.rate < rate_breakout_thr)) {
           do_square_split = 0;
@@ -3810,202 +3213,88 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (!x->skip_chroma_rd) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // store estimated motion vector
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
-#if CONFIG_SUPERTX
-  int64_t temp_best_rdcost = INT64_MAX;
-#else
-  int64_t temp_best_rdcost = best_rdc.rdcost;
-#endif
-
-  // PARTITION_SPLIT
-  // TODO(jingning): use the motion vectors given by the above search as
-  // the starting point of motion search in the following partition type check.
-  if (do_square_split) {
-    int reached_last_index = 0;
-    subsize = get_subsize(bsize, PARTITION_SPLIT);
-    if (bsize == BLOCK_8X8 && !unify_bsize) {
-      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
-        pc_tree->leaf_split[0]->pred_interp_filter =
-            av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
-
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-#if CONFIG_SUPERTX
-                       &sum_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_SPLIT,
-#endif
-                       subsize, pc_tree->leaf_split[0], temp_best_rdcost);
-      if (sum_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
-#endif
-      }
-#if CONFIG_SUPERTX
-      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) {
-        TX_SIZE supertx_size = max_txsize_lookup[bsize];
-        const PARTITION_TYPE best_partition = pc_tree->partitioning;
-
-        pc_tree->partitioning = PARTITION_SPLIT;
-
-        sum_rdc.rate += av1_cost_bit(
-            cm->fc->supertx_prob[partition_supertx_context_lookup
-                                     [PARTITION_SPLIT]][supertx_size],
-            0);
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-        if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
-          TX_TYPE best_tx = DCT_DCT;
-          RD_STATS tmp_rdc;
-          av1_init_rd_stats(&tmp_rdc);
-          tmp_rdc.rate = sum_rate_nocoef;
-
-          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
-                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
-
-          tmp_rdc.rate += av1_cost_bit(
-              cm->fc->supertx_prob[partition_supertx_context_lookup
-                                       [PARTITION_SPLIT]][supertx_size],
-              1);
-          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-            sum_rdc = tmp_rdc;
-            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                    supertx_size, pc_tree);
-          }
-        }
-
-        pc_tree->partitioning = best_partition;
-      }
-#endif  // CONFIG_SUPERTX
-      reached_last_index = 1;
-    } else {
-      int idx;
-      for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
-        const int x_idx = (idx & 1) * mi_step;
-        const int y_idx = (idx >> 1) * mi_step;
+#if CONFIG_DIST_8X8
+  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
 
-        if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
-          continue;
+  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
+    for (int i = 0; i < num_planes; i++) {
+      src_plane_8x8[i] = x->plane[i].src.buf;
+      dst_plane_8x8[i] = xd->plane[i].dst.buf;
+    }
+  }
+#endif  // CONFIG_DIST_8X8
 
-        if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+  // PARTITION_SPLIT
+  if (do_square_split) {
+    av1_init_rd_stats(&sum_rdc);
+    int reached_last_index = 0;
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    int idx;
 
-        pc_tree->split[idx]->index = idx;
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                          mi_col + x_idx, subsize, &this_rdc,
-#if CONFIG_SUPERTX
-                          &this_rate_nocoef,
-#endif
-                          temp_best_rdcost - sum_rdc.rdcost,
-                          pc_tree->split[idx]);
-
-        if (this_rdc.rate == INT_MAX) {
-          sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-          sum_rate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
-          break;
-        } else {
-          sum_rdc.rate += this_rdc.rate;
-          sum_rdc.dist += this_rdc.dist;
-          sum_rdc.rdcost += this_rdc.rdcost;
-#if CONFIG_SUPERTX
-          sum_rate_nocoef += this_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-        }
-      }
-      reached_last_index = (idx == 4);
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (x->using_dist_8x8 && reached_last_index &&
-          sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        const int src_stride = x->plane[0].src.stride;
-        int64_t dist_8x8;
-        dist_8x8 =
-            dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride - 4);
-        sum_rdc.dist = dist_8x8;
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
 
-#if CONFIG_SUPERTX
-      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
-        TX_SIZE supertx_size = max_txsize_lookup[bsize];
-        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-        pc_tree->partitioning = PARTITION_SPLIT;
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                        subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost,
+                        pc_tree->split[idx], p_split_rd);
 
-        sum_rdc.rate += av1_cost_bit(
-            cm->fc->supertx_prob[partition_supertx_context_lookup
-                                     [PARTITION_SPLIT]][supertx_size],
-            0);
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
 
-        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-          TX_TYPE best_tx = DCT_DCT;
-          RD_STATS tmp_rdc;
-          av1_init_rd_stats(&tmp_rdc);
-          tmp_rdc.rate = sum_rate_nocoef;
-
-          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
-                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
-
-          tmp_rdc.rate += av1_cost_bit(
-              cm->fc->supertx_prob[partition_supertx_context_lookup
-                                       [PARTITION_SPLIT]][supertx_size],
-              1);
-          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-            sum_rdc = tmp_rdc;
-            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                    supertx_size, pc_tree);
+        if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                         pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+          MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic);
+          PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+          // Neither palette mode nor cfl predicted
+          if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+            if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
           }
         }
-
-        pc_tree->partitioning = best_partition;
       }
-#endif  // CONFIG_SUPERTX
     }
+    reached_last_index = (idx == 4);
 
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (!reached_last_index && sum_rdc.rdcost >= best_rdc.rdcost)
-      cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8 && reached_last_index &&
+        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      int64_t dist_8x8;
+      dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+#ifdef DEBUG_DIST_8X8
+      // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
+      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
+        assert(sum_rdc.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
+      sum_rdc.dist = dist_8x8;
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    }
+#endif  // CONFIG_DIST_8X8
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-      sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
-#endif  // CONFIG_SUPERTX
 
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = sum_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#else
-        temp_best_rdcost = best_rdc.rdcost;
-#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else if (cpi->sf.less_rectangular_check) {
@@ -4013,473 +3302,362 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
       // gives better rd cost
       do_rectangular_split &= !partition_none_allowed;
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }  // if (do_split)
 
   // PARTITION_HORZ
   if (partition_horz_allowed &&
-      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
-    subsize = get_subsize(bsize, PARTITION_HORZ);
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
-          av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
 
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-#if CONFIG_SUPERTX
-                     &sum_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_HORZ,
-#endif
-                     subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
+                     PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                     best_rdc.rdcost);
+    horz_rd[0] = sum_rdc.rdcost;
 
-#if CONFIG_SUPERTX
-    abort_flag =
-        (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
-        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
-#endif
-    if (sum_rdc.rdcost < temp_best_rdcost && !force_horz_split &&
-        (bsize > BLOCK_8X8 || unify_bsize)) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
       PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                        NULL);
+      MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic);
+      PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
+      }
+      update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                        subsize, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
 
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
-            av1_extract_interp_filter(ctx_h->mic.mbmi.interp_filters, 0);
+            av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
 
-#if CONFIG_SUPERTX
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-                       &this_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_HORZ,
-#endif
-                       subsize, &pc_tree->horizontal[1], INT64_MAX);
-#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_HORZ,
-#endif
-                       subsize, &pc_tree->horizontal[1],
+                       PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
+      horz_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
-                     subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
-                          subsize, NULL);
+        update_state(cpi, tile_data, td, &pc_tree->horizontal[1],
+                     mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+                          mi_row + mi_step, mi_col, subsize, NULL);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef += this_rate_nocoef;
-#endif  // CONFIG_SUPERTX
       }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
-        const int src_stride = x->plane[0].src.stride;
         int64_t dist_8x8;
-        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride);
+        dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+#ifdef DEBUG_DIST_8X8
+        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
+        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
+          assert(sum_rdc.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
         sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
-    }
-
-#if CONFIG_SUPERTX
-    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
-      TX_SIZE supertx_size = max_txsize_lookup[bsize];
-      const PARTITION_TYPE best_partition = pc_tree->partitioning;
-
-      pc_tree->partitioning = PARTITION_HORZ;
-
-      sum_rdc.rate += av1_cost_bit(
-          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
-                              [supertx_size],
-          0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-        TX_TYPE best_tx = DCT_DCT;
-        RD_STATS tmp_rdc;
-        av1_init_rd_stats(&tmp_rdc);
-        tmp_rdc.rate = sum_rate_nocoef;
-
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
-                      &tmp_rdc.dist, &best_tx, pc_tree);
-
-        tmp_rdc.rate += av1_cost_bit(
-            cm->fc
-                ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
-                              [supertx_size],
-            1);
-        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-          sum_rdc = tmp_rdc;
-          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                  supertx_size, pc_tree);
-        }
-      }
-
-      pc_tree->partitioning = best_partition;
+#endif  // CONFIG_DIST_8X8
     }
-#endif  // CONFIG_SUPERTX
 
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-      sum_rate_nocoef += partition_cost[PARTITION_HORZ];
-#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = sum_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // PARTITION_VERT
   if (partition_vert_allowed &&
-      (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) {
-    subsize = get_subsize(bsize, PARTITION_VERT);
+      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
-          av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
 
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-#if CONFIG_SUPERTX
-                     &sum_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_VERT,
-#endif
-                     subsize, &pc_tree->vertical[0], best_rdc.rdcost);
-#if CONFIG_SUPERTX
-    abort_flag =
-        (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
-        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
-    const int64_t vert_max_rdcost = INT64_MAX;
-#else
+                     PARTITION_VERT, subsize, &pc_tree->vertical[0],
+                     best_rdc.rdcost);
+    vert_rd[0] = sum_rdc.rdcost;
     const int64_t vert_max_rdcost = best_rdc.rdcost;
-#endif  // CONFIG_SUPERTX
-    if (sum_rdc.rdcost < vert_max_rdcost && !force_vert_split &&
-        (bsize > BLOCK_8X8 || unify_bsize)) {
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                        NULL);
+    if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
+      MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic);
+      PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
+      }
+      update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col,
+                   subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                        subsize, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
-            av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+            av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
 
-#if CONFIG_SUPERTX
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                       &this_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_VERT,
-#endif
-                       subsize, &pc_tree->vertical[1],
-                       INT64_MAX - sum_rdc.rdcost);
-#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_VERT,
-#endif
-                       subsize, &pc_tree->vertical[1],
+                       PARTITION_VERT, subsize, &pc_tree->vertical[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
+      vert_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
-                     subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
-                          subsize, NULL);
+        update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row,
+                     mi_col + mi_step, subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col + mi_step, subsize, NULL);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef += this_rate_nocoef;
-#endif  // CONFIG_SUPERTX
       }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
         int64_t dist_8x8;
-        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4);
+        dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+#ifdef DEBUG_DIST_8X8
+        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
+        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 &&
+            0 /* !CONFIG_CFL */)
+          assert(sum_rdc.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
         sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
     }
-#if CONFIG_SUPERTX
-    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
-      TX_SIZE supertx_size = max_txsize_lookup[bsize];
-      const PARTITION_TYPE best_partition = pc_tree->partitioning;
-
-      pc_tree->partitioning = PARTITION_VERT;
-
-      sum_rdc.rate += av1_cost_bit(
-          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
-                              [supertx_size],
-          0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-        TX_TYPE best_tx = DCT_DCT;
-        RD_STATS tmp_rdc;
-        av1_init_rd_stats(&tmp_rdc);
-        tmp_rdc.rate = sum_rate_nocoef;
-
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
-                      &tmp_rdc.dist, &best_tx, pc_tree);
-
-        tmp_rdc.rate += av1_cost_bit(
-            cm->fc
-                ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
-                              [supertx_size],
-            1);
-        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-          sum_rdc = tmp_rdc;
-          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                  supertx_size, pc_tree);
-        }
-      }
-
-      pc_tree->partitioning = best_partition;
-    }
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-      sum_rate_nocoef += partition_cost[PARTITION_VERT];
-#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = sum_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
   const int ext_partition_allowed =
       do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
 
-#if CONFIG_EXT_PARTITION && CONFIG_EXT_PARTITION_TYPES_AB
-  // Don't allow A/B partitions on 128x128 blocks for now (support for
-  // 128x32 and 32x128 blocks doesn't yet exist).
-  const int ab_partition_allowed =
-      ext_partition_allowed && bsize < BLOCK_128X128;
-#else
-  const int ab_partition_allowed = ext_partition_allowed;
-#endif
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
+  // so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed =
+      ext_partition_allowed && bsize != BLOCK_128X128;
+
+  // The standard AB partitions are allowed whenever ext-partition-types are
+  // allowed
+  int horzab_partition_allowed = ext_partition_allowed;
+  int vertab_partition_allowed = ext_partition_allowed;
+
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    x->source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    x->source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+  int horza_partition_allowed = horzab_partition_allowed;
+  int horzb_partition_allowed = horzab_partition_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.prune_ext_partition_types_search_level) {
+      case 1:
+        horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  int verta_partition_allowed = vertab_partition_allowed;
+  int vertb_partition_allowed = vertab_partition_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.prune_ext_partition_types_search_level) {
+      case 1:
+        verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    ml_prune_ab_partition(bsize, pc_tree->partitioning,
+                          get_unsigned_bits(x->source_variance),
+                          best_rdc.rdcost, horz_rd, vert_rd, split_rd,
+                          &horza_partition_allowed, &horzb_partition_allowed,
+                          &verta_partition_allowed, &vertb_partition_allowed);
+  }
 
   // PARTITION_HORZ_A
-  if (partition_horz_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ_4),
-        mi_row + mi_step / 2, mi_col, get_subsize(bsize, PARTITION_HORZ_4),
-        mi_row + mi_step, mi_col, get_subsize(bsize, PARTITION_HORZ));
-#else
-    subsize = get_subsize(bsize, PARTITION_HORZ_A);
+  if (partition_horz_allowed && horza_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
+    pc_tree->horizontala[0].rd_mode_is_ready = 0;
+    pc_tree->horizontala[1].rd_mode_is_ready = 0;
+    pc_tree->horizontala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
+      pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
+      pc_tree->horizontala[0].rd_mode_is_ready = 1;
+      if (split_ctx_is_ready[1]) {
+        av1_copy_tree_context(&pc_tree->horizontala[1],
+                              &pc_tree->split[1]->none);
+        pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
+        pc_tree->horizontala[1].rd_mode_is_ready = 1;
+      }
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_HORZ_A,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2,
-                       mi_row + mi_step, mi_col, subsize);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
+                       mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
+                       subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
   // PARTITION_HORZ_B
-  if (partition_horz_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ), mi_row + mi_step,
-        mi_col, get_subsize(bsize, PARTITION_HORZ_4), mi_row + 3 * mi_step / 2,
-        mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-#else
-    subsize = get_subsize(bsize, PARTITION_HORZ_B);
+  if (partition_horz_allowed && horzb_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
+    pc_tree->horizontalb[0].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[1].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[2].rd_mode_is_ready = 0;
+    if (horz_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
+      pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
+      pc_tree->horizontalb[0].rd_mode_is_ready = 1;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_HORZ_B,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, subsize, mi_row + mi_step, mi_col,
-                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_HORZ_B, mi_row, mi_col, subsize,
+                       mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
+                       mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
+
   // PARTITION_VERT_A
-  if (partition_vert_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_VERT_4), mi_row,
-        mi_col + mi_step / 2, get_subsize(bsize, PARTITION_VERT_4), mi_row,
-        mi_col + mi_step, get_subsize(bsize, PARTITION_VERT));
-#else
-    subsize = get_subsize(bsize, PARTITION_VERT_A);
+  if (partition_vert_allowed && verta_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
+    pc_tree->verticala[0].rd_mode_is_ready = 0;
+    pc_tree->verticala[1].rd_mode_is_ready = 0;
+    pc_tree->verticala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
+      pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
+      pc_tree->verticala[0].rd_mode_is_ready = 1;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_VERT_A,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2,
-                       mi_row, mi_col + mi_step, subsize);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_VERT_A, mi_row, mi_col, bsize2,
+                       mi_row + mi_step, mi_col, bsize2, mi_row,
+                       mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
   // PARTITION_VERT_B
-  if (partition_vert_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_VERT), mi_row,
-        mi_col + mi_step, get_subsize(bsize, PARTITION_VERT_4), mi_row,
-        mi_col + 3 * mi_step / 2, get_subsize(bsize, PARTITION_VERT_4));
-#else
-    subsize = get_subsize(bsize, PARTITION_VERT_B);
+  if (partition_vert_allowed && vertb_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
+    pc_tree->verticalb[0].rd_mode_is_ready = 0;
+    pc_tree->verticalb[1].rd_mode_is_ready = 0;
+    pc_tree->verticalb[2].rd_mode_is_ready = 0;
+    if (vert_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
+      pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
+      pc_tree->verticalb[0].rd_mode_is_ready = 1;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_VERT_B,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, subsize, mi_row, mi_col + mi_step,
-                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
+                       mi_col + mi_step, bsize2, mi_row + mi_step,
+                       mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
-#if CONFIG_EXT_PARTITION
-  const int can_partition_4 = (bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
-                               bsize == BLOCK_32X32 || bsize == BLOCK_16X16);
-#else
-  const int can_partition_4 =
-      (bsize == BLOCK_64X64 || bsize == BLOCK_32X32 || bsize == BLOCK_16X16);
-#endif  // CONFIG_EXT_PARTITION
-
   // PARTITION_HORZ_4
-  // TODO(david.barker): For this and PARTITION_VERT_4,
-  // * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the
-  //   chroma plane
-  // * Add support for supertx
-  if (can_partition_4 && partition_horz_allowed && !force_horz_split &&
-      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
+  int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+    partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                pc_tree->partitioning == PARTITION_HORZ_A ||
+                                pc_tree->partitioning == PARTITION_HORZ_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+  }
+  if (partition_horz4_allowed && has_rows &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
     const int quarter_step = mi_size_high[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
-    subsize = get_subsize(bsize, PARTITION_HORZ_4);
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
 
     for (int i = 0; i < 4; ++i) {
       int this_mi_row = mi_row + i * quarter_step;
@@ -4488,6 +3666,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
 
+      ctx_this->rd_mode_is_ready = 0;
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3),
                            this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc,
                            &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this))
@@ -4504,19 +3683,25 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         pc_tree->partitioning = PARTITION_HORZ_4;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
+
   // PARTITION_VERT_4
-  if (can_partition_4 && partition_vert_allowed && !force_vert_split &&
-      (do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) {
+  int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                pc_tree->partitioning == PARTITION_VERT_A ||
+                                pc_tree->partitioning == PARTITION_VERT_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+  }
+  if (partition_vert4_allowed && has_cols &&
+      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
     const int quarter_step = mi_size_wide[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
-    subsize = get_subsize(bsize, PARTITION_VERT_4);
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
 
     for (int i = 0; i < 4; ++i) {
       int this_mi_col = mi_col + i * quarter_step;
@@ -4525,6 +3710,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
 
+      ctx_this->rd_mode_is_ready = 0;
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row,
                            this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
                            PARTITION_VERT_4, ctx_prev, ctx_this))
@@ -4541,13 +3727,15 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         pc_tree->partitioning = PARTITION_VERT_4;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (bsize == cm->seq_params.sb_size && best_rdc.rate == INT_MAX) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+    goto BEGIN_PARTITION_SEARCH;
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
@@ -4556,44 +3744,27 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   (void)best_rd;
   *rd_cost = best_rdc;
 
-#if CONFIG_SUPERTX
-  *rate_nocoef = best_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    if (bsize == cm->sb_size) {
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-      set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree);
-#endif
-
-#if CONFIG_LV_MAP
+    if (bsize == cm->seq_params.sb_size) {
       x->cb_offset = 0;
-#endif
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-      set_sb_mi_boundaries(cm, xd, mi_row, mi_col);
-#endif
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
                 pc_tree, NULL);
     } else {
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
                 pc_tree, NULL);
     }
   }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
   if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
       best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
 
-  if (bsize == cm->sb_size) {
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-    assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
-#endif
+  if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
   } else {
@@ -4601,71 +3772,62 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 }
 
+// Set all the counters as max.
+static void init_first_partition_pass_stats_tables(
+    FIRST_PARTITION_PASS_STATS *stats) {
+  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+    memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
+    memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
+    stats[i].sample_counts = INT_MAX;
+  }
+}
+
+// Minimum number of samples to trigger the
+// mode_pruning_based_on_two_pass_partition_search feature.
+#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
+
 static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
-#if CONFIG_EXT_PARTITION
   const int leaf_nodes = 256;
-#else
-  const int leaf_nodes = 64;
-#endif  // CONFIG_EXT_PARTITION
 
   // Initialize the left context for the new SB row
   av1_zero_left_context(xd);
 
   // Reset delta for every tile
-  if (cm->delta_q_present_flag)
-    if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex;
-#if CONFIG_EXT_DELTA_Q
-  if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
-    if (mi_row == tile_info->mi_row_start)
-      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-        xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-    if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0;
+  if (mi_row == tile_info->mi_row_start) {
+    if (cm->delta_q_present_flag) xd->current_qindex = cm->base_qindex;
+    if (cm->delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
   }
-#endif
 
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += cm->mib_size) {
+       mi_col += cm->seq_params.mib_size) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
     RD_STATS dummy_rdc;
-#if CONFIG_SUPERTX
-    int dummy_rate_nocoef;
-#endif  // CONFIG_SUPERTX
     int i;
     int seg_skip = 0;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
-    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
-    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+    MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root =
+        td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
-#if CONFIG_LV_MAP && LV_MAP_PROB
-    av1_fill_coeff_costs(&td->mb, xd->tile_ctx);
-#else
-    av1_fill_token_costs_from_cdf(x->token_head_costs,
-                                  x->e_mbd.tile_ctx->coef_head_cdfs);
-    av1_fill_token_costs_from_cdf(x->token_tail_costs,
-                                  x->e_mbd.tile_ctx->coef_tail_cdfs);
-#endif
+    av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
     av1_fill_mode_rates(cm, x, xd->tile_ctx);
 
     if (sf->adaptive_pred_interp_filter) {
-#if !CONFIG_CB4X4
-      for (i = 0; i < leaf_nodes; ++i)
-        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
-#endif
-
       for (i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
@@ -4674,29 +3836,43 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       }
     }
 
-    x->tx_rd_record.num = x->tx_rd_record.index_start = 0;
+    x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+
+    av1_zero(x->txb_rd_record_8X8);
+    av1_zero(x->txb_rd_record_16X16);
+    av1_zero(x->txb_rd_record_32X32);
+    av1_zero(x->txb_rd_record_64X64);
+    av1_zero(x->txb_rd_record_intra);
+
     av1_zero(x->pred_mv);
     pc_root->index = 0;
 
     if (seg->enabled) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
+      int segment_id =
+          map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
+              : 0;
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
-#if CONFIG_AMVR
-    xd->cur_frame_mv_precision_level = cm->cur_frame_mv_precision_level;
-#endif
+    xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
 
     if (cm->delta_q_present_flag) {
-      // Test mode for delta quantization
-      int sb_row = mi_row >> 3;
-      int sb_col = mi_col >> 3;
-      int sb_stride = (cm->width + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2;
-      int index = ((sb_row * sb_stride + sb_col + 8) & 31) - 16;
-
-      // Ensure divisibility of delta_qindex by delta_q_res
-      int offset_qindex = (index < 0 ? -index - 8 : index - 8);
+      // Delta-q modulation based on variance
+      av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+
+      int offset_qindex;
+      if (DELTAQ_MODULATION == 1) {
+        const int block_wavelet_energy_level =
+            av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size);
+        offset_qindex = av1_compute_deltaq_from_energy_level(
+            cpi, block_wavelet_energy_level);
+      } else {
+        const int block_var_level =
+            av1_block_energy(cpi, x, cm->seq_params.sb_size);
+        offset_qindex =
+            av1_compute_deltaq_from_energy_level(cpi, block_var_level);
+      }
       int qmask = ~(cm->delta_q_res - 1);
       int current_qindex = clamp(cm->base_qindex + offset_qindex,
                                  cm->delta_q_res, 256 - cm->delta_q_res);
@@ -4707,136 +3883,163 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       assert(current_qindex > 0);
 
       xd->delta_qindex = current_qindex - cm->base_qindex;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
-      xd->mi[0]->mbmi.current_q_index = current_qindex;
-#if !CONFIG_EXT_DELTA_Q
-      xd->mi[0]->mbmi.segment_id = 0;
-#endif  // CONFIG_EXT_DELTA_Q
-      av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
-#if CONFIG_EXT_DELTA_Q
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      xd->mi[0]->current_qindex = current_qindex;
+      av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
       if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
         int j, k;
         int lfmask = ~(cm->delta_lf_res - 1);
-        int current_delta_lf_from_base = offset_qindex / 2;
-        current_delta_lf_from_base =
-            ((current_delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+        int delta_lf_from_base = offset_qindex / 2;
+        delta_lf_from_base =
+            ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
 
         // pre-set the delta lf for loop filter. Note that this value is set
         // before mi is assigned for each block in current superblock
-        for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) {
-          for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) {
+        for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row);
+             j++) {
+          for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col);
+               k++) {
             cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
-                .mbmi.current_delta_lf_from_base =
-                clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
-#if CONFIG_LOOPFILTER_LEVEL
-            for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+                .delta_lf_from_base =
+                clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+            const int frame_lf_count =
+                av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+            for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
               cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
-                  .mbmi.curr_delta_lf[lf_id] = current_delta_lf_from_base;
+                  .delta_lf[lf_id] =
+                  clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
             }
-#endif  // CONFIG_LOOPFILTER_LEVEL
           }
         }
       }
-#endif  // CONFIG_EXT_DELTA_Q
     }
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
-      bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
-                       &dummy_rate, &dummy_dist,
-#if CONFIG_SUPERTX
-                       &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                       1, pc_root);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+                       pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
-                       &dummy_rate, &dummy_dist,
-#if CONFIG_SUPERTX
-                       &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                       1, pc_root);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+                       pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
-                        &dummy_rdc,
-#if CONFIG_SUPERTX
-                        &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                        INT64_MAX, pc_root);
+
+      reset_partition(pc_root, cm->seq_params.sb_size);
+      x->use_cb_search_range = 0;
+      init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+      if (cpi->sf.two_pass_partition_search &&
+          mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
+          mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
+          cm->frame_type != KEY_FRAME) {
+        x->cb_partition_scan = 1;
+        // Reset the stats tables.
+        if (sf->mode_pruning_based_on_two_pass_partition_search)
+          av1_zero(x->first_partition_pass_stats);
+        rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                              cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                              pc_root, NULL);
+        x->cb_partition_scan = 0;
+
+        x->source_variance = UINT_MAX;
+        if (sf->adaptive_pred_interp_filter) {
+          for (i = 0; i < leaf_nodes; ++i) {
+            td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+          }
+        }
+
+        x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+        av1_zero(x->txb_rd_record_8X8);
+        av1_zero(x->txb_rd_record_16X16);
+        av1_zero(x->txb_rd_record_32X32);
+        av1_zero(x->txb_rd_record_64X64);
+        av1_zero(x->txb_rd_record_intra);
+        av1_zero(x->pred_mv);
+        pc_root->index = 0;
+
+        for (int idy = 0; idy < mi_size_high[cm->seq_params.sb_size]; ++idy) {
+          for (int idx = 0; idx < mi_size_wide[cm->seq_params.sb_size]; ++idx) {
+            const int offset = cm->mi_stride * (mi_row + idy) + (mi_col + idx);
+            cm->mi_grid_visible[offset] = 0;
+          }
+        }
+
+        x->use_cb_search_range = 1;
+
+        if (sf->mode_pruning_based_on_two_pass_partition_search) {
+          for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+            FIRST_PARTITION_PASS_STATS *const stat =
+                &x->first_partition_pass_stats[i];
+            if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+              // If there are not enough samples collected, make all available.
+              memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+              memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+            } else if (sf->selective_ref_frame < 2) {
+              // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+              // initial partition scan, so we don't eliminate them.
+              stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref0_counts[BWDREF_FRAME] = 0xff;
+              stat->ref1_counts[BWDREF_FRAME] = 0xff;
+            }
+          }
+        }
+
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                          cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                          pc_root, NULL);
+      } else {
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                          cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                          pc_root, NULL);
+      }
     }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+    if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+        cm->tile_rows == 1) {
+      av1_inter_mode_data_fit(x->rdmult);
+    }
+#endif
   }
 }
 
 static void init_encode_frame_mb_context(AV1_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Copy data over into macro block data structures.
-  av1_setup_src_planes(x, cpi->source, 0, 0);
-
-  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
-}
-
-#if !CONFIG_REF_ADAPT
-static int check_dual_ref_flags(AV1_COMP *cpi) {
-  const int ref_flags = cpi->ref_frame_flags;
-
-  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
-    return 0;
-  } else {
-    return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) +
-#if CONFIG_EXT_REFS
-            !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) +
-            !!(ref_flags & AOM_BWD_FLAG) + !!(ref_flags & AOM_ALT2_FLAG) +
-#endif  // CONFIG_EXT_REFS
-            !!(ref_flags & AOM_ALT_FLAG)) >= 2;
-  }
-}
-#endif  // !CONFIG_REF_ADAPT
-
-#if !CONFIG_VAR_TX
-static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) {
-  int mi_row, mi_col;
-  const int mis = cm->mi_stride;
-  MODE_INFO **mi_ptr = cm->mi_grid_visible;
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
 
-  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
-    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-      if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size)
-        mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
-    }
-  }
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
 }
-#endif
 
 static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
   if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
-#if CONFIG_EXT_REFS
   // We will not update the golden frame with an internal overlay frame
   else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
            cpi->rc.is_src_frame_ext_arf)
-#else
-  else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
-#endif  // CONFIG_EXT_REFS
     return ALTREF_FRAME;
-  else if (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-           cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
            cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
@@ -4846,22 +4049,19 @@ static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
 }
 
 static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
-  if (cpi->common.all_lossless) return ONLY_4X4;
-#if CONFIG_VAR_TX_NO_TX_MODE
-  return TX_MODE_SELECT;
-#else
+  if (cpi->common.coded_lossless) return ONLY_4X4;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
-    return ALLOW_32X32 + CONFIG_TX64X64;
+    return TX_MODE_LARGEST;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
-           cpi->sf.tx_size_search_method == USE_TX_8X8)
+           cpi->sf.tx_size_search_method == USE_FAST_RD)
     return TX_MODE_SELECT;
   else
     return cpi->common.tx_mode;
-#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
 void av1_init_tile_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   int tile_col, tile_row;
@@ -4886,29 +4086,23 @@ void av1_init_tile_data(AV1_COMP *cpi) {
             tile_data->mode_map[i][j] = j;
           }
         }
-#if CONFIG_PVQ
-        // This will be dynamically increased as more pvq block is encoded.
-        tile_data->pvq_q.buf_len = 1000;
-        CHECK_MEM_ERROR(
-            cm, tile_data->pvq_q.buf,
-            aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO)));
-        tile_data->pvq_q.curr_pos = 0;
-#endif
       }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *const tile_info =
-          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
 
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = cpi->tile_tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(*tile_info);
-#if CONFIG_PVQ
-      cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0;
-#endif
+      tile_tok = allocated_tokens(
+          *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+      tile_data->allow_update_cdf = !cm->large_scale_tile;
+      tile_data->allow_update_cdf =
+          tile_data->allow_update_cdf && !cm->disable_cdf_update;
     }
   }
 }
@@ -4922,134 +4116,35 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
-#if CONFIG_DEPENDENT_HORZTILES
-  if ((!cm->dependent_horz_tiles) || (tile_row == 0) ||
-      tile_info->tg_horz_boundary) {
-    av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
-  }
-#else
-  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
-#endif
+  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end,
+                         tile_row);
+  av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
 
   // Set up pointers to per thread motion search counters.
   this_tile->m_search_count = 0;   // Count of motion search hits.
   this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
   td->mb.m_search_count_ptr = &this_tile->m_search_count;
   td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
-
-#if CONFIG_PVQ
-  td->mb.pvq_q = &this_tile->pvq_q;
-
-  // TODO(yushin) : activity masking info needs be signaled by a bitstream
-  td->mb.daala_enc.use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
-
-  if (td->mb.daala_enc.use_activity_masking)
-    td->mb.daala_enc.qm = OD_HVS_QM;  // Hard coded. Enc/dec required to sync.
-  else
-    td->mb.daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
-
-  {
-    // FIXME: Multiple segments support
-    int segment_id = 0;
-    int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id);
-    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-#if CONFIG_HIGHBITDEPTH
-    const int quantizer_shift = td->mb.e_mbd.bd - 8;
-#else
-    const int quantizer_shift = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-    int64_t q_ac = OD_MAXI(
-        1, av1_ac_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
-    int64_t q_dc = OD_MAXI(
-        1, av1_dc_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
-    /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */
-    td->mb.daala_enc.pvq_norm_lambda =
-        (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS));
-    td->mb.daala_enc.pvq_norm_lambda_dc =
-        (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS));
-    // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda);
-  }
-  od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv,
-             td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
-
-  if (td->mb.daala_enc.use_activity_masking) {
-    int pli;
-    int use_masking = td->mb.daala_enc.use_activity_masking;
-    int segment_id = 0;
-    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-
-    for (pli = 0; pli < MAX_MB_PLANE; pli++) {
-      int i;
-      int q;
-
-      q = qindex;
-      if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
-        od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
-      } else {
-        i = 0;
-        while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
-               q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
-                       << OD_COEFF_SHIFT) {
-          i++;
-        }
-        od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][i][pli],
-                     &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
-      }
-    }
-  }
-
-#if !CONFIG_ANS
-  od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025);
-  od_ec_enc_reset(&td->mb.daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-#endif  // #if CONFIG_PVQ
-
   this_tile->tctx = *cm->fc;
   td->mb.e_mbd.tile_ctx = &this_tile->tctx;
 
-#if CONFIG_CFL
-  MACROBLOCKD *const xd = &td->mb.e_mbd;
-  xd->cfl = &this_tile->cfl;
-  cfl_init(xd->cfl, cm);
-#endif
-
-#if CONFIG_PVQ
-  td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
+  cfl_init(&td->mb.e_mbd.cfl, cm);
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  if (!cm->loop_filter_across_tiles_enabled)
-    av1_setup_across_tile_boundary_info(cm, tile_info);
-#endif
+  av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
 
-  av1_crc_calculator_init(&td->mb.tx_rd_record.crc_calculator, 24, 0x5D6DCB);
+  td->intrabc_used_this_tile = 0;
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += cm->mib_size) {
+       mi_row += cm->seq_params.mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
   }
 
   cpi->tok_count[tile_row][tile_col] =
       (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
-#if CONFIG_PVQ
-#if !CONFIG_ANS
-  od_ec_enc_clear(&td->mb.daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-
-  td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos;
-  // rewind current position so that bitstream can be written
-  // from the 1st pvq block
-  td->mb.pvq_q->curr_pos = 0;
-
-  td->mb.pvq_q = NULL;
-#endif
+  assert(cpi->tok_count[tile_row][tile_col] <=
+         allocated_tokens(*tile_info,
+                          cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
+                          av1_num_planes(cm)));
 }
 
 static void encode_tiles(AV1_COMP *cpi) {
@@ -5058,9 +4153,12 @@ static void encode_tiles(AV1_COMP *cpi) {
 
   av1_init_tile_data(cpi);
 
-  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row)
-    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col)
+  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+      cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
+    }
+  }
 }
 
 #if CONFIG_FP_MB_STATS
@@ -5077,52 +4175,34 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
 }
 #endif
 
-#if CONFIG_GLOBAL_MOTION
 #define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
 static int gm_get_params_cost(const WarpedMotionParams *gm,
                               const WarpedMotionParams *ref_gm, int allow_hp) {
-  assert(gm->wmtype < GLOBAL_TRANS_TYPES);
   int params_cost = 0;
   int trans_bits, trans_prec_diff;
   switch (gm->wmtype) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (gm->wmtype != HORTRAPEZOID)
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
-            (gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
-      if (gm->wmtype != VERTRAPEZOID)
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
-            (gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
-    // Fallthrough intended
     case AFFINE:
     case ROTZOOM:
       params_cost += aom_count_signed_primitive_refsubexpfin(
           GM_ALPHA_MAX + 1, SUBEXPFIN_K,
           (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
           (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      if (gm->wmtype != VERTRAPEZOID)
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
         params_cost += aom_count_signed_primitive_refsubexpfin(
             GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-            (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (gm->wmtype >= AFFINE) {
-        if (gm->wmtype != HORTRAPEZOID)
-          params_cost += aom_count_signed_primitive_refsubexpfin(
-              GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-              (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-              (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
         params_cost += aom_count_signed_primitive_refsubexpfin(
             GM_ALPHA_MAX + 1, SUBEXPFIN_K,
             (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
                 (1 << GM_ALPHA_PREC_BITS),
             (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
       }
-    // Fallthrough intended
+      AOM_FALLTHROUGH_INTENDED;
     case TRANSLATION:
       trans_bits = (gm->wmtype == TRANSLATION)
                        ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
@@ -5138,7 +4218,7 @@ static int gm_get_params_cost(const WarpedMotionParams *gm,
           (1 << trans_bits) + 1, SUBEXPFIN_K,
           (ref_gm->wmmat[1] >> trans_prec_diff),
           (gm->wmmat[1] >> trans_prec_diff));
-    // Fallthrough intended
+      AOM_FALLTHROUGH_INTENDED;
     case IDENTITY: break;
     default: assert(0);
   }
@@ -5152,26 +4232,16 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
   switch (sf->gm_search_type) {
     case GM_FULL_SEARCH: return 1;
     case GM_REDUCED_REF_SEARCH:
-#if CONFIG_EXT_REFS
       return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
-#else
-      return (num_refs_using_gm < 2);
-#endif  // CONFIG_EXT_REFS
     case GM_DISABLE_SEARCH: return 0;
     default: assert(0);
   }
   return 1;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
 // Estimate if the source frame is screen content, based on the portion of
 // blocks that have no more than 4 (experimentally selected) luma colors.
-static int is_screen_content(const uint8_t *src,
-#if CONFIG_HIGHBITDEPTH
-                             int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
                              int stride, int width, int height) {
   assert(src != NULL);
   int counts = 0;
@@ -5180,20 +4250,198 @@ static int is_screen_content(const uint8_t *src,
   const int limit = 4;
   for (int r = 0; r + blk_h <= height; r += blk_h) {
     for (int c = 0; c + blk_w <= width; c += blk_w) {
+      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
       const int n_colors =
-#if CONFIG_HIGHBITDEPTH
           use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
-                                            blk_h, bd)
-                  :
-#endif  // CONFIG_HIGHBITDEPTH
-                  av1_count_colors(src + r * stride + c, stride, blk_w, blk_h);
+                                            blk_h, bd, count_buf)
+                  : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h,
+                                     count_buf);
       if (n_colors > 1 && n_colors <= limit) counts++;
     }
   }
   // The threshold is 10%.
   return counts * blk_h * blk_w * 10 > width * height;
 }
-#endif  // !CONFIG_PVQ
+
+// Enforce the number of references for each arbitrary frame limited to
+// (INTER_REFS_PER_FRAME - 1)
+static void enforce_max_ref_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  (void)flag_list;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) total_valid_refs++;
+  }
+
+  // NOTE(zoeliu): When all the possible reference frames are availble, we
+  // reduce the number of reference frames by 1, following the rules of:
+  // (1) Retain GOLDEN_FARME/ALTEF_FRAME;
+  // (2) Check the earliest 2 remaining reference frames, and remove the one
+  //     with the lower quality factor, otherwise if both have been coded at
+  //     the same quality level, remove the earliest reference frame.
+
+  if (total_valid_refs == INTER_REFS_PER_FRAME) {
+    unsigned int min_ref_offset = UINT_MAX;
+    unsigned int second_min_ref_offset = UINT_MAX;
+    MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME };
+    int earliest_buf_idxes[2] = { 0 };
+
+    // Locate the earliest two reference frames except GOLDEN/ALTREF.
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      // Retain GOLDEN/ALTERF
+      if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue;
+
+      const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+      if (buf_idx >= 0) {
+        const unsigned int ref_offset =
+            cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+
+        if (min_ref_offset == UINT_MAX) {
+          min_ref_offset = ref_offset;
+          earliest_ref_frames[0] = ref_frame;
+          earliest_buf_idxes[0] = buf_idx;
+        } else {
+          if (get_relative_dist(cm, ref_offset, min_ref_offset) < 0) {
+            second_min_ref_offset = min_ref_offset;
+            earliest_ref_frames[1] = earliest_ref_frames[0];
+            earliest_buf_idxes[1] = earliest_buf_idxes[0];
+
+            min_ref_offset = ref_offset;
+            earliest_ref_frames[0] = ref_frame;
+            earliest_buf_idxes[0] = buf_idx;
+          } else if (second_min_ref_offset == UINT_MAX ||
+                     get_relative_dist(cm, ref_offset, second_min_ref_offset) <
+                         0) {
+            second_min_ref_offset = ref_offset;
+            earliest_ref_frames[1] = ref_frame;
+            earliest_buf_idxes[1] = buf_idx;
+          }
+        }
+      }
+    }
+    // Check the coding quality factors of the two earliest reference frames.
+    RATE_FACTOR_LEVEL ref_rf_level[2];
+    double ref_rf_deltas[2];
+    for (int i = 0; i < 2; ++i) {
+      ref_rf_level[i] = cpi->frame_rf_level[earliest_buf_idxes[i]];
+      ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]];
+    }
+    (void)ref_rf_level;
+    (void)ref_rf_deltas;
+
+#define USE_RF_LEVEL_TO_ENFORCE 1
+#if USE_RF_LEVEL_TO_ENFORCE
+    // If both earliest two reference frames are coded using the same rate-
+    // factor, disable the earliest reference frame; Otherwise disable the
+    // reference frame that uses a lower rate-factor delta.
+    const MV_REFERENCE_FRAME ref_frame_to_disable =
+        (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0]
+                                               : earliest_ref_frames[1];
+#else
+    // Always disable the earliest reference frame
+    const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0];
+#endif  // USE_RF_LEVEL_TO_ENFORCE
+#undef USE_RF_LEVEL_TO_ENFORCE
+
+    switch (ref_frame_to_disable) {
+      case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break;
+      case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break;
+      case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      default: break;
+    }
+  }
+}
+
+static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
+  assert(!frame_is_intra_only(cm));
+
+  int one_sided_refs = 1;
+  for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
+    const int buf_idx = cm->frame_refs[ref].idx;
+    if (buf_idx == INVALID_IDX) continue;
+
+    const int ref_offset =
+        cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    if (get_relative_dist(cm, ref_offset, (int)cm->frame_offset) > 0) {
+      one_sided_refs = 0;  // bwd reference
+      break;
+    }
+  }
+  return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+                                             int ref_offset[2]) {
+  ref_offset[0] = ref_offset[1] = 0;
+  if (!cm->is_skip_mode_allowed) return;
+
+  const int buf_idx_0 = cm->frame_refs[cm->ref_frame_idx_0].idx;
+  const int buf_idx_1 = cm->frame_refs[cm->ref_frame_idx_1].idx;
+  assert(buf_idx_0 != INVALID_IDX && buf_idx_1 != INVALID_IDX);
+
+  ref_offset[0] = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+  ref_offset[1] = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  av1_setup_skip_mode_allowed(cm);
+  if (!cm->is_skip_mode_allowed) return 0;
+
+  // Turn off skip mode if the temporal distances of the reference pair to the
+  // current frame are different by more than 1 frame.
+  const int cur_offset = (int)cm->frame_offset;
+  int ref_offset[2];
+  get_skip_mode_ref_offsets(cm, ref_offset);
+  const int cur_to_ref0 = get_relative_dist(cm, cur_offset, ref_offset[0]);
+  const int cur_to_ref1 = abs(get_relative_dist(cm, cur_offset, ref_offset[1]));
+  if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+  // High Latency: Turn off skip mode if all refs are fwd.
+  if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  const int ref_frame[2] = { cm->ref_frame_idx_0 + LAST_FRAME,
+                             cm->ref_frame_idx_1 + LAST_FRAME };
+  if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) ||
+      !(cpi->ref_frame_flags & flag_list[ref_frame[1]]))
+    return 0;
+
+  return 1;
+}
+
+// Function to decide if we can skip the global motion parameter computation
+// for a particular ref frame
+static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
+  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
+      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
+    return get_relative_dist(
+               cm, cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME],
+               cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0;
+  }
+  return 0;
+}
 
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
@@ -5202,16 +4450,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
   int i;
-#if CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
-  const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME);
-#endif  // CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
-
-#if CONFIG_ADAPT_SCAN
-  av1_deliver_eob_threshold(cm, xd);
-#endif
 
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
 #if CONFIG_DIST_8X8
   x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
   x->tune_metric = cpi->oxcf.tuning;
@@ -5225,23 +4466,29 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_zero(rdc->comp_pred_diff);
 
   if (frame_is_intra_only(cm)) {
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-    cm->allow_screen_content_tools =
-        cpi->oxcf.content == AOM_CONTENT_SCREEN ||
-        is_screen_content(cpi->source->y_buffer,
-#if CONFIG_HIGHBITDEPTH
-                          cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
-                          cpi->source->y_stride, cpi->source->y_width,
-                          cpi->source->y_height);
-#else
-    cm->allow_screen_content_tools = 0;
-#endif  // !CONFIG_PVQ
+    if (cm->seq_params.force_screen_content_tools == 2) {
+      cm->allow_screen_content_tools =
+          cpi->oxcf.content == AOM_CONTENT_SCREEN ||
+          is_screen_content(cpi->source->y_buffer,
+                            cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                            cpi->source->y_stride, cpi->source->y_width,
+                            cpi->source->y_height);
+    } else {
+      cm->allow_screen_content_tools =
+          cm->seq_params.force_screen_content_tools;
+    }
+  }
+
+  // Allow intrabc when screen content tools are enabled.
+  cm->allow_intrabc = cm->allow_screen_content_tools;
+  // Reset the flag.
+  cpi->intrabc_used = 0;
+  // Need to disable intrabc when superres is selected
+  if (av1_superres_scaled(cm)) {
+    cm->allow_intrabc = 0;
   }
 
-#if CONFIG_HASH_ME
-  if (cpi->oxcf.pass != 1 && cpi->common.allow_screen_content_tools) {
+  if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
     // add to hash table
     const int pic_width = cpi->source->y_crop_width;
     const int pic_height = cpi->source->y_crop_height;
@@ -5295,6 +4542,13 @@ static void encode_frame_internal(AV1_COMP *cpi) {
         &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
         pic_width, pic_height, 64);
 
+    av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 128);
+
     for (k = 0; k < 2; k++) {
       for (j = 0; j < 2; j++) {
         aom_free(block_hash_values[k][j]);
@@ -5305,18 +4559,71 @@ static void encode_frame_internal(AV1_COMP *cpi) {
       }
     }
   }
-#endif
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  alloc_ncobmc_pred_buffer(xd);
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = cm->seg.enabled
+                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+                           : cm->base_qindex;
+    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+    if (xd->lossless[i]) cpi->has_lossless_segment = 1;
+    xd->qindex[i] = qindex;
+    if (xd->lossless[i]) {
+      cpi->optimize_seg_arr[i] = 0;
+    } else {
+      cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature;
+    }
+  }
+  cm->coded_lossless = is_coded_lossless(cm, xd);
+  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+
+  cm->tx_mode = select_tx_mode(cpi);
+
+  // Fix delta q resolution for the moment
+  cm->delta_q_res = DEFAULT_DELTA_Q_RES;
+  // Set delta_q_present_flag before it is used for the first time
+  cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
+  cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+  cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+  cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+  cm->delta_q_present_flag &= cm->base_qindex > 0;
+  cm->delta_lf_present_flag &= cm->base_qindex > 0;
+
+  av1_frame_init_quantizer(cpi);
+
+  av1_initialize_rd_consts(cpi);
+  av1_initialize_me_consts(cpi, x, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+
+  if (cm->prev_frame)
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  else
+    cm->last_frame_seg_map = NULL;
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+  if (cm->allow_intrabc || cm->coded_lossless) {
+    av1_set_default_ref_deltas(cm->lf.ref_deltas);
+    av1_set_default_mode_deltas(cm->lf.mode_deltas);
+  } else if (cm->prev_frame) {
+    memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
+
+  x->txb_split_count = 0;
+  av1_zero(x->blk_skip_drl);
 
-#if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
   if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
       !cpi->global_motion_search_done) {
-    YV12_BUFFER_CONFIG *ref_buf[TOTAL_REFS_PER_FRAME];
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
     int frame;
     double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
     const double *params_this_motion;
@@ -5327,32 +4634,31 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     };
     int num_refs_using_gm = 0;
 
-    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
       ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
       int pframe;
       cm->global_motion[frame] = default_warp_params;
       const WarpedMotionParams *ref_params =
-          cm->error_resilient_mode ? &default_warp_params
-                                   : &cm->prev_frame->global_motion[frame];
+          cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                         : &default_warp_params;
       // check for duplicate buffer
-      for (pframe = LAST_FRAME; pframe < frame; ++pframe) {
+      for (pframe = ALTREF_FRAME; pframe > frame; --pframe) {
         if (ref_buf[frame] == ref_buf[pframe]) break;
       }
-      if (pframe < frame) {
+      if (pframe > frame) {
         memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
                sizeof(WarpedMotionParams));
       } else if (ref_buf[frame] &&
                  ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
                  ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
-                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame)) {
+                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
+                 !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
         TransformationType model;
-        const int64_t ref_frame_error = av1_frame_error(
-#if CONFIG_HIGHBITDEPTH
-            xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
-            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
-            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
-            cpi->source->y_stride);
+        const int64_t ref_frame_error =
+            av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
+                            cpi->source->y_buffer, cpi->source->y_width,
+                            cpi->source->y_height, cpi->source->y_stride);
 
         if (ref_frame_error == 0) continue;
 
@@ -5366,10 +4672,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           }
 
           compute_global_motion_feature_based(
-              model, cpi->source, ref_buf[frame],
-#if CONFIG_HIGHBITDEPTH
-              cpi->common.bit_depth,
-#endif  // CONFIG_HIGHBITDEPTH
+              model, cpi->source, ref_buf[frame], cpi->common.bit_depth,
               inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS);
 
           for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
@@ -5381,9 +4684,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
             if (tmp_wm_params.wmtype != IDENTITY) {
               const int64_t warp_error = refine_integerized_param(
                   &tmp_wm_params, tmp_wm_params.wmtype,
-#if CONFIG_HIGHBITDEPTH
                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
                   ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
                   ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
                   cpi->source->y_buffer, cpi->source->y_width,
@@ -5418,7 +4719,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           if (!is_enough_erroradvantage(
                   (double)best_warp_error / ref_frame_error,
                   gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                                     cm->allow_high_precision_mv))) {
+                                     cm->allow_high_precision_mv),
+                  cpi->sf.gm_erroradv_type)) {
             cm->global_motion[frame] = default_warp_params;
           }
           if (cm->global_motion[frame].wmtype != IDENTITY) break;
@@ -5435,91 +4737,14 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     cpi->global_motion_search_done = 1;
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
-         TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
-#endif  // CONFIG_GLOBAL_MOTION
-
-  for (i = 0; i < MAX_SEGMENTS; ++i) {
-    const int qindex = cm->seg.enabled
-                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
-                           : cm->base_qindex;
-    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                      cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
-    xd->qindex[i] = qindex;
-  }
-  cm->all_lossless = all_lossless(cm, xd);
-  if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
-
-  cm->tx_mode = select_tx_mode(cpi);
-
-  // Fix delta q resolution for the moment
-  cm->delta_q_res = DEFAULT_DELTA_Q_RES;
-// Set delta_q_present_flag before it is used for the first time
-#if CONFIG_EXT_DELTA_Q
-  cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
-  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
-  cm->delta_q_present_flag &= cm->base_qindex > 0;
-  cm->delta_lf_present_flag &= cm->base_qindex > 0;
-#else
-  cm->delta_q_present_flag =
-      cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0;
-#endif  // CONFIG_EXT_DELTA_Q
-
-  av1_frame_init_quantizer(cpi);
-
-  av1_initialize_rd_consts(cpi);
-  av1_initialize_me_consts(cpi, x, cm->base_qindex);
-  init_encode_frame_mb_context(cpi);
-
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
-  //               show_exisiting_frame=1, nor can it take a frame not used as
-  //               a reference, it is probable that by the time it is being
-  //               referred to, the frame buffer it originally points to may
-  //               already get expired and have been reassigned to the current
-  //               newly coded frame. Hence, we need to check whether this is
-  //               the case, and if yes, we have 2 choices:
-  //               (1) Simply disable the use of previous frame mvs; or
-  //               (2) Have cm->prev_frame point to one reference frame buffer,
-  //                   e.g. LAST_FRAME.
-  if (!enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
-    // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame = last_fb_buf_idx != INVALID_IDX
-                         ? &cm->buffer_pool->frame_bufs[last_fb_buf_idx]
-                         : NULL;
-  }
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_TEMPMV_SIGNALING
-  cm->use_prev_frame_mvs &= frame_can_use_prev_frame_mvs(cm);
-#else
-  if (cm->prev_frame) {
-    cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
-#if CONFIG_FRAME_SUPERRES
-                             cm->width == cm->last_width &&
-                             cm->height == cm->last_height &&
-#else
-                             cm->width == cm->prev_frame->buf.y_crop_width &&
-                             cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-                             !cm->intra_only && cm->last_show_frame;
-  } else {
-    cm->use_prev_frame_mvs = 0;
-  }
-#endif  // CONFIG_TEMPMV_SIGNALING
+         REF_FRAMES * sizeof(WarpedMotionParams));
 
-  // Special case: set prev_mi to NULL when the previous mode info
-  // context cannot be used.
-  cm->prev_mi =
-      cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
+  av1_setup_motion_field(cm);
 
-#if CONFIG_VAR_TX
-  x->txb_split_count = 0;
-  av1_zero(x->blk_skip_drl);
-#endif
+  cpi->all_one_sided_refs =
+      frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
 
-#if CONFIG_MFMV
-  av1_setup_motion_field(cm);
-#endif  // CONFIG_MFMV
+  cm->skip_mode_flag = check_skip_mode_enabled(cpi);
 
   {
     struct aom_usec_timer emr_timer;
@@ -5532,7 +4757,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     }
 #endif
 
-    av1_setup_frame_boundary_info(cm);
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    av1_inter_mode_data_init();
+#endif
 
     // If allowed, encoding tiles in parallel with one thread handling one tile.
     // TODO(geza.lore): The multi-threaded encoder is not safe with more than
@@ -5543,109 +4770,72 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     else
       encode_tiles(cpi);
 
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#if INTER_MODE_RD_TEST
+    if (cpi->sf.inter_mode_rd_model_estimation) {
+      av1_inter_mode_data_show(cm);
+    }
+#endif
+#endif
+
     aom_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
   }
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  free_ncobmc_pred_buffer(xd);
-#endif
-
-#if 0
-  // Keep record of the total distortion this time around for future use
-  cpi->last_frame_distortion = cpi->frame_distortion;
-#endif
-}
 
-static void make_consistent_compound_tools(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_INTERINTRA
-  if (frame_is_intra_only(cm) || cm->reference_mode == COMPOUND_REFERENCE)
-    cm->allow_interintra_compound = 0;
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#if CONFIG_COMPOUND_SINGLEREF
-  if (frame_is_intra_only(cm))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    cm->allow_masked_compound = 0;
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+  // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+  if (cm->allow_intrabc && !cpi->intrabc_used) cm->allow_intrabc = 0;
+  if (cm->allow_intrabc) cm->delta_lf_present_flag = 0;
 }
 
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_EXT_TX
+  const int num_planes = av1_num_planes(cm);
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
   cm->reduced_tx_set_used = 0;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_ADAPT_SCAN
-  cm->use_adapt_scan = 1;
-  // TODO(angiebird): call av1_init_scan_order only when use_adapt_scan
-  // switches from 1 to 0
-  if (cm->use_adapt_scan == 0) av1_init_scan_order(cm);
-#endif
 
-#if CONFIG_FRAME_MARKER
   if (cm->show_frame == 0) {
     int arf_offset = AOMMIN(
         (MAX_GF_INTERVAL - 1),
         cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-#if CONFIG_EXT_REFS
     int brf_offset =
         cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
     arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-#endif  // CONFIG_EXT_REFS
     cm->frame_offset = cm->current_video_frame + arf_offset;
   } else {
     cm->frame_offset = cm->current_video_frame;
   }
-  av1_setup_frame_buf_refs(cm);
-#if CONFIG_FRAME_SIGN_BIAS
-  av1_setup_frame_sign_bias(cm);
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#endif  // CONFIG_FRAME_MARKER
-
-  // In the longer term the encoder should be generalized to match the
-  // decoder such that we allow compound where one of the 3 buffers has a
-  // different sign bias and that buffer is then the fixed ref. However, this
-  // requires further work in the rd loop. For now the only supported encoder
-  // side behavior is where the ALT ref buffer has opposite sign bias to
-  // the other two.
-  if (!frame_is_intra_only(cm)) {
-#if !CONFIG_ONE_SIDED_COMPOUND
-    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
-        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cpi->allow_comp_inter_inter = 0;
-    } else {
-#endif  // !CONFIG_ONE_SIDED_COMPOUND
-      cpi->allow_comp_inter_inter = 1;
-#if CONFIG_EXT_REFS
-      cm->comp_fwd_ref[0] = LAST_FRAME;
-      cm->comp_fwd_ref[1] = LAST2_FRAME;
-      cm->comp_fwd_ref[2] = LAST3_FRAME;
-      cm->comp_fwd_ref[3] = GOLDEN_FRAME;
-      cm->comp_bwd_ref[0] = BWDREF_FRAME;
-      cm->comp_bwd_ref[1] = ALTREF2_FRAME;
-      cm->comp_bwd_ref[2] = ALTREF_FRAME;
-#else                           // !CONFIG_EXT_REFS
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif                          // CONFIG_EXT_REFS
-#if !CONFIG_ONE_SIDED_COMPOUND  // Normative in encoder
+  cm->frame_offset %= (1 << (cm->seq_params.order_hint_bits_minus_1 + 1));
+
+  // Make sure segment_id is no larger than last_active_segid.
+  if (cm->seg.enabled && cm->seg.update_map) {
+    const int mi_rows = cm->mi_rows;
+    const int mi_cols = cm->mi_cols;
+    const int last_active_segid = cm->seg.last_active_segid;
+    uint8_t *map = cpi->segmentation_map;
+    for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+      for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+        map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+      }
+      map += mi_cols;
     }
-#endif  // !CONFIG_ONE_SIDED_COMPOUND
-  } else {
-    cpi->allow_comp_inter_inter = 0;
   }
 
+  av1_setup_frame_buf_refs(cm);
+  if (cpi->sf.selective_ref_frame >= 2) enforce_max_ref_frames(cpi);
+  av1_setup_frame_sign_bias(cm);
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(num_planes);
+#else
+  (void)num_planes;
+#endif
+
+  cpi->allow_comp_inter_inter = !frame_is_intra_only(cm);
+
   if (cpi->sf.frame_parameter_update) {
     int i;
     RD_OPT *const rd_opt = &cpi->rd;
-    FRAME_COUNTS *counts = cpi->td.counts;
     RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
     // This code does a single RD pass over the whole frame assuming
@@ -5662,39 +4852,20 @@ void av1_encode_frame(AV1_COMP *cpi) {
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
-/* prediction (compound, single or hybrid) mode selection */
-#if CONFIG_REF_ADAPT
-    // NOTE(zoeliu): "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
-    if (is_alt_ref || !cpi->allow_comp_inter_inter)
-      cm->reference_mode = SINGLE_REFERENCE;
-    else
-      cm->reference_mode = REFERENCE_MODE_SELECT;
-#else
-#if CONFIG_BGSPRITE
-    (void)is_alt_ref;
-    if (!cpi->allow_comp_inter_inter)
-#else
+    /* prediction (compound, single or hybrid) mode selection */
+    // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
     if (is_alt_ref || !cpi->allow_comp_inter_inter)
-#endif  // CONFIG_BGSPRITE
-      cm->reference_mode = SINGLE_REFERENCE;
-    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
-             mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
-             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
-      cm->reference_mode = COMPOUND_REFERENCE;
-    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
       cm->reference_mode = SINGLE_REFERENCE;
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
-#endif  // CONFIG_REF_ADAPT
 
-#if CONFIG_DUAL_FILTER
     cm->interp_filter = SWITCHABLE;
-#endif
+    if (cm->large_scale_tile) cm->interp_filter = EIGHTTAP_REGULAR;
 
-    make_consistent_compound_tools(cm);
+    cm->switchable_motion_mode = 1;
 
-    rdc->single_ref_used_flag = 0;
     rdc->compound_ref_used_flag = 0;
+    rdc->skip_mode_used_flag = 0;
 
     encode_frame_internal(cpi);
 
@@ -5705,406 +4876,124 @@ void av1_encode_frame(AV1_COMP *cpi) {
       // Use a flag that includes 4x4 blocks
       if (rdc->compound_ref_used_flag == 0) {
         cm->reference_mode = SINGLE_REFERENCE;
-        av1_zero(counts->comp_inter);
-#if !CONFIG_REF_ADAPT
-        // Use a flag that includes 4x4 blocks
-      } else if (rdc->single_ref_used_flag == 0) {
-        cm->reference_mode = COMPOUND_REFERENCE;
-        av1_zero(counts->comp_inter);
-#endif  // !CONFIG_REF_ADAPT
-      }
-    }
-    make_consistent_compound_tools(cm);
-
-#if CONFIG_VAR_TX
-#if CONFIG_RECT_TX_EXT
-    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0 &&
-        counts->quarter_tx_size[1] == 0)
-#else
-    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
-#endif
-      cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
-#else
-#if CONFIG_RECT_TX_EXT && CONFIG_EXT_TX
-    if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0)
-#else
-    if (cm->tx_mode == TX_MODE_SELECT)
-#endif
-    {
-#if CONFIG_TX64X64
-      int count4x4 = 0;
-      int count8x8_8x8p = 0, count8x8_lp = 0;
-      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32_32x32p = 0, count32x32_lp = 0;
-      int count64x64_64x64p = 0;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        int depth;
-        // counts->tx_size[max_depth][context_idx][this_depth_level]
-        depth = tx_size_to_depth(TX_4X4);
-        count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_8X8);
-        count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_16X16);
-        count16x16_16x16p +=
-            counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count16x16_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_32X32);
-        count32x32_32x32p +=
-            counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count32x32_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_64X64);
-        count64x64_64x64p +=
-            counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-      }
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4];
-      count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_64X64][TX_8X8];
-      count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[TX_64X64][TX_16X16];
-      count32x32_32x32p += counts->tx_size_implied[TX_32X32][TX_32X32];
-      count32x32_lp += counts->tx_size_implied[TX_64X64][TX_32X32];
-      count64x64_64x64p += counts->tx_size_implied[TX_64X64][TX_64X64];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32_lp == 0 && count32x32_32x32p == 0 &&
-#if CONFIG_SUPERTX
-          cm->counts.supertx_size[TX_16X16] == 0 &&
-          cm->counts.supertx_size[TX_32X32] == 0 &&
-          cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-          count64x64_64x64p == 0) {
-        cm->tx_mode = ALLOW_8X8;
-        reset_skip_tx_size(cm, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count8x8_lp == 0 &&
-                 count16x16_16x16p == 0 && count16x16_lp == 0 &&
-                 count32x32_32x32p == 0 && count32x32_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_8X8] == 0 &&
-                 cm->counts.supertx_size[TX_16X16] == 0 &&
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-                 cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-                 count64x64_64x64p == 0) {
-        cm->tx_mode = ONLY_4X4;
-        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
-                 count32x32_lp == 0) {
-        cm->tx_mode = ALLOW_64X64;
-      } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-                 count64x64_64x64p == 0) {
-        cm->tx_mode = ALLOW_32X32;
-        reset_skip_tx_size(cm, TX_32X32);
-      } else if (count4x4 == 0 && count8x8_lp == 0 && count32x32_lp == 0 &&
-                 count32x32_32x32p == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-                 cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-                 count64x64_64x64p == 0) {
-        cm->tx_mode = ALLOW_16X16;
-        reset_skip_tx_size(cm, TX_16X16);
-      }
-
-#else  // CONFIG_TX64X64
-
-      int count4x4 = 0;
-      int count8x8_lp = 0, count8x8_8x8p = 0;
-      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32 = 0;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        int depth;
-        // counts->tx_size[max_depth][context_idx][this_depth_level]
-        depth = tx_size_to_depth(TX_4X4);
-        count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_8X8);
-        count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_16X16);
-        count16x16_16x16p +=
-            counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_32X32);
-        count32x32 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-      }
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4];
-      count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8];
-      count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16];
-      count32x32 += counts->tx_size_implied[TX_32X32][TX_32X32];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-#if CONFIG_SUPERTX
-          cm->counts.supertx_size[TX_16X16] == 0 &&
-          cm->counts.supertx_size[TX_32X32] == 0 &&
-#endif  // CONFIG_SUPERTX
-          count32x32 == 0) {
-        cm->tx_mode = ALLOW_8X8;
-        reset_skip_tx_size(cm, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_8X8] == 0 &&
-                 cm->counts.supertx_size[TX_16X16] == 0 &&
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-#endif  // CONFIG_SUPERTX
-                 count32x32 == 0) {
-        cm->tx_mode = ONLY_4X4;
-        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
-        cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-#endif  // CONFIG_SUPERTX
-                 count4x4 == 0) {
-        cm->tx_mode = ALLOW_16X16;
-        reset_skip_tx_size(cm, TX_16X16);
-      }
-#endif  // CONFIG_TX64X64
-    }
-#endif
-  } else {
-    make_consistent_compound_tools(cm);
-    encode_frame_internal(cpi);
-  }
-}
-
-static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
-                            const MODE_INFO *mi, const MODE_INFO *above_mi,
-                            const MODE_INFO *left_mi, const int intraonly,
-                            const int mi_row, const int mi_col) {
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const PREDICTION_MODE y_mode = mbmi->mode;
-  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-  (void)counts;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int unify_bsize = CONFIG_CB4X4;
-
-  if (bsize < BLOCK_8X8 && !unify_bsize) {
-    int idx, idy;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-    for (idy = 0; idy < 2; idy += num_4x4_h)
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int bidx = idy * 2 + idx;
-        const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
-        if (intraonly) {
-#if CONFIG_ENTROPY_STATS
-          const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
-          const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
-          ++counts->kf_y_mode[a][l][bmode];
-#endif  // CONFIG_ENTROPY_STATS
-          update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, bidx), bmode,
-                     INTRA_MODES);
-        } else {
 #if CONFIG_ENTROPY_STATS
-          ++counts->y_mode[0][bmode];
+        av1_zero(cpi->td.counts->comp_inter);
 #endif  // CONFIG_ENTROPY_STATS
-          update_cdf(fc->y_mode_cdf[0], bmode, INTRA_MODES);
-        }
       }
-  } else {
-    if (intraonly) {
-#if CONFIG_ENTROPY_STATS
-      const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
-      const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
-      ++counts->kf_y_mode[above][left][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, 0), y_mode,
-                 INTRA_MODES);
-    } else {
-#if CONFIG_ENTROPY_STATS
-      ++counts->y_mode[size_group_lookup[bsize]][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
-    }
-
-#if CONFIG_FILTER_INTRA
-    if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
-      const int use_filter_intra_mode =
-          mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
-      ++counts->filter_intra[0][use_filter_intra_mode];
     }
-    if (mbmi->uv_mode == UV_DC_PRED
-#if CONFIG_CB4X4
-        &&
-        is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y)
-#endif
-        && mbmi->palette_mode_info.palette_size[1] == 0) {
-      const int use_filter_intra_mode =
-          mbmi->filter_intra_mode_info.use_filter_intra_mode[1];
-      ++counts->filter_intra[1][use_filter_intra_mode];
+    // Re-check on the skip mode status as reference mode may have been changed.
+    if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE) {
+      cm->is_skip_mode_allowed = 0;
+      cm->skip_mode_flag = 0;
     }
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-    if (av1_is_directional_mode(mbmi->mode, bsize)) {
-      const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-      const int p_angle =
-          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-      if (av1_is_intra_filter_switchable(p_angle))
-        ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter];
+    if (cm->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+      cm->skip_mode_flag = 0;
+
+    if (!cm->large_scale_tile) {
+      if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+        cm->tx_mode = TX_MODE_LARGEST;
     }
-#endif  // CONFIG_INTRA_INTERP && CONFIG_INTRA_INTERP
+  } else {
+    encode_frame_internal(cpi);
   }
-
-#if CONFIG_CB4X4
-  if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y))
-    return;
-#else
-  (void)mi_row;
-  (void)mi_col;
-  (void)xd;
-#endif
-#if CONFIG_ENTROPY_STATS
-  ++counts->uv_mode[y_mode][uv_mode];
-#endif  // CONFIG_ENTROPY_STATS
-  update_cdf(fc->uv_mode_cdf[y_mode], uv_mode, UV_INTRA_MODES);
 }
 
-#if CONFIG_VAR_TX
 static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
                               FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
-                              int blk_row, int blk_col) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+                              int blk_row, int blk_col,
+                              uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
                                    xd->left_txfm_context + blk_row,
                                    mbmi->sb_type, tx_size);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
   assert(tx_size > TX_4X4);
 
   if (depth == MAX_VARTX_DEPTH) {
-// Don't add to counts in this case
-#if CONFIG_RECT_TX_EXT
-    if (tx_size == plane_tx_size)
-#endif
-      mbmi->tx_size = tx_size;
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
     return;
   }
 
-#if CONFIG_RECT_TX_EXT
-  if (tx_size == plane_tx_size ||
-      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type])
-#else
-  if (tx_size == plane_tx_size)
-#endif
-  {
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
     ++counts->txfm_partition[ctx][0];
-#if CONFIG_RECT_TX_EXT
-    if (tx_size == plane_tx_size)
 #endif
-      mbmi->tx_size = tx_size;
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bs = tx_size_wide_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
 
+#if CONFIG_ENTROPY_STATS
     ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
     ++x->txb_split_count;
 
     if (sub_txs == TX_4X4) {
-      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
       mbmi->tx_size = TX_4X4;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, TX_4X4, tx_size);
       return;
     }
 
-    for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) * bs;
-      int offsetc = (i & 0x01) * bs;
-      update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
-                        blk_col + offsetc);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
     }
   }
 }
 
 static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
                                       BLOCK_SIZE plane_bsize, int mi_row,
-                                      int mi_col, FRAME_COUNTS *td_counts) {
+                                      int mi_col, FRAME_COUNTS *td_counts,
+                                      uint8_t allow_update_cdf) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
-  int init_depth =
-      (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
-
-#if CONFIG_INTRABC
-  // Intrabc doesn't support var-tx yet. So no need to update tx partition
-  // info., except for the split count (otherwise common->tx_mode may be
-  // modified, causing mismatch).
-  if (is_intrabc_block(&x->e_mbd.mi[0]->mbmi)) {
-    if (x->e_mbd.mi[0]->mbmi.tx_size != max_tx_size) ++x->txb_split_count;
-    return;
-  }
-#endif  // CONFIG_INTRABC
 
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
-      update_txfm_count(x, xd, td_counts, max_tx_size, init_depth, idy, idx);
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
 }
 
 static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
                              int blk_col) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -6114,23 +5003,23 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
 
   } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
     if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
       mbmi->tx_size = TX_4X4;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, TX_4X4, tx_size);
       return;
     }
-
-    assert(bsl > 0);
-    for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) * bsl;
-      int offsetc = (i & 0x01) * bsl;
-      set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc);
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
     }
   }
 }
@@ -6140,214 +5029,94 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm,
                                       int mi_row, int mi_col) {
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
 
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
       set_txfm_context(xd, max_tx_size, idy, idx);
 }
-#endif
-
-void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
-#if CONFIG_TXK_SEL
-                              int blk_row, int blk_col, int block, int plane,
-#endif
-                              BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              FRAME_COUNTS *counts) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int is_inter = is_inter_block(mbmi);
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-#if !CONFIG_ENTROPY_STATS
-  (void)counts;
-#endif  // !CONFIG_ENTROPY_STATS
-
-#if !CONFIG_TXK_SEL
-  TX_TYPE tx_type = mbmi->tx_type;
-#else
-  (void)blk_row;
-  (void)blk_col;
-  // Only y plane's tx_type is updated
-  if (plane > 0) return;
-  TX_TYPE tx_type =
-      av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, block, tx_size);
-#endif
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const int eset =
-        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-    if (eset > 0) {
-#if !CONFIG_LGT_FROM_PRED
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-      if (is_inter) {
-        update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
-                   av1_ext_tx_ind[tx_set_type][tx_type],
-                   av1_num_ext_tx_set[tx_set_type]);
-#if CONFIG_ENTROPY_STATS
-        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      } else {
-#if CONFIG_ENTROPY_STATS
-        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
-                              [tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        update_cdf(
-            fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][mbmi->mode],
-            av1_ext_tx_ind[tx_set_type][tx_type],
-            av1_num_ext_tx_set[tx_set_type]);
-      }
-#else
-      (void)tx_type;
-      (void)fc;
-      if (is_inter) {
-        if (LGT_FROM_PRED_INTER) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            ++counts->inter_lgt[txsize_sqr_map[tx_size]][mbmi->use_lgt];
-#if CONFIG_ENTROPY_STATS
-          if (!mbmi->use_lgt)
-            ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
-          else
-#endif  // CONFIG_ENTROPY_STATS
-            mbmi->tx_type = DCT_DCT;
-        } else {
-#if CONFIG_ENTROPY_STATS
-          ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      } else {
-        if (LGT_FROM_PRED_INTRA) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            ++counts->intra_lgt[txsize_sqr_map[tx_size]][mbmi->mode]
-                               [mbmi->use_lgt];
-#if CONFIG_ENTROPY_STATS
-          if (!mbmi->use_lgt)
-            ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
-                                  [tx_type];
-          else
-#endif  // CONFIG_ENTROPY_STATS
-            mbmi->tx_type = DCT_DCT;
-        } else {
-#if CONFIG_ENTROPY_STATS
-          ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
-                                [tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      }
-#endif  // CONFIG_LGT_FROM_PRED
-    }
-  }
-#else
-  (void)bsize;
-  if (tx_size < TX_32X32 &&
-      ((!cm->seg.enabled && cm->base_qindex > 0) ||
-       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-      !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    if (is_inter) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->inter_ext_tx[tx_size][tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(fc->inter_ext_tx_cdf[tx_size], av1_ext_tx_ind[tx_type],
-                 TX_TYPES);
-    } else {
-#if CONFIG_ENTROPY_STATS
-      ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]]
-                            [tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(
-          fc->intra_ext_tx_cdf[tx_size]
-                              [intra_mode_to_tx_type_context[mbmi->mode]],
-          av1_ext_tx_ind[tx_type], TX_TYPES);
-    }
-  }
-#endif  // CONFIG_EXT_TX
-}
 
-static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int *rate) {
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
   const int seg_skip =
       segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
   const int mis = cm->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
   const int is_inter = is_inter_block(mbmi);
-#if CONFIG_CB4X4
-  const BLOCK_SIZE block_size = bsize;
-#else
-  const BLOCK_SIZE block_size = AOMMAX(bsize, BLOCK_8X8);
-#endif
 
-#if CONFIG_PVQ
-  x->pvq_speed = 0;
-  x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
-#endif
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      x->cb_partition_scan) {
+    for (int row = mi_row; row < mi_row + mi_width;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        // Increase the counter of data samples.
+        ++stats->sample_counts;
+        // Increase the counter for ref_frame[0] and ref_frame[1].
+        if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref0_counts[mbmi->ref_frame[0]];
+        if (mbmi->ref_frame[1] >= 0 &&
+            stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref1_counts[mbmi->ref_frame[1]];
+      }
+    }
+  }
 
   if (!is_inter) {
-#if CONFIG_CFL
-    xd->cfl->store_y = 1;
-#endif  // CONFIG_CFL
-    int plane;
+    xd->cfl.is_chroma_reference = is_chroma_reference(
+        mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+    xd->cfl.store_y = store_cfl_required(cm, xd);
     mbmi->skip = 1;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1,
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane,
+                                   cpi->optimize_seg_arr[mbmi->segment_id],
                                    mi_row, mi_col);
     }
-#if CONFIG_CFL
-    xd->cfl->store_y = 0;
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x,
-                            xd->cfl->subsampling_y) &&
-        !xd->cfl->are_parameters_computed) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-#endif  // CONFIG_CFL
-    if (!dry_run) {
-      sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
-                      frame_is_intra_only(cm), mi_row, mi_col);
-    }
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-    if (bsize >= BLOCK_8X8) {
-      for (plane = 0; plane <= 1; ++plane) {
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->has_lossless_segment)
+      mbmi->skip = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
-          if (!dry_run)
-            av1_tokenize_color_map(x, plane, 0, t, bsize, mbmi->tx_size,
-                                   PALETTE_MAP);
-          else if (dry_run == DRY_RUN_COSTCOEFFS)
-            rate += av1_cost_color_map(x, plane, 0, bsize, mbmi->tx_size,
-                                       PALETTE_MAP);
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
         }
       }
     }
-#endif  // !CONFIG_PVQ
 
-#if CONFIG_VAR_TX
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif
-#if CONFIG_LV_MAP
-    av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
-#else   // CONFIG_LV_MAP
-    av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
-#endif  // CONFIG_LV_MAP
+    av1_update_txb_context(cpi, td, dry_run, bsize, rate, mi_row, mi_col,
+                           tile_data->allow_update_cdf);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -6355,123 +5124,66 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-#if CONFIG_INTRABC
-      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
-#else
-      assert(cfg != NULL);
-#endif  // !CONFIG_INTRABC
-      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                           &xd->block_refs[ref]->sf);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    // Single ref compound mode
-    if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
-      xd->block_refs[1] = xd->block_refs[0];
-      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
-#if CONFIG_INTRABC
-      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
-#else
-      assert(cfg != NULL);
-#endif  // !CONFIG_INTRABC
-      av1_setup_pre_planes(xd, 1, cfg, mi_row, mi_col, &xd->block_refs[1]->sf);
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, block_size);
-
-#if !CONFIG_NCOBMC_ADAPT_WEIGHT
-#if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_NCOBMC
-      if (dry_run == OUTPUT_ENABLED)
-        av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-      else
-#endif
-        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf, num_planes);
     }
-#endif  // CONFIG_MOTION_VAR
-#else
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
+
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    if (mbmi->motion_mode == OBMC_CAUSAL)
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-    } else if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT &&
-               dry_run == OUTPUT_ENABLED) {
-      int p;
-      for (p = 0; p < MAX_MB_PLANE; ++p) {
-        get_pred_from_intrpl_buf(xd, mi_row, mi_col, block_size, p);
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                                  plane, pixel_c, pixel_r, pd->width,
+                                  pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
       }
     }
-#endif
-
-    av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col);
-#if CONFIG_VAR_TX
-    if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, block_size,
-                          rate);
 #else
-#if CONFIG_LV_MAP
-    av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
-#else   // CONFIG_LV_MAP
-    av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
-#endif  // CONFIG_LV_MAP
+    (void)num_planes;
 #endif
-  }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-    dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize,
-                            block_size_wide[bsize], block_size_high[bsize],
-                            mi_row, mi_col);
+    av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate,
+                          tile_data->allow_update_cdf);
   }
-#endif
 
   if (!dry_run) {
-#if CONFIG_VAR_TX
-    TX_SIZE tx_size =
-        is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size;
-#else
-    TX_SIZE tx_size = mbmi->tx_size;
-#endif
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi))
+      td->intrabc_used_this_tile = 1;
     if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
-        mbmi->sb_type > BLOCK_4X4 &&
-#else
-        mbmi->sb_type >= BLOCK_8X8 &&
-#endif
-        !(is_inter && (mbmi->skip || seg_skip))) {
-#if CONFIG_VAR_TX
+        mbmi->sb_type > BLOCK_4X4 && !(is_inter && (mbmi->skip || seg_skip))) {
       if (is_inter) {
-        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
+        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts,
+                                  tile_data->allow_update_cdf);
       } else {
-        const int tx_size_ctx = get_tx_size_context(xd);
-        const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                             : intra_tx_size_cat_lookup[bsize];
-        const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-        const int depth = tx_size_to_depth(coded_tx_size);
-        ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
-        if (tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
-      }
-#else
-      const int tx_size_ctx = get_tx_size_context(xd);
-      const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                           : intra_tx_size_cat_lookup[bsize];
-      const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-      const int depth = tx_size_to_depth(coded_tx_size);
-
-      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
 #endif
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      if (is_quarter_tx_allowed(xd, mbmi, is_inter) &&
-          quarter_txsize_lookup[bsize] != max_txsize_rect_lookup[bsize] &&
-          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
-           mbmi->tx_size == max_txsize_rect_lookup[bsize])) {
-        ++td->counts
-              ->quarter_tx_size[mbmi->tx_size == quarter_txsize_lookup[bsize]];
+        }
       }
-#endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
     } else {
       int i, j;
       TX_SIZE intra_tx_size;
@@ -6480,43 +5192,22 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
         if (xd->lossless[mbmi->segment_id]) {
           intra_tx_size = TX_4X4;
         } else {
-          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
         }
       } else {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-        intra_tx_size = tx_size;
-#else
-        intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+        intra_tx_size = mbmi->tx_size;
       }
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      ++td->counts->tx_size_implied[max_txsize_lookup[bsize]]
-                                   [txsize_sqr_up_map[tx_size]];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
       for (j = 0; j < mi_height; j++)
         for (i = 0; i < mi_width; i++)
           if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
-            mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size;
+            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
 
-#if CONFIG_VAR_TX
-      mbmi->min_tx_size = get_min_tx_size(intra_tx_size);
       if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
-#endif
     }
-
-#if !CONFIG_TXK_SEL
-    av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
-#endif
   }
 
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4
-      mbmi->sb_type > BLOCK_4X4 &&
-#else
-      mbmi->sb_type >= BLOCK_8X8 &&
-#endif
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type) &&
       is_inter && !(mbmi->skip || seg_skip) &&
       !xd->lossless[mbmi->segment_id]) {
     if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
@@ -6527,1137 +5218,20 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       if (xd->lossless[mbmi->segment_id]) {
         tx_size = TX_4X4;
       } else {
-        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter);
+        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
       }
     } else {
       tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
     }
     mbmi->tx_size = tx_size;
-    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd);
-  }
-#endif  // CONFIG_VAR_TX
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-  CFL_CTX *const cfl = xd->cfl;
-#if CONFIG_DEBUG
-  if (is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
-                          cfl->subsampling_y) &&
-      !cfl->are_parameters_computed) {
-    cfl_clear_sub8x8_val(cfl);
-  }
-#endif  // CONFIG_DEBUG
+    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h,
+                  (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
+  }
+  CFL_CTX *const cfl = &xd->cfl;
   if (is_inter_block(mbmi) &&
       !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
-                           cfl->subsampling_y)) {
+                           cfl->subsampling_y) &&
+      is_cfl_allowed(xd)) {
     cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-}
-
-#if CONFIG_SUPERTX
-static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
-  if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1;
-  if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1;
-  return 0;
-}
-
-static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                          PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  int i;
-#endif
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-#if !CONFIG_CB4X4
-  assert(bsize >= BLOCK_8X8);
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1;
-
-  switch (partition) {
-    case PARTITION_NONE: return check_intra_b(&pc_tree->none); break;
-    case PARTITION_VERT:
-      if (check_intra_b(&pc_tree->vertical[0])) return 1;
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        if (check_intra_b(&pc_tree->vertical[1])) return 1;
-      }
-      break;
-    case PARTITION_HORZ:
-      if (check_intra_b(&pc_tree->horizontal[0])) return 1;
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        if (check_intra_b(&pc_tree->horizontal[1])) return 1;
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        if (check_intra_b(pc_tree->leaf_split[0])) return 1;
-      } else {
-        if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
-                           pc_tree->split[0]))
-          return 1;
-        if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
-                           pc_tree->split[1]))
-          return 1;
-        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
-                           pc_tree->split[2]))
-          return 1;
-        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
-                           pc_tree->split[3]))
-          return 1;
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->horizontala[i])) return 1;
-      }
-      break;
-    case PARTITION_HORZ_B:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->horizontalb[i])) return 1;
-      }
-      break;
-    case PARTITION_VERT_A:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->verticala[i])) return 1;
-      }
-      break;
-    case PARTITION_VERT_B:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->verticalb[i])) return 1;
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
-  return 0;
-}
-
-static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
-  return ctx->mic.mbmi.tx_size == supertx_size;
-}
-
-static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
-                            PC_TREE *pc_tree) {
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-  partition = pc_tree->partitioning;
-  subsize = get_subsize(bsize, partition);
-  switch (partition) {
-    case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none);
-    case PARTITION_VERT:
-      return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
-    case PARTITION_HORZ:
-      return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize)
-        return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
-      else
-        return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
-    case PARTITION_HORZ_B:
-      return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
-    case PARTITION_VERT_A:
-      return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
-    case PARTITION_VERT_B:
-      return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0); return 0;
-  }
-}
-
-static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                               int mi_row_ori, int mi_col_ori, int mi_row_pred,
-                               int mi_col_pred, int plane,
-                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi_8x8 = xd->mi[0];
-  MODE_INFO *mi = mi_8x8;
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int ref;
-  const int is_compound = has_second_ref(mbmi);
-
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-    av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
-                         &xd->block_refs[ref]->sf);
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  // Single ref compound mode
-  if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
-    xd->block_refs[1] = xd->block_refs[0];
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
-    av1_setup_pre_planes(xd, 1, cfg, mi_row_pred, mi_col_pred,
-                         &xd->block_refs[1]->sf);
-  }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  if (!b_sub8x8)
-    av1_build_inter_predictor_sb_extend(cm, xd, mi_row_ori, mi_col_ori,
-                                        mi_row_pred, mi_col_pred, plane,
-                                        bsize_pred);
-  else
-    av1_build_inter_predictor_sb_sub8x8_extend(cm, xd, mi_row_ori, mi_col_ori,
-                                               mi_row_pred, mi_col_pred, plane,
-                                               bsize_pred, block);
-}
-
-static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
-                             const TileInfo *const tile, int block,
-                             int mi_row_ori, int mi_col_ori, int mi_row_pred,
-                             int mi_col_pred, int mi_row_top, int mi_col_top,
-                             int plane, uint8_t *dst_buf, int dst_stride,
-                             BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             RUN_TYPE dry_run, int b_sub8x8) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
-  // block: sub location of sub8x8 blocks
-  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
-  // bextend: 1: region to predict is an extension of ori; 0: not
-
-  MACROBLOCK *const x = &td->mb;
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
-  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
-  const int mi_width_top = mi_size_wide[bsize_top];
-  const int mi_height_top = mi_size_high[bsize_top];
-
-  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
-      mi_row_pred >= mi_row_top + mi_height_top ||
-      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
-      mi_col_pred >= cm->mi_cols)
-    return;
-
-  set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
-                     mi_col_ori, bsize_pred);
-  xd->plane[plane].dst.stride = dst_stride;
-  xd->plane[plane].dst.buf =
-      dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
-      (c >> xd->plane[plane].subsampling_x);
-
-  predict_superblock(cpi, td, mi_row_ori, mi_col_ori, mi_row_pred, mi_col_pred,
-                     plane, bsize_pred, b_sub8x8, block);
-
-  if (!dry_run && (plane == 0) && (block == 0 || !b_sub8x8))
-    update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
-}
-
-static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
-                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
-                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
-                       int plane, uint8_t *dst_buf, int dst_stride, int dir) {
-  // dir: 0-lower, 1-upper, 2-left, 3-right
-  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
-  MACROBLOCKD *xd = &td->mb.e_mbd;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  int xss = xd->plane[1].subsampling_x;
-  int yss = xd->plane[1].subsampling_y;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
-  int wide_unit, high_unit;
-  int i, j;
-  int ext_offset = 0;
-
-  BLOCK_SIZE extend_bsize;
-  int mi_row_pred, mi_col_pred;
-
-  if (dir == 0 || dir == 1) {  // lower and upper
-    extend_bsize =
-        (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
-            ? BLOCK_8X8
-            : BLOCK_16X8;
-
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
-    mi_col_pred = mi_col;
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
-                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                         extend_bsize, 1, b_sub8x8);
-  } else if (dir == 2 || dir == 3) {  // left and right
-    extend_bsize =
-        (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
-            ? BLOCK_8X8
-            : BLOCK_8X16;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row;
-    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
-                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                         extend_bsize, 1, b_sub8x8);
-  } else {
-    extend_bsize = BLOCK_8X8;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
-                                                   : -(mi_height + ext_offset));
-    mi_col_pred =
-        mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
-                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                         extend_bsize, 1, b_sub8x8);
-  }
-}
-
-static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
-                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
-                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
-                       int plane, uint8_t *dst_buf, int dst_stride) {
-  assert(block >= 0 && block < 4);
-  for (int i = 0; i < 8; ++i) {
-    extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row_ori, mi_col_ori,
-               mi_row, mi_col, mi_row_top, mi_col_top, plane, dst_buf,
-               dst_stride, i);
-  }
-}
-
-// This function generates prediction for multiple blocks, between which
-// discontinuity around boundary is reduced by smoothing masks. The basic
-// smoothing mask is a soft step function along horz/vert direction. In more
-// complicated case when a block is split into 4 subblocks, the basic mask is
-// first applied to neighboring subblocks (2 pairs) in horizontal direction and
-// then applied to the 2 masked prediction mentioned above in vertical direction
-// If the block is split into more than one level, at every stage, masked
-// prediction is stored in dst_buf[] passed from higher level.
-static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
-                               const TileInfo *const tile, int mi_row,
-                               int mi_col, int mi_row_top, int mi_col_top,
-                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
-                               int dst_stride[3], PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int is_partition_root = bsize >= BLOCK_8X8;
-  const int ctx = is_partition_root
-                      ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                                mi_row + hbs < cm->mi_rows,
-                                                mi_col + hbs < cm->mi_cols,
-#endif
-                                                bsize)
-                      : -1;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-
-  int i;
-  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-  assert(bsize >= BLOCK_8X8);
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
-    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
-    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
-    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
-    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
-    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
-    dst_buf3[0] = tmp_buf3;
-    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
-    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  if (!dry_run && ctx >= 0 && bsize < top_bsize) {
-    // Explicitly cast away const.
-    FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts;
-    frame_counts->partition[ctx][partition]++;
-  }
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = dst_buf[i];
-    xd->plane[i].dst.stride = dst_stride[i];
-  }
-
-  switch (partition) {
-    case PARTITION_NONE:
-      assert(bsize < top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                         top_bsize, bsize, dry_run, 0);
-        extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row,
-                   mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                   dst_stride[i]);
-      }
-      break;
-    case PARTITION_HORZ:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-          // First half
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                           top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-
-          // Second half
-          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf1[i],
-                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
-                       dst_stride1[i]);
-        }
-
-        // Smooth
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-
-            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row, mi_col,
-                             mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], top_bsize, bsize,
-                             dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, bsize, top_bsize,
-                         mi_row + mode_offset_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i]);
-          } else {
-#endif
-            // First half
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, subsize, dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-            else
-              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i], 0);
-            xd->plane[i].dst.buf = dst_buf[i];
-            xd->plane[i].dst.stride = dst_stride[i];
-
-            if (mi_row + hbs < cm->mi_rows) {
-              // Second half
-              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
-                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
-                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
-                               dry_run, 0);
-              if (bsize < top_bsize)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                           i, dst_buf1[i], dst_stride1[i]);
-              else
-                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                           i, dst_buf1[i], dst_stride1[i], 1);
-              // Smooth
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_HORZ, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_VERT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-          // First half
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                           top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-
-          // Second half
-          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf1[i],
-                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
-                       dst_stride1[i]);
-        }
-
-        // Smooth
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + mode_offset_col,
-                             mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], top_bsize, bsize,
-                             dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row,
-                         mi_col + mode_offset_col, mi_row, mi_col, mi_row_top,
-                         mi_col_top, i, dst_buf[i], dst_stride[i]);
-          } else {
-#endif
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, subsize, dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-            else
-              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i], 3);
-            xd->plane[i].dst.buf = dst_buf[i];
-            xd->plane[i].dst.stride = dst_stride[i];
-
-            if (mi_col + hbs < cm->mi_cols) {
-              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                               mi_col + hbs, mi_row_top, mi_col_top, i,
-                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
-                               dry_run, 0);
-              if (bsize < top_bsize)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              else
-                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf1[i], dst_stride1[i], 2);
-
-              // smooth
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_VERT, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                           top_bsize, BLOCK_8X8, dry_run, 1);
-          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf1[i],
-                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf2[i],
-                           dst_stride2[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf3[i],
-                           dst_stride3[i], top_bsize, BLOCK_8X8, dry_run, 1);
-
-          if (bsize < top_bsize) {
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
-                       dst_stride1[i]);
-            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf2[i],
-                       dst_stride2[i]);
-            extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf3[i],
-                       dst_stride3[i]);
-          }
-        }
-#if CONFIG_CB4X4
-      } else if (bsize == BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row =
-                CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0;
-            int mode_offset_col =
-                CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0;
-
-            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row,
-                             mi_col + mode_offset_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, BLOCK_8X8, dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, BLOCK_8X8, top_bsize,
-                         mi_row + mode_offset_row, mi_col + mode_offset_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-          } else {
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, subsize, dry_run, 0);
-            if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                               mi_col + hbs, mi_row_top, mi_col_top, i,
-                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
-                               dry_run, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
-                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
-                               dst_buf2[i], dst_stride2[i], top_bsize, subsize,
-                               dry_run, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
-                               mi_row + hbs, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf3[i], dst_stride3[i],
-                               top_bsize, subsize, dry_run, 0);
-
-            if (bsize < top_bsize) {
-              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-              if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                           i, dst_buf2[i], dst_stride2[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col + hbs, mi_row + hbs, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf3[i], dst_stride3[i]);
-            }
-          }
-        }
-#endif
-      } else {
-        predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
-                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
-                           dst_stride, pc_tree->split[0]);
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
-                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
-                             dst_stride1, pc_tree->split[1]);
-        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
-                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
-                             dst_stride2, pc_tree->split[2]);
-        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                             mi_row_top, mi_col_top, dry_run, subsize,
-                             top_bsize, dst_buf3, dst_stride3,
-                             pc_tree->split[3]);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-        const struct macroblockd_plane *pd = &xd->plane[i];
-        int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-            subsize, pd->subsampling_x, pd->subsampling_y);
-        if (handle_chroma_sub8x8) continue;  // Skip <4x4 chroma smoothing
-#else
-        if (bsize == BLOCK_8X8 && i != 0)
-          continue;  // Skip <4x4 chroma smoothing
-#endif
-
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_VERT, i);
-          if (mi_row + hbs < cm->mi_rows) {
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_VERT, i);
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_HORZ, i);
-          }
-        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_HORZ, i);
-        }
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       top_bsize, subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-
-      break;
-    case PARTITION_VERT_A:
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-    case PARTITION_HORZ_B:
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
-                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
-                 dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_VERT, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      break;
-    case PARTITION_VERT_B:
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
-                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
-                 dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_HORZ, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
-
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize < top_bsize)
-    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-}
-
-static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
-                          const TileInfo *const tile, int mi_row, int mi_col,
-                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
-                          TX_TYPE *best_tx, PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
-      base_rate = *tmp_rate;
-  int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
-  uint8_t *dst_buf[3];
-  int dst_stride[3];
-  TX_SIZE tx_size;
-  MB_MODE_INFO *mbmi;
-  TX_TYPE tx_type, best_tx_nostx;
-  int tmp_rate_tx = 0, skip_tx = 0;
-  int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
-
-  set_skip_context(xd, mi_row, mi_col);
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    dst_buf[plane] = xd->plane[plane].dst.buf;
-    dst_stride[plane] = xd->plane[plane].dst.stride;
-  }
-  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
-                     bsize, dst_buf, dst_stride, pc_tree);
-
-  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-  set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
-
-  mbmi = &xd->mi[0]->mbmi;
-  best_tx_nostx = mbmi->tx_type;
-
-  *best_tx = DCT_DCT;
-
-  // chroma
-  skippable_uv = 1;
-  rate_uv = 0;
-  dist_uv = 0;
-  sse_uv = 0;
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    RD_STATS this_rd_stats;
-    av1_init_rd_stats(&this_rd_stats);
-
-    tx_size = max_txsize_lookup[bsize];
-    tx_size =
-        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
-
-    av1_subtract_plane(x, bsize, plane);
-    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0,
-                      get_plane_block_size(bsize, pd), &ctxa[0], &ctxl[0],
-                      &this_rd_stats);
-
-    this_rate = this_rd_stats.rate;
-    this_dist = this_rd_stats.dist;
-    pnsse = this_rd_stats.sse;
-    pnskip = this_rd_stats.skip;
-#else
-    tx_size = max_txsize_lookup[bsize];
-    tx_size =
-        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
-    av1_subtract_plane(x, bsize, plane);
-    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
-                                 &pnsse, INT64_MAX, plane, bsize, tx_size, 0);
-#endif  // CONFIG_VAR_TX
-
-    rate_uv += this_rate;
-    dist_uv += this_dist;
-    sse_uv += pnsse;
-    skippable_uv &= pnskip;
-  }
-
-  // luma
-  tx_size = max_txsize_lookup[bsize];
-  av1_subtract_plane(x, bsize, 0);
-#if CONFIG_EXT_TX
-  int ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used);
-  const TxSetType tx_set_type =
-      get_ext_tx_set_type(tx_size, bsize, 1, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-#if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    const struct macroblockd_plane *const pd = &xd->plane[0];
-    RD_STATS this_rd_stats;
-#endif  // CONFIG_VAR_TX
-
-#if CONFIG_EXT_TX
-    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-#else
-    if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue;
-#endif  // CONFIG_EXT_TX
-    mbmi->tx_type = tx_type;
-
-#if CONFIG_VAR_TX
-    av1_init_rd_stats(&this_rd_stats);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
-    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, &ctxa[0], &ctxl[0],
-                      &this_rd_stats);
-
-    this_rate = this_rd_stats.rate;
-    this_dist = this_rd_stats.dist;
-    pnsse = this_rd_stats.sse;
-    pnskip = this_rd_stats.skip;
-#else
-    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
-                                 &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
-#endif  // CONFIG_VAR_TX
-
-#if CONFIG_EXT_TX
-    if (get_ext_tx_types(tx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
-        !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) {
-      if (ext_tx_set > 0)
-        this_rate +=
-            x->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
-    }
-#else
-    if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-        this_rate != INT_MAX) {
-      this_rate += x->inter_tx_type_costs[tx_size][mbmi->tx_type];
-    }
-#endif  // CONFIG_EXT_TX
-    *tmp_rate = rate_uv + this_rate;
-    *tmp_dist = dist_uv + this_dist;
-    sse = sse_uv + pnsse;
-    skippable = skippable_uv && pnskip;
-    if (skippable) {
-      *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      x->skip = 1;
-    } else {
-      if (RDCOST(x->rdmult, *tmp_rate, *tmp_dist) < RDCOST(x->rdmult, 0, sse)) {
-        *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-        x->skip = 0;
-      } else {
-        *tmp_dist = sse;
-        *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-        x->skip = 1;
-      }
-    }
-    *tmp_rate += base_rate;
-    rd_tx = RDCOST(x->rdmult, *tmp_rate, *tmp_dist);
-    if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
-      *best_tx = tx_type;
-      bestrd_tx = rd_tx;
-      tmp_rate_tx = *tmp_rate;
-      tmp_dist_tx = *tmp_dist;
-      skip_tx = x->skip;
-    }
-  }
-  *tmp_rate = tmp_rate_tx;
-  *tmp_dist = tmp_dist_tx;
-  x->skip = skip_tx;
-#if CONFIG_VAR_TX
-  for (plane = 0; plane < 1; ++plane)
-    memset(x->blk_skip[plane], x->skip,
-           sizeof(uint8_t) * pc_tree->none.num_4x4_blk);
-#endif  // CONFIG_VAR_TX
-  xd->mi[0]->mbmi.tx_type = best_tx_nostx;
 }
-#endif  // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index b54e54d25..62141dba4 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -20,6 +20,8 @@
 extern "C" {
 #endif
 
+#define DELTAQ_MODULATION 0  // 0: variance based, 1: wavelet AC energy based
+
 struct macroblock;
 struct yv12_buffer_config;
 struct AV1_COMP;
@@ -27,7 +29,7 @@ struct ThreadData;
 
 void av1_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src, int mi_row,
-                          int mi_col);
+                          int mi_col, const int num_planes);
 
 void av1_encode_frame(struct AV1_COMP *cpi);
 
@@ -35,12 +37,6 @@ void av1_init_tile_data(struct AV1_COMP *cpi);
 void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 
-void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
-#if CONFIG_TXK_SEL
-                              int blk_row, int blk_col, int block, int plane,
-#endif
-                              BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              FRAME_COUNTS *counts);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index f35ce8a4f..cea8db6f9 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -9,15 +9,20 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/bitwriter.h"
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
@@ -25,22 +30,10 @@
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encodemb.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/rd.h"
-#include "av1/encoder/tokenize.h"
-
-#if CONFIG_PVQ
-#include "av1/encoder/encint.h"
-#include "av1/common/partition.h"
-#include "av1/encoder/pvq_encoder.h"
-#endif
-
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
+#include "av1/encoder/rdopt.h"
 
 // Check if one needs to use c version subtraction.
 static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
@@ -49,31 +42,23 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
                            int16_t *diff, ptrdiff_t diff_stride,
                            const uint8_t *src8, ptrdiff_t src_stride,
                            const uint8_t *pred8, ptrdiff_t pred_stride) {
-#if !CONFIG_HIGHBITDEPTH
-  (void)xd;
-#endif
-
   if (check_subtract_block_size(rows, cols)) {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
                                   src_stride, pred8, pred_stride, xd->bd);
       return;
     }
-#endif  // CONFIG_HIGHBITDEPTH
     aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                          pred_stride);
 
     return;
   }
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                               pred8, pred_stride, xd->bd);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
 }
@@ -101,7 +86,8 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -110,325 +96,26 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                  pd->dst.buf, pd->dst.stride);
 }
 
-// Shifting negative values is undefined behaviour in C99,
-// and could mislead the optimizer, who might assume the shifted is positive.
-// This also avoids ubsan warnings.
-// In practise, this gets inlined by the optimizer to a single instruction.
-static INLINE int signed_shift_right(int x, int shift) {
-  if (x >= 0)
-    return x >> shift;
-  else
-    return -((-x) >> shift);
-}
-
-#if !CONFIG_LV_MAP
-// These numbers are empirically obtained.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 7 }, { 8, 5 },
-};
-
-static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
-                             int blk_row, int blk_col, int block,
-                             TX_SIZE tx_size, int ctx) {
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode,
+                   int *rate_cost) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const PLANE_TYPE plane_type = pd->plane_type;
   const int eob = p->eobs[block];
-  assert(mb->qindex > 0);
-  assert((!plane_type && !plane) || (plane_type && plane));
-  assert(eob <= tx_size_2d[tx_size]);
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int16_t *const dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  const int16_t *const scan = scan_order->scan;
-  const int16_t *const nb = scan_order->neighbors;
-  const int shift = av1_get_tx_scale(tx_size);
-#if CONFIG_AOM_QM
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
-  const qm_val_t *iqmatrix =
-      IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[seg_id][!ref][tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
-#endif
-#if CONFIG_NEW_QUANT
-  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
-  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
-#endif  // CONFIG_NEW_QUANT
-  int64_t rd_cost0, rd_cost1;
-  int16_t t0, t1;
-  int i, final_eob = 0;
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      mb->token_head_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      mb->token_tail_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
-  int64_t rate0, rate1;
-  int64_t eob_cost0, eob_cost1;
-  tran_low_t before_best_eob_qc = 0;
-  tran_low_t before_best_eob_dqc = 0;
-
-  uint8_t token_cache[MAX_TX_SQUARE];
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    token_cache[rc] = av1_pt_energy_class[av1_get_token(qcoeff[rc])];
-  }
-
-  /* Record the r-d cost */
-  int64_t accu_rate = 0;
-  // Initialized to the worst possible error for the largest transform size.
-  // This ensures that it never goes negative.
-  int64_t accu_error = ((int64_t)1) << 50;
-  rate0 = head_token_costs[0][ctx][0];
-  int64_t best_block_rd_cost = RDCOST(rdmult, rate0, accu_error);
-
-  // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
-  const int seg_eob =
-      av1_get_tx_eob(&cm->seg, xd->mi[0]->mbmi.segment_id, tx_size);
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    const int x = qcoeff[rc];
-    const int sz = -(x < 0);
-    const int band_cur = band_translate[i];
-    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
-    const int eob_val =
-        (i + 1 == eob) ? (i + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-    const int is_first = (i == 0);
-
-    if (x == 0) {
-      // no need to search when x == 0
-      accu_rate += av1_get_coeff_token_cost(
-          ZERO_TOKEN, eob_val, is_first, head_token_costs[band_cur][ctx_cur],
-          tail_token_costs[band_cur][ctx_cur]);
-      // accu_error does not change when x==0
-    } else {
-      /*  Computing distortion
-       */
-      // compute the distortion for the first candidate
-      // and the distortion for quantizing to 0.
-      int dx0 = abs(coeff[rc]) * (1 << shift);
-      dx0 >>= xd->bd - 8;
-
-      const int64_t d0 = (int64_t)dx0 * dx0;
-      const int x_a = x - 2 * sz - 1;
-      int dqv;
-#if CONFIG_AOM_QM
-      int iwt;
-      dqv = dequant_ptr[rc != 0];
-      if (iqmatrix != NULL) {
-        iwt = iqmatrix[rc];
-        dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-      }
-#else
-      dqv = dequant_ptr[rc != 0];
-#endif
+  const int segment_id = xd->mi[0]->segment_id;
 
-      int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-      dx = signed_shift_right(dx, xd->bd - 8);
-      const int64_t d2 = (int64_t)dx * dx;
-
-      /* compute the distortion for the second candidate
-       * x_a = x - 2 * sz + 1;
-       */
-      int64_t d2_a;
-      if (x_a != 0) {
-#if CONFIG_NEW_QUANT
-        dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
-             (coeff[rc] * (1 << shift));
-        dx >>= xd->bd - 8;
-#else   // CONFIG_NEW_QUANT
-        dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
-#endif  // CONFIG_NEW_QUANT
-        d2_a = (int64_t)dx * dx;
-      } else {
-        d2_a = d0;
-      }
-      // Computing RD cost
-      int64_t base_bits;
-      // rate cost of x
-      base_bits = av1_get_token_cost(x, &t0, cat6_bits);
-      rate0 = base_bits +
-              av1_get_coeff_token_cost(t0, eob_val, is_first,
-                                       head_token_costs[band_cur][ctx_cur],
-                                       tail_token_costs[band_cur][ctx_cur]);
-      // rate cost of x_a
-      base_bits = av1_get_token_cost(x_a, &t1, cat6_bits);
-      if (t1 == ZERO_TOKEN && eob_val) {
-        rate1 = base_bits;
-      } else {
-        rate1 = base_bits +
-                av1_get_coeff_token_cost(t1, eob_val, is_first,
-                                         head_token_costs[band_cur][ctx_cur],
-                                         tail_token_costs[band_cur][ctx_cur]);
-      }
-
-      int64_t next_bits0 = 0, next_bits1 = 0;
-      if (i < eob - 1) {
-        int ctx_next;
-        const int band_next = band_translate[i + 1];
-        const int token_next = av1_get_token(qcoeff[scan[i + 1]]);
-        const int eob_val_next =
-            (i + 2 == eob) ? (i + 2 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-
-        token_cache[rc] = av1_pt_energy_class[t0];
-        ctx_next = get_coef_context(nb, token_cache, i + 1);
-        next_bits0 = av1_get_coeff_token_cost(
-            token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next],
-            tail_token_costs[band_next][ctx_next]);
-
-        token_cache[rc] = av1_pt_energy_class[t1];
-        ctx_next = get_coef_context(nb, token_cache, i + 1);
-        next_bits1 = av1_get_coeff_token_cost(
-            token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next],
-            tail_token_costs[band_next][ctx_next]);
-      }
-
-      rd_cost0 = RDCOST(rdmult, (rate0 + next_bits0), d2);
-      rd_cost1 = RDCOST(rdmult, (rate1 + next_bits1), d2_a);
-      const int best_x = (rd_cost1 < rd_cost0);
-
-      const int eob_v = (i + 1 == seg_eob) ? LAST_EOB : EARLY_EOB;
-      int64_t next_eob_bits0, next_eob_bits1;
-      int best_eob_x;
-      next_eob_bits0 = av1_get_coeff_token_cost(
-          t0, eob_v, is_first, head_token_costs[band_cur][ctx_cur],
-          tail_token_costs[band_cur][ctx_cur]);
-      eob_cost0 =
-          RDCOST(rdmult, (accu_rate + next_eob_bits0), (accu_error + d2 - d0));
-      eob_cost1 = eob_cost0;
-      if (x_a != 0) {
-        next_eob_bits1 = av1_get_coeff_token_cost(
-            t1, eob_v, is_first, head_token_costs[band_cur][ctx_cur],
-            tail_token_costs[band_cur][ctx_cur]);
-        eob_cost1 = RDCOST(rdmult, (accu_rate + next_eob_bits1),
-                           (accu_error + d2_a - d0));
-        best_eob_x = (eob_cost1 < eob_cost0);
-      } else {
-        best_eob_x = 0;
-      }
-
-      const int dqc = dqcoeff[rc];
-      int dqc_a = 0;
-      if (best_x || best_eob_x) {
-        if (x_a != 0) {
-#if CONFIG_NEW_QUANT
-          dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
-                                           dequant_val[band_translate[i]]);
-          dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a;
-          if (sz) dqc_a = -dqc_a;
-#else
-          if (x_a < 0)
-            dqc_a = -((-x_a * dqv) >> shift);
-          else
-            dqc_a = (x_a * dqv) >> shift;
-#endif  // CONFIG_NEW_QUANT
-        } else {
-          dqc_a = 0;
-        }  // if (x_a != 0)
-      }
-
-      // record the better quantized value
-      if (best_x) {
-        assert(d2_a <= d0);
-        qcoeff[rc] = x_a;
-        dqcoeff[rc] = dqc_a;
-        accu_rate += rate1;
-        accu_error += d2_a - d0;
-        token_cache[rc] = av1_pt_energy_class[t1];
-      } else {
-        assert(d2 <= d0);
-        accu_rate += rate0;
-        accu_error += d2 - d0;
-        token_cache[rc] = av1_pt_energy_class[t0];
-      }
-      assert(accu_error >= 0);
-
-      // determine whether to move the eob position to i+1
-      const int use_a = (x_a != 0) && (best_eob_x);
-      const int64_t best_eob_cost_i = use_a ? eob_cost1 : eob_cost0;
-      if (best_eob_cost_i < best_block_rd_cost) {
-        best_block_rd_cost = best_eob_cost_i;
-        final_eob = i + 1;
-        if (use_a) {
-          before_best_eob_qc = x_a;
-          before_best_eob_dqc = dqc_a;
-        } else {
-          before_best_eob_qc = x;
-          before_best_eob_dqc = dqc;
-        }
-      }
-    }  // if (x==0)
-  }    // for (i)
-
-  assert(final_eob <= eob);
-  if (final_eob > 0) {
-    assert(before_best_eob_qc != 0);
-    i = final_eob - 1;
-    int rc = scan[i];
-    qcoeff[rc] = before_best_eob_qc;
-    dqcoeff[rc] = before_best_eob_dqc;
-  }
-
-  for (i = final_eob; i < eob; i++) {
-    int rc = scan[i];
-    qcoeff[rc] = 0;
-    dqcoeff[rc] = 0;
+  if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+      xd->lossless[segment_id]) {
+    *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
+    return eob;
   }
 
-  p->eobs[block] = final_eob;
-  return final_eob;
-}
-#endif  // !CONFIG_LV_MAP
-
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
-                   int blk_col, int block, BLOCK_SIZE plane_bsize,
-                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  const int eob = p->eobs[block];
-  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
-  if (eob == 0) return eob;
-  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return eob;
-
-#if CONFIG_PVQ
-  (void)cm;
-  (void)tx_size;
-  (void)a;
-  (void)l;
-  return eob;
-#endif
-
-#if !CONFIG_LV_MAP
-  (void)plane_bsize;
-  (void)blk_row;
-  (void)blk_col;
   (void)fast_mode;
-#if CONFIG_VAR_TX
-  int ctx = get_entropy_context(tx_size, a, l);
-#else
-  int ctx = combine_entropy_contexts(*a, *l);
-#endif  // CONFIG_VAR_TX
-  return optimize_b_greedy(cm, mb, plane, blk_row, blk_col, block, tx_size,
-                           ctx);
-#else   // !CONFIG_LV_MAP
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_optimize_txb(cm, mb, plane, blk_row, blk_col, block, tx_size,
-                          &txb_ctx, fast_mode);
-#endif  // !CONFIG_LV_MAP
+  return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+                              rate_cost, cpi->oxcf.sharpness);
 }
 
-#if !CONFIG_PVQ
 typedef enum QUANT_FUNC {
   QUANT_FUNC_LOWBD = 0,
   QUANT_FUNC_HIGHBD = 1,
@@ -437,394 +124,231 @@ typedef enum QUANT_FUNC {
 
 static AV1_QUANT_FACADE
     quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
-#if !CONFIG_NEW_QUANT
       { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
       { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
       { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
-#else   // !CONFIG_NEW_QUANT
-      { av1_quantize_fp_nuq_facade, av1_highbd_quantize_fp_nuq_facade },
-      { av1_quantize_b_nuq_facade, av1_highbd_quantize_b_nuq_facade },
-      { av1_quantize_dc_nuq_facade, av1_highbd_quantize_dc_nuq_facade },
-#endif  // !CONFIG_NEW_QUANT
       { NULL, NULL }
     };
-#endif  // !CONFIG_PVQ
-
-#if !CONFIG_TXMG && !CONFIG_PVQ
-typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride,
-                            TxfmParam *txfm_param);
-static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm,
-                                              av1_highbd_fwd_txfm };
-#endif
 
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, int ctx,
+                     TX_SIZE tx_size, TX_TYPE tx_type,
                      AV1_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if !(CONFIG_PVQ || CONFIG_DIST_8X8)
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#else
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-
-#if (CONFIG_AOM_QM || CONFIG_NEW_QUANT) && !CONFIG_PVQ
-  const int is_inter = is_inter_block(mbmi);
-#endif
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
 
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = block_size_wide[plane_bsize];
-#if CONFIG_AOM_QM && !CONFIG_PVQ
   int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
   // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
   const qm_val_t *qmatrix =
-      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][!is_inter][tx_size]
-                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
+      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size]
+                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
   const qm_val_t *iqmatrix =
       IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[seg_id][!is_inter][tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
-#endif
+          ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
 
-  TxfmParam txfm_param;
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-  uint8_t *dst;
-  const int dst_stride = pd->dst.stride;
-#if CONFIG_PVQ || CONFIG_DIST_8X8
-  int16_t *pred;
-  const int txw = tx_size_wide[tx_size];
-  const int txh = tx_size_high[tx_size];
-  int i, j;
-#endif
-#endif
-
-#if !CONFIG_PVQ
-  const int tx2d_size = tx_size_2d[tx_size];
+  const int src_offset = (blk_row * diff_stride + blk_col);
+  const int16_t *src_diff = &p->src_diff[src_offset << tx_size_wide_log2[0]];
   QUANT_PARAM qparam;
-  const int16_t *src_diff;
-
-  src_diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   qparam.log_scale = av1_get_tx_scale(tx_size);
-#if CONFIG_NEW_QUANT
   qparam.tx_size = tx_size;
-  qparam.dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
-#endif  // CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
   qparam.qmatrix = qmatrix;
   qparam.iqmatrix = iqmatrix;
-#endif  // CONFIG_AOM_QM
-#else
-  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
-  int skip = 1;
-  PVQ_INFO *pvq_info = NULL;
-  uint8_t *src;
-  int16_t *src_int16;
-  const int src_stride = p->src.stride;
-
-  (void)ctx;
-  (void)scan_order;
-  (void)qcoeff;
-
-  if (x->pvq_coded) {
-    assert(block < MAX_PVQ_BLOCKS_IN_SB);
-    pvq_info = &x->pvq[block][plane];
-  }
-  src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-  src_int16 =
-      &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        src_int16[diff_stride * j + i] =
-            CONVERT_TO_SHORTPTR(src)[src_stride * j + i];
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        src_int16[diff_stride * j + i] = src[src_stride * j + i];
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED ||
-        // CONFIG_MRC_TX
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8
-  if (CONFIG_PVQ
-#if CONFIG_DIST_8X8
-      || x->using_dist_8x8
-#endif  // CONFIG_DIST_8X8
-      ) {
-    pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-
-// copy uint8 orig and predicted block to int16 buffer
-// in order to use existing VP10 transform functions
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < txh; j++)
-        for (i = 0; i < txw; i++)
-          pred[diff_stride * j + i] =
-              CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (j = 0; j < txh; j++)
-        for (i = 0; i < txw; i++)
-          pred[diff_stride * j + i] = dst[dst_stride * j + i];
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
-
-  (void)ctx;
-
+  TxfmParam txfm_param;
   txfm_param.tx_type = tx_type;
   txfm_param.tx_size = tx_size;
   txfm_param.lossless = xd->lossless[mbmi->segment_id];
-#if CONFIG_MRC_TX || CONFIG_LGT
-  txfm_param.is_inter = is_inter_block(mbmi);
-#endif
-#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-  txfm_param.dst = dst;
-  txfm_param.stride = dst_stride;
-#if CONFIG_MRC_TX
-  txfm_param.valid_mask = &mbmi->valid_mrc_mask;
-#if SIGNAL_ANY_MRC_MASK
-  txfm_param.mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // SIGNAL_ANY_MRC_MASK
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  txfm_param.mode = mbmi->mode;
-  txfm_param.use_lgt = mbmi->use_lgt;
-#endif  // CONFIG_LGT_FROM_PRED
-#endif  // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-
-#if !CONFIG_PVQ
+  txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
+
   txfm_param.bd = xd->bd;
-  const int is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
 
-#if CONFIG_TXMG
-  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
-#else   // CONFIG_TXMG
-  fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &txfm_param);
-#endif  // CONFIG_TXMG
+  av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
 
   if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    const int n_coeffs = av1_get_max_eob(tx_size);
     if (LIKELY(!x->skip_block)) {
-      quant_func_list[xform_quant_idx][is_hbd](
-          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+      quant_func_list[xform_quant_idx][txfm_param.is_hbd](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
     } else {
-      av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+      av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
     }
   }
-#if CONFIG_LV_MAP
-  p->txb_entropy_ctx[block] =
-      (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
-#endif  // CONFIG_LV_MAP
-  return;
-#else  // CONFIG_PVQ
-  (void)xform_quant_idx;
-#if CONFIG_HIGHBITDEPTH
-  txfm_param.bd = xd->bd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param);
-    av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param);
+  // NOTE: optimize_b_following is ture means av1_optimze_b will be called
+  // When the condition of doing optimize_b is changed,
+  // this flag need update simultaneously
+  const int optimize_b_following =
+      (xform_quant_idx != AV1_XFORM_QUANT_FP) || (txfm_param.lossless);
+  if (optimize_b_following) {
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
   } else {
-#endif
-    av1_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param);
-    av1_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-
-  // PVQ for inter mode block
-  if (!x->skip_block) {
-    PVQ_SKIP_TYPE ac_dc_coded =
-        av1_pvq_encode_helper(x,
-                              coeff,        // target original vector
-                              ref_coeff,    // reference vector
-                              dqcoeff,      // de-quantized vector
-                              eob,          // End of Block marker
-                              pd->dequant,  // aom's quantizers
-                              plane,        // image plane
-                              tx_size,      // block size in log_2 - 2
-                              tx_type,
-                              &x->rate,  // rate measured
-                              x->pvq_speed,
-                              pvq_info);  // PVQ info for a block
-    skip = ac_dc_coded == PVQ_SKIP;
+    p->txb_entropy_ctx[block] = 0;
   }
-  x->pvq_skip[plane] = skip;
-
-  if (!skip) mbmi->skip = 0;
-#endif  // #if !CONFIG_PVQ
+  return;
 }
 
 static void encode_block(int plane, int block, int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+                         int mi_row, int mi_col, RUN_TYPE dry_run) {
+  (void)mi_row;
+  (void)mi_col;
+  (void)dry_run;
   struct encode_b_args *const args = arg;
-  AV1_COMMON *cm = args->cm;
+  const AV1_COMMON *const cm = &args->cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int ctx;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   uint8_t *dst;
-#if !CONFIG_PVQ
   ENTROPY_CONTEXT *a, *l;
-#endif
-#if CONFIG_VAR_TX
-  int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-#endif
+  int dummy_rate_cost = 0;
+
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   dst = &pd->dst
              .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
 
-#if !CONFIG_PVQ
   a = &args->ta[blk_col];
   l = &args->tl[blk_row];
-#if CONFIG_VAR_TX
-  ctx = get_entropy_context(tx_size, a, l);
-#else
-  ctx = combine_entropy_contexts(*a, *l);
-#endif
-#else
-  ctx = 0;
-#endif  // CONFIG_PVQ
-
-#if CONFIG_VAR_TX
   // Assert not magic number (uninitialized).
-  assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234);
-
-  if (x->blk_skip[plane][blk_row * bw + blk_col] == 0)
-#endif
-  {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    ctx, AV1_XFORM_QUANT_FP);
-  }
-#if CONFIG_VAR_TX
-  else {
+  assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
+
+  if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) &&
+      !mbmi->skip_mode) {
+    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+                                      tx_size, cm->reduced_tx_set_used);
+    if (args->enable_optimize_b) {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
+    } else {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+    }
+  } else {
     p->eobs[block] = 0;
+    p->txb_entropy_ctx[block] = 0;
   }
-#endif
-
-#if !CONFIG_PVQ
-  av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
-                 l, 0);
 
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
-  if (p->eobs[block]) *(args->skip) = 0;
+  if (p->eobs[block]) {
+    *(args->skip) = 0;
 
-  if (p->eobs[block] != 0)
-#else
-  (void)ctx;
-  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+                                      tx_size, cm->reduced_tx_set_used);
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                pd->dst.stride, p->eobs[block],
+                                cm->reduced_tx_set_used);
+  }
 
-  if (!x->pvq_skip[plane])
-#endif
-  {
-#if CONFIG_LGT_FROM_PRED
-    PREDICTION_MODE mode = xd->mi[0]->mbmi.mode;
-#endif  // CONFIG_LGT_FROM_PRED
-    TX_TYPE tx_type =
-        av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
-    av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                mode,
+  if (p->eobs[block] == 0 && plane == 0) {
+  // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+  // case. It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ &&
+        args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+      // enable_optimize_b is true to detect potential RD bug.
+      const uint8_t disable_txk_check = args->enable_optimize_b;
+      if (!disable_txk_check) {
+        assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+                                                     blk_col)] == DCT_DCT);
+      }
+    }
 #endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                tx_type, tx_size, dst, pd->dst.stride,
-                                p->eobs[block]);
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
   }
+
+#if CONFIG_MISMATCH_DEBUG
+  if (dry_run == OUTPUT_ENABLED) {
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
+                             pixel_c, pixel_r, blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
 }
 
-#if CONFIG_VAR_TX
 static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               void *arg) {
+                               void *arg, int mi_row, int mi_col,
+                               RUN_TYPE dry_run) {
+  (void)mi_row;
+  (void)mi_col;
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  if (!plane) {
+    assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+           tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+  }
 
-  if (tx_size == plane_tx_size) {
-    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+  if (tx_size == plane_tx_size || plane) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+                 mi_row, mi_col, dry_run);
   } else {
     assert(tx_size < TX_SIZES_ALL);
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-    if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0);
-#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
     assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
-#endif
     // This is the square transform block partition entry point.
-    int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + ((i >> 1) * bsl);
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + ((i & 0x01) * bsl);
-#else
-      const int offsetr = blk_row + ((i >> 1) * bsl);
-      const int offsetc = blk_col + ((i & 0x01) * bsl);
-#endif
-      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    assert(bsw > 0 && bsh > 0);
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
 
-      encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
-                         arg);
-      block += step;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                           arg, mi_row, mi_col, dry_run);
+        block += step;
+      }
     }
   }
 }
-#endif
 
 typedef struct encode_block_pass1_args {
   AV1_COMMON *cm;
@@ -843,57 +367,25 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   TxfmParam txfm_param;
   uint8_t *dst;
-  int ctx = 0;
   dst = &pd->dst
              .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
-
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  ctx, AV1_XFORM_QUANT_B);
-#if CONFIG_PVQ
-  if (!x->pvq_skip[plane]) {
-    int tx_blk_size;
-    int i, j;
-    // transform block size in pixels
-    tx_blk_size = tx_size_wide[tx_size];
-
-// Since av1 does not have separate function which does inverse transform
-// but av1_inv_txfm_add_*x*() also does addition of predicted image to
-// inverse transformed image,
-// pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++)
-          CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0;
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_PVQ
+                  DCT_DCT, AV1_XFORM_QUANT_B);
 
-#if !CONFIG_PVQ
-  if (p->eobs[block] > 0)
-#endif
-  {
+  if (p->eobs[block] > 0) {
     txfm_param.bd = xd->bd;
+    txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
     txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
     txfm_param.eob = p->eobs[block];
-    txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    txfm_param.lossless = xd->lossless[xd->mi[0]->segment_id];
+    txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+        txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
+    if (txfm_param.is_hbd) {
       av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
       return;
     }
-#endif  //  CONFIG_HIGHBITDEPTH
-    if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-      av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-    } else {
-      av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-    }
+    av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
   }
 }
 
@@ -904,20 +396,28 @@ void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
                                          encode_block_pass1, &args);
 }
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                   int mi_col) {
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run) {
+  (void)dry_run;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct encode_b_args arg = { cpi,
+                               x,
+                               &ctx,
+                               &mbmi->skip,
+                               NULL,
+                               NULL,
+                               cpi->optimize_seg_arr[mbmi->segment_id] };
   int plane;
 
   mbmi->skip = 1;
 
   if (x->skip) return;
 
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  for (plane = 0; plane < num_planes; ++plane) {
     const int subsampling_x = xd->plane[plane].subsampling_x;
     const int subsampling_y = xd->plane[plane].subsampling_y;
 
@@ -925,41 +425,32 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
                              subsampling_y))
       continue;
 
-    bsize = scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
-#else
-    (void)mi_row;
-    (void)mi_col;
-#endif
+    const BLOCK_SIZE bsizec =
+        scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
 
-#if CONFIG_VAR_TX
     // TODO(jingning): Clean this up.
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(
-        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-    const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    const int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
     int idx, idy;
     int block = 0;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]);
-#else
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
-#endif
+    av1_get_entropy_contexts(bsizec, pd, ctx.ta[plane], ctx.tl[plane]);
+
+    av1_subtract_plane(x, bsizec, plane);
 
-#if !CONFIG_PVQ
-    av1_subtract_plane(x, bsize, plane);
-#endif
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
 
-#if CONFIG_VAR_TX
-    const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
     int mu_blocks_wide =
         block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
     int mu_blocks_high =
@@ -976,67 +467,14 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
         for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
           for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
             encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
-                               max_tx_size, &arg);
+                               max_tx_size, &arg, mi_row, mi_col, dry_run);
             block += step;
           }
         }
       }
     }
-#else
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
-#endif
-  }
-}
-
-#if CONFIG_SUPERTX
-void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
-  int plane;
-
-  mbmi->skip = 1;
-  if (x->skip) return;
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_VAR_TX
-    const TX_SIZE tx_size = TX_4X4;
-#else
-    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-#endif
-    av1_subtract_plane(x, bsize, plane);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
-    arg.ta = ctx.ta[plane];
-    arg.tl = ctx.tl[plane];
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
   }
 }
-#endif  // CONFIG_SUPERTX
-
-#if !CONFIG_PVQ
-void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
-                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  (void)tx_size;
-  struct macroblock_plane *p = &x->plane[plane];
-
-#if !CONFIG_LV_MAP
-  *a = *l = p->eobs[block] > 0;
-#else   // !CONFIG_LV_MAP
-  *a = *l = p->txb_entropy_ctx[block];
-#endif  // !CONFIG_LV_MAP
-
-#if CONFIG_VAR_TX || CONFIG_LV_MAP
-  int i;
-  for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0];
-
-  for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0];
-#endif
-}
-#endif
 
 static void encode_block_intra_and_set_context(int plane, int block,
                                                int blk_row, int blk_col,
@@ -1044,260 +482,113 @@ static void encode_block_intra_and_set_context(int plane, int block,
                                                TX_SIZE tx_size, void *arg) {
   av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
                          arg);
-#if !CONFIG_PVQ
+
   struct encode_b_args *const args = arg;
   MACROBLOCK *x = args->x;
   ENTROPY_CONTEXT *a = &args->ta[blk_col];
   ENTROPY_CONTEXT *l = &args->tl[blk_row];
   av1_set_txb_context(x, plane, block, tx_size, a, l);
-#endif
 }
 
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
   struct encode_b_args *const args = arg;
-  AV1_COMMON *cm = args->cm;
+  const AV1_COMMON *const cm = &args->cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
   uint16_t *eob = &p->eobs[block];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  int dummy_rate_cost = 0;
 
-  av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
-                                 tx_size);
-
-  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
 
-  const ENTROPY_CONTEXT *a = &args->ta[blk_col];
-  const ENTROPY_CONTEXT *l = &args->tl[blk_row];
-  int ctx = combine_entropy_contexts(*a, *l);
-  if (args->enable_optimize_b) {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 0);
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  // Assert not magic number (uninitialized).
+  assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
+  if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) {
+    *eob = 0;
+    p->txb_entropy_ctx[block] = 0;
   } else {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    ctx, AV1_XFORM_QUANT_B);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+    const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+    const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+    if (args->enable_optimize_b) {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
+    } else {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+    }
   }
 
-#if CONFIG_PVQ
-  // *(args->skip) == mbmi->skip
-  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+  if (*eob) {
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                dst_stride, *eob, cm->reduced_tx_set_used);
+  }
 
-  if (x->pvq_skip[plane]) return;
-#endif  // CONFIG_PVQ
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              xd->mi[0]->mbmi.mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, dst_stride, *eob);
-#if !CONFIG_PVQ
-  if (*eob) *(args->skip) = 0;
-#else
-// Note : *(args->skip) == mbmi->skip
+  if (*eob == 0 && plane == 0) {
+  // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+  // It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ
+        && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+                                                   blk_col)] == DCT_DCT);
+    }
 #endif
-#if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
+  }
+
+  // For intra mode, skipped blocks are so rare that transmitting skip=1 is
+  // very expensive.
+  *(args->skip) = 0;
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif  // CONFIG_CFL
 }
 
-void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
                                   int mi_col) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 };
-  ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
 
   struct encode_b_args arg = {
-    cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
+    cpi, x, NULL, &(xd->mi[0]->skip), ta, tl, enable_optimize_b
   };
 
-#if CONFIG_CB4X4
   if (!is_chroma_reference(mi_row, mi_col, bsize,
                            xd->plane[plane].subsampling_x,
                            xd->plane[plane].subsampling_y))
     return;
-#else
-  (void)mi_row;
-  (void)mi_col;
-#endif
 
   if (enable_optimize_b) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
+    av1_get_entropy_contexts(bsize, pd, ta, tl);
   }
   av1_foreach_transformed_block_in_plane(
       xd, bsize, plane, encode_block_intra_and_set_context, &arg);
 }
-
-#if CONFIG_PVQ
-PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
-                                    tran_low_t *ref_coeff,
-                                    tran_low_t *const dqcoeff, uint16_t *eob,
-                                    const int16_t *quant, int plane,
-                                    TX_SIZE tx_size, TX_TYPE tx_type, int *rate,
-                                    int speed, PVQ_INFO *pvq_info) {
-  const int tx_blk_size = tx_size_wide[tx_size];
-  daala_enc_ctx *daala_enc = &x->daala_enc;
-  PVQ_SKIP_TYPE ac_dc_coded;
-  int coeff_shift = 3 - av1_get_tx_scale(tx_size);
-  int hbd_downshift = 0;
-  int rounding_mask;
-  int pvq_dc_quant;
-  int use_activity_masking = daala_enc->use_activity_masking;
-  int tell;
-  int has_dc_skip = 1;
-  int i;
-  int off = od_qm_offset(tx_size, plane ? 1 : 0);
-
-  DECLARE_ALIGNED(16, tran_low_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-
-  DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-
-  hbd_downshift = x->e_mbd.bd - 8;
-
-  assert(OD_COEFF_SHIFT >= 4);
-  // DC quantizer for PVQ
-  if (use_activity_masking)
-    pvq_dc_quant =
-        OD_MAXI(1,
-                (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
-                        daala_enc->state
-                            .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
-                    4);
-  else
-    pvq_dc_quant =
-        OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
-
-  *eob = 0;
-
-#if !CONFIG_ANS
-  tell = od_ec_enc_tell_frac(&daala_enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-
-  // Change coefficient ordering for pvq encoding.
-  od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff,
-                            tx_blk_size);
-  od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff,
-                            tx_blk_size);
-
-  // copy int16 inputs to int32
-  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
-    ref_int32[i] =
-        AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
-        hbd_downshift;
-    in_int32[i] = AOM_SIGNED_SHL(coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
-                  hbd_downshift;
-  }
-
-  if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */
-    out_int32[0] = 0;
-  } else {
-    out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant);
-  }
-
-  ac_dc_coded = od_pvq_encode(
-      daala_enc, ref_int32, in_int32, out_int32,
-      OD_MAXI(1,
-              quant[0] << (OD_COEFF_SHIFT - 3) >>
-                  hbd_downshift),  // scale/quantizer
-      OD_MAXI(1,
-              quant[1] << (OD_COEFF_SHIFT - 3) >>
-                  hbd_downshift),  // scale/quantizer
-      plane, tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
-      0,  // is_keyframe,
-      daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
-      speed,  // speed
-      pvq_info);
-
-  // Encode residue of DC coeff, if required.
-  if (!has_dc_skip || out_int32[0]) {
-    generic_encode(&daala_enc->w, &daala_enc->state.adapt->model_dc[plane],
-                   abs(out_int32[0]) - has_dc_skip,
-                   &daala_enc->state.adapt->ex_dc[plane][tx_size][0], 2);
-  }
-  if (out_int32[0]) {
-    aom_write_bit(&daala_enc->w, out_int32[0] < 0);
-  }
-
-  // need to save quantized residue of DC coeff
-  // so that final pvq bitstream writing can know whether DC is coded.
-  if (pvq_info) pvq_info->dq_dc_residue = out_int32[0];
-
-  out_int32[0] = out_int32[0] * pvq_dc_quant;
-  out_int32[0] += ref_int32[0];
-
-  // copy int32 result back to int16
-  assert(OD_COEFF_SHIFT > coeff_shift);
-  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
-  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
-    out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
-    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
-                     (OD_COEFF_SHIFT - coeff_shift);
-  }
-
-  // Back to original coefficient order
-  od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq,
-                            tx_blk_size);
-
-  *eob = tx_blk_size * tx_blk_size;
-
-#if !CONFIG_ANS
-  *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell)
-          << (AV1_PROB_COST_SHIFT - OD_BITRES);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  assert(*rate >= 0);
-
-  return ac_dc_coded;
-}
-
-void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
-                            od_coeff *y, int nb_bands, const int *off,
-                            int *size, int skip_rest, int skip_dir,
-                            int bs) {  // block size in log_2 -2
-  int i;
-  const int tx_blk_size = tx_size_wide[bs];
-
-  for (i = 0; i < nb_bands; i++) {
-    pvq_info->qg[i] = qg[i];
-    pvq_info->theta[i] = theta[i];
-    pvq_info->k[i] = k[i];
-    pvq_info->off[i] = off[i];
-    pvq_info->size[i] = size[i];
-  }
-
-  memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff));
-
-  pvq_info->nb_bands = nb_bands;
-  pvq_info->skip_rest = skip_rest;
-  pvq_info->skip_dir = skip_dir;
-  pvq_info->bs = bs;
-}
-#endif
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index c817a94f0..673f87ea7 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -12,21 +12,23 @@
 #ifndef AV1_ENCODER_ENCODEMB_H_
 #define AV1_ENCODER_ENCODEMB_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
 #include "av1/encoder/block.h"
-
+#include "av1/encoder/tokenize.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
 };
 
 struct encode_b_args {
-  AV1_COMMON *cm;
+  const struct AV1_COMP *cpi;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
   int8_t *skip;
@@ -43,52 +45,39 @@ typedef enum AV1_XFORM_QUANT {
   AV1_XFORM_QUANT_TYPES,
 } AV1_XFORM_QUANT;
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                   int mi_col);
-#if CONFIG_SUPERTX
-void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
-#endif  // CONFIG_SUPERTX
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run);
 void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx);
+                     TX_SIZE tx_size, TX_TYPE tx_type,
+                     AV1_XFORM_QUANT xform_quant_idx);
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
-                   int blk_col, int block, BLOCK_SIZE plane_bsize,
-                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode);
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-#if !CONFIG_PVQ
-void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
-                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l);
-#endif
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+                                       TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+                                       ENTROPY_CONTEXT *l) {
+  const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+  memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+  memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
 
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
-void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
                                   int mi_col);
 
-#if CONFIG_PVQ
-PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
-                                    tran_low_t *ref_coeff,
-                                    tran_low_t *const dqcoeff, uint16_t *eob,
-                                    const int16_t *quant, int plane,
-                                    TX_SIZE tx_size, TX_TYPE tx_type, int *rate,
-                                    int speed, PVQ_INFO *pvq_info);
-
-void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
-                            od_coeff *y, int nb_bands, const int *off,
-                            int *size, int skip_rest, int skip_dir, int bs);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index f8a546999..944e2c53d 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -16,20 +16,9 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
-#include "av1/encoder/subexp.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 
-static struct av1_token mv_joint_encodings[MV_JOINTS];
-static struct av1_token mv_class_encodings[MV_CLASSES];
-static struct av1_token mv_fp_encodings[MV_FP_SIZE];
-
-void av1_entropy_mv_init(void) {
-  av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree);
-  av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree);
-  av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
-}
-
 static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
                                 MvSubpelPrecision precision) {
   int offset;
@@ -42,38 +31,23 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
 
   assert(comp != 0);
 
-// Sign
-#if CONFIG_NEW_MULTISYMBOL
-  aom_write_bit(w, sign);
-#else
-  aom_write(w, sign, mvcomp->sign);
-#endif
+  // Sign
+  aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
 
   // Class
-  aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
+  aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
-#else
-    aom_write(w, d, mvcomp->class0[0]);
-#endif
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
-#if CONFIG_NEW_MULTISYMBOL
     for (i = 0; i < n; ++i)
-      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[(i + 1) / 2], 2);
-#else
-    for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
-#endif
+      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
   }
-// Fractional bits
-#if CONFIG_INTRABC || CONFIG_AMVR
-  if (precision > MV_SUBPEL_NONE)
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-  {
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
     aom_write_symbol(
         w, fr,
         mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
@@ -82,13 +56,9 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
 
   // High precision bit
   if (precision > MV_SUBPEL_LOW_PRECISION)
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(
         w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
         2);
-#else
-    aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
-#endif
 }
 
 static void build_nmv_component_cost_table(int *mvcost,
@@ -100,24 +70,20 @@ static void build_nmv_component_cost_table(int *mvcost,
   int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
 
-  sign_cost[0] = av1_cost_zero(mvcomp->sign);
-  sign_cost[1] = av1_cost_one(mvcomp->sign);
-  av1_cost_tokens(class_cost, mvcomp->classes, av1_mv_class_tree);
-  av1_cost_tokens(class0_cost, mvcomp->class0, av1_mv_class0_tree);
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
   for (i = 0; i < MV_OFFSET_BITS; ++i) {
-    bits_cost[i][0] = av1_cost_zero(mvcomp->bits[i]);
-    bits_cost[i][1] = av1_cost_one(mvcomp->bits[i]);
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
   }
 
   for (i = 0; i < CLASS0_SIZE; ++i)
-    av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree);
-  av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree);
+    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
+  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
 
   if (precision > MV_SUBPEL_LOW_PRECISION) {
-    class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp);
-    class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp);
-    hp_cost[0] = av1_cost_zero(mvcomp->hp);
-    hp_cost[1] = av1_cost_one(mvcomp->hp);
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
   }
   mvcost[0] = 0;
   for (v = 1; v <= MV_MAX; ++v) {
@@ -134,10 +100,7 @@ static void build_nmv_component_cost_table(int *mvcost,
       const int b = c + CLASS0_BITS - 1; /* number of bits */
       for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
     }
-#if CONFIG_INTRABC || CONFIG_AMVR
-    if (precision > MV_SUBPEL_NONE)
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-    {
+    if (precision > MV_SUBPEL_NONE) {
       if (c == MV_CLASS_0) {
         cost += class0_fp_cost[d][f];
       } else {
@@ -156,50 +119,14 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
-                      aom_prob upd_p) {
-  (void)upd_p;
-  // Just use the default maximum number of tile groups to avoid passing in the
-  // actual
-  // number
-  av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG);
-}
-
-void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
-                         nmv_context_counts *const nmv_counts) {
-  int i;
-  int nmv_ctx = 0;
-#if CONFIG_AMVR
-  if (cm->cur_frame_mv_precision_level) {
-    return;
-  }
-#endif
-  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
-    nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
-    nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
-
-    if (usehp) {
-      for (i = 0; i < 2; ++i) {
-        update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
-                  MV_UPDATE_PROB);
-        update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
-      }
-    }
-  }
-}
-#endif
-
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level) {
+  if (cpi->common.cur_frame_force_integer_mv) {
     usehp = MV_SUBPEL_NONE;
   }
-#endif
-  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
@@ -214,212 +141,81 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
   }
 }
 
-#if CONFIG_INTRABC
 void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx) {
+  // DV and ref DV should not have sub-pel.
+  assert((mv->col & 7) == 0);
+  assert((mv->row & 7) == 0);
+  assert((ref->col & 7) == 0);
+  assert((ref->row & 7) == 0);
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
 
-  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
 
   if (mv_joint_horizontal(j))
     encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
 }
-#endif  // CONFIG_INTRABC
 
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *ctx,
                               MvSubpelPrecision precision) {
-  av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree);
+  av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
   build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
 }
 
-static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
-                    const int_mv mvs[2], const int_mv pred_mvs[2],
-                    nmv_context_counts *nmv_counts
-#if CONFIG_AMVR
-                    ,
-                    MvSubpelPrecision precision
-#endif
-                    ) {
-  int i;
-  PREDICTION_MODE mode = mbmi->mode;
-
-  if (mode == NEWMV || mode == NEW_NEWMV) {
-    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-      const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
-      const MV diff = { mvs[i].as_mv.row - ref->row,
-                        mvs[i].as_mv.col - ref->col };
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx =
-          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
-      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-      (void)pred_mvs;
-#if CONFIG_AMVR
-      av1_inc_mv(&diff, counts, precision);
-#else
-      av1_inc_mv(&diff, counts, 1);
-#endif
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack =
+      mbmi_ext->ref_mv_stack[ref_frame_type];
+  int_mv ref_mv;
+  ref_mv.as_int = INVALID_MV;
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    if (ref_idx == 0) {
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+    } else {
+      assert(ref_idx == 1);
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
     }
-  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
-    const MV diff = { mvs[1].as_mv.row - ref->row,
-                      mvs[1].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
-  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
-    const MV diff = { mvs[0].as_mv.row - ref->row,
-                      mvs[0].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
-#if CONFIG_COMPOUND_SINGLEREF
   } else {
-    assert(  // mode == SR_NEAREST_NEWMV ||
-        mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV || mode == SR_NEW_NEWMV);
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-    (void)pred_mvs;
-    MV diff;
-    if (mode == SR_NEW_NEWMV) {
-      diff.row = mvs[0].as_mv.row - ref->row;
-      diff.col = mvs[0].as_mv.col - ref->col;
-      av1_inc_mv(&diff, counts, 1);
+    assert(ref_idx == 0);
+    if (ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+    } else {
+      ref_mv = mbmi_ext->global_mvs[ref_frame_type];
     }
-    diff.row = mvs[1].as_mv.row - ref->row;
-    diff.col = mvs[1].as_mv.col - ref->col;
-    av1_inc_mv(&diff, counts, 1);
-#endif  // CONFIG_COMPOUND_SINGLEREF
   }
+  return ref_mv;
 }
 
-static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
-                           const MB_MODE_INFO_EXT *mbmi_ext,
-                           nmv_context_counts *nmv_counts
-#if CONFIG_AMVR
-                           ,
-                           MvSubpelPrecision precision
-#endif
-                           ) {
-  int i;
-  PREDICTION_MODE mode = mi->bmi[block].as_mode;
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-
-  if (mode == NEWMV || mode == NEW_NEWMV) {
-    for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
-      const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
-      const MV diff = { mvs[i].as_mv.row - ref->row,
-                        mvs[i].as_mv.col - ref->col };
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx =
-          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
-      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-      av1_inc_mv(&diff, counts, precision);
-#else
-      av1_inc_mv(&diff, counts, 1);
-#endif
-    }
-  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-    const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
-    const MV diff = { mvs[1].as_mv.row - ref->row,
-                      mvs[1].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
-  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-    const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
-    const MV diff = { mvs[0].as_mv.row - ref->row,
-                      mvs[0].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
   }
+  return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                                   x->mbmi_ext);
 }
 
-void av1_update_mv_count(ThreadData *td) {
-  const MACROBLOCKD *xd = &td->mb.e_mbd;
-  const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-#if CONFIG_AMVR
-  MvSubpelPrecision precision = 1;
-  if (xd->cur_frame_mv_precision_level) {
-    precision = MV_SUBPEL_NONE;
-  }
-#endif
-
-  if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
-    int idx, idy;
-
-    for (idy = 0; idy < 2; idy += num_4x4_h) {
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int i = idy * 2 + idx;
-
-        if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
-
-#if CONFIG_AMVR
-          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv,
-                         precision);
-#else
-          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv);
-#endif
-      }
-    }
-  } else {
-    if (have_newmv_in_inter_mode(mbmi->mode))
-
-#if CONFIG_AMVR
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv,
-              precision);
-#else
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv);
-#endif
-  }
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer) {
+  const int ref_idx = 0;
+  MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+  lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+  *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+  lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
 }
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
index 8689cec27..64e9e7162 100644
--- a/third_party/aom/av1/encoder/encodemv.h
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -18,13 +18,6 @@
 extern "C" {
 #endif
 
-void av1_entropy_mv_init(void);
-
-#if !CONFIG_NEW_MULTISYMBOL
-void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
-                         nmv_context_counts *const counts);
-#endif
-
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp);
 
@@ -34,10 +27,18 @@ void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
 
 void av1_update_mv_count(ThreadData *td);
 
-#if CONFIG_INTRABC
 void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx);
-#endif  // CONFIG_INTRABC
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index e9ab3c87f..196e18d8a 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -13,12 +13,13 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "av1/common/alloccommon.h"
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif  // CONFIG_CDEF
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
@@ -30,32 +31,17 @@
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
-#if CONFIG_BGSPRITE
-#include "av1/encoder/bgsprite.h"
-#endif  // CONFIG_BGSPRITE
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#endif
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
-#if CONFIG_HASH_ME
 #include "av1/encoder/hash_motion.h"
-#endif
 #include "av1/encoder/mbgraph.h"
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#include "av1/common/ncobmc_kernels.h"
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #include "av1/encoder/picklpf.h"
-#if CONFIG_LOOP_RESTORATION
 #include "av1/encoder/pickrst.h"
-#endif  // CONFIG_LOOP_RESTORATION
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
@@ -63,45 +49,41 @@
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/temporal_filter.h"
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
 #include "aom_dsp/psnr.h"
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
 #endif
+#include "av1/encoder/grain_test_vectors.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
 
 #if CONFIG_ENTROPY_STATS
 FRAME_COUNTS aggregate_fc;
-// Aggregate frame counts per frame context type
-FRAME_COUNTS aggregate_fc_per_type[FRAME_CONTEXTS];
 #endif  // CONFIG_ENTROPY_STATS
 
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
 
-#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-                                       //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-                                       // mv. Choose a very high value for
-                                       // now so that HIGH_PRECISION is always
-                                       // chosen.
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
 
 // #define OUTPUT_YUV_REC
-#ifdef OUTPUT_YUV_DENOISED
-FILE *yuv_denoised_file = NULL;
-#endif
 #ifdef OUTPUT_YUV_SKINMAP
 FILE *yuv_skinmap_file = NULL;
 #endif
@@ -110,20 +92,6 @@ FILE *yuv_rec_file;
 #define FILE_NAME_LEN 100
 #endif
 
-#if 0
-FILE *framepsnr;
-FILE *kf_list;
-FILE *keyfile;
-#endif
-
-#if CONFIG_CFL
-CFL_CTX NULL_CFL;
-#endif
-
-#if CONFIG_INTERNAL_STATS
-typedef enum { Y, U, V, ALL } STAT_TYPE;
-#endif  // CONFIG_INTERNAL_STATS
-
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -180,7 +148,6 @@ static void apply_active_map(AV1_COMP *cpi) {
         if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
       av1_enable_segmentation(seg);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-#if CONFIG_LOOPFILTER_LEVEL
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
@@ -194,23 +161,12 @@ static void apply_active_map(AV1_COMP *cpi) {
                       -MAX_LOOP_FILTER);
       av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
                       -MAX_LOOP_FILTER);
-#else
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
-      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
-      // filter level being zero regardless of the value of seg->abs_delta.
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF,
-                      -MAX_LOOP_FILTER);
-#endif  // CONFIG_LOOPFILTER_LEVEL
     } else {
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-#if CONFIG_LOOPFILTER_LEVEL
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
-#else
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
-#endif  // CONFIG_LOOPFILTER_LEVEL
       if (seg->enabled) {
         seg->update_data = 1;
         seg->update_map = 1;
@@ -277,54 +233,45 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
   }
 }
 
-static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv
-#if CONFIG_AMVR
-                                  ,
-                                  int cur_frame_mv_precision_level
-#endif
-                                  ) {
+static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv,
+                                  int cur_frame_force_integer_mv) {
   MACROBLOCK *const mb = &cpi->td.mb;
-  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
-
-#if CONFIG_AMVR
-  if (cpi->common.allow_high_precision_mv &&
-      cur_frame_mv_precision_level == 0) {
-#else
-  if (cpi->common.allow_high_precision_mv) {
-#endif
-    int i;
-    for (i = 0; i < NMV_CONTEXTS; ++i) {
-      mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
-    }
-  } else {
-    int i;
-    for (i = 0; i < NMV_CONTEXTS; ++i) {
-      mb->mv_cost_stack[i] = mb->nmvcost[i];
-    }
-  }
+  cpi->common.allow_high_precision_mv =
+      allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+  const int copy_hp =
+      cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+  int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost;
+  mb->mv_cost_stack = *src;
 }
 
 static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
-#if CONFIG_EXT_PARTITION
+  const AV1_COMMON *const cm = &cpi->common;
+
   if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
     return BLOCK_64X64;
-
-  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
-    return BLOCK_128X128;
+#if CONFIG_FILEOPTIONS
+  if (cm->options && cm->options->ext_partition)
+#endif
+    if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+      return BLOCK_128X128;
 
   assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
 
-  assert(IMPLIES(cpi->common.tile_cols > 1,
-                 cpi->common.tile_width % MAX_MIB_SIZE == 0));
-  assert(IMPLIES(cpi->common.tile_rows > 1,
-                 cpi->common.tile_height % MAX_MIB_SIZE == 0));
+// TODO(any): Possibly could improve this with a heuristic.
+#if CONFIG_FILEOPTIONS
+  if (cm->options && !cm->options->ext_partition) return BLOCK_64X64;
+#endif
+
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there. Also, this heuristic gives
+  // compression gain for speed >= 2 only.
+  if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
+      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
+    return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
+                                                   : BLOCK_64X64;
+  }
 
-  // TODO(any): Possibly could improve this with a heuristic.
   return BLOCK_128X128;
-#else
-  (void)cpi;
-  return BLOCK_64X64;
-#endif  //  CONFIG_EXT_PARTITION
 }
 
 static void setup_frame(AV1_COMP *cpi) {
@@ -334,96 +281,82 @@ static void setup_frame(AV1_COMP *cpi) {
   // frames where the error_resilient_mode or intra_only flag is set. For
   // other inter-frames the encoder currently uses only two contexts;
   // context 1 for ALTREF frames and context 0 for the others.
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+
+  cm->primary_ref_frame = PRIMARY_REF_NONE;
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cm->force_primary_ref_none) {
     av1_setup_past_independence(cm);
+    for (int i = 0; i < REF_FRAMES; i++) {
+      cm->fb_of_context_type[i] = -1;
+    }
+    cm->fb_of_context_type[REGULAR_FRAME] =
+        get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+    cm->frame_context_idx = REGULAR_FRAME;
   } else {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-// Just use frame context from first signaled reference frame.
-// This will always be LAST_FRAME for now.
-#else
-#if CONFIG_EXT_REFS
     const GF_GROUP *gf_group = &cpi->twopass.gf_group;
     if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
       cm->frame_context_idx = EXT_ARF_FRAME;
     else if (cpi->refresh_alt_ref_frame)
       cm->frame_context_idx = ARF_FRAME;
-#else   // !CONFIG_EXT_REFS
-    if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
-#endif  // CONFIG_EXT_REFS
     else if (cpi->rc.is_src_frame_alt_ref)
       cm->frame_context_idx = OVERLAY_FRAME;
     else if (cpi->refresh_golden_frame)
       cm->frame_context_idx = GLD_FRAME;
-#if CONFIG_EXT_REFS
     else if (cpi->refresh_bwd_ref_frame)
       cm->frame_context_idx = BRF_FRAME;
-#endif  // CONFIG_EXT_REFS
     else
       cm->frame_context_idx = REGULAR_FRAME;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+    int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx];
+    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+      int fb = get_ref_frame_map_idx(cpi, ref_frame);
+      if (fb == wanted_fb) {
+        cm->primary_ref_frame = ref_frame - LAST_FRAME;
+      }
+    }
   }
 
   if (cm->frame_type == KEY_FRAME) {
     cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
     av1_zero(cpi->interp_filter_selected);
-    set_sb_size(cm, select_sb_size(cpi));
-#if CONFIG_REFERENCE_BUFFER
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
     set_use_reference_buffer(cm, 0);
-#endif  // CONFIG_REFERENCE_BUFFER
+  } else if (frame_is_sframe(cm)) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    av1_zero(cpi->interp_filter_selected);
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
   } else {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-        cm->frame_refs[0].idx < 0) {
-      *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
     } else {
-      *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
+      *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
     }
-#else
-    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
     av1_zero(cpi->interp_filter_selected[0]);
   }
-#if CONFIG_EXT_REFS
-#if CONFIG_ONE_SIDED_COMPOUND && \
-    !CONFIG_EXT_COMP_REFS  // No change to bitstream
-  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame;
-    cpi->rc.is_bipred_frame = 1;
-  }
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-      cm->frame_refs[0].idx < 0) {
-    // use default frame context values
-    cm->pre_fc = &cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
-  } else {
-    *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
-    cm->pre_fc = &cm->frame_contexts[cm->frame_refs[0].idx];
-  }
-#else
-  cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
 
+  cm->prev_frame = get_prev_frame(cm);
   cpi->vaq_refresh = 0;
 }
 
 static void enc_setup_mi(AV1_COMMON *cm) {
   int i;
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->mi = cm->mip;
+  memset(cm->mip, 0, cm->mi_stride * cm->mi_rows * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip;
   // Clear top border row
   memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
   // Clear left border column
-  for (i = 1; i < cm->mi_rows + 1; ++i)
+  for (i = 0; i < cm->mi_rows; ++i)
     memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
 
   memset(cm->mi_grid_base, 0,
-         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
 }
 
 static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
@@ -433,10 +366,11 @@ static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
   if (!cm->prev_mip) return 1;
   cm->mi_alloc_size = mi_size;
 
-  cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
   if (!cm->mi_grid_base) return 1;
   cm->prev_mi_grid_base =
-      (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
   if (!cm->prev_mi_grid_base) return 1;
 
   return 0;
@@ -456,19 +390,19 @@ static void enc_free_mi(AV1_COMMON *cm) {
 
 static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
-  MODE_INFO **temp_base = cm->prev_mi_grid_base;
-  MODE_INFO *temp = cm->prev_mip;
+  MB_MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MB_MODE_INFO *temp = cm->prev_mip;
   cm->prev_mip = cm->mip;
   cm->mip = temp;
 
   // Update the upper left visible macroblock ptrs.
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->mi = cm->mip;
+  cm->prev_mi = cm->prev_mip;
 
   cm->prev_mi_grid_base = cm->mi_grid_base;
   cm->mi_grid_base = temp_base;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
 }
 
 void av1_initialize_enc(void) {
@@ -480,11 +414,7 @@ void av1_initialize_enc(void) {
     aom_scale_rtcd();
     av1_init_intra_predictors();
     av1_init_me_luts();
-#if !CONFIG_XIPHRC
     av1_rc_init_minq_luts();
-#endif
-    av1_entropy_mv_init();
-    av1_encode_token_init();
     av1_init_wedge_masks();
     init_done = 1;
   }
@@ -506,25 +436,47 @@ static void alloc_context_buffers_ext(AV1_COMP *cpi) {
                   aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
-static void dealloc_compressor_data(AV1_COMP *cpi) {
+static void update_film_grain_parameters(struct AV1_COMP *cpi,
+                                         const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
+  cpi->oxcf = *oxcf;
 
-  dealloc_context_buffers_ext(cpi);
+  if (cm->film_grain_table) {
+    aom_film_grain_table_free(cm->film_grain_table);
+    aom_free(cm->film_grain_table);
+  }
+  cm->film_grain_table = 0;
+
+  if (oxcf->film_grain_test_vector) {
+    cm->film_grain_params_present = 1;
+    if (cm->frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
 
-#if CONFIG_PVQ
-  if (cpi->oxcf.pass != 1) {
-    const int tile_cols = cm->tile_cols;
-    const int tile_rows = cm->tile_rows;
-    int tile_col, tile_row;
-
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        aom_free(tile_data->pvq_q.buf);
+      cm->film_grain_params.bit_depth = cm->bit_depth;
+      if (cm->color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
       }
+    }
+  } else if (oxcf->film_grain_table_filename) {
+    cm->film_grain_table = aom_malloc(sizeof(*cm->film_grain_table));
+    memset(cm->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+
+    aom_film_grain_table_read(cm->film_grain_table,
+                              oxcf->film_grain_table_filename, &cm->error);
+  } else {
+    cm->film_grain_params_present = 0;
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-#endif
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  dealloc_context_buffers_ext(cpi);
+
   aom_free(cpi->tile_data);
   cpi->tile_data = NULL;
 
@@ -538,7 +490,6 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
-#if CONFIG_MOTION_VAR
   aom_free(cpi->td.mb.above_pred_buf);
   cpi->td.mb.above_pred_buf = NULL;
 
@@ -550,26 +501,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
 
   aom_free(cpi->td.mb.mask_buf);
   cpi->td.mb.mask_buf = NULL;
-#endif
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
 
   av1_free_ref_frame_buffers(cm->buffer_pool);
-#if CONFIG_LV_MAP
   av1_free_txb_buf(cpi);
-#endif
   av1_free_context_buffers(cm);
 
   aom_free_frame_buffer(&cpi->last_frame_uf);
-#if CONFIG_LOOP_RESTORATION
   av1_free_restoration_buffers(cm);
-  aom_free_frame_buffer(&cpi->last_frame_db);
   aom_free_frame_buffer(&cpi->trial_frame_rst);
-  aom_free(cpi->extra_rstbuf);
-  {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; ++i)
-      av1_free_restoration_struct(&cpi->rst_search[i]);
-  }
-#endif  // CONFIG_LOOP_RESTORATION
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
   aom_free_frame_buffer(&cpi->alt_ref_buffer);
@@ -578,32 +520,22 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->tile_tok[0][0]);
   cpi->tile_tok[0][0] = 0;
 
-  av1_free_pc_tree(&cpi->td);
+  av1_free_pc_tree(&cpi->td, num_planes);
 
   aom_free(cpi->td.mb.palette_buffer);
-
-#if CONFIG_ANS
-  aom_buf_ans_free(&cpi->buf_ans);
-#endif  // CONFIG_ANS
 }
 
 static void save_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
-  int i;
 
   // Stores a snapshot of key state variables which can subsequently be
   // restored with a call to av1_restore_coding_context. These functions are
   // intended for use in a re-code loop in av1_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
-    av1_copy(cc->nmv_costs, cpi->nmv_costs);
-    av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
-  }
-
-  av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
-  av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+  av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost);
+  av1_copy(cc->nmv_costs, cpi->nmv_costs);
+  av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
 
   cc->fc = *cm->fc;
 }
@@ -611,18 +543,12 @@ static void save_coding_context(AV1_COMP *cpi) {
 static void restore_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
-  int i;
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to av1_save_coding_context.
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
-    av1_copy(cpi->nmv_costs, cc->nmv_costs);
-    av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
-  }
-
-  av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
-  av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+  av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost);
+  av1_copy(cpi->nmv_costs, cc->nmv_costs);
+  av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
 
   *cm->fc = cc->fc;
 }
@@ -673,7 +599,6 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       qi_delta =
           av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
-#if CONFIG_LOOPFILTER_LEVEL
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
@@ -683,15 +608,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-#else
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
-      // Where relevant assume segment data is delta data
-      seg->abs_delta = SEGMENT_DELTADATA;
     }
   } else if (seg->enabled) {
     // All other frames if segmentation has been enabled
@@ -702,14 +620,12 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       if (rc->source_alt_ref_active) {
         seg->update_map = 0;
         seg->update_data = 1;
-        seg->abs_delta = SEGMENT_DELTADATA;
 
         qi_delta =
             av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
-#if CONFIG_LOOPFILTER_LEVEL
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
@@ -719,10 +635,6 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-#else
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
@@ -777,16 +689,16 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
 
 static void update_reference_segmentation_map(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->last_frame_seg_map;
+  MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->current_frame_seg_map;
   int row, col;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    MB_MODE_INFO **mi_4x4 = mi_4x4_ptr;
     uint8_t *cache = cache_ptr;
-    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
-      cache[0] = mi_8x8[0]->mbmi.segment_id;
-    mi_8x8_ptr += cm->mi_stride;
+    for (col = 0; col < cm->mi_cols; col++, mi_4x4++, cache++)
+      cache[0] = mi_4x4[0]->segment_id;
+    mi_4x4_ptr += cm->mi_stride;
     cache_ptr += cm->mi_cols;
   }
 }
@@ -796,12 +708,9 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
   if (!cpi->lookahead)
-    cpi->lookahead = av1_lookahead_init(oxcf->width, oxcf->height,
-                                        cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                        cm->use_highbitdepth,
-#endif
-                                        oxcf->lag_in_frames);
+    cpi->lookahead = av1_lookahead_init(
+        oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y,
+        cm->use_highbitdepth, oxcf->lag_in_frames);
   if (!cpi->lookahead)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
@@ -809,11 +718,8 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -822,84 +728,49 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-#if CONFIG_LOOP_RESTORATION
-  if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame deblocked buffer");
   if (aom_realloc_frame_buffer(
-          &cpi->trial_frame_rst,
-#if CONFIG_FRAME_SUPERRES
-          cm->superres_upscaled_width, cm->superres_upscaled_height,
-#else
-          cm->width, cm->height,
-#endif  // CONFIG_FRAME_SUPERRES
-          cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-          cm->use_highbitdepth,
-#endif
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
+          cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+          NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
-  int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE;
-  if (extra_rstbuf_sz > 0) {
-    aom_free(cpi->extra_rstbuf);
-    CHECK_MEM_ERROR(cm, cpi->extra_rstbuf,
-                    (uint8_t *)aom_malloc(extra_rstbuf_sz));
-  } else {
-    cpi->extra_rstbuf = NULL;
-  }
-#endif  // CONFIG_LOOP_RESTORATION
 
   if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
   if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
 
 static void alloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
 
   av1_alloc_context_buffers(cm, cm->width, cm->height);
 
-#if CONFIG_LV_MAP
   av1_alloc_txb_buf(cpi);
-#endif
 
   alloc_context_buffers_ext(cpi);
 
   aom_free(cpi->tile_tok[0][0]);
 
   {
-    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+    unsigned int tokens =
+        get_token_alloc(cm->mb_rows, cm->mb_cols, MAX_SB_SIZE_LOG2, num_planes);
     CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
                     aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
@@ -909,18 +780,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
 
 void av1_new_framerate(AV1_COMP *cpi, double framerate) {
   cpi->framerate = framerate < 0.1 ? 30 : framerate;
-#if CONFIG_XIPHRC
-  if (!cpi->od_rc.cur_frame) return;
-  cpi->od_rc.framerate = cpi->framerate;
-  od_enc_rc_resize(&cpi->od_rc);
-#else
   av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
-#endif
 }
 
-#if CONFIG_MAX_TILE
-
-static void set_tile_info_max_tile(AV1_COMP *cpi) {
+static void set_tile_info(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int i, start_sb;
 
@@ -932,15 +795,15 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) {
     cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols);
     cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols);
   } else {
-    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
+    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+    int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
     int size_sb, j = 0;
     cm->uniform_tile_spacing_flag = 0;
     for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
       cm->tile_col_start_sb[i] = start_sb;
       size_sb = cpi->oxcf.tile_widths[j++];
       if (j >= cpi->oxcf.tile_width_count) j = 0;
-      start_sb += AOMMIN(size_sb, MAX_TILE_WIDTH_SB);
+      start_sb += AOMMIN(size_sb, cm->max_tile_width_sb);
     }
     cm->tile_cols = i;
     cm->tile_col_start_sb[i] = sb_cols;
@@ -952,8 +815,8 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) {
     cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows);
     cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows);
   } else {
-    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-    int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+    int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
     int size_sb, j = 0;
     for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
       cm->tile_row_start_sb[i] = start_sb;
@@ -967,158 +830,174 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) {
   av1_calculate_tile_rows(cm);
 }
 
-#endif
-
-static void set_tile_info(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_DEPENDENT_HORZTILES
-  int tile_row, tile_col, num_tiles_in_tg;
-  int tg_row_start, tg_col_start;
-#endif
-#if CONFIG_EXT_TILE
-  if (cpi->oxcf.large_scale_tile) {
-#if CONFIG_EXT_PARTITION
-    if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
-      cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
-      cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
-      cm->tile_width <<= MAX_MIB_SIZE_LOG2;
-      cm->tile_height <<= MAX_MIB_SIZE_LOG2;
-    } else {
-      cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
-      cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
-      cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
-      cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
-    }
-#else
-    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
-    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
-    cm->tile_width <<= MAX_MIB_SIZE_LOG2;
-    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
-#endif  // CONFIG_EXT_PARTITION
-
-    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
-    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
-
-    assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
-    assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
-
-    // Get the number of tiles
-    cm->tile_cols = 1;
-    while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
-
-    cm->tile_rows = 1;
-    while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
-  } else {
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_MAX_TILE
-    set_tile_info_max_tile(cpi);
-#else
-  int min_log2_tile_cols, max_log2_tile_cols;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
-
-  cm->log2_tile_cols =
-      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
-
-  cm->tile_width =
-      get_tile_size(cm->mi_cols, cm->log2_tile_cols, &cm->tile_cols);
-  cm->tile_height =
-      get_tile_size(cm->mi_rows, cm->log2_tile_rows, &cm->tile_rows);
-#endif  // CONFIG_MAX_TILE
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_DEPENDENT_HORZTILES
-  cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles;
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    // May not needed since cpi->oxcf.dependent_horz_tiles is already adjusted.
-    cm->dependent_horz_tiles = 0;
-  } else {
-#endif  // CONFIG_EXT_TILE
-    if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_EXT_TILE
-  if (!cm->large_scale_tile) {
-#endif  // CONFIG_EXT_TILE
-    if (cpi->oxcf.mtu == 0) {
-      cm->num_tg = cpi->oxcf.num_tile_groups;
-    } else {
-      // Use a default value for the purposes of weighting costs in probability
-      // updates
-      cm->num_tg = DEFAULT_MAX_NUM_TG;
-    }
-    num_tiles_in_tg =
-        (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg;
-    tg_row_start = 0;
-    tg_col_start = 0;
-    for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-      for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-        if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) {
-          tg_row_start = tile_row;
-          tg_col_start = tile_col;
-        }
-        cm->tile_group_start_row[tile_row][tile_col] = tg_row_start;
-        cm->tile_group_start_col[tile_row][tile_col] = tg_col_start;
-      }
-    }
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-#endif
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  cm->loop_filter_across_tiles_enabled =
-      cpi->oxcf.loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-}
-
 static void update_frame_size(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
   av1_set_mb_mi(cm, cm->width, cm->height);
   av1_init_context_buffers(cm);
-  av1_init_macroblockd(cm, xd,
-#if CONFIG_PVQ
-                       NULL,
-#endif
-#if CONFIG_CFL
-                       &NULL_CFL,
-#endif
-                       NULL);
+  av1_init_macroblockd(cm, xd, NULL);
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
   set_tile_info(cpi);
 }
 
 static void init_buffer_indices(AV1_COMP *cpi) {
-#if CONFIG_EXT_REFS
   int fb_idx;
-  for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx)
-    cpi->lst_fb_idxes[fb_idx] = fb_idx;
-  cpi->gld_fb_idx = LAST_REF_FRAMES;
-  cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
-  cpi->alt2_fb_idx = LAST_REF_FRAMES + 2;
-  cpi->alt_fb_idx = LAST_REF_FRAMES + 3;
-  cpi->ext_fb_idx = LAST_REF_FRAMES + 4;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    cpi->ref_fb_idx[fb_idx] = fb_idx;
   for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
     cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
-#else   // !CONFIG_EXT_REFS
-  cpi->lst_fb_idx = 0;
-  cpi->gld_fb_idx = 1;
-  cpi->alt_fb_idx = 2;
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_AMVR
   cpi->rate_index = 0;
   cpi->rate_size = 0;
   cpi->cur_poc = -1;
-#endif
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+                                   int lvl_width, int lvl_height,
+                                   double lvl_fps, int lvl_dim_mult) {
+  const int64_t lvl_luma_pels = lvl_width * lvl_height;
+  const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+  const int64_t luma_pels = width * height;
+  const double display_sample_rate = luma_pels * fps;
+  return luma_pels <= lvl_luma_pels &&
+         display_sample_rate <= lvl_display_sample_rate &&
+         width <= lvl_width * lvl_dim_mult &&
+         height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
+                                     const AV1EncoderConfig *oxcf) {
+  // TODO(any): This is a placeholder function that only addresses dimensions
+  // and max display sample rates.
+  // Need to add checks for max bit rate, max decoded luma sample rate, header
+  // rate, etc. that are not covered by this function.
+  (void)oxcf;
+  BitstreamLevel bl = { 9, 3 };
+  if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
+                       288, 30.0, 4)) {
+    bl.major = 2;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              704, 396, 30.0, 4)) {
+    bl.major = 2;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1088, 612, 30.0, 4)) {
+    bl.major = 3;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1376, 774, 30.0, 4)) {
+    bl.major = 3;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 30.0, 3)) {
+    bl.major = 4;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 60.0, 3)) {
+    bl.major = 4;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 30.0, 2)) {
+    bl.major = 5;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 60.0, 2)) {
+    bl.major = 5;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 120.0, 2)) {
+    bl.major = 5;
+    bl.minor = 2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 30.0, 2)) {
+    bl.major = 6;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 60.0, 2)) {
+    bl.major = 6;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 120.0, 2)) {
+    bl.major = 6;
+    bl.minor = 2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 30.0, 2)) {
+    bl.major = 7;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 60.0, 2)) {
+    bl.major = 7;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 120.0, 2)) {
+    bl.major = 7;
+    bl.minor = 2;
+  }
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    seq->level[i] = bl;
+    seq->tier[i] = 0;  // setting main tier by default
+    // Set the maximum parameters for bitrate and buffer size for this profile,
+    // level, and tier
+    cm->op_params[i].bitrate = max_level_bitrate(
+        cm->profile, major_minor_to_seq_level_idx(seq->level[i]), seq->tier[i]);
+    // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+    // check
+    if (cm->op_params[i].bitrate == 0)
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "AV1 does not support this combination of profile, level, and tier.");
+    // Buffer size in bits/s is bitrate in bits/s * 1 s
+    cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+  }
+}
+
+static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+                                  const AV1EncoderConfig *oxcf) {
+  seq->still_picture = (oxcf->limit == 1);
+  seq->reduced_still_picture_hdr = seq->still_picture;
+  seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
+  seq->force_screen_content_tools = 2;
+  seq->force_integer_mv = 2;
+  seq->enable_order_hint = oxcf->enable_order_hint;
+  seq->frame_id_numbers_present_flag = oxcf->large_scale_tile;
+  if (seq->still_picture && seq->reduced_still_picture_hdr) {
+    seq->enable_order_hint = 0;
+    seq->frame_id_numbers_present_flag = 0;
+    seq->force_screen_content_tools = 2;
+    seq->force_integer_mv = 2;
+  }
+  seq->order_hint_bits_minus_1 =
+      seq->enable_order_hint ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1;
+
+  seq->enable_dual_filter = oxcf->enable_dual_filter;
+  seq->enable_jnt_comp = oxcf->enable_jnt_comp;
+  seq->enable_jnt_comp &= seq->enable_order_hint;
+  seq->enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+  seq->enable_ref_frame_mvs &= seq->enable_order_hint;
+  seq->enable_superres = oxcf->enable_superres;
+  seq->enable_cdef = oxcf->enable_cdef;
+  seq->enable_restoration = oxcf->enable_restoration;
+  seq->enable_warped_motion = oxcf->enable_warped_motion;
+  seq->enable_interintra_compound = 1;
+  seq->enable_masked_compound = 1;
+  seq->enable_intra_edge_filter = 1;
+  seq->enable_filter_intra = 1;
+
+  set_bitstream_level_tier(seq, cm, oxcf);
+
+  if (seq->operating_points_cnt_minus_1 == 0) {
+    seq->operating_point_idc[0] = 0;
+  } else {
+    // Set operating_point_idc[] such that for the i-th operating point the
+    // first (operating_points_cnt-i) spatial layers and the first temporal
+    // layer are decoded Note that highest quality operating point should come
+    // first
+    for (int i = 0; i < seq->operating_points_cnt_minus_1 + 1; i++)
+      seq->operating_point_idc[i] =
+          (~(~0u << (seq->operating_points_cnt_minus_1 + 1 - i)) << 8) | 1;
+  }
 }
 
 static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
@@ -1129,22 +1008,53 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
 
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
-#if CONFIG_HIGHBITDEPTH
   cm->use_highbitdepth = oxcf->use_highbitdepth;
-#endif
-  cm->color_space = oxcf->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  cm->transfer_function = oxcf->transfer_function;
+  cm->color_primaries = oxcf->color_primaries;
+  cm->transfer_characteristics = oxcf->transfer_characteristics;
+  cm->matrix_coefficients = oxcf->matrix_coefficients;
+  cm->seq_params.monochrome = oxcf->monochrome;
   cm->chroma_sample_position = oxcf->chroma_sample_position;
-#endif
   cm->color_range = oxcf->color_range;
+  cm->timing_info_present = oxcf->timing_info_present;
+  cm->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+  cm->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  cm->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  cm->seq_params.display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  cm->seq_params.decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    cm->buffer_model.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_delay_present = 1;
+    set_aom_dec_model_info(&cm->buffer_model);
+    set_dec_model_op_parameters(&cm->op_params[0]);
+  } else if (cm->timing_info_present &&
+             cm->timing_info.equal_picture_interval &&
+             !cm->seq_params.decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    set_resource_availability_parameters(&cm->op_params[0]);
+  } else {
+    cm->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
+  set_sb_size(&cm->seq_params,
+              select_sb_size(cpi));  // set sb size before allocations
   alloc_compressor_data(cpi);
 
+  update_film_grain_parameters(cpi, oxcf);
+
   // Single thread case: use counts in common.
-  cpi->td.counts = &cm->counts;
+  cpi->td.counts = &cpi->counts;
 
   // change includes all joint functionality
   av1_change_config(cpi, oxcf);
@@ -1173,16 +1083,15 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
       (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
 }
 
-#if CONFIG_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
   cpi->fn_ptr[BT].sdf = SDF;                                           \
   cpi->fn_ptr[BT].sdaf = SDAF;                                         \
   cpi->fn_ptr[BT].vf = VF;                                             \
   cpi->fn_ptr[BT].svf = SVF;                                           \
   cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                       \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1220,47 +1129,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
            4;                                                                  \
   }
 
-#define MAKE_BFP_SAD3_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 4;                          \
-  }
-
-#define MAKE_BFP_SAD8_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 4;                          \
-  }
 #define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
   static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
                              const uint8_t *const ref_ptr[], int ref_stride,  \
@@ -1282,11 +1150,33 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
     for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
   }
 
-#if CONFIG_EXT_PARTITION
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
@@ -1294,7 +1184,6 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
-#endif  // CONFIG_EXT_PARTITION
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
@@ -1309,49 +1198,32 @@ MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad32x32x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad64x64x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x16x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x8x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x16x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x8x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x8x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
 
-#if CONFIG_EXT_PARTITION_TYPES
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
@@ -1370,15 +1242,29 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
-#if CONFIG_EXT_PARTITION
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x128)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x128_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x128x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg)
 
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
   cpi->fn_ptr[BT].msdf = MCSDF;       \
@@ -1409,11 +1295,9 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d)
            4;                                                            \
   }
 
-#if CONFIG_EXT_PARTITION
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
-#endif  // CONFIG_EXT_PARTITION
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
@@ -1427,21 +1311,13 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
-
-#if CONFIG_EXT_PARTITION_TYPES
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
-#if CONFIG_EXT_PARTITION
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x128)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_MOTION_VAR
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;           \
   cpi->fn_ptr[BT].ovf = OVF;             \
@@ -1464,11 +1340,9 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32)
     return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
   }
 
-#if CONFIG_EXT_PARTITION
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
-#endif  // CONFIG_EXT_PARTITION
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
@@ -1482,198 +1356,190 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
-
-#if CONFIG_EXT_PARTITION_TYPES
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
-#if CONFIG_EXT_PARTITION
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x128)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_MOTION_VAR
 
 static void highbd_set_var_fns(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case AOM_BITS_8:
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits8,
-                   aom_highbd_sad128x32_avg_bits8, aom_highbd_8_variance128x32,
-                   aom_highbd_8_sub_pixel_variance128x32,
-                   aom_highbd_8_sub_pixel_avg_variance128x32, NULL, NULL,
-                   aom_highbd_sad128x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits8,
-                   aom_highbd_sad32x128_avg_bits8, aom_highbd_8_variance32x128,
-                   aom_highbd_8_sub_pixel_variance32x128,
-                   aom_highbd_8_sub_pixel_avg_variance32x128, NULL, NULL,
-                   aom_highbd_sad32x128x4d_bits8)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
                    aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
                    aom_highbd_8_sub_pixel_variance64x16,
-                   aom_highbd_8_sub_pixel_avg_variance64x16, NULL, NULL,
-                   aom_highbd_sad64x16x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits8,
+                   aom_highbd_jnt_sad64x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x16)
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
                    aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
                    aom_highbd_8_sub_pixel_variance16x64,
-                   aom_highbd_8_sub_pixel_avg_variance16x64, NULL, NULL,
-                   aom_highbd_sad16x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits8,
-                   aom_highbd_sad32x8_avg_bits8, aom_highbd_8_variance32x8,
-                   aom_highbd_8_sub_pixel_variance32x8,
-                   aom_highbd_8_sub_pixel_avg_variance32x8, NULL, NULL,
-                   aom_highbd_sad32x8x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits8,
-                   aom_highbd_sad8x32_avg_bits8, aom_highbd_8_variance8x32,
-                   aom_highbd_8_sub_pixel_variance8x32,
-                   aom_highbd_8_sub_pixel_avg_variance8x32, NULL, NULL,
-                   aom_highbd_sad8x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits8,
-                   aom_highbd_sad16x4_avg_bits8, aom_highbd_8_variance16x4,
-                   aom_highbd_8_sub_pixel_variance16x4,
-                   aom_highbd_8_sub_pixel_avg_variance16x4, NULL, NULL,
-                   aom_highbd_sad16x4x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits8,
-                   aom_highbd_sad4x16_avg_bits8, aom_highbd_8_variance4x16,
-                   aom_highbd_8_sub_pixel_variance4x16,
-                   aom_highbd_8_sub_pixel_avg_variance4x16, NULL, NULL,
-                   aom_highbd_sad4x16x4d_bits8)
-#endif
+                   aom_highbd_8_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits8,
+                   aom_highbd_jnt_sad16x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x64)
+
+        HIGHBD_BFP(
+            BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
+            aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
+            aom_highbd_8_sub_pixel_avg_variance32x8,
+            aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance32x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
+            aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
+            aom_highbd_8_sub_pixel_avg_variance8x32,
+            aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance8x32)
+
+        HIGHBD_BFP(
+            BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
+            aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
+            aom_highbd_8_sub_pixel_avg_variance16x4,
+            aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance16x4)
+
+        HIGHBD_BFP(
+            BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
+            aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
+            aom_highbd_8_sub_pixel_avg_variance4x16,
+            aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance4x16)
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
                    aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
                    aom_highbd_8_sub_pixel_variance32x16,
-                   aom_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
-                   aom_highbd_sad32x16x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits8,
+                   aom_highbd_jnt_sad32x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x16)
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
                    aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
                    aom_highbd_8_sub_pixel_variance16x32,
-                   aom_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
-                   aom_highbd_sad16x32x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits8,
+                   aom_highbd_jnt_sad16x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x32)
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
                    aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
                    aom_highbd_8_sub_pixel_variance64x32,
-                   aom_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
-                   aom_highbd_sad64x32x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits8,
+                   aom_highbd_jnt_sad64x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x32)
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
                    aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
                    aom_highbd_8_sub_pixel_variance32x64,
-                   aom_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
-                   aom_highbd_sad32x64x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits8,
+                   aom_highbd_jnt_sad32x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x64)
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
                    aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
                    aom_highbd_8_sub_pixel_variance32x32,
                    aom_highbd_8_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x3_bits8, aom_highbd_sad32x32x8_bits8,
-                   aom_highbd_sad32x32x4d_bits8)
+                   aom_highbd_sad32x32x4d_bits8,
+                   aom_highbd_jnt_sad32x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x32)
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
                    aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
                    aom_highbd_8_sub_pixel_variance64x64,
                    aom_highbd_8_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x3_bits8, aom_highbd_sad64x64x8_bits8,
-                   aom_highbd_sad64x64x4d_bits8)
+                   aom_highbd_sad64x64x4d_bits8,
+                   aom_highbd_jnt_sad64x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x64)
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
                    aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
                    aom_highbd_8_sub_pixel_variance16x16,
                    aom_highbd_8_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x3_bits8, aom_highbd_sad16x16x8_bits8,
-                   aom_highbd_sad16x16x4d_bits8)
+                   aom_highbd_sad16x16x4d_bits8,
+                   aom_highbd_jnt_sad16x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x16)
 
         HIGHBD_BFP(
             BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
             aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
-            aom_highbd_8_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x3_bits8,
-            aom_highbd_sad16x8x8_bits8, aom_highbd_sad16x8x4d_bits8)
+            aom_highbd_8_sub_pixel_avg_variance16x8,
+            aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance16x8)
 
         HIGHBD_BFP(
             BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
             aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
-            aom_highbd_8_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x3_bits8,
-            aom_highbd_sad8x16x8_bits8, aom_highbd_sad8x16x4d_bits8)
+            aom_highbd_8_sub_pixel_avg_variance8x16,
+            aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance8x16)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
-            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
-            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits8,
-            aom_highbd_sad8x8x8_bits8, aom_highbd_sad8x8x4d_bits8)
+        HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8,
+                   aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8,
+                   aom_highbd_8_sub_pixel_variance8x8,
+                   aom_highbd_8_sub_pixel_avg_variance8x8,
+                   aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance8x8)
 
         HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
                    aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
                    aom_highbd_8_sub_pixel_variance8x4,
-                   aom_highbd_8_sub_pixel_avg_variance8x4, NULL,
-                   aom_highbd_sad8x4x8_bits8, aom_highbd_sad8x4x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance8x4,
+                   aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance8x4)
 
         HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
                    aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
                    aom_highbd_8_sub_pixel_variance4x8,
-                   aom_highbd_8_sub_pixel_avg_variance4x8, NULL,
-                   aom_highbd_sad4x8x8_bits8, aom_highbd_sad4x8x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance4x8,
+                   aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance4x8)
 
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
-            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
-            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8,
-            aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_8_variance2x4, NULL, NULL,
-                   NULL, NULL, NULL)
-#endif
+        HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8,
+                   aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4,
+                   aom_highbd_8_sub_pixel_variance4x4,
+                   aom_highbd_8_sub_pixel_avg_variance4x4,
+                   aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance4x4)
 
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
-                   aom_highbd_sad128x128_avg_bits8,
-                   aom_highbd_8_variance128x128,
-                   aom_highbd_8_sub_pixel_variance128x128,
-                   aom_highbd_8_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8,
-                   aom_highbd_sad128x128x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits8,
+            aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128,
+            aom_highbd_8_sub_pixel_variance128x128,
+            aom_highbd_8_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance128x128)
 
         HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
                    aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
                    aom_highbd_8_sub_pixel_variance128x64,
-                   aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL,
-                   aom_highbd_sad128x64x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits8,
+                   aom_highbd_jnt_sad128x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance128x64)
 
         HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
                    aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
                    aom_highbd_8_sub_pixel_variance64x128,
-                   aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL,
-                   aom_highbd_sad64x128x4d_bits8)
-#endif  // CONFIG_EXT_PARTITION
+                   aom_highbd_8_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits8,
+                   aom_highbd_jnt_sad64x128_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x128)
 
-#if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
                     aom_highbd_8_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
                     aom_highbd_8_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
                     aom_highbd_8_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
                     aom_highbd_8_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
@@ -1700,35 +1566,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_8_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance128x32)
-
-        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
                     aom_highbd_8_masked_sub_pixel_variance64x16)
-
         HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
                     aom_highbd_8_masked_sub_pixel_variance16x64)
-
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
                     aom_highbd_8_masked_sub_pixel_variance32x8)
-
         HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
                     aom_highbd_8_masked_sub_pixel_variance8x32)
-
         HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
                     aom_highbd_8_masked_sub_pixel_variance16x4)
-
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x16)
-#endif
-#if CONFIG_MOTION_VAR
-#if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
                     aom_highbd_obmc_variance128x128,
                     aom_highbd_obmc_sub_pixel_variance128x128)
@@ -1738,7 +1587,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
                     aom_highbd_obmc_variance64x128,
                     aom_highbd_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
                     aom_highbd_obmc_variance64x64,
                     aom_highbd_obmc_sub_pixel_variance64x64)
@@ -1778,223 +1626,206 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
                     aom_highbd_obmc_variance4x4,
                     aom_highbd_obmc_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits8,
-                    aom_highbd_obmc_variance128x32,
-                    aom_highbd_obmc_sub_pixel_variance128x32)
-
-        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits8,
-                    aom_highbd_obmc_variance32x128,
-                    aom_highbd_obmc_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
                     aom_highbd_obmc_variance64x16,
                     aom_highbd_obmc_sub_pixel_variance64x16)
-
         HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
                     aom_highbd_obmc_variance16x64,
                     aom_highbd_obmc_sub_pixel_variance16x64)
-
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
                     aom_highbd_obmc_variance32x8,
                     aom_highbd_obmc_sub_pixel_variance32x8)
-
         HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
                     aom_highbd_obmc_variance8x32,
                     aom_highbd_obmc_sub_pixel_variance8x32)
-
         HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
                     aom_highbd_obmc_variance16x4,
                     aom_highbd_obmc_sub_pixel_variance16x4)
-
         HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
                     aom_highbd_obmc_variance4x16,
                     aom_highbd_obmc_sub_pixel_variance4x16)
-#endif
-#endif  // CONFIG_MOTION_VAR
         break;
 
       case AOM_BITS_10:
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits10,
-                   aom_highbd_sad128x32_avg_bits10,
-                   aom_highbd_10_variance128x32,
-                   aom_highbd_10_sub_pixel_variance128x32,
-                   aom_highbd_10_sub_pixel_avg_variance128x32, NULL, NULL,
-                   aom_highbd_sad128x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits10,
-                   aom_highbd_sad32x128_avg_bits10,
-                   aom_highbd_10_variance32x128,
-                   aom_highbd_10_sub_pixel_variance32x128,
-                   aom_highbd_10_sub_pixel_avg_variance32x128, NULL, NULL,
-                   aom_highbd_sad32x128x4d_bits10)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
                    aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
                    aom_highbd_10_sub_pixel_variance64x16,
-                   aom_highbd_10_sub_pixel_avg_variance64x16, NULL, NULL,
-                   aom_highbd_sad64x16x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits10,
+                   aom_highbd_jnt_sad64x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x16);
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
                    aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
                    aom_highbd_10_sub_pixel_variance16x64,
-                   aom_highbd_10_sub_pixel_avg_variance16x64, NULL, NULL,
-                   aom_highbd_sad16x64x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits10,
+                   aom_highbd_jnt_sad16x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x64);
 
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
                    aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
                    aom_highbd_10_sub_pixel_variance32x8,
-                   aom_highbd_10_sub_pixel_avg_variance32x8, NULL, NULL,
-                   aom_highbd_sad32x8x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits10,
+                   aom_highbd_jnt_sad32x8_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x8);
 
         HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
                    aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
                    aom_highbd_10_sub_pixel_variance8x32,
-                   aom_highbd_10_sub_pixel_avg_variance8x32, NULL, NULL,
-                   aom_highbd_sad8x32x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits10,
+                   aom_highbd_jnt_sad8x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance8x32);
 
         HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
                    aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
                    aom_highbd_10_sub_pixel_variance16x4,
-                   aom_highbd_10_sub_pixel_avg_variance16x4, NULL, NULL,
-                   aom_highbd_sad16x4x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits10,
+                   aom_highbd_jnt_sad16x4_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x4);
 
         HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
                    aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
                    aom_highbd_10_sub_pixel_variance4x16,
-                   aom_highbd_10_sub_pixel_avg_variance4x16, NULL, NULL,
-                   aom_highbd_sad4x16x4d_bits10)
-#endif
+                   aom_highbd_10_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits10,
+                   aom_highbd_jnt_sad4x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance4x16);
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
                    aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
                    aom_highbd_10_sub_pixel_variance32x16,
-                   aom_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
-                   aom_highbd_sad32x16x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits10,
+                   aom_highbd_jnt_sad32x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x16);
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
                    aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
                    aom_highbd_10_sub_pixel_variance16x32,
-                   aom_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
-                   aom_highbd_sad16x32x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits10,
+                   aom_highbd_jnt_sad16x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x32);
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
                    aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
                    aom_highbd_10_sub_pixel_variance64x32,
-                   aom_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
-                   aom_highbd_sad64x32x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits10,
+                   aom_highbd_jnt_sad64x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x32);
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
                    aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
                    aom_highbd_10_sub_pixel_variance32x64,
-                   aom_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
-                   aom_highbd_sad32x64x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits10,
+                   aom_highbd_jnt_sad32x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x64);
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
                    aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
                    aom_highbd_10_sub_pixel_variance32x32,
                    aom_highbd_10_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x3_bits10, aom_highbd_sad32x32x8_bits10,
-                   aom_highbd_sad32x32x4d_bits10)
+                   aom_highbd_sad32x32x4d_bits10,
+                   aom_highbd_jnt_sad32x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x32);
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
                    aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
                    aom_highbd_10_sub_pixel_variance64x64,
                    aom_highbd_10_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x3_bits10, aom_highbd_sad64x64x8_bits10,
-                   aom_highbd_sad64x64x4d_bits10)
+                   aom_highbd_sad64x64x4d_bits10,
+                   aom_highbd_jnt_sad64x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x64);
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
                    aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
                    aom_highbd_10_sub_pixel_variance16x16,
                    aom_highbd_10_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x3_bits10, aom_highbd_sad16x16x8_bits10,
-                   aom_highbd_sad16x16x4d_bits10)
+                   aom_highbd_sad16x16x4d_bits10,
+                   aom_highbd_jnt_sad16x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x16);
 
         HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
                    aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
                    aom_highbd_10_sub_pixel_variance16x8,
                    aom_highbd_10_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x3_bits10, aom_highbd_sad16x8x8_bits10,
-                   aom_highbd_sad16x8x4d_bits10)
+                   aom_highbd_sad16x8x4d_bits10,
+                   aom_highbd_jnt_sad16x8_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x8);
 
         HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
                    aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
                    aom_highbd_10_sub_pixel_variance8x16,
                    aom_highbd_10_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x3_bits10, aom_highbd_sad8x16x8_bits10,
-                   aom_highbd_sad8x16x4d_bits10)
+                   aom_highbd_sad8x16x4d_bits10,
+                   aom_highbd_jnt_sad8x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance8x16);
 
         HIGHBD_BFP(
             BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
             aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
-            aom_highbd_10_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits10,
-            aom_highbd_sad8x8x8_bits10, aom_highbd_sad8x8x4d_bits10)
+            aom_highbd_10_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance8x8);
 
-        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits10,
-                   aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4,
-                   aom_highbd_10_sub_pixel_variance8x4,
-                   aom_highbd_10_sub_pixel_avg_variance8x4, NULL,
-                   aom_highbd_sad8x4x8_bits10, aom_highbd_sad8x4x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
+            aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
+            aom_highbd_10_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance8x4);
 
-        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits10,
-                   aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8,
-                   aom_highbd_10_sub_pixel_variance4x8,
-                   aom_highbd_10_sub_pixel_avg_variance4x8, NULL,
-                   aom_highbd_sad4x8x8_bits10, aom_highbd_sad4x8x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
+            aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
+            aom_highbd_10_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance4x8);
 
         HIGHBD_BFP(
             BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
             aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
-            aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10,
-            aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_10_variance2x4, NULL, NULL,
-                   NULL, NULL, NULL)
-#endif
+            aom_highbd_10_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
+                   aom_highbd_sad128x128_avg_bits10,
+                   aom_highbd_10_variance128x128,
+                   aom_highbd_10_sub_pixel_variance128x128,
+                   aom_highbd_10_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits10,
+                   aom_highbd_jnt_sad128x128_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance128x128);
 
-#if CONFIG_EXT_PARTITION
         HIGHBD_BFP(
-            BLOCK_128X128, aom_highbd_sad128x128_bits10,
-            aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128,
-            aom_highbd_10_sub_pixel_variance128x128,
-            aom_highbd_10_sub_pixel_avg_variance128x128,
-            aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10,
-            aom_highbd_sad128x128x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
-                   aom_highbd_sad128x64_avg_bits10,
-                   aom_highbd_10_variance128x64,
-                   aom_highbd_10_sub_pixel_variance128x64,
-                   aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL,
-                   aom_highbd_sad128x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
-                   aom_highbd_sad64x128_avg_bits10,
-                   aom_highbd_10_variance64x128,
-                   aom_highbd_10_sub_pixel_variance64x128,
-                   aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL,
-                   aom_highbd_sad64x128x4d_bits10)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION
+            BLOCK_128X64, aom_highbd_sad128x64_bits10,
+            aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64,
+            aom_highbd_10_sub_pixel_variance128x64,
+            aom_highbd_10_sub_pixel_avg_variance128x64,
+            aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(
+            BLOCK_64X128, aom_highbd_sad64x128_bits10,
+            aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128,
+            aom_highbd_10_sub_pixel_variance64x128,
+            aom_highbd_10_sub_pixel_avg_variance64x128,
+            aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance64x128);
+
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
                     aom_highbd_10_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
                     aom_highbd_10_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
                     aom_highbd_10_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
                     aom_highbd_10_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
@@ -2021,35 +1852,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_10_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance128x32)
-
-        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
                     aom_highbd_10_masked_sub_pixel_variance64x16)
-
         HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
                     aom_highbd_10_masked_sub_pixel_variance16x64)
-
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
                     aom_highbd_10_masked_sub_pixel_variance32x8)
-
         HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
                     aom_highbd_10_masked_sub_pixel_variance8x32)
-
         HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
                     aom_highbd_10_masked_sub_pixel_variance16x4)
-
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x16)
-#endif
-#if CONFIG_MOTION_VAR
-#if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
                     aom_highbd_10_obmc_variance128x128,
                     aom_highbd_10_obmc_sub_pixel_variance128x128)
@@ -2059,7 +1873,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
                     aom_highbd_10_obmc_variance64x128,
                     aom_highbd_10_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
                     aom_highbd_10_obmc_variance64x64,
                     aom_highbd_10_obmc_sub_pixel_variance64x64)
@@ -2099,16 +1912,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
                     aom_highbd_10_obmc_variance4x4,
                     aom_highbd_10_obmc_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits10,
-                    aom_highbd_10_obmc_variance128x32,
-                    aom_highbd_10_obmc_sub_pixel_variance128x32)
-
-        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits10,
-                    aom_highbd_10_obmc_variance32x128,
-                    aom_highbd_10_obmc_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
 
         HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
                     aom_highbd_10_obmc_variance64x16,
@@ -2133,189 +1936,188 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
                     aom_highbd_10_obmc_variance4x16,
                     aom_highbd_10_obmc_sub_pixel_variance4x16)
-#endif
-#endif  // CONFIG_MOTION_VAR
         break;
 
       case AOM_BITS_12:
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits12,
-                   aom_highbd_sad128x32_avg_bits12,
-                   aom_highbd_12_variance128x32,
-                   aom_highbd_12_sub_pixel_variance128x32,
-                   aom_highbd_12_sub_pixel_avg_variance128x32, NULL, NULL,
-                   aom_highbd_sad128x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits12,
-                   aom_highbd_sad32x128_avg_bits12,
-                   aom_highbd_12_variance32x128,
-                   aom_highbd_12_sub_pixel_variance32x128,
-                   aom_highbd_12_sub_pixel_avg_variance32x128, NULL, NULL,
-                   aom_highbd_sad32x128x4d_bits12)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
                    aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
                    aom_highbd_12_sub_pixel_variance64x16,
-                   aom_highbd_12_sub_pixel_avg_variance64x16, NULL, NULL,
-                   aom_highbd_sad64x16x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits12,
+                   aom_highbd_jnt_sad64x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x16);
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
                    aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
                    aom_highbd_12_sub_pixel_variance16x64,
-                   aom_highbd_12_sub_pixel_avg_variance16x64, NULL, NULL,
-                   aom_highbd_sad16x64x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits12,
+                   aom_highbd_jnt_sad16x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x64);
 
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
                    aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
                    aom_highbd_12_sub_pixel_variance32x8,
-                   aom_highbd_12_sub_pixel_avg_variance32x8, NULL, NULL,
-                   aom_highbd_sad32x8x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits12,
+                   aom_highbd_jnt_sad32x8_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x8);
 
         HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
                    aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
                    aom_highbd_12_sub_pixel_variance8x32,
-                   aom_highbd_12_sub_pixel_avg_variance8x32, NULL, NULL,
-                   aom_highbd_sad8x32x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits12,
+                   aom_highbd_jnt_sad8x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance8x32);
 
         HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
                    aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
                    aom_highbd_12_sub_pixel_variance16x4,
-                   aom_highbd_12_sub_pixel_avg_variance16x4, NULL, NULL,
-                   aom_highbd_sad16x4x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits12,
+                   aom_highbd_jnt_sad16x4_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x4);
 
         HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
                    aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
                    aom_highbd_12_sub_pixel_variance4x16,
-                   aom_highbd_12_sub_pixel_avg_variance4x16, NULL, NULL,
-                   aom_highbd_sad4x16x4d_bits12)
-#endif
+                   aom_highbd_12_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits12,
+                   aom_highbd_jnt_sad4x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance4x16);
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
                    aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
                    aom_highbd_12_sub_pixel_variance32x16,
-                   aom_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
-                   aom_highbd_sad32x16x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits12,
+                   aom_highbd_jnt_sad32x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x16);
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
                    aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
                    aom_highbd_12_sub_pixel_variance16x32,
-                   aom_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
-                   aom_highbd_sad16x32x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits12,
+                   aom_highbd_jnt_sad16x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x32);
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
                    aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
                    aom_highbd_12_sub_pixel_variance64x32,
-                   aom_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
-                   aom_highbd_sad64x32x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits12,
+                   aom_highbd_jnt_sad64x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x32);
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
                    aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
                    aom_highbd_12_sub_pixel_variance32x64,
-                   aom_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
-                   aom_highbd_sad32x64x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits12,
+                   aom_highbd_jnt_sad32x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x64);
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
                    aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
                    aom_highbd_12_sub_pixel_variance32x32,
                    aom_highbd_12_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x3_bits12, aom_highbd_sad32x32x8_bits12,
-                   aom_highbd_sad32x32x4d_bits12)
+                   aom_highbd_sad32x32x4d_bits12,
+                   aom_highbd_jnt_sad32x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x32);
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
                    aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
                    aom_highbd_12_sub_pixel_variance64x64,
                    aom_highbd_12_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x3_bits12, aom_highbd_sad64x64x8_bits12,
-                   aom_highbd_sad64x64x4d_bits12)
+                   aom_highbd_sad64x64x4d_bits12,
+                   aom_highbd_jnt_sad64x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x64);
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
                    aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
                    aom_highbd_12_sub_pixel_variance16x16,
                    aom_highbd_12_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x3_bits12, aom_highbd_sad16x16x8_bits12,
-                   aom_highbd_sad16x16x4d_bits12)
+                   aom_highbd_sad16x16x4d_bits12,
+                   aom_highbd_jnt_sad16x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x16);
 
         HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
                    aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
                    aom_highbd_12_sub_pixel_variance16x8,
                    aom_highbd_12_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x3_bits12, aom_highbd_sad16x8x8_bits12,
-                   aom_highbd_sad16x8x4d_bits12)
+                   aom_highbd_sad16x8x4d_bits12,
+                   aom_highbd_jnt_sad16x8_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x8);
 
         HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
                    aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
                    aom_highbd_12_sub_pixel_variance8x16,
                    aom_highbd_12_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x3_bits12, aom_highbd_sad8x16x8_bits12,
-                   aom_highbd_sad8x16x4d_bits12)
+                   aom_highbd_sad8x16x4d_bits12,
+                   aom_highbd_jnt_sad8x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance8x16);
 
         HIGHBD_BFP(
             BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
             aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
-            aom_highbd_12_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits12,
-            aom_highbd_sad8x8x8_bits12, aom_highbd_sad8x8x4d_bits12)
+            aom_highbd_12_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance8x8);
 
-        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits12,
-                   aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4,
-                   aom_highbd_12_sub_pixel_variance8x4,
-                   aom_highbd_12_sub_pixel_avg_variance8x4, NULL,
-                   aom_highbd_sad8x4x8_bits12, aom_highbd_sad8x4x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
+            aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
+            aom_highbd_12_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance8x4);
 
-        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits12,
-                   aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8,
-                   aom_highbd_12_sub_pixel_variance4x8,
-                   aom_highbd_12_sub_pixel_avg_variance4x8, NULL,
-                   aom_highbd_sad4x8x8_bits12, aom_highbd_sad4x8x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
+            aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
+            aom_highbd_12_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance4x8);
 
         HIGHBD_BFP(
             BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
             aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
-            aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12,
-            aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_12_variance2x4, NULL, NULL,
-                   NULL, NULL, NULL)
-#endif
+            aom_highbd_12_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
+                   aom_highbd_sad128x128_avg_bits12,
+                   aom_highbd_12_variance128x128,
+                   aom_highbd_12_sub_pixel_variance128x128,
+                   aom_highbd_12_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits12,
+                   aom_highbd_jnt_sad128x128_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(
+            BLOCK_128X64, aom_highbd_sad128x64_bits12,
+            aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64,
+            aom_highbd_12_sub_pixel_variance128x64,
+            aom_highbd_12_sub_pixel_avg_variance128x64,
+            aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance128x64);
 
-#if CONFIG_EXT_PARTITION
         HIGHBD_BFP(
-            BLOCK_128X128, aom_highbd_sad128x128_bits12,
-            aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128,
-            aom_highbd_12_sub_pixel_variance128x128,
-            aom_highbd_12_sub_pixel_avg_variance128x128,
-            aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12,
-            aom_highbd_sad128x128x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
-                   aom_highbd_sad128x64_avg_bits12,
-                   aom_highbd_12_variance128x64,
-                   aom_highbd_12_sub_pixel_variance128x64,
-                   aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL,
-                   aom_highbd_sad128x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
-                   aom_highbd_sad64x128_avg_bits12,
-                   aom_highbd_12_variance64x128,
-                   aom_highbd_12_sub_pixel_variance64x128,
-                   aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL,
-                   aom_highbd_sad64x128x4d_bits12)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION
+            BLOCK_64X128, aom_highbd_sad64x128_bits12,
+            aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128,
+            aom_highbd_12_sub_pixel_variance64x128,
+            aom_highbd_12_sub_pixel_avg_variance64x128,
+            aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance64x128);
+
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
                     aom_highbd_12_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
                     aom_highbd_12_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
                     aom_highbd_12_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
                     aom_highbd_12_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
@@ -2342,36 +2144,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_12_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance128x32)
-
-        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
                     aom_highbd_12_masked_sub_pixel_variance64x16)
-
         HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
                     aom_highbd_12_masked_sub_pixel_variance16x64)
-
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
                     aom_highbd_12_masked_sub_pixel_variance32x8)
-
         HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
                     aom_highbd_12_masked_sub_pixel_variance8x32)
-
         HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
                     aom_highbd_12_masked_sub_pixel_variance16x4)
-
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x16)
-#endif
-
-#if CONFIG_MOTION_VAR
-#if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
                     aom_highbd_12_obmc_variance128x128,
                     aom_highbd_12_obmc_sub_pixel_variance128x128)
@@ -2381,7 +2165,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
                     aom_highbd_12_obmc_variance64x128,
                     aom_highbd_12_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
                     aom_highbd_12_obmc_variance64x64,
                     aom_highbd_12_obmc_sub_pixel_variance64x64)
@@ -2421,42 +2204,24 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
                     aom_highbd_12_obmc_variance4x4,
                     aom_highbd_12_obmc_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits12,
-                    aom_highbd_12_obmc_variance128x32,
-                    aom_highbd_12_obmc_sub_pixel_variance128x32)
-
-        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits12,
-                    aom_highbd_12_obmc_variance32x128,
-                    aom_highbd_12_obmc_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
                     aom_highbd_12_obmc_variance64x16,
                     aom_highbd_12_obmc_sub_pixel_variance64x16)
-
         HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
                     aom_highbd_12_obmc_variance16x64,
                     aom_highbd_12_obmc_sub_pixel_variance16x64)
-
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
                     aom_highbd_12_obmc_variance32x8,
                     aom_highbd_12_obmc_sub_pixel_variance32x8)
-
         HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
                     aom_highbd_12_obmc_variance8x32,
                     aom_highbd_12_obmc_sub_pixel_variance8x32)
-
         HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
                     aom_highbd_12_obmc_variance16x4,
                     aom_highbd_12_obmc_sub_pixel_variance16x4)
-
         HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
                     aom_highbd_12_obmc_variance4x16,
                     aom_highbd_12_obmc_sub_pixel_variance4x16)
-#endif
-#endif  // CONFIG_MOTION_VAR
         break;
 
       default:
@@ -2466,7 +2231,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static void realloc_segmentation_maps(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -2487,40 +2251,59 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) {
                   aom_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
-void set_compound_tools(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_INTERINTRA
-  cm->allow_interintra_compound = 1;
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  cm->allow_masked_compound = 1;
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-}
-
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCK *const x = &cpi->td.mb;
 
   if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
-  cm->color_space = oxcf->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  cm->transfer_function = oxcf->transfer_function;
+  cm->color_primaries = oxcf->color_primaries;
+  cm->transfer_characteristics = oxcf->transfer_characteristics;
+  cm->matrix_coefficients = oxcf->matrix_coefficients;
+  cm->seq_params.monochrome = oxcf->monochrome;
   cm->chroma_sample_position = oxcf->chroma_sample_position;
-#endif
   cm->color_range = oxcf->color_range;
 
-  if (cm->profile <= PROFILE_1)
-    assert(cm->bit_depth == AOM_BITS_8);
-  else
-    assert(cm->bit_depth > AOM_BITS_8);
+  assert(IMPLIES(cm->profile <= PROFILE_1, cm->bit_depth <= AOM_BITS_10));
+
+  cm->timing_info_present = oxcf->timing_info_present;
+  cm->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+  cm->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  cm->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  cm->seq_params.display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  cm->seq_params.decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    cm->buffer_model.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_delay_present = 1;
+    set_aom_dec_model_info(&cm->buffer_model);
+    set_dec_model_op_parameters(&cm->op_params[0]);
+  } else if (cm->timing_info_present &&
+             cm->timing_info.equal_picture_interval &&
+             !cm->seq_params.decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    set_resource_availability_parameters(&cm->op_params[0]);
+  } else {
+    cm->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
+  cpi->common.options = oxcf->cfg;
   x->e_mbd.bd = (int)cm->bit_depth;
-#if CONFIG_GLOBAL_MOTION
   x->e_mbd.global_motion = cm->global_motion;
-#endif  // CONFIG_GLOBAL_MOTION
 
   if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
     rc->baseline_gf_interval = FIXED_GF_INTERVAL;
@@ -2530,30 +2313,21 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
   cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
-
-  cm->refresh_frame_context =
-      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
-          ? REFRESH_FRAME_CONTEXT_FORWARD
-          : REFRESH_FRAME_CONTEXT_BACKWARD;
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-#endif
+
+  cm->refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
+                                  ? REFRESH_FRAME_CONTEXT_DISABLED
+                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   if (x->palette_buffer == NULL) {
     CHECK_MEM_ERROR(cm, x->palette_buffer,
                     aom_memalign(16, sizeof(*x->palette_buffer)));
   }
-  set_compound_tools(cm);
   av1_reset_segment_features(cm);
-#if CONFIG_AMVR
-  set_high_precision_mv(cpi, 0, 0);
-#else
-  set_high_precision_mv(cpi, 0);
-#endif
+  set_high_precision_mv(cpi, 1, 0);
 
   set_rc_buffer_sizes(rc, &cpi->oxcf);
 
@@ -2569,7 +2343,12 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
   rc->best_quality = cpi->oxcf.best_allowed_q;
 
-  cm->interp_filter = cpi->sf.default_interp_filter;
+  if (!oxcf->large_scale_tile)
+    cm->interp_filter = cpi->sf.default_interp_filter;
+  else
+    cm->interp_filter = EIGHTTAP_REGULAR;
+
+  cm->switchable_motion_mode = 1;
 
   if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
     cm->render_width = cpi->oxcf.render_width;
@@ -2581,10 +2360,17 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cm->width = cpi->oxcf.width;
   cm->height = cpi->oxcf.height;
 
-  if (cpi->initial_width) {
-    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+  int sb_size = cm->seq_params.sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!cpi->seq_params_locked) {
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+  }
+
+  if (cpi->initial_width || sb_size != cm->seq_params.sb_size) {
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
+        cm->seq_params.sb_size != sb_size) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(&cpi->td);
+      av1_free_pc_tree(&cpi->td, num_planes);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2595,32 +2381,24 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
-#if CONFIG_EXT_REFS
   rc->is_bwd_ref_frame = 0;
   rc->is_last_bipred_frame = 0;
   rc->is_bipred_frame = 0;
-#endif  // CONFIG_EXT_REFS
-
-#if 0
-  // Experimental RD Code
-  cpi->frame_distortion = 0;
-  cpi->last_frame_distortion = 0;
-#endif
 
   set_tile_info(cpi);
 
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
 
-#if CONFIG_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
-#endif
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-#if CONFIG_AMVR
-  cm->seq_mv_precision_level = 2;
-#endif
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!cpi->seq_params_locked) {
+    cm->seq_params.operating_points_cnt_minus_1 =
+        cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
+    init_seq_coding_tools(&cm->seq_params, cm, oxcf);
+  }
 }
 
 AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
@@ -2644,10 +2422,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cm->free_mi = enc_free_mi;
   cm->setup_mi = enc_setup_mi;
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  get_default_ncobmc_kernels(cm);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(cm, cm->frame_contexts,
@@ -2663,38 +2437,18 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
-#if CONFIG_XIPHRC
-  cpi->od_rc.framerate = cpi->framerate;
-  cpi->od_rc.frame_width = cm->render_width;
-  cpi->od_rc.frame_height = cm->render_height;
-  cpi->od_rc.keyframe_rate = oxcf->key_freq;
-  cpi->od_rc.goldenframe_rate = FIXED_GF_INTERVAL;
-  cpi->od_rc.altref_rate = 25;
-  cpi->od_rc.firstpass_quant = 1;
-  cpi->od_rc.bit_depth = cm->bit_depth;
-  cpi->od_rc.minq = oxcf->best_allowed_q;
-  cpi->od_rc.maxq = oxcf->worst_allowed_q;
-  if (cpi->oxcf.rc_mode == AOM_CQ) cpi->od_rc.minq = cpi->od_rc.quality;
-  cpi->od_rc.quality = cpi->oxcf.rc_mode == AOM_Q ? oxcf->cq_level : -1;
-  cpi->od_rc.periodic_boosts = oxcf->frame_periodic_boost;
-  od_enc_rc_init(&cpi->od_rc,
-                 cpi->oxcf.rc_mode == AOM_Q ? -1 : oxcf->target_bandwidth,
-                 oxcf->maximum_buffer_size_ms);
-#else
   av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
-#endif
 
   cm->current_video_frame = 0;
+  cpi->seq_params_locked = 0;
   cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
   cpi->last_show_frame_buf_idx = INVALID_IDX;
 
   realloc_segmentation_maps(cpi);
 
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
-    memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
-  }
+  memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+  memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
@@ -2715,7 +2469,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
 
   cpi->refresh_alt_ref_frame = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -2753,17 +2506,14 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
 #if CONFIG_ENTROPY_STATS
   av1_zero(aggregate_fc);
-  av1_zero_array(aggregate_fc_per_type, FRAME_CONTEXTS);
 #endif  // CONFIG_ENTROPY_STATS
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX];
-    cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX];
-    cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX];
-    cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX];
-  }
+  cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX];
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX];
 
 #ifdef OUTPUT_YUV_SKINMAP
   yuv_skinmap_file = fopen("skinmap.yuv", "ab");
@@ -2772,17 +2522,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
 
-#if 0
-  framepsnr = fopen("framepsnr.stt", "a");
-  kf_list = fopen("kf_list.stt", "w");
-#endif
-
-#if CONFIG_XIPHRC
-  if (oxcf->pass == 2) {
-    cpi->od_rc.twopass_allframes_buf = oxcf->two_pass_stats_in.buf;
-    cpi->od_rc.twopass_allframes_buf_size = oxcf->two_pass_stats_in.sz;
-  }
-#else
   if (oxcf->pass == 1) {
     av1_init_first_pass(cpi);
   } else if (oxcf->pass == 2) {
@@ -2808,24 +2547,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
     av1_init_second_pass(cpi);
   }
-#endif
 
-#if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
-  int buf_scaler = 2;
-#else
-  int buf_scaler = 1;
-#endif
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.above_pred_buf,
-      (uint8_t *)aom_memalign(16,
-                              buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*cpi->td.mb.above_pred_buf)));
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.above_pred_buf)));
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.left_pred_buf,
-      (uint8_t *)aom_memalign(16,
-                              buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*cpi->td.mb.left_pred_buf)));
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.left_pred_buf)));
 
   CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
                   (int32_t *)aom_memalign(
@@ -2835,143 +2565,130 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
 
-#endif
-
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_speed_features_framesize_dependent(cpi);
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
   cpi->fn_ptr[BT].sdf = SDF;                                    \
   cpi->fn_ptr[BT].sdaf = SDAF;                                  \
   cpi->fn_ptr[BT].vf = VF;                                      \
   cpi->fn_ptr[BT].svf = SVF;                                    \
   cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
 
-#if CONFIG_EXT_PARTITION_TYPES
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
-      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, NULL, NULL,
-      aom_sad4x16x4d)
+      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+      aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16)
 
   BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
-      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, NULL, NULL,
-      aom_sad16x4x4d)
+      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+      aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4)
 
   BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
-      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, NULL, NULL,
-      aom_sad8x32x4d)
+      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+      aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32)
 
   BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
-      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, NULL, NULL,
-      aom_sad32x8x4d)
+      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+      aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8)
 
   BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
-      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, NULL, NULL,
-      aom_sad16x64x4d)
+      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+      aom_sad16x64x4d, aom_jnt_sad16x64_avg,
+      aom_jnt_sub_pixel_avg_variance16x64)
 
   BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
-      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, NULL, NULL,
-      aom_sad64x16x4d)
-
-#if CONFIG_EXT_PARTITION
-  BFP(BLOCK_32X128, aom_sad32x128, aom_sad32x128_avg, aom_variance32x128,
-      aom_sub_pixel_variance32x128, aom_sub_pixel_avg_variance32x128, NULL,
-      NULL, aom_sad32x128x4d)
-
-  BFP(BLOCK_128X32, aom_sad128x32, aom_sad128x32_avg, aom_variance128x32,
-      aom_sub_pixel_variance128x32, aom_sub_pixel_avg_variance128x32, NULL,
-      NULL, aom_sad128x32x4d)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+      aom_sad64x16x4d, aom_jnt_sad64x16_avg,
+      aom_jnt_sub_pixel_avg_variance64x16)
 
-#if CONFIG_EXT_PARTITION
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
-      aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d)
+      aom_sad128x128x4d, aom_jnt_sad128x128_avg,
+      aom_jnt_sub_pixel_avg_variance128x128)
 
   BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
-      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL,
-      NULL, aom_sad128x64x4d)
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+      aom_sad128x64x4d, aom_jnt_sad128x64_avg,
+      aom_jnt_sub_pixel_avg_variance128x64)
 
   BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
-      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL,
-      NULL, aom_sad64x128x4d)
-#endif  // CONFIG_EXT_PARTITION
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+      aom_sad64x128x4d, aom_jnt_sad64x128_avg,
+      aom_jnt_sub_pixel_avg_variance64x128)
 
   BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
-      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL,
-      aom_sad32x16x4d)
+      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+      aom_sad32x16x4d, aom_jnt_sad32x16_avg,
+      aom_jnt_sub_pixel_avg_variance32x16)
 
   BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
-      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, NULL, NULL,
-      aom_sad16x32x4d)
+      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+      aom_sad16x32x4d, aom_jnt_sad16x32_avg,
+      aom_jnt_sub_pixel_avg_variance16x32)
 
   BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
-      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, NULL, NULL,
-      aom_sad64x32x4d)
+      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+      aom_sad64x32x4d, aom_jnt_sad64x32_avg,
+      aom_jnt_sub_pixel_avg_variance64x32)
 
   BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
-      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, NULL, NULL,
-      aom_sad32x64x4d)
+      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+      aom_sad32x64x4d, aom_jnt_sad32x64_avg,
+      aom_jnt_sub_pixel_avg_variance32x64)
 
   BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
       aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
-      aom_sad32x32x3, aom_sad32x32x8, aom_sad32x32x4d)
+      aom_sad32x32x4d, aom_jnt_sad32x32_avg,
+      aom_jnt_sub_pixel_avg_variance32x32)
 
   BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
       aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
-      aom_sad64x64x3, aom_sad64x64x8, aom_sad64x64x4d)
+      aom_sad64x64x4d, aom_jnt_sad64x64_avg,
+      aom_jnt_sub_pixel_avg_variance64x64)
 
   BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
       aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
-      aom_sad16x16x3, aom_sad16x16x8, aom_sad16x16x4d)
+      aom_sad16x16x4d, aom_jnt_sad16x16_avg,
+      aom_jnt_sub_pixel_avg_variance16x16)
 
   BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
-      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x3,
-      aom_sad16x8x8, aom_sad16x8x4d)
+      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+      aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8)
 
   BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
-      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x3,
-      aom_sad8x16x8, aom_sad8x16x4d)
+      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+      aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16)
 
   BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
-      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x3,
-      aom_sad8x8x8, aom_sad8x8x4d)
+      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+      aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8)
 
   BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
-      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, NULL,
-      aom_sad8x4x8, aom_sad8x4x4d)
+      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+      aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4)
 
   BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
-      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, NULL,
-      aom_sad4x8x8, aom_sad4x8x4d)
+      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+      aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8)
 
   BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
-      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3,
-      aom_sad4x4x8, aom_sad4x4x4d)
+      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+      aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4)
 
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL)
-  BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL)
-  BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL)
-#endif
-
-#if CONFIG_MOTION_VAR
 #define OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;    \
   cpi->fn_ptr[BT].ovf = OVF;      \
   cpi->fn_ptr[BT].osvf = OSVF;
 
-#if CONFIG_EXT_PARTITION
   OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
        aom_obmc_sub_pixel_variance128x128)
   OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
        aom_obmc_sub_pixel_variance128x64)
   OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
        aom_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
   OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
        aom_obmc_sub_pixel_variance64x64)
   OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
@@ -2998,46 +2715,27 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
        aom_obmc_sub_pixel_variance8x4)
   OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
        aom_obmc_sub_pixel_variance4x4)
-
-#if CONFIG_EXT_PARTITION_TYPES
   OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
        aom_obmc_sub_pixel_variance4x16)
-
   OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
        aom_obmc_sub_pixel_variance16x4)
-
   OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
        aom_obmc_sub_pixel_variance8x32)
-
   OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
        aom_obmc_sub_pixel_variance32x8)
-
   OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
        aom_obmc_sub_pixel_variance16x64)
-
   OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
        aom_obmc_sub_pixel_variance64x16)
 
-#if CONFIG_EXT_PARTITION
-  OBFP(BLOCK_32X128, aom_obmc_sad32x128, aom_obmc_variance32x128,
-       aom_obmc_sub_pixel_variance32x128)
-
-  OBFP(BLOCK_128X32, aom_obmc_sad128x32, aom_obmc_variance128x32,
-       aom_obmc_sub_pixel_variance128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_MOTION_VAR
-
 #define MBFP(BT, MCSDF, MCSVF)  \
   cpi->fn_ptr[BT].msdf = MCSDF; \
   cpi->fn_ptr[BT].msvf = MCSVF;
 
-#if CONFIG_EXT_PARTITION
   MBFP(BLOCK_128X128, aom_masked_sad128x128,
        aom_masked_sub_pixel_variance128x128)
   MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
   MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
   MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
   MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
   MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
@@ -3052,7 +2750,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
   MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
 
-#if CONFIG_EXT_PARTITION_TYPES
   MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
 
   MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
@@ -3065,16 +2762,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
 
-#if CONFIG_EXT_PARTITION
-  MBFP(BLOCK_32X128, aom_masked_sad32x128, aom_masked_sub_pixel_variance32x128)
-
-  MBFP(BLOCK_128X32, aom_masked_sad128x32, aom_masked_sub_pixel_variance128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
-#endif
 
   /* av1_init_quantizer() is first called here. Add check in
    * av1_frame_init_quantizer() so that av1_init_quantizer is only
@@ -3082,29 +2770,25 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
    * av1_init_quantizer() for every frame.
    */
   av1_init_quantizer(cpi);
-#if CONFIG_AOM_QM
-  aom_qm_init(cm);
-#endif
+  av1_qm_init(cm);
 
   av1_loop_filter_init(cm);
-#if CONFIG_FRAME_SUPERRES
   cm->superres_scale_denominator = SCALE_NUMERATOR;
   cm->superres_upscaled_width = oxcf->width;
   cm->superres_upscaled_height = oxcf->height;
-#endif  // CONFIG_FRAME_SUPERRES
-#if CONFIG_LOOP_RESTORATION
   av1_loop_restoration_precal();
-#endif  // CONFIG_LOOP_RESTORATION
 
   cm->error.setjmp = 0;
 
   return cpi;
 }
 
+#if CONFIG_INTERNAL_STATS
 #define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
 
 #define SNPRINT2(H, T, V) \
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
 
 void av1_remove_compressor(AV1_COMP *cpi) {
   AV1_COMMON *cm;
@@ -3114,14 +2798,14 @@ void av1_remove_compressor(AV1_COMP *cpi) {
   if (!cpi) return;
 
   cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
   if (cm->current_video_frame > 0) {
 #if CONFIG_ENTROPY_STATS
     if (cpi->oxcf.pass != 1) {
       fprintf(stderr, "Writing counts.stt\n");
       FILE *f = fopen("counts.stt", "wb");
       fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
-      fwrite(aggregate_fc_per_type, sizeof(aggregate_fc_per_type[0]),
-             FRAME_CONTEXTS, f);
       fclose(f);
     }
 #endif  // CONFIG_ENTROPY_STATS
@@ -3151,16 +2835,21 @@ void av1_remove_compressor(AV1_COMP *cpi) {
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
         snprintf(results, sizeof(results),
                  "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
-                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
-                 cpi->psnr.stat[ALL] / cpi->count, total_psnr, total_ssim,
-                 total_ssim, cpi->fastssim.stat[ALL] / cpi->count,
-                 cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
+                 cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
+                 total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
+                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[STAT_Y] / cpi->count,
+                 cpi->psnr.stat[STAT_U] / cpi->count,
+                 cpi->psnr.stat[STAT_V] / cpi->count);
 
         if (cpi->b_calculate_blockiness) {
           SNPRINT(headings, "\t  Block\tWstBlck");
@@ -3184,19 +2873,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
       fclose(f);
     }
-
-#endif
-
-#if 0
-    {
-      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
-      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
-             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
-             cpi->time_compress_data / 1000,
-             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
-    }
-#endif
+#endif  // CONFIG_INTERNAL_STATS
   }
 
   for (t = 0; t < cpi->num_workers; ++t) {
@@ -3209,21 +2886,22 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
       aom_free(thread_data->td->palette_buffer);
-#if CONFIG_MOTION_VAR
       aom_free(thread_data->td->above_pred_buf);
       aom_free(thread_data->td->left_pred_buf);
       aom_free(thread_data->td->wsrc_buf);
       aom_free(thread_data->td->mask_buf);
-#endif  // CONFIG_MOTION_VAR
       aom_free(thread_data->td->counts);
-      av1_free_pc_tree(thread_data->td);
+      av1_free_pc_tree(thread_data->td, num_planes);
       aom_free(thread_data->td);
     }
   }
   aom_free(cpi->tile_thr_data);
   aom_free(cpi->workers);
 
-  if (cpi->num_workers > 1) av1_loop_filter_dealloc(&cpi->lf_row_sync);
+  if (cpi->num_workers > 1) {
+    av1_loop_filter_dealloc(&cpi->lf_row_sync);
+    av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+  }
 
   dealloc_compressor_data(cpi);
 
@@ -3244,6 +2922,10 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 #endif  // CONFIG_INTERNAL_STATS
 
   av1_remove_common(cm);
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    av1_hash_table_destroy(&cm->buffer_pool->frame_bufs[i].hash_table);
+  }
+  if (cpi->sf.use_hash_based_trellis) hbt_destroy();
   av1_free_ref_frame_buffers(cm->buffer_pool);
   aom_free(cpi);
 
@@ -3253,30 +2935,14 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
-#if 0
-
-  if (keyfile)
-    fclose(keyfile);
-
-  if (framepsnr)
-    fclose(framepsnr);
-
-  if (kf_list)
-    fclose(kf_list);
-
-#endif
 }
 
 static void generate_psnr_packet(AV1_COMP *cpi) {
   struct aom_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
-#if CONFIG_HIGHBITDEPTH
   aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
                        cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
-#else
-  aom_calc_psnr(cpi->source, cpi->common.frame_to_show, &psnr);
-#endif
 
   for (i = 0; i < 4; ++i) {
     pkt.data.psnr.samples[i] = psnr.samples[i];
@@ -3290,22 +2956,25 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
   if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
 
-  cpi->ref_frame_flags = ref_frame_flags;
+  cpi->ext_ref_frame_flags = ref_frame_flags;
   return 0;
 }
 
-void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) {
-  cpi->ext_refresh_golden_frame = (ref_frame_flags & AOM_GOLD_FLAG) != 0;
-  cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & AOM_ALT_FLAG) != 0;
-  cpi->ext_refresh_last_frame = (ref_frame_flags & AOM_LAST_FLAG) != 0;
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) {
+  cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0;
+  cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0;
+  cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0;
+  cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0;
   cpi->ext_refresh_frame_flags_pending = 1;
 }
 
 int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
   if (cfg) {
-    aom_yv12_copy_frame(cfg, sd);
+    aom_yv12_copy_frame(cfg, sd, num_planes);
     return 0;
   } else {
     return -1;
@@ -3314,9 +2983,10 @@ int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
 
 int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
   if (cfg) {
-    aom_yv12_copy_frame(sd, cfg);
+    aom_yv12_copy_frame(sd, cfg, num_planes);
     return 0;
   } else {
     return -1;
@@ -3361,7 +3031,6 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
 }
 #endif
 
-#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
 #if USE_GF16_MULTI_LAYER
 static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -3374,7 +3043,7 @@ static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
   } else if (cpi->rc.is_last_bipred_frame) {
     cpi->rc.is_last_bipred_frame = 0;
     cm->show_existing_frame = 1;
-    cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx;
+    cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1];
   } else if (next_frame_update_type == OVERLAY_UPDATE ||
              next_frame_update_type == INTNL_OVERLAY_UPDATE) {
     // Check the temporal filtering status for the next OVERLAY frame
@@ -3392,8 +3061,8 @@ static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
       cm->show_existing_frame = 1;
       cpi->rc.is_src_frame_alt_ref = 1;
       cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
-                                         ? cpi->alt_fb_idx
-                                         : cpi->bwd_fb_idx;
+                                         ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
+                                         : cpi->ref_fb_idx[BWDREF_FRAME - 1];
       cpi->is_arf_filter_off[which_arf] = 0;
     }
   }
@@ -3423,7 +3092,7 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
     //       the last_fb_idxes[0] after reference frame buffer update
     cpi->rc.is_last_bipred_frame = 0;
     cm->show_existing_frame = 1;
-    cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0];
+    cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
   } else if (cpi->is_arf_filter_off[which_arf] &&
              (next_frame_update_type == OVERLAY_UPDATE ||
               next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
@@ -3432,20 +3101,18 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
     cm->show_existing_frame = 1;
     cpi->rc.is_src_frame_alt_ref = 1;
     cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
-                                       ? cpi->alt_fb_idx
-                                       : cpi->alt2_fb_idx;
+                                       ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
+                                       : cpi->ref_fb_idx[ALTREF2_FRAME - 1];
     cpi->is_arf_filter_off[which_arf] = 0;
   }
   cpi->rc.is_src_frame_ext_arf = 0;
 }
-#endif  // CONFIG_EXT_REFS && !CONFIG_XIPHRC
 
 #ifdef OUTPUT_YUV_REC
 void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
   int h = cm->height;
   if (yuv_rec_file == NULL) return;
-#if CONFIG_HIGHBITDEPTH
   if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
 
@@ -3473,7 +3140,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
     fflush(yuv_rec_file);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   do {
     fwrite(src, s->y_width, 1, yuv_rec_file);
@@ -3500,7 +3166,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
 }
 #endif  // OUTPUT_YUV_REC
 
-#if CONFIG_GLOBAL_MOTION
 #define GM_RECODE_LOOP_NUM4X4_FACTOR 192
 static int recode_loop_test_global_motion(AV1_COMP *cpi) {
   int i;
@@ -3515,12 +3180,13 @@ static int recode_loop_test_global_motion(AV1_COMP *cpi) {
       assert(cm->global_motion[i].wmtype == IDENTITY);
       cpi->gmparams_cost[i] = 0;
       recode = 1;
-      recode |= (rdc->global_motion_used[i] > 0);
+      // TODO(sarahparker): The earlier condition for recoding here was:
+      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
+      // similar to that back to speed up global motion?
     }
   }
   return recode;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
@@ -3602,15 +3268,15 @@ static void dump_ref_frame_images(AV1_COMP *cpi) {
 }
 #endif  // DUMP_REF_FRAME_IMAGES == 1
 
-#if CONFIG_EXT_REFS
 // This function is used to shift the virtual indices of last reference frames
 // as follows:
 // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
 // when the LAST_FRAME is updated.
 static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
   int ref_frame;
   for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
-    cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+    cpi->ref_fb_idx[ref_frame] = cpi->ref_fb_idx[ref_frame - 1];
 
     // [0] is allocated to the current coded frame. The statistics for the
     // reference frames start at [LAST_FRAME], i.e. [1].
@@ -3621,64 +3287,18 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
     }
   }
 }
-#endif  // CONFIG_EXT_REFS
 
-#if CONFIG_VAR_REFS
-static void enc_check_valid_ref_frames(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  MV_REFERENCE_FRAME ref_frame;
-
-  // TODO(zoeliu): To handle ALTREF_FRAME the same way as do with other
-  //               reference frames. Current encoder invalid ALTREF when ALTREF
-  //               is the same as LAST, but invalid all the other references
-  //               when they are the same as ALTREF.
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    int ref_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
-
-    if (ref_buf_idx != INVALID_IDX) {
-      ref_buf->is_valid = 1;
-
-      MV_REFERENCE_FRAME ref;
-      for (ref = LAST_FRAME; ref < ref_frame; ++ref) {
-        int buf_idx = get_ref_frame_buf_idx(cpi, ref);
-        RefBuffer *const buf = &cm->frame_refs[ref - LAST_FRAME];
-        if (buf->is_valid && buf_idx == ref_buf_idx) {
-          if (ref_frame != ALTREF_FRAME || ref == LAST_FRAME) {
-            ref_buf->is_valid = 0;
-            break;
-          } else {
-            buf->is_valid = 0;
-          }
-        }
-      }
-    } else {
-      ref_buf->is_valid = 0;
-    }
-  }
-}
-#endif  // CONFIG_VAR_REFS
-
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 static void update_reference_frames_gf16(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
 
   if (cm->frame_type == KEY_FRAME) {
-    for (int ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
       ref_cnt_fb(pool->frame_bufs,
-                 &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
                  cm->new_fb_idx);
     }
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
-               cm->new_fb_idx);
   } else {
     if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
         cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
@@ -3703,7 +3323,6 @@ static void update_reference_frames_gf16(AV1_COMP *cpi) {
 #endif  // DUMP_REF_FRAME_IMAGES
 }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
 static void update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3712,30 +3331,28 @@ static void update_reference_frames(AV1_COMP *cpi) {
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
   if (cpi->rc.baseline_gf_interval == 16) {
     update_reference_frames_gf16(cpi);
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
   BufferPool *const pool = cm->buffer_pool;
+
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
-  if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->new_fb_idx);
-#if CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
-               cm->new_fb_idx);
-#endif  // CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
-               cm->new_fb_idx);
-  } else if (av1_preserve_existing_gf(cpi)) {
+
+  if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
+                 cm->new_fb_idx);
+    }
+    return;
+  }
+
+  if (av1_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
     // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
@@ -3746,19 +3363,17 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // slot and, if we're updating the GF, the current frame becomes the new GF.
     int tmp;
 
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
                cm->new_fb_idx);
-    tmp = cpi->alt_fb_idx;
-    cpi->alt_fb_idx = cpi->gld_fb_idx;
-    cpi->gld_fb_idx = tmp;
+    tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+    cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
 
-#if CONFIG_EXT_REFS
     // We need to modify the mapping accordingly
-    cpi->arf_map[0] = cpi->alt_fb_idx;
-#endif  // CONFIG_EXT_REFS
-// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
-// cpi->interp_filter_selected[GOLDEN_FRAME]?
-#if CONFIG_EXT_REFS
+    cpi->arf_map[0] = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+    // cpi->interp_filter_selected[GOLDEN_FRAME]?
   } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
     // Deal with the special case for showing existing internal ALTREF_FRAME
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
@@ -3767,29 +3382,22 @@ static void update_reference_frames(AV1_COMP *cpi) {
     const int which_arf = gf_group->arf_ref_idx[gf_group->index];
     assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
 
-    const int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+    const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
     shift_last_ref_frames(cpi);
 
-    cpi->lst_fb_idxes[0] = cpi->alt2_fb_idx;
-    cpi->alt2_fb_idx = tmp;
+    cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
+    cpi->ref_fb_idx[ALTREF2_FRAME - 1] = tmp;
     // We need to modify the mapping accordingly
-    cpi->arf_map[which_arf] = cpi->alt2_fb_idx;
+    cpi->arf_map[which_arf] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
 
     memcpy(cpi->interp_filter_selected[LAST_FRAME],
            cpi->interp_filter_selected[ALTREF2_FRAME],
            sizeof(cpi->interp_filter_selected[ALTREF2_FRAME]));
-#endif     // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
     // === ALTREF_FRAME ===
     if (cpi->refresh_alt_ref_frame) {
-      int arf_idx = cpi->alt_fb_idx;
+      int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
       int which_arf = 0;
-#if !CONFIG_EXT_REFS
-      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        arf_idx = gf_group->arf_update_idx[gf_group->index];
-      }
-#endif  // !CONFIG_EXT_REFS
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
@@ -3799,21 +3407,19 @@ static void update_reference_frames(AV1_COMP *cpi) {
 
     // === GOLDEN_FRAME ===
     if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
                  cm->new_fb_idx);
 
-#if !CONFIG_EXT_REFS
-      if (!cpi->rc.is_src_frame_alt_ref)
-#endif  // !CONFIG_EXT_REFS
-        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-               cpi->interp_filter_selected[0],
-               sizeof(cpi->interp_filter_selected[0]));
+      memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 
-#if CONFIG_EXT_REFS
     // === BWDREF_FRAME ===
     if (cpi->refresh_bwd_ref_frame) {
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
                  cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
@@ -3823,18 +3429,17 @@ static void update_reference_frames(AV1_COMP *cpi) {
 
     // === ALTREF2_FRAME ===
     if (cpi->refresh_alt2_ref_frame) {
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]],
                  cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
     }
-#endif  // CONFIG_EXT_REFS
   }
 
   if (cpi->refresh_last_frame) {
-#if CONFIG_EXT_REFS
     // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
     // reference to the reference frame buffer virtual index; and then (2) from
     // the virtual index to the reference frame buffer physical index:
@@ -3842,7 +3447,7 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // LAST_FRAME,      ..., LAST3_FRAME,     ..., ALTREF_FRAME
     //      |                     |                     |
     //      v                     v                     v
-    // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx
+    // ref_fb_idx[0],   ..., ref_fb_idx[2],   ..., ref_fb_idx[ALTREF_FRAME-1]
     //      |                     |                     |
     //      v                     v                     v
     // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
@@ -3864,61 +3469,42 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // LAST_FRAME,      LAST2_FRAME,     LAST3_FRAME
     //      |                |                |
     //      v                v                v
-    // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
-    int ref_frame;
+    // ref_fb_idx[2],   ref_fb_idx[0],   ref_fb_idx[1]
+    int tmp;
 
-    if (cm->frame_type == KEY_FRAME) {
-      for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
-        ref_cnt_fb(pool->frame_bufs,
-                   &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
-                   cm->new_fb_idx);
-      }
-    } else {
-      int tmp;
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[LAST_REF_FRAMES - 1]],
+               cm->new_fb_idx);
 
-      ref_cnt_fb(pool->frame_bufs,
-                 &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
-                 cm->new_fb_idx);
+    tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
 
-      tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+    shift_last_ref_frames(cpi);
+    cpi->ref_fb_idx[0] = tmp;
 
-      shift_last_ref_frames(cpi);
-      cpi->lst_fb_idxes[0] = tmp;
+    assert(cm->show_existing_frame == 0);
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[0],
+           sizeof(cpi->interp_filter_selected[0]));
+
+    if (cpi->rc.is_last_bipred_frame) {
+      // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
+      // LAST3_FRAME by updating the virtual indices.
+      //
+      // NOTE: The source frame for BWDREF does not have a holding position as
+      //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
+      //       virtual index reshuffling for BWDREF, the encoder always
+      //       specifies a LAST_BIPRED right before BWDREF and completes the
+      //       reshuffling job accordingly.
+      tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
 
-      assert(cm->show_existing_frame == 0);
-      memcpy(cpi->interp_filter_selected[LAST_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
+      shift_last_ref_frames(cpi);
+      cpi->ref_fb_idx[0] = cpi->ref_fb_idx[BWDREF_FRAME - 1];
+      cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
 
-      if (cpi->rc.is_last_bipred_frame) {
-        // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
-        // LAST3_FRAME by updating the virtual indices.
-        //
-        // NOTE: The source frame for BWDREF does not have a holding position as
-        //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
-        //       virtual index reshuffling for BWDREF, the encoder always
-        //       specifies a LAST_BIPRED right before BWDREF and completes the
-        //       reshuffling job accordingly.
-        tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
-
-        shift_last_ref_frames(cpi);
-        cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
-        cpi->bwd_fb_idx = tmp;
-
-        memcpy(cpi->interp_filter_selected[LAST_FRAME],
-               cpi->interp_filter_selected[BWDREF_FRAME],
-               sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-      }
-    }
-#else   // !CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
-               cm->new_fb_idx);
-    if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
+             cpi->interp_filter_selected[BWDREF_FRAME],
+             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
     }
-#endif  // CONFIG_EXT_REFS
   }
 
 #if DUMP_REF_FRAME_IMAGES == 1
@@ -3937,19 +3523,11 @@ static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
 
 static void scale_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MV_REFERENCE_FRAME ref_frame;
   const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
-    AOM_LAST_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_LAST2_FLAG,
-    AOM_LAST3_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_GOLD_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_BWD_FLAG,
-    AOM_ALT2_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_ALT_FLAG
+    AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG,
+    AOM_BWD_FLAG,  AOM_ALT2_FLAG,  AOM_ALT_FLAG
   };
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -3964,7 +3542,6 @@ static void scale_references(AV1_COMP *cpi) {
         continue;
       }
 
-#if CONFIG_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         RefCntBuffer *new_fb_ptr = NULL;
         int force_scaling = 0;
@@ -3983,35 +3560,11 @@ static void scale_references(AV1_COMP *cpi) {
                   cm->byte_alignment, NULL, NULL, NULL))
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf,
-                                      (int)cm->bit_depth);
+          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+                                      num_planes);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
-#else
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        RefCntBuffer *new_fb_ptr = NULL;
-        int force_scaling = 0;
-        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
-        if (new_fb == INVALID_IDX) {
-          new_fb = get_free_fb(cm);
-          force_scaling = 1;
-        }
-        if (new_fb == INVALID_IDX) return;
-        new_fb_ptr = &pool->frame_bufs[new_fb];
-        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-            new_fb_ptr->buf.y_crop_height != cm->height) {
-          if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
-                                       cm->subsampling_x, cm->subsampling_y,
-                                       AOM_BORDER_IN_PIXELS, cm->byte_alignment,
-                                       NULL, NULL, NULL))
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffer");
-          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf);
-          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
-#endif  // CONFIG_HIGHBITDEPTH
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -4029,115 +3582,18 @@ static void scale_references(AV1_COMP *cpi) {
 static void release_scaled_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   int i;
-  if (cpi->oxcf.pass == 0) {
-    // Only release scaled references under certain conditions:
-    // if reference will be updated, or if scaled reference has same resolution.
-    int refresh[INTER_REFS_PER_FRAME];
-    refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
-#if CONFIG_EXT_REFS
-    refresh[1] = refresh[2] = 0;
-    refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0;
-    refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
-    refresh[5] = (cpi->refresh_alt2_ref_frame) ? 1 : 0;
-    refresh[6] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
-#else   // !CONFIG_EXT_REFS
-    refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
-    refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
-#endif  // CONFIG_EXT_REFS
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      const int idx = cpi->scaled_ref_idx[i - 1];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
-      if (buf != NULL &&
-          (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
-                              buf->buf.y_crop_height == ref->y_crop_height))) {
-        --buf->ref_count;
-        cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
-      }
-    }
-  } else {
-    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) {
-      const int idx = cpi->scaled_ref_idx[i];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      if (buf != NULL) {
-        --buf->ref_count;
-        cpi->scaled_ref_idx[i] = INVALID_IDX;
-      }
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (i = 0; i < REF_FRAMES; ++i) {
+    const int idx = cpi->scaled_ref_idx[i];
+    RefCntBuffer *const buf =
+        idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
     }
   }
 }
 
-#if 0 && CONFIG_INTERNAL_STATS
-static void output_frame_level_debug_stats(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int64_t recon_err;
-
-  aom_clear_system_state();
-
-  recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-
-  if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
-       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
-       "%10"PRId64" %10"PRId64" %10d "
-       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
-        "%6d %6d %5d %5d %5d "
-        "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
-        cpi->common.current_video_frame,
-        cm->width, cm->height,
-        cpi->rc.source_alt_ref_pending,
-        cpi->rc.source_alt_ref_active,
-        cpi->rc.this_frame_target,
-        cpi->rc.projected_frame_size,
-        cpi->rc.projected_frame_size / cpi->common.MBs,
-        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
-        cpi->rc.vbr_bits_off_target,
-        cpi->rc.vbr_bits_off_target_fast,
-        cpi->twopass.extend_minq,
-        cpi->twopass.extend_minq_fast,
-        cpi->rc.total_target_vs_actual,
-        (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
-        cpi->rc.total_actual_bits, cm->base_qindex,
-        av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
-        (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
-        av1_convert_qindex_to_q(cpi->twopass.active_worst_quality,
-                                cm->bit_depth),
-        cpi->rc.avg_q,
-        av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
-        cpi->refresh_last_frame, cpi->refresh_golden_frame,
-        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
-        cpi->twopass.bits_left,
-        cpi->twopass.total_left_stats.coded_error,
-        cpi->twopass.bits_left /
-            (1 + cpi->twopass.total_left_stats.coded_error),
-        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
-        cpi->twopass.kf_zeromotion_pct,
-        cpi->twopass.fr_content_type);
-
-  fclose(f);
-
-  if (0) {
-    FILE *const fmodes = fopen("Modes.stt", "a");
-    int i;
-
-    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
-            cm->frame_type, cpi->refresh_golden_frame,
-            cpi->refresh_alt_ref_frame);
-
-    for (i = 0; i < MAX_MODES; ++i)
-      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
-    fprintf(fmodes, "\n");
-
-    fclose(fmodes);
-  }
-}
-#endif
-
 static void set_mv_search_params(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
@@ -4164,18 +3620,16 @@ static void set_mv_search_params(AV1_COMP *cpi) {
 }
 
 static void set_size_independent_vars(AV1_COMP *cpi) {
-#if CONFIG_GLOBAL_MOTION
   int i;
   for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     cpi->common.global_motion[i] = default_warp_params;
   }
   cpi->global_motion_search_done = 0;
-#endif  // CONFIG_GLOBAL_MOTION
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_rd_speed_thresholds(cpi);
   av1_set_rd_speed_thresholds_sub8x8(cpi);
   cpi->common.interp_filter = cpi->sf.default_interp_filter;
-  if (!frame_is_intra_only(&cpi->common)) set_compound_tools(&cpi->common);
+  cpi->common.switchable_motion_mode = 1;
 }
 
 static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
@@ -4186,24 +3640,13 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
   // Setup variables that depend on the dimensions of the frame.
   av1_set_speed_features_framesize_dependent(cpi);
 
-// Decide q and q bounds.
-#if CONFIG_XIPHRC
-  int frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
-  *q = od_enc_rc_select_quantizers_and_lambdas(
-      &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
-      frame_type, bottom_index, top_index);
-#else
+  // Decide q and q bounds.
   *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
                                 top_index);
-#endif
 
   if (!frame_is_intra_only(cm)) {
-#if CONFIG_AMVR
     set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH,
-                          cpi->common.cur_frame_mv_precision_level);
-#else
-    set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
-#endif
+                          cpi->common.cur_frame_force_integer_mv);
   }
 
   // Configure experimental use of segmentation for enhanced coding of
@@ -4224,10 +3667,9 @@ static void init_motion_estimation(AV1_COMP *cpi) {
   }
 }
 
-#if CONFIG_LOOP_RESTORATION
 #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
-static void set_restoration_tilesize(int width, int height, int sx, int sy,
-                                     RestorationInfo *rst) {
+static void set_restoration_unit_size(int width, int height, int sx, int sy,
+                                      RestorationInfo *rst) {
   (void)width;
   (void)height;
   (void)sx;
@@ -4238,17 +3680,13 @@ static void set_restoration_tilesize(int width, int height, int sx, int sy,
   int s = 0;
 #endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
 
-  rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
-  rst[1].restoration_tilesize = rst[0].restoration_tilesize >> s;
-  rst[2].restoration_tilesize = rst[1].restoration_tilesize;
-
-  rst[0].procunit_width = rst[0].procunit_height = RESTORATION_PROC_UNIT_SIZE;
-  rst[1].procunit_width = rst[2].procunit_width =
-      RESTORATION_PROC_UNIT_SIZE >> sx;
-  rst[1].procunit_height = rst[2].procunit_height =
-      RESTORATION_PROC_UNIT_SIZE >> sy;
+  if (width * height > 352 * 288)
+    rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
+  else
+    rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+  rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
+  rst[2].restoration_unit_size = rst[1].restoration_unit_size;
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
 static void init_ref_frame_bufs(AV1_COMMON *cm) {
   int i;
@@ -4258,31 +3696,23 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
     cm->ref_frame_map[i] = INVALID_IDX;
     pool->frame_bufs[i].ref_count = 0;
   }
-#if CONFIG_HASH_ME
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+  if (cm->seq_params.force_screen_content_tools) {
+    for (i = 0; i < FRAME_BUFFERS; ++i) {
+      av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+    }
   }
-#endif
 }
 
-static void check_initial_width(AV1_COMP *cpi,
-#if CONFIG_HIGHBITDEPTH
-                                int use_highbitdepth,
-#endif
+static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                                 int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
 
-  if (!cpi->initial_width ||
-#if CONFIG_HIGHBITDEPTH
-      cm->use_highbitdepth != use_highbitdepth ||
-#endif
+  if (!cpi->initial_width || cm->use_highbitdepth != use_highbitdepth ||
       cm->subsampling_x != subsampling_x ||
       cm->subsampling_y != subsampling_y) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
-#if CONFIG_HIGHBITDEPTH
     cm->use_highbitdepth = use_highbitdepth;
-#endif
 
     alloc_raw_frame_buffers(cpi);
     init_ref_frame_bufs(cm);
@@ -4299,12 +3729,9 @@ static void check_initial_width(AV1_COMP *cpi,
 // Returns 1 if the assigned width or height was <= 0.
 static int set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
-#if CONFIG_HIGHBITDEPTH
+  const int num_planes = av1_num_planes(cm);
   check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
                       cm->subsampling_y);
-#else
-  check_initial_width(cpi, cm->subsampling_x, cm->subsampling_y);
-#endif  // CONFIG_HIGHBITDEPTH
 
   if (width <= 0 || height <= 0) return 1;
 
@@ -4314,7 +3741,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
   if (cpi->initial_width && cpi->initial_height &&
       (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(&cpi->td);
+    av1_free_pc_tree(&cpi->td, num_planes);
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
     cpi->initial_width = cpi->initial_height = 0;
@@ -4326,6 +3753,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
 
 static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int ref_frame;
 
@@ -4333,52 +3761,42 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
     // There has been a change in the encoded frame size
     set_size_literal(cpi, width, height);
     set_mv_search_params(cpi);
+    // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+    cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
   }
 
-#if !CONFIG_XIPHRC
   if (cpi->oxcf.pass == 2) {
     av1_set_target_rate(cpi, cm->width, cm->height);
   }
-#endif
 
   alloc_frame_mvs(cm, cm->new_fb_idx);
 
+  // Allocate above context buffers
+  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+      cm->num_allocated_above_contexts < cm->tile_rows) {
+    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+  }
+
   // Reset the frame pointers to the current frame size.
   if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
-#if CONFIG_LOOP_RESTORATION
-  set_restoration_tilesize(
-#if CONFIG_FRAME_SUPERRES
-      cm->superres_upscaled_width, cm->superres_upscaled_height,
-#else
-      cm->width, cm->height,
-#endif  // CONFIG_FRAME_SUPERRES
-      cm->subsampling_x, cm->subsampling_y, cm->rst_info);
-  for (int i = 0; i < MAX_MB_PLANE; ++i)
+  const int frame_width = cm->superres_upscaled_width;
+  const int frame_height = cm->superres_upscaled_height;
+  set_restoration_unit_size(frame_width, frame_height, cm->subsampling_x,
+                            cm->subsampling_y, cm->rst_info);
+  for (int i = 0; i < num_planes; ++i)
     cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
   av1_alloc_restoration_buffers(cm);
-  for (int i = 0; i < MAX_MB_PLANE; ++i) {
-    cpi->rst_search[i].restoration_tilesize =
-        cm->rst_info[i].restoration_tilesize;
-    cpi->rst_search[i].procunit_width = cm->rst_info[i].procunit_width;
-    cpi->rst_search[i].procunit_height = cm->rst_info[i].procunit_height;
-    av1_alloc_restoration_struct(cm, &cpi->rst_search[i],
-#if CONFIG_FRAME_SUPERRES
-                                 cm->superres_upscaled_width,
-                                 cm->superres_upscaled_height);
-#else
-                                 cm->width, cm->height);
-#endif  // CONFIG_FRAME_SUPERRES
-  }
-#endif                            // CONFIG_LOOP_RESTORATION
   alloc_util_frame_buffers(cpi);  // TODO(afergs): Remove? Gets called anyways.
   init_motion_estimation(cpi);
 
@@ -4391,36 +3809,18 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
     if (buf_idx != INVALID_IDX) {
       YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
       ref_buf->buf = buf;
-#if CONFIG_HIGHBITDEPTH
-      av1_setup_scale_factors_for_frame(
-          &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width,
-          cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0);
-#else
       av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
                                         buf->y_crop_height, cm->width,
                                         cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
-      if (av1_is_scaled(&ref_buf->sf)) aom_extend_frame_borders(buf);
+      if (av1_is_scaled(&ref_buf->sf))
+        aom_extend_frame_borders(buf, num_planes);
     } else {
       ref_buf->buf = NULL;
     }
   }
 
-#if CONFIG_VAR_REFS
-  // Check duplicate reference frames
-  enc_check_valid_ref_frames(cpi);
-#endif  // CONFIG_VAR_REFS
-
-#if CONFIG_INTRABC
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
-                                    cm->width, cm->height,
-                                    cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
                                     cm->width, cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_INTRABC
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
@@ -4432,6 +3832,7 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
   if (oxcf->pass == 1) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
 
+  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
   switch (oxcf->resize_mode) {
     case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
     case RESIZE_FIXED:
@@ -4446,15 +3847,19 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
   return new_denom;
 }
 
-#if CONFIG_FRAME_SUPERRES
-
 static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
   // Choose an arbitrary random number
   static unsigned int seed = 34567;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   if (oxcf->pass == 1) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
-  int bottom_index, top_index, q, qthresh;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
+                 cpi->common.seq_params.enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 oxcf->superres_mode == SUPERRES_NONE));
 
   switch (oxcf->superres_mode) {
     case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
@@ -4465,21 +3870,35 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
         new_denom = oxcf->superres_scale_denominator;
       break;
     case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
-    case SUPERRES_QTHRESH:
-      qthresh = (cpi->common.frame_type == KEY_FRAME ? oxcf->superres_kf_qthresh
-                                                     : oxcf->superres_qthresh);
+    case SUPERRES_QTHRESH: {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      const RATE_FACTOR_LEVEL rf_level = gf_group->rf_level[gf_group->index];
+      const double rate_factor_delta = rate_factor_deltas[rf_level];
+      const int qthresh = (rate_factor_delta <= 1.0)
+                              ? oxcf->superres_qthresh
+                              : oxcf->superres_kf_qthresh;
       av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
-      q = av1_rc_pick_q_and_bounds(cpi, cpi->oxcf.width, cpi->oxcf.height,
-                                   &bottom_index, &top_index);
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
       if (q < qthresh) {
         new_denom = SCALE_NUMERATOR;
       } else {
-        new_denom = SCALE_NUMERATOR + 1 + ((q - qthresh) >> 3);
-        new_denom = AOMMIN(SCALE_NUMERATOR << 1, new_denom);
-        // printf("SUPERRES: q %d, qthresh %d: denom %d\n", q, qthresh,
-        // new_denom);
+        const uint8_t min_denom = SCALE_NUMERATOR + 1;
+        const uint8_t denom_step = (MAXQ - qthresh + 1) >> 3;
+
+        if (q == qthresh) {
+          new_denom = min_denom;
+        } else if (denom_step == 0) {
+          new_denom = SCALE_NUMERATOR << 1;
+        } else {
+          const uint8_t additional_denom = (q - qthresh) / denom_step;
+          new_denom =
+              AOMMIN(min_denom + additional_denom, SCALE_NUMERATOR << 1);
+        }
       }
       break;
+    }
     default: assert(0);
   }
   return new_denom;
@@ -4489,15 +3908,12 @@ static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
   return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
 }
 
-// TODO(now): Fix?
 static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
-  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom) &&
-         (CONFIG_HORZONLY_FRAME_SUPERRES ||
-          dimension_is_ok(oheight, rsz->resize_height, rsz->superres_denom));
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
 }
 
-#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
-
 static int validate_size_scales(RESIZE_MODE resize_mode,
                                 SUPERRES_MODE superres_mode, int owidth,
                                 int oheight, size_params_type *rsz) {
@@ -4548,24 +3964,17 @@ static int validate_size_scales(RESIZE_MODE resize_mode,
     } while (!dimensions_are_ok(owidth, oheight, rsz) &&
              (resize_denom > SCALE_NUMERATOR ||
               rsz->superres_denom > SCALE_NUMERATOR));
-  } else {  // We are allowed to alter neither resize scale nor superres scale.
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
     return 0;
   }
   return dimensions_are_ok(owidth, oheight, rsz);
 }
-#undef DIVIDE_AND_ROUND
-#endif  // CONFIG_FRAME_SUPERRES
 
 // Calculates resize and superres params for next frame
 size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  size_params_type rsz = {
-    oxcf->width,
-    oxcf->height,
-#if CONFIG_FRAME_SUPERRES
-    SCALE_NUMERATOR
-#endif  // CONFIG_FRAME_SUPERRES
-  };
+  size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
   int resize_denom;
   if (oxcf->pass == 1) return rsz;
   if (cpi->resize_pending_width && cpi->resize_pending_height) {
@@ -4579,12 +3988,10 @@ size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
     av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
                               resize_denom);
   }
-#if CONFIG_FRAME_SUPERRES
   rsz.superres_denom = calculate_next_superres_scale(cpi);
   if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width,
                             oxcf->height, &rsz))
     assert(0 && "Invalid scale parameters");
-#endif  // CONFIG_FRAME_SUPERRES
   return rsz;
 }
 
@@ -4592,14 +3999,12 @@ static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
   int encode_width = rsz->resize_width;
   int encode_height = rsz->resize_height;
 
-#if CONFIG_FRAME_SUPERRES
   AV1_COMMON *cm = &cpi->common;
   cm->superres_upscaled_width = encode_width;
   cm->superres_upscaled_height = encode_height;
   cm->superres_scale_denominator = rsz->superres_denom;
   av1_calculate_scaled_superres_size(&encode_width, &encode_height,
                                      rsz->superres_denom);
-#endif  // CONFIG_FRAME_SUPERRES
   set_frame_size(cpi, encode_width, encode_height);
 }
 
@@ -4608,67 +4013,63 @@ static void setup_frame_size(AV1_COMP *cpi) {
   setup_frame_size_from_params(cpi, &rsz);
 }
 
-#if CONFIG_FRAME_SUPERRES
 static void superres_post_encode(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
 
-  if (av1_superres_unscaled(cm)) return;
+  if (!av1_superres_scaled(cm)) return;
+
+  assert(cpi->oxcf.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf));
+  assert(!cm->all_lossless);
 
   av1_superres_upscale(cm, NULL);
 
   // If regular resizing is occurring the source will need to be downscaled to
   // match the upscaled superres resolution. Otherwise the original source is
   // used.
-  if (av1_resize_unscaled(cm)) {
+  if (!av1_resize_scaled(cm)) {
     cpi->source = cpi->unscaled_source;
     if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
   } else {
     assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
     assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
-    // Do downscale. cm->(width|height) has been updated by av1_superres_upscale
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
     if (aom_realloc_frame_buffer(
             &cpi->scaled_source, cm->superres_upscaled_width,
             cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-            cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+            cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+            NULL, NULL, NULL))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate scaled source buffer for superres");
     assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
     assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
-#if CONFIG_HIGHBITDEPTH
     av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
-                                (int)cm->bit_depth);
-#else
-    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source);
-#endif  // CONFIG_HIGHBITDEPTH
+                                (int)cm->bit_depth, num_planes);
     cpi->source = &cpi->scaled_source;
   }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
 static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  struct loopfilter *lf = &cm->lf;
-  int no_loopfilter = 0;
 
-  if (is_lossless_requested(&cpi->oxcf)) no_loopfilter = 1;
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+                 cm->coded_lossless && cm->all_lossless));
+
+  const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile;
+  const int no_cdef =
+      !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile;
+  const int no_restoration = !cm->seq_params.enable_restoration ||
+                             cm->all_lossless || cm->large_scale_tile;
 
-#if CONFIG_EXT_TILE
-  // 0 loopfilter level is only necessary if individual tile
-  // decoding is required.
-  if (cm->single_tile_decoding) no_loopfilter = 1;
-#endif  // CONFIG_EXT_TILE
+  struct loopfilter *lf = &cm->lf;
 
   if (no_loopfilter) {
-#if CONFIG_LOOPFILTER_LEVEL
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
-#else
-    lf->filter_level = 0;
-#endif
   } else {
     struct aom_usec_timer timer;
 
@@ -4682,79 +4083,60 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
     cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
   }
 
-#if !CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
-  if (lf->filter_level[0] || lf->filter_level[1])
-#else
-  if (lf->filter_level > 0)
-#endif
-#endif  // CONFIG_LPF_SB
-  {
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-#if CONFIG_LPF_SB
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0, 0,
-                          0);
-#else
-#if CONFIG_LOOPFILTER_LEVEL
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level[0],
-                          lf->filter_level[1], 0, 0);
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_u,
-                          lf->filter_level_u, 1, 0);
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_v,
-                          lf->filter_level_v, 2, 0);
-
-#else
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
+  if (lf->filter_level[0] || lf->filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
 #else
     if (cpi->num_workers > 1)
-      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
-                               lf->filter_level, 0, 0, cpi->workers,
-                               cpi->num_workers, &cpi->lf_row_sync);
+      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
     else
-      av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+      av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
 #endif
   }
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm);
-#endif
+  if (!no_restoration)
+    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0);
 
-#if CONFIG_CDEF
-  if (is_lossless_requested(&cpi->oxcf)) {
+  if (no_cdef) {
     cm->cdef_bits = 0;
     cm->cdef_strengths[0] = 0;
     cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
   } else {
     // Find CDEF parameters
     av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
-                    cpi->oxcf.speed > 0);
+                    cpi->sf.fast_cdef_search);
 
     // Apply the filter
     av1_cdef_frame(cm->frame_to_show, cm, xd);
   }
-#endif
 
-#if CONFIG_FRAME_SUPERRES
   superres_post_encode(cpi);
-#endif  // CONFIG_FRAME_SUPERRES
 
-#if CONFIG_LOOP_RESTORATION
-  aom_extend_frame_borders(cm->frame_to_show);
-  av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL);
+  if (no_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  } else {
+    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1);
+    av1_pick_filter_restoration(cpi->source, cpi);
+    if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+      if (cpi->num_workers > 1)
+        av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0,
+                                             cpi->workers, cpi->num_workers,
+                                             &cpi->lr_row_sync, &cpi->lr_ctxt);
+      else
+        av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0,
+                                          &cpi->lr_ctxt);
+    }
   }
-#endif  // CONFIG_LOOP_RESTORATION
-  // TODO(debargha): Fix mv search range on encoder side
-  // aom_extend_frame_inner_borders(cm->frame_to_show);
-  aom_extend_frame_borders(cm->frame_to_show);
 }
 
-static void encode_without_recode_loop(AV1_COMP *cpi) {
+static int encode_without_recode_loop(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
 
@@ -4774,10 +4156,7 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   if (cpi->unscaled_last_source != NULL)
     cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
                                              &cpi->scaled_last_source);
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
   cpi->source->buf_8bit_valid = 0;
-#endif
-
   if (frame_is_intra_only(cm) == 0) {
     scale_references(cpi);
   }
@@ -4796,6 +4175,16 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
     av1_cyclic_refresh_setup(cpi);
   }
   apply_active_map(cpi);
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+    } else {
+      calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
 
   // transform / motion compensation build reconstruction frame
   av1_encode_frame(cpi);
@@ -4810,29 +4199,25 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
   aom_clear_system_state();
+  return AOM_CODEC_OK;
 }
 
-static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
-                                    uint8_t *dest) {
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int bottom_index, top_index;
   int loop_count = 0;
   int loop_at_this_size = 0;
   int loop = 0;
-#if !CONFIG_XIPHRC
   int overshoot_seen = 0;
   int undershoot_seen = 0;
-#endif
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
 
   set_size_independent_vars(cpi);
 
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
   cpi->source->buf_8bit_valid = 0;
-#endif
 
   aom_clear_system_state();
   setup_frame_size(cpi);
@@ -4845,32 +4230,27 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
-#if !CONFIG_XIPHRC
       // Reset the loop state for new frame size.
       overshoot_seen = 0;
       undershoot_seen = 0;
-#endif
 
       q_low = bottom_index;
       q_high = top_index;
 
       loop_at_this_size = 0;
-    }
 
-    // Decide frame size bounds first time through.
-    if (loop_count == 0) {
+      // Decide frame size bounds first time through.
       av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
                                        &frame_under_shoot_limit,
                                        &frame_over_shoot_limit);
     }
 
-#if CONFIG_GLOBAL_MOTION
-    // if frame was scaled calculate global_motion_search again if already done
+    // if frame was scaled calculate global_motion_search again if already
+    // done
     if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
       if (cpi->source->y_crop_width != cm->width ||
           cpi->source->y_crop_height != cm->height)
         cpi->global_motion_search_done = 0;
-#endif  // CONFIG_GLOBAL_MOTION
     cpi->source =
         av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
     if (cpi->unscaled_last_source != NULL)
@@ -4884,29 +4264,18 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       scale_references(cpi);
     }
     av1_set_quantizer(cm, q);
+    // printf("Frame %d/%d: q = %d, frame_type = %d\n", cm->current_video_frame,
+    //        cm->show_frame, q, cm->frame_type);
 
     if (loop_count == 0) setup_frame(cpi);
 
-#if CONFIG_Q_ADAPT_PROBS
     // Base q-index may have changed, so we need to assign proper default coef
     // probs before every iteration.
-    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
-      int i;
+    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
       av1_default_coef_probs(cm);
-      if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
-        for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-      } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-        if (cm->frame_refs[0].idx >= 0) {
-          cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
-        }
-#else
-        cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif
-      }
+      av1_setup_frame_contexts(cm);
     }
-#endif  // CONFIG_Q_ADAPT_PROBS
 
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
@@ -4915,6 +4284,16 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       av1_setup_in_frame_q_adj(cpi);
     }
+    if (cm->seg.enabled) {
+      if (!cm->seg.update_data && cm->prev_frame) {
+        segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      } else {
+        calculate_segdata(&cm->seg);
+      }
+    } else {
+      memset(&cm->seg, 0, sizeof(cm->seg));
+    }
+    segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
 
     // transform / motion compensation build reconstruction frame
     save_coding_context(cpi);
@@ -4931,7 +4310,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       restore_coding_context(cpi);
-      av1_pack_bitstream(cpi, dest, size);
+
+      if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+        return AOM_CODEC_ERROR;
 
       rc->projected_frame_size = (int)(*size) << 3;
       restore_coding_context(cpi);
@@ -4950,16 +4331,11 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         int64_t high_err_target = cpi->ambient_err;
         int64_t low_err_target = cpi->ambient_err >> 1;
 
-#if CONFIG_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
         } else {
           kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
         }
-#else
-        kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-#endif  // CONFIG_HIGHBITDEPTH
-
         // Prevent possible divide by zero error below for perfect KF
         kf_err += !kf_err;
 
@@ -4996,7 +4372,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
         int last_q = q;
-#if !CONFIG_XIPHRC
         int retries = 0;
 
         // Frame size out of permitted range:
@@ -5062,7 +4437,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 
           undershoot_seen = 1;
         }
-#endif
 
         // Clamp Q to upper and lower limits:
         q = clamp(q, q_low, q_high);
@@ -5078,11 +4452,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
-#if CONFIG_GLOBAL_MOTION
     if (recode_loop_test_global_motion(cpi)) {
       loop = 1;
     }
-#endif  // CONFIG_GLOBAL_MOTION
 
     if (loop) {
       ++loop_count;
@@ -5093,86 +4465,90 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 #endif
     }
   } while (loop);
+
+  return AOM_CODEC_OK;
 }
 
 static int get_ref_frame_flags(const AV1_COMP *cpi) {
   const int *const map = cpi->common.ref_frame_map;
 
-#if CONFIG_EXT_REFS
-  const int last2_is_last =
-      map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]];
-  const int last3_is_last =
-      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
-  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
-#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
-  const int last3_is_last2 =
-      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
-  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
-  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
-#else   // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS
-  const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
-
-  const int last3_is_last2 =
-      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
-  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
-  const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]];
-
-  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
-  const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]];
-
-  const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-
-  const int alt2_is_last = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[0]];
-  const int alt2_is_last2 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[1]];
-  const int alt2_is_last3 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[2]];
-  const int alt2_is_gld = map[cpi->alt2_fb_idx] == map[cpi->gld_fb_idx];
-  const int alt2_is_bwd = map[cpi->alt2_fb_idx] == map[cpi->bwd_fb_idx];
-
-  const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
-  const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
-  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
-  const int alt2_is_alt = map[cpi->alt2_fb_idx] == map[cpi->alt_fb_idx];
-#else   // !CONFIG_EXT_REFS
-  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
-  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
-#endif  // CONFIG_EXT_REFS
-
-  int flags = AOM_REFFRAME_ALL;
-
-  if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG;
+  // No.1 Priority: LAST_FRAME
+  const int last2_is_last = map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[0]];
+  const int last3_is_last = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[0]];
+  const int gld_is_last =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int bwd_is_last =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int alt2_is_last =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int alt_is_last =
+      map[cpi->ref_fb_idx[ALTREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+
+  // No.2 Priority: ALTREF_FRAME
+  const int last2_is_alt =
+      map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int last3_is_alt =
+      map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int gld_is_alt = map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int bwd_is_alt = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int alt2_is_alt = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+
+  // No.3 Priority: LAST2_FRAME
+  const int last3_is_last2 = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[1]];
+  const int gld_is_last2 =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+  const int bwd_is_last2 =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+  const int alt2_is_last2 =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+
+  // No.4 Priority: LAST3_FRAME
+  const int gld_is_last3 =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+  const int bwd_is_last3 =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+  const int alt2_is_last3 =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+
+  // No.5 Priority: GOLDEN_FRAME
+  const int bwd_is_gld = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+  const int alt2_is_gld = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+
+  // No.6 Priority: BWDREF_FRAME
+  const int alt2_is_bwd = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[BWDREF_FRAME - 1]];
+
+  // No.7 Priority: ALTREF2_FRAME
+
+  // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be
+  // adjusted according to external encoder flags.
+  int flags = cpi->ext_ref_frame_flags;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
 
   if (alt_is_last) flags &= ~AOM_ALT_FLAG;
 
-#if CONFIG_EXT_REFS
   if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
 
-  if (last3_is_last || last3_is_last2 || last3_is_alt) flags &= ~AOM_LAST3_FLAG;
+  if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
 
-  if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG;
+  if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
+    flags &= ~AOM_GOLD_FLAG;
 
-#if CONFIG_ONE_SIDED_COMPOUND && \
-    !CONFIG_EXT_COMP_REFS  // Changes LL & HL bitstream
-  /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */
-  if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG;
-#else   // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS
-  if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld ||
-       bwd_is_alt) &&
+  if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 ||
+       bwd_is_gld) &&
       (flags & AOM_BWD_FLAG))
     flags &= ~AOM_BWD_FLAG;
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
 
-  if ((alt2_is_last || alt2_is_last2 || alt2_is_last3 || alt2_is_gld ||
-       alt2_is_bwd || alt2_is_alt) &&
+  if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
+       alt2_is_gld || alt2_is_bwd) &&
       (flags & AOM_ALT2_FLAG))
     flags &= ~AOM_ALT2_FLAG;
-#endif  // CONFIG_EXT_REFS
 
   return flags;
 }
@@ -5182,6 +4558,9 @@ static void set_ext_overrides(AV1_COMP *cpi) {
   // av1_update_reference() and av1_update_entropy() calls
   // Note: The overrides are valid only for the next frame passed
   // to encode_frame_to_data_rate() function
+  if (cpi->ext_use_s_frame) cpi->common.frame_type = S_FRAME;
+  cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none;
+
   if (cpi->ext_refresh_frame_context_pending) {
     cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
     cpi->ext_refresh_frame_context_pending = 0;
@@ -5190,54 +4569,23 @@ static void set_ext_overrides(AV1_COMP *cpi) {
     cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
     cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
     cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
+    cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
     cpi->ext_refresh_frame_flags_pending = 0;
   }
+  cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+  cpi->common.error_resilient_mode = cpi->ext_use_error_resilient;
 }
 
-#if !CONFIG_FRAME_SIGN_BIAS
-static void set_arf_sign_bias(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  int arf_sign_bias;
-#if CONFIG_EXT_REFS
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  // The arf_sign_bias will be one for internal ARFs'
-  arf_sign_bias = cpi->rc.source_alt_ref_active &&
-                  (!cpi->refresh_alt_ref_frame ||
-                   gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE);
-#else   // !CONFIG_EXT_REFS
-  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    arf_sign_bias = cpi->rc.source_alt_ref_active &&
-                    (!cpi->refresh_alt_ref_frame ||
-                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
-  } else {
-    arf_sign_bias =
-        (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
-  }
-#endif  // CONFIG_EXT_REFS
-
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
-#if CONFIG_EXT_REFS
-  cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
-  cm->ref_frame_sign_bias[ALTREF2_FRAME] =
-      cm->ref_frame_sign_bias[ALTREF_FRAME];
-#endif  // CONFIG_EXT_REFS
-}
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-
 static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   InterpFilter ifilter;
-  int ref_total[TOTAL_REFS_PER_FRAME] = { 0 };
+  int ref_total[REF_FRAMES] = { 0 };
   MV_REFERENCE_FRAME ref;
   int mask = 0;
   int arf_idx = ALTREF_FRAME;
 
-#if CONFIG_EXT_REFS
   if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
       cpi->refresh_alt2_ref_frame)
-#else   // !CONFIG_EXT_REFS
-  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
-#endif  // CONFIG_EXT_REFS
     return mask;
 
   for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
@@ -5247,25 +4595,21 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
     if ((ref_total[LAST_FRAME] &&
          cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
-#if CONFIG_EXT_REFS
         (ref_total[LAST2_FRAME] == 0 ||
          cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
              ref_total[LAST2_FRAME]) &&
         (ref_total[LAST3_FRAME] == 0 ||
          cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
              ref_total[LAST3_FRAME]) &&
-#endif  // CONFIG_EXT_REFS
         (ref_total[GOLDEN_FRAME] == 0 ||
          cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
              ref_total[GOLDEN_FRAME]) &&
-#if CONFIG_EXT_REFS
         (ref_total[BWDREF_FRAME] == 0 ||
          cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
              ref_total[BWDREF_FRAME]) &&
         (ref_total[ALTREF2_FRAME] == 0 ||
          cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 <
              ref_total[ALTREF2_FRAME]) &&
-#endif  // CONFIG_EXT_REFS
         (ref_total[ALTREF_FRAME] == 0 ||
          cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
              ref_total[ALTREF_FRAME]))
@@ -5281,16 +4625,50 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
 static void dump_filtered_recon_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
-  int h;
-  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
-  FILE *f_recon = NULL;
 
-  if (recon_buf == NULL || !cm->show_frame) {
-    printf("Frame %d is not ready or no show to dump.\n",
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", cm->current_video_frame);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      cm->current_video_frame, cm->frame_offset, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    const int ref_offset =
+        (buf_idx >= 0)
+            ? (int)cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset
+            : -1;
+    printf(
+        " %d(%c-%d-%4.2f)", ref_offset,
+        (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N',
+        (buf_idx >= 0) ? (int)cpi->frame_rf_level[buf_idx] : -1,
+        (buf_idx >= 0) ? rate_factor_deltas[cpi->frame_rf_level[buf_idx]] : -1);
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
            cm->current_video_frame);
     return;
   }
 
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
   if (cm->current_video_frame == 0) {
     if ((f_recon = fopen(file_name, "wb")) == NULL) {
       printf("Unable to open file %s to write.\n", file_name);
@@ -5303,13 +4681,14 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
     }
   }
   printf(
-      "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
-      "source_alt_ref_active=%d, refresh_alt_ref_frame=%d, rf_level=%d, "
-      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n",
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, rf_level=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
       cm->current_video_frame, cpi->twopass.gf_group.index,
       cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-      cm->show_existing_frame, cpi->rc.source_alt_ref_active,
-      cpi->refresh_alt_ref_frame,
+      cm->frame_offset, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
       recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
 #if 0
@@ -5346,49 +4725,44 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
 }
 #endif  // DUMP_RECON_FRAMES
 
-static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows,
-                                      const int tile_cols,
-                                      FRAME_CONTEXT *ec_ctxs[]) {
-  int i;
-  for (i = 0; i < tile_rows * tile_cols; ++i)
-    ec_ctxs[i] = &cpi->tile_data[i].tctx;
-}
-
-static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
-                                      uint8_t *dest, int skip_adapt,
-                                      unsigned int *frame_flags) {
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                                     int skip_adapt,
+                                     unsigned int *frame_flags) {
   AV1_COMMON *const cm = &cpi->common;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
-  FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
-                                         sizeof(&cpi->tile_data[0].tctx));
-  aom_cdf_prob **cdf_ptrs =
-      aom_malloc(cm->tile_rows * cm->tile_cols *
-                 sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0]));
-#if CONFIG_XIPHRC
-  int frame_type;
-  int drop_this_frame = 0;
-#endif  // CONFIG_XIPHRC
+
   set_ext_overrides(cpi);
   aom_clear_system_state();
 
-#if !CONFIG_FRAME_SIGN_BIAS
-  // Set the arf sign bias for this frame.
-  set_arf_sign_bias(cpi);
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-
-#if CONFIG_TEMPMV_SIGNALING
   // frame type has been decided outside of this function call
-  cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
-  cm->use_prev_frame_mvs =
-      !cpi->oxcf.disable_tempmv && !cm->cur_frame->intra_only;
-#endif
+  cm->cur_frame->intra_only = frame_is_intra_only(cm);
+  cm->cur_frame->frame_type = cm->frame_type;
+
+  // S_FRAMEs are always error resilient
+  cm->error_resilient_mode |= frame_is_sframe(cm);
+
+  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
+  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
+  if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0;
+
+  cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+  // cm->allow_ref_frame_mvs needs to be written into the frame header while
+  // cm->large_scale_tile is 1, therefore, "cm->large_scale_tile=1" case is
+  // separated from frame_might_allow_ref_frame_mvs().
+  cm->allow_ref_frame_mvs &= !cm->large_scale_tile;
+
+  cm->allow_warped_motion =
+      cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+
+  // Reset the frame packet stamp index.
+  if (cm->frame_type == KEY_FRAME) cm->current_video_frame = 0;
 
-#if CONFIG_EXT_REFS
   // NOTE:
   // (1) Move the setup of the ref_frame_flags upfront as it would be
   //     determined by the current frame properties;
-  // (2) The setup of the ref_frame_flags applies to both show_existing_frame's
+  // (2) The setup of the ref_frame_flags applies to both
+  // show_existing_frame's
   //     and the other cases.
   if (cm->current_video_frame > 0)
     cpi->ref_frame_flags = get_ref_frame_flags(cpi);
@@ -5415,12 +4789,20 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cpi->rc.is_bipred_frame = 0;
 
     restore_coding_context(cpi);
+
     // Build the bitstream
-    av1_pack_bitstream(cpi, dest, size);
+    if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+
+    cpi->seq_params_locked = 1;
 
     // Set up frame to show to get ready for stats collection.
     cm->frame_to_show = get_frame_new_buffer(cm);
 
+    // Update current frame offset.
+    cm->frame_offset =
+        cm->buffer_pool->frame_bufs[cm->new_fb_idx].cur_frame_offset;
+
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
     dump_filtered_recon_frames(cpi);
@@ -5432,9 +4814,11 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     //     update has been done previously when handling the LAST_BIPRED_FRAME
     //     right before BWDREF_FRAME (in the display order);
     // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
-    //     update will be done when the following is called, which will exchange
+    //     update will be done when the following is called, which will
+    //     exchange
     //     the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
-    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2, and
+    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2,
+    //     and
     //     ALTREF2_FRAME will serve as the new LAST_FRAME.
     update_reference_frames(cpi);
 
@@ -5452,23 +4836,13 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     // to do post-encoding update accordingly.
     if (cpi->rc.is_src_frame_alt_ref) {
       av1_set_target_rate(cpi, cm->width, cm->height);
-#if CONFIG_XIPHRC
-      frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME;
-      drop_this_frame = od_enc_rc_update_state(
-          &cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
-          cpi->refresh_alt_ref_frame, frame_type, cpi->droppable);
-#else
       av1_rc_postencode_update(cpi, *size);
-#endif
     }
 
     ++cm->current_video_frame;
 
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
+    return AOM_CODEC_OK;
   }
-#endif  // CONFIG_EXT_REFS
 
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
@@ -5477,7 +4851,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
 
   // Set various flags etc to special state if it is a key frame.
-  if (frame_is_intra_only(cm)) {
+  if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
     // Reset the loop filter deltas and segmentation map.
     av1_reset_segment_features(cm);
 
@@ -5489,19 +4863,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
     // The alternate reference frame cannot be active for a key frame.
     cpi->rc.source_alt_ref_active = 0;
-
-    cm->error_resilient_mode = oxcf->error_resilient_mode;
-
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    // By default, encoder assumes decoder can use prev_mi.
-    if (cm->error_resilient_mode) {
-      cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
-    } else if (cm->intra_only) {
-      // Only reset the current context.
-      cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
-    }
-#endif
   }
   if (cpi->oxcf.mtu == 0) {
     cm->num_tg = cpi->oxcf.num_tile_groups;
@@ -5511,33 +4872,15 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cm->num_tg = DEFAULT_MAX_NUM_TG;
   }
 
-#if CONFIG_EXT_TILE
-  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
-  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_XIPHRC
-  if (drop_this_frame) {
-    av1_rc_postencode_update_drop_frame(cpi);
-    ++cm->current_video_frame;
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
-  }
-#else
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
   if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
       cm->frame_type != KEY_FRAME) {
     if (av1_rc_drop_frame(cpi)) {
       av1_rc_postencode_update_drop_frame(cpi);
-      ++cm->current_video_frame;
-      aom_free(tile_ctxs);
-      aom_free(cdf_ptrs);
-      return;
+      return AOM_CODEC_OK;
     }
   }
-#endif
 
   aom_clear_system_state();
 
@@ -5546,46 +4889,59 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     /* Non-normative definition of current_frame_id ("frame counter" with
-    * wraparound) */
-    const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7;
+     * wraparound) */
+    const int frame_id_length = FRAME_ID_LENGTH;
     if (cm->current_frame_id == -1) {
       int lsb, msb;
-/* quasi-random initialization of current_frame_id for a key frame */
-#if CONFIG_HIGHBITDEPTH
+      /* quasi-random initialization of current_frame_id for a key frame */
       if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
         lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
         msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
       } else {
-#endif
         lsb = cpi->source->y_buffer[0] & 0xff;
         msb = cpi->source->y_buffer[1] & 0xff;
-#if CONFIG_HIGHBITDEPTH
       }
-#endif
       cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+
+      // S_frame is meant for stitching different streams of different
+      // resolutions together, so current_frame_id must be the
+      // same across different streams of the same content current_frame_id
+      // should be the same and not random. 0x37 is a chosen number as start
+      // point
+      if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
     } else {
       cm->current_frame_id =
           (cm->current_frame_id + 1 + (1 << frame_id_length)) %
           (1 << frame_id_length);
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-#if CONFIG_EXT_DELTA_Q
-  cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
-  cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
-#if CONFIG_LOOPFILTER_LEVEL
-  cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
+  switch (cpi->oxcf.cdf_update_mode) {
+    case 0:  // No CDF update for any frames(4~6% compression loss).
+      cm->disable_cdf_update = 1;
+      break;
+    case 1:  // Enable CDF update for all frames.
+      cm->disable_cdf_update = 0;
+      break;
+    case 2:
+      // Strategically determine at which frames to do CDF update.
+      // Currently only enable CDF update for all-intra and no-show frames(1.5%
+      // compression loss).
+      // TODO(huisu@google.com): design schemes for various trade-offs between
+      // compression quality and decoding speed.
+      cm->disable_cdf_update =
+          (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      break;
+  }
+  cm->timing_info_present &= !cm->seq_params.reduced_still_picture_hdr;
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi);
+    if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
   } else {
-    encode_with_recode_loop(cpi, size, dest);
+    if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
   }
 
   cm->last_tile_cols = cm->tile_cols;
@@ -5601,72 +4957,86 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
-#if CONFIG_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       cpi->ambient_err =
           aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
     } else {
       cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
     }
-#else
-    cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-#endif  // CONFIG_HIGHBITDEPTH
   }
 
-  // If the encoder forced a KEY_FRAME decision
-  if (cm->frame_type == KEY_FRAME) {
+  // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
+  if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
     cpi->refresh_last_frame = 1;
   }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_to_show->color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  cm->frame_to_show->transfer_function = cm->transfer_function;
+  cm->frame_to_show->color_primaries = cm->color_primaries;
+  cm->frame_to_show->transfer_characteristics = cm->transfer_characteristics;
+  cm->frame_to_show->matrix_coefficients = cm->matrix_coefficients;
+  cm->frame_to_show->monochrome = cm->seq_params.monochrome;
   cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position;
-#endif
   cm->frame_to_show->color_range = cm->color_range;
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
-// off.
-#endif  // CONFIG_EXT_REFS
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
 
   // Pick the loop filter level for the frame.
-  loopfilter_frame(cpi, cm);
+  if (!cm->allow_intrabc) {
+    loopfilter_frame(cpi, cm);
+  } else {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // TODO(debargha): Fix mv search range on encoder side
+  // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm));
+  aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm));
 
 #ifdef OUTPUT_YUV_REC
   aom_write_one_yuv_frame(cm, cm->frame_to_show);
 #endif
 
   // Build the bitstream
-  av1_pack_bitstream(cpi, dest, size);
+  if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
 
-  if (skip_adapt) {
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
-  }
+  cpi->seq_params_locked = 1;
+
+  if (skip_adapt) return AOM_CODEC_OK;
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     int i;
-    /* Update reference frame id values based on the value of refresh_mask */
+    // Update reference frame id values based on the value of refresh_frame_mask
     for (i = 0; i < REF_FRAMES; i++) {
-      if ((cm->refresh_mask >> i) & 1) {
+      if ((cpi->refresh_frame_mask >> i) & 1) {
         cm->ref_frame_id[i] = cm->current_frame_id;
       }
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
 
 #if DUMP_RECON_FRAMES == 1
   // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-  if (cm->show_frame) dump_filtered_recon_frames(cpi);
+  dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
-  if (cm->seg.update_map) update_reference_segmentation_map(cpi);
+  if (cm->seg.enabled) {
+    if (cm->seg.update_map) {
+      update_reference_segmentation_map(cpi);
+    } else if (cm->last_frame_seg_map) {
+      memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map,
+             cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
+    }
+  }
 
   if (frame_is_intra_only(cm) == 0) {
     release_scaled_references(cpi);
@@ -5675,39 +5045,12 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   update_reference_frames(cpi);
 
 #if CONFIG_ENTROPY_STATS
-  av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
-  assert(cm->frame_context_idx < FRAME_CONTEXTS);
-  av1_accumulate_frame_counts(&aggregate_fc_per_type[cm->frame_context_idx],
-                              &cm->counts);
+  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
 #endif  // CONFIG_ENTROPY_STATS
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-#if CONFIG_LV_MAP
-    av1_adapt_coef_probs(cm);
-#endif  // CONFIG_LV_MAP
-    av1_adapt_intra_frame_probs(cm);
-    make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs);
-    av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
-                               cm->tile_rows * cm->tile_cols);
-    av1_average_tile_intra_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
-                                cm->tile_rows * cm->tile_cols);
-#if CONFIG_PVQ
-    av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs,
-                              cm->tile_rows * cm->tile_cols);
-#endif  // CONFIG_PVQ
-#if CONFIG_ADAPT_SCAN
-    av1_adapt_scan_order(cm);
-#endif  // CONFIG_ADAPT_SCAN
-  }
 
-  if (!frame_is_intra_only(cm)) {
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      av1_adapt_inter_frame_probs(cm);
-      av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
-      av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs,
-                                  cdf_ptrs, cm->tile_rows * cm->tile_cols);
-      av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
-                               cm->tile_rows * cm->tile_cols);
-    }
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx;
+    av1_reset_cdf_symbol_counters(cm->fc);
   }
 
   if (cpi->refresh_golden_frame == 1)
@@ -5720,39 +5063,14 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
-#if CONFIG_EXT_REFS
   if (cpi->refresh_bwd_ref_frame == 1)
     cpi->frame_flags |= FRAMEFLAGS_BWDREF;
   else
     cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
-#endif  // CONFIG_EXT_REFS
-
-#if !CONFIG_EXT_REFS
-  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
-#endif  // !CONFIG_EXT_REFS
 
   cm->last_frame_type = cm->frame_type;
 
-#if CONFIG_XIPHRC
-  frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
-
-  drop_this_frame =
-      od_enc_rc_update_state(&cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
-                             cpi->refresh_alt_ref_frame, frame_type, 0);
-  if (drop_this_frame) {
-    av1_rc_postencode_update_drop_frame(cpi);
-    ++cm->current_video_frame;
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
-  }
-#else   // !CONFIG_XIPHRC
   av1_rc_postencode_update(cpi, *size);
-#endif  // CONFIG_XIPHRC
-
-#if 0
-  output_frame_level_debug_stats(cpi);
-#endif
 
   if (cm->frame_type == KEY_FRAME) {
     // Tell the caller that the frame was coded as a key frame
@@ -5768,90 +5086,79 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   cm->lf.mode_ref_delta_update = 0;
 
   if (cm->show_frame) {
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
-// being used as reference.
-#endif  // CONFIG_EXT_REFS
+    // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
+    // are
+    // being used as reference.
     swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
   }
 
-#if CONFIG_EXT_REFS
   // NOTE: Shall not refer to any frame not used as reference.
   if (cm->is_reference_frame) {
-#endif  // CONFIG_EXT_REFS
-    cm->prev_frame = cm->cur_frame;
     // keep track of the last coded dimensions
     cm->last_width = cm->width;
     cm->last_height = cm->height;
 
     // reset to normal state now that we are done.
     cm->last_show_frame = cm->show_frame;
-#if CONFIG_EXT_REFS
   }
-#endif  // CONFIG_EXT_REFS
 
-  aom_free(tile_ctxs);
-  aom_free(cdf_ptrs);
+  return AOM_CODEC_OK;
 }
 
-static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                        int skip_adapt, unsigned int *frame_flags) {
-#if CONFIG_XIPHRC
-  int64_t ip_count;
-  int frame_type, is_golden, is_altref;
-
-  /* Not updated during init so update it here */
-  if (cpi->oxcf.rc_mode == AOM_Q) cpi->od_rc.quality = cpi->oxcf.cq_level;
-
-  frame_type = od_frame_type(&cpi->od_rc, cpi->od_rc.cur_frame, &is_golden,
-                             &is_altref, &ip_count);
-
-  if (frame_type == OD_I_FRAME) {
-    frame_type = KEY_FRAME;
-    cpi->frame_flags &= FRAMEFLAGS_KEY;
-  } else if (frame_type == OD_P_FRAME) {
-    frame_type = INTER_FRAME;
-  }
-
-  if (is_altref) {
-    cpi->refresh_alt_ref_frame = 1;
-    cpi->rc.source_alt_ref_active = 1;
-  }
-
-  cpi->refresh_golden_frame = is_golden;
-  cpi->common.frame_type = frame_type;
-  if (is_golden) cpi->frame_flags &= FRAMEFLAGS_GOLDEN;
-#else
+static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                       int skip_adapt, unsigned int *frame_flags) {
   if (cpi->oxcf.rc_mode == AOM_CBR) {
     av1_rc_get_one_pass_cbr_params(cpi);
   } else {
     av1_rc_get_one_pass_vbr_params(cpi);
   }
-#endif
-  encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags);
+  if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+  check_show_existing_frame(cpi);
+  return AOM_CODEC_OK;
 }
 
-#if !CONFIG_XIPHRC
-static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                        unsigned int *frame_flags) {
-  encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags);
+static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                       unsigned int *frame_flags) {
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *cm = &cpi->common;
+  cm->txcoeff_cost_timer = 0;
+  cm->txcoeff_cost_count = 0;
+#endif
+
+  if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+#if TXCOEFF_COST_TIMER
+  cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+  fprintf(stderr,
+          "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+          "in us\n",
+          cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+          cm->cum_txcoeff_cost_timer);
+#endif
 
-#if CONFIG_EXT_REFS
-  // Do not do post-encoding update for those frames that do not have a spot in
-  // a gf group, but note that an OVERLAY frame always has a spot in a gf group,
+  // Do not do post-encoding update for those frames that do not have a spot
+  // in
+  // a gf group, but note that an OVERLAY frame always has a spot in a gf
+  // group,
   // even when show_existing_frame is used.
   if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
     av1_twopass_postencode_update(cpi);
   }
   check_show_existing_frame(cpi);
-#else
-  av1_twopass_postencode_update(cpi);
-#endif  // CONFIG_EXT_REFS
+  return AOM_CODEC_OK;
 }
-#endif
 
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -5861,37 +5168,34 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
-#if CONFIG_HIGHBITDEPTH
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-#endif
 
-#if CONFIG_HIGHBITDEPTH
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
-#else
-  check_initial_width(cpi, subsampling_x, subsampling_y);
-#endif  // CONFIG_HIGHBITDEPTH
 
   aom_usec_timer_start(&timer);
 
   if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
-#if CONFIG_HIGHBITDEPTH
-                         use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-                         frame_flags))
+                         use_highbitdepth, frame_flags))
     res = -1;
   aom_usec_timer_mark(&timer);
   cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
 
-  if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+  if ((cm->profile == PROFILE_0) && !cm->seq_params.monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "Non-4:2:0 color format requires profile 1 or 3");
+                       "Non-4:2:0 color format requires profile 1 or 2");
+    res = -1;
+  }
+  if ((cm->profile == PROFILE_1) &&
+      !(subsampling_x == 0 && subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 1 requires 4:4:4 color format");
     res = -1;
   }
-  if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
-      (subsampling_x == 1 && subsampling_y == 1)) {
+  if ((cm->profile == PROFILE_2) && (cm->bit_depth <= AOM_BITS_10) &&
+      !(subsampling_x == 1 && subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "4:2:0 color format requires profile 0 or 2");
+                       "Profile 2 bit-depth < 10 requires 4:2:2 color format");
     res = -1;
   }
 
@@ -5902,13 +5206,10 @@ static int frame_is_reference(const AV1_COMP *cpi) {
   const AV1_COMMON *cm = &cpi->common;
 
   return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
-         cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-         cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-         cpi->refresh_alt_ref_frame || !cm->error_resilient_mode ||
-         cm->lf.mode_ref_delta_update || cm->seg.update_map ||
-         cm->seg.update_data;
+         cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
+         cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame ||
+         !cm->error_resilient_mode || cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map || cm->seg.update_data;
 }
 
 static void adjust_frame_rate(AV1_COMP *cpi,
@@ -5968,7 +5269,6 @@ static int get_arf_src_index(AV1_COMP *cpi) {
   return arf_src_index;
 }
 
-#if CONFIG_EXT_REFS
 static int get_brf_src_index(AV1_COMP *cpi) {
   int brf_src_index = 0;
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -6002,7 +5302,6 @@ static int get_arf2_src_index(AV1_COMP *cpi) {
   }
   return arf2_src_index;
 }
-#endif  // CONFIG_EXT_REFS
 
 static void check_src_altref(AV1_COMP *cpi,
                              const struct lookahead_entry *source) {
@@ -6014,14 +5313,10 @@ static void check_src_altref(AV1_COMP *cpi,
   if (cpi->oxcf.pass == 2) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     rc->is_src_frame_alt_ref =
-#if CONFIG_EXT_REFS
         (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
-#endif  // CONFIG_EXT_REFS
         (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
-#if CONFIG_EXT_REFS
     rc->is_src_frame_ext_arf =
         gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
-#endif  // CONFIG_EXT_REFS
   } else {
     rc->is_src_frame_alt_ref =
         cpi->alt_ref_source && (source == cpi->alt_ref_source);
@@ -6031,20 +5326,16 @@ static void check_src_altref(AV1_COMP *cpi,
     // Current frame is an ARF overlay frame.
     cpi->alt_ref_source = NULL;
 
-#if CONFIG_EXT_REFS
     if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
       // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
       // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
       // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
       cpi->refresh_last_frame = 1;
     } else {
-#endif  // CONFIG_EXT_REFS
       // Don't refresh the last buffer for an ARF overlay frame. It will
       // become the GF so preserve last as an alternative prediction option.
       cpi->refresh_last_frame = 0;
-#if CONFIG_EXT_REFS
     }
-#endif  // CONFIG_EXT_REFS
   }
 }
 
@@ -6055,10 +5346,10 @@ extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
 
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
-  s->stat[Y] += y;
-  s->stat[U] += u;
-  s->stat[V] += v;
-  s->stat[ALL] += all;
+  s->stat[STAT_Y] += y;
+  s->stat[STAT_U] += u;
+  s->stat[STAT_V] += v;
+  s->stat[STAT_ALL] += all;
   s->worst = AOMMIN(s->worst, all);
 }
 
@@ -6073,12 +5364,10 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 #endif
   cpi->bytes += frame_bytes;
 
-#if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     in_bit_depth = cpi->oxcf.input_bit_depth;
     bit_depth = cm->bit_depth;
   }
-#endif
   if (cm->show_frame) {
     const YV12_BUFFER_CONFIG *orig = cpi->source;
     const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
@@ -6089,28 +5378,20 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       PSNR_STATS psnr;
       double frame_ssim2 = 0.0, weight = 0.0;
       aom_clear_system_state();
-// TODO(yaowu): unify these two versions into one.
-#if CONFIG_HIGHBITDEPTH
+      // TODO(yaowu): unify these two versions into one.
       aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
-#else
-      aom_calc_psnr(orig, recon, &psnr);
-#endif  // CONFIG_HIGHBITDEPTH
 
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
                         &cpi->psnr);
       cpi->total_sq_error += psnr.sse[0];
       cpi->total_samples += psnr.samples[0];
       samples = psnr.samples[0];
-// TODO(yaowu): unify these two versions into one.
-#if CONFIG_HIGHBITDEPTH
+      // TODO(yaowu): unify these two versions into one.
       if (cm->use_highbitdepth)
         frame_ssim2 =
             aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
       else
         frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
-#else
-      frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
-#endif  // CONFIG_HIGHBITDEPTH
 
       cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
       cpi->summed_quality += frame_ssim2 * weight;
@@ -6119,18 +5400,19 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 #if 0
       {
         FILE *f = fopen("q_used.stt", "a");
+        double y2 = psnr.psnr[1];
+        double u2 = psnr.psnr[2];
+        double v2 = psnr.psnr[3];
+        double frame_psnr2 = psnr.psnr[0];
         fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                cpi->common.current_video_frame, y2, u2, v2,
+                cm->current_video_frame, y2, u2, v2,
                 frame_psnr2, frame_ssim2);
         fclose(f);
       }
 #endif
     }
     if (cpi->b_calculate_blockiness) {
-#if CONFIG_HIGHBITDEPTH
-      if (!cm->use_highbitdepth)
-#endif
-      {
+      if (!cm->use_highbitdepth) {
         const double frame_blockiness =
             av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
                                recon->y_stride, orig->y_width, orig->y_height);
@@ -6139,10 +5421,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       }
 
       if (cpi->b_calculate_consistency) {
-#if CONFIG_HIGHBITDEPTH
-        if (!cm->use_highbitdepth)
-#endif
-        {
+        if (!cm->use_highbitdepth) {
           const double this_inconsistency = aom_get_ssim_metrics(
               orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
               orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
@@ -6167,7 +5446,6 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 }
 #endif  // CONFIG_INTERNAL_STATS
 
-#if CONFIG_AMVR
 static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
                          const YV12_BUFFER_CONFIG *last_picture,
                          hash_table *last_hash_table) {
@@ -6203,14 +5481,28 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
       p_cur += (y_pos * stride_cur + x_pos);
       p_ref += (y_pos * stride_ref + x_pos);
 
-      for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-        for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-          if (p_cur[tmpX] != p_ref[tmpX]) {
-            match = 0;
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
           }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
         }
-        p_cur += stride_cur;
-        p_ref += stride_ref;
       }
 
       if (match) {
@@ -6227,10 +5519,14 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
 
       av1_get_block_hash_value(
           cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
-          block_size, &hash_value_1, &hash_value_2);
-
-      if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
-        M++;
+          block_size, &hash_value_1, &hash_value_2,
+          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH));
+      // Hashing does not work for highbitdepth currently.
+      // TODO(Roger): Make it work for highbitdepth.
+      if (av1_use_hash_me(&cpi->common)) {
+        if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+          M++;
+        }
       }
     }
   }
@@ -6282,13 +5578,14 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
 
   return 0;
 }
-#endif
 
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush) {
+                            int64_t *time_end, int flush,
+                            const aom_rational_t *timebase) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   BufferPool *const pool = cm->buffer_pool;
   RATE_CONTROL *const rc = &cpi->rc;
   struct aom_usec_timer cmptimer;
@@ -6296,15 +5593,9 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
   int arf_src_index;
-#if CONFIG_EXT_REFS
   int brf_src_index;
-#endif  // CONFIG_EXT_REFS
   int i;
 
-#if CONFIG_XIPHRC
-  cpi->od_rc.end_of_input = flush;
-#endif
-
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads == 0 &&
          "bitstream debug tool does not support multithreading");
@@ -6312,13 +5603,10 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
 #endif
 
+  cm->showable_frame = 0;
   aom_usec_timer_start(&cmptimer);
 
-#if CONFIG_AMVR
   set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
-#else
-  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
-#endif
 
   // Is multi-arf enabled.
   // Note that at the moment multi_arf is only configured for 2 pass VBR
@@ -6327,24 +5615,36 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   else
     cpi->multi_arf_allowed = 0;
 
-// Normal defaults
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-#endif
-  cm->refresh_frame_context =
-      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
-          ? REFRESH_FRAME_CONTEXT_FORWARD
-          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  // Normal defaults
+  cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
+                                  ? REFRESH_FRAME_CONTEXT_DISABLED
+                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
   cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
   cpi->refresh_alt_ref_frame = 0;
 
-#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+  // TODO(zoeliu@gmail.com): To support forward-KEY_FRAME and set up the
+  //                         following flag accordingly.
+  cm->reset_decoder_state = 0;
+
+  // Don't allow a show_existing_frame to coincide with an error resilient or
+  // S-Frame
+  struct lookahead_entry *lookahead_src = NULL;
+  if (cm->current_video_frame > 0)
+    lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
+  if (lookahead_src != NULL &&
+      ((cpi->oxcf.error_resilient_mode |
+        ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
+       (cpi->oxcf.s_frame_mode |
+        ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0)))) {
+    cm->show_existing_frame = 0;
+  }
+
   if (oxcf->pass == 2 && cm->show_existing_frame) {
     // Manage the source buffer and flush out the source frame that has been
     // coded already; Also get prepared for PSNR calculation if needed.
@@ -6352,6 +5652,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       *size = 0;
       return -1;
     }
+    av1_apply_encoding_flags(cpi, source->flags);
     cpi->source = &source->img;
     // TODO(zoeliu): To track down to determine whether it's needed to adjust
     // the frame rate.
@@ -6361,7 +5662,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     // We need to adjust frame rate for an overlay frame
     if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
 
-    // Find a free buffer for the new frame, releasing the reference previously
+    // Find a free buffer for the new frame, releasing the reference
+    // previously
     // held.
     if (cm->new_fb_idx != INVALID_IDX) {
       --pool->frame_bufs[cm->new_fb_idx].ref_count;
@@ -6379,7 +5681,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     // We need to update the gf_group for show_existing overlay frame
     if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
 
-    Pass2Encode(cpi, size, dest, frame_flags);
+    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
 
     if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
 
@@ -6393,7 +5696,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     cm->show_existing_frame = 0;
     return 0;
   }
-#endif  // CONFIG_EXT_REFS && !CONFIG_XIPHRC
 
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
@@ -6415,21 +5717,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cm->showable_frame = 1;
       cpi->alt_ref_source = source;
 
       if (oxcf->arnr_max_frames > 0) {
-// Produce the filtered ARF frame.
-#if CONFIG_BGSPRITE
-        int bgsprite_ret = av1_background_sprite(cpi, arf_src_index);
-        // Do temporal filter if bgsprite not generated.
-        if (bgsprite_ret != 0)
-#endif  // CONFIG_BGSPRITE
-          av1_temporal_filter(cpi,
-#if CONFIG_BGSPRITE
-                              NULL, &cpi->alt_ref_buffer,
-#endif  // CONFIG_BGSPRITE
-                              arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer);
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -6438,16 +5732,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       rc->is_src_frame_alt_ref = 0;
     }
     rc->source_alt_ref_pending = 0;
   }
 
-#if CONFIG_EXT_REFS
   // Should we encode an arf2 frame.
   arf_src_index = get_arf2_src_index(cpi);
   if (arf_src_index) {
@@ -6468,16 +5759,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cm->showable_frame = 1;
       cpi->alt_ref_source = source;
 
       if (oxcf->arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
-        av1_temporal_filter(cpi,
-#if CONFIG_BGSPRITE
-                            NULL, NULL,
-#endif  // CONFIG_BGSPRITE
-                            arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer);
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -6499,6 +5787,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   if (brf_src_index) {
     assert(brf_src_index <= rc->frames_to_key);
     if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->showable_frame = 1;
       cm->show_frame = 0;
       cm->intra_only = 0;
 
@@ -6511,7 +5800,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       rc->is_bwd_ref_frame = 1;
     }
   }
-#endif  // CONFIG_EXT_REFS
 
   if (!source) {
     // Get last frame source.
@@ -6538,16 +5826,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
     *time_stamp = source->ts_start;
     *time_end = source->ts_end;
+    av1_apply_encoding_flags(cpi, source->flags);
     *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
   } else {
     *size = 0;
     if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
-#if CONFIG_XIPHRC
-      od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 1);
-#else
       av1_end_first_pass(cpi); /* get last stats packet */
-#endif
       cpi->twopass.first_pass_done = 1;
     }
     return -1;
@@ -6573,20 +5858,23 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
   if (cm->new_fb_idx == INVALID_IDX) return -1;
 
+  // Retain the RF_LEVEL for the current newly coded frame.
+  cpi->frame_rf_level[cm->new_fb_idx] =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
   cm->cur_frame->buf.buf_8bit_valid = 0;
-#endif
-#if !CONFIG_EXT_REFS
-  if (cpi->multi_arf_allowed) {
-    if (cm->frame_type == KEY_FRAME) {
-      init_buffer_indices(cpi);
-    } else if (oxcf->pass == 2) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
-    }
+
+  if (cm->film_grain_table) {
+    cm->film_grain_params_present = aom_film_grain_table_lookup(
+        cm->film_grain_table, *time_stamp, *time_end, 0 /* erase */,
+        &cm->film_grain_params);
   }
-#endif  // !CONFIG_EXT_REFS
+  cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+
+  // only one operating point supported now
+  cpi->common.tu_presentation_delay =
+      ticks_to_timebase_units(timebase, *time_stamp);
 
   // Start with a 0 size frame.
   *size = 0;
@@ -6594,87 +5882,62 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   cpi->frame_flags = *frame_flags;
 
   if (oxcf->pass == 2) {
-#if CONFIG_XIPHRC
-    if (od_enc_rc_2pass_in(&cpi->od_rc) < 0) return -1;
-  }
-#else
     av1_rc_get_second_pass_params(cpi);
   } else if (oxcf->pass == 1) {
     setup_frame_size(cpi);
   }
-#endif
 
   if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
-    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i)
-      cpi->scaled_ref_idx[i] = INVALID_IDX;
+    for (i = 0; i < REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
-#if CONFIG_AOM_QM
   cm->using_qmatrix = cpi->oxcf.using_qm;
   cm->min_qmlevel = cpi->oxcf.qm_minlevel;
   cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
-#endif
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     if (*time_stamp == 0) {
       cpi->common.current_frame_id = -1;
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
-#if CONFIG_AMVR
+
   cpi->cur_poc++;
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
-    if (cpi->common.seq_mv_precision_level == 2) {
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+      !frame_is_intra_only(cm)) {
+    if (cpi->common.seq_params.force_integer_mv == 2) {
       struct lookahead_entry *previous_entry =
-          cpi->lookahead->buf + cpi->previsous_index;
-      cpi->common.cur_frame_mv_precision_level = is_integer_mv(
-          cpi, cpi->source, &previous_entry->img, cpi->previsou_hash_table);
+          av1_lookahead_peek(cpi->lookahead, cpi->previous_index);
+      if (!previous_entry)
+        cpi->common.cur_frame_force_integer_mv = 0;
+      else
+        cpi->common.cur_frame_force_integer_mv = is_integer_mv(
+            cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table);
     } else {
-      cpi->common.cur_frame_mv_precision_level =
-          cpi->common.seq_mv_precision_level;
+      cpi->common.cur_frame_force_integer_mv =
+          cpi->common.seq_params.force_integer_mv;
     }
   } else {
-    cpi->common.cur_frame_mv_precision_level = 0;
+    cpi->common.cur_frame_force_integer_mv = 0;
   }
-#endif
 
-#if CONFIG_XIPHRC
-  if (oxcf->pass == 1) {
-    size_t tmp;
-    if (cpi->od_rc.cur_frame == 0) Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
-    cpi->od_rc.firstpass_quant = cpi->od_rc.target_quantizer;
-    Pass0Encode(cpi, &tmp, dest, 0, frame_flags);
-    od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 0);
-  } else if (oxcf->pass == 2) {
-    Pass0Encode(cpi, size, dest, 0, frame_flags);
-  } else {
-    if (cpi->od_rc.cur_frame == 0) {
-      size_t tmp;
-      Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
-    }
-    Pass0Encode(cpi, size, dest, 0, frame_flags);
-  }
-#else
   if (oxcf->pass == 1) {
     cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
     av1_first_pass(cpi, source);
   } else if (oxcf->pass == 2) {
-    Pass2Encode(cpi, size, dest, frame_flags);
+    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
   } else {
     // One pass encode
-    Pass0Encode(cpi, size, dest, 0, frame_flags);
+    if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
   }
-#endif
-#if CONFIG_HASH_ME
   if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
-#if CONFIG_AMVR
-    cpi->previsou_hash_table = &cm->cur_frame->hash_table;
+    cpi->previous_hash_table = &cm->cur_frame->hash_table;
     {
       int l;
       for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
         if ((cpi->lookahead->buf + l) == source) {
-          cpi->previsous_index = l;
+          cpi->previous_index = l;
           break;
         }
       }
@@ -6684,17 +5947,26 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                            "Failed to find last frame original buffer");
       }
     }
-#endif
   }
 
-#endif
+  if (!cm->large_scale_tile) {
+    cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+  }
 
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
-#else
-  if (!cm->error_resilient_mode)
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && oxcf->pass == 2) {
+    char fn[20] = "./fc";
+    fn[4] = cm->current_video_frame / 100 + '0';
+    fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+    fn[6] = (cm->current_video_frame % 10) + '0';
+    fn[7] = '\0';
+    av1_print_frame_contexts(cm->fc, fn);
+  }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+  cm->showable_frame = !cm->show_frame && cm->showable_frame;
 
   // No frame encoded, or frame was dropped, release scaled references.
   if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
@@ -6717,10 +5989,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   }
 #endif  // CONFIG_INTERNAL_STATS
 
-#if CONFIG_XIPHRC
-  cpi->od_rc.cur_frame++;
-#endif
-
   aom_clear_system_state();
 
   return 0;
@@ -6755,6 +6023,29 @@ int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
   return 0;
 }
 
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
 int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
                           AOM_SCALING vert_mode) {
   int hr = 0, hs = 0, vr = 0, vs = 0;
@@ -6773,47 +6064,134 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
 
 int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
 
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+  size_t output_size = 0;
+  size_t total_bytes_read = 0;
+  size_t remaining_size = *frame_size;
+  uint8_t *buff_ptr = buffer;
+
+  // go through each OBUs
+  while (total_bytes_read < *frame_size) {
+    uint8_t saved_obu_header[2];
+    uint64_t obu_payload_size;
+    size_t length_of_payload_size;
+    size_t length_of_obu_size;
+    uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+    size_t obu_bytes_read = obu_header_size;  // bytes read for current obu
+
+    // save the obu header (1 or 2 bytes)
+    memmove(saved_obu_header, buff_ptr, obu_header_size);
+    // clear the obu_has_size_field
+    saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+    // get the payload_size and length of payload_size
+    if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+                        &obu_payload_size, &length_of_payload_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+    obu_bytes_read += length_of_payload_size;
+
+    // calculate the length of size of the obu header plus payload
+    length_of_obu_size =
+        aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+    // move the rest of data to new location
+    memmove(buff_ptr + length_of_obu_size + obu_header_size,
+            buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+    obu_bytes_read += (size_t)obu_payload_size;
+
+    // write the new obu size
+    const uint64_t obu_size = obu_header_size + obu_payload_size;
+    size_t coded_obu_size;
+    if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+                        &coded_obu_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+
+    // write the saved (modified) obu_header following obu size
+    memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+    total_bytes_read += obu_bytes_read;
+    remaining_size -= obu_bytes_read;
+    buff_ptr += length_of_obu_size + obu_size;
+    output_size += length_of_obu_size + (size_t)obu_size;
+  }
+
+  *frame_size = output_size;
+  return AOM_CODEC_OK;
+}
+
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+  // TODO(yunqingwang): For what references to use, external encoding flags
+  // should be consistent with internal reference frame selection. Need to
+  // ensure that there is not conflict between the two. In AV1 encoder, the
+  // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+  // GOLDEN, BWDREF, ALTREF2. If only one reference frame is used, it must be
+  // LAST.
+  cpi->ext_ref_frame_flags = AOM_REFFRAME_ALL;
   if (flags &
-      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) {
-    int ref = AOM_REFFRAME_ALL;
-
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+       AOM_EFLAG_NO_REF_ARF2)) {
     if (flags & AOM_EFLAG_NO_REF_LAST) {
-      ref ^= AOM_LAST_FLAG;
-#if CONFIG_EXT_REFS
-      ref ^= AOM_LAST2_FLAG;
-      ref ^= AOM_LAST3_FLAG;
-#endif  // CONFIG_EXT_REFS
-    }
+      cpi->ext_ref_frame_flags = 0;
+    } else {
+      int ref = AOM_REFFRAME_ALL;
+
+      if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
 
-    if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
 
-    if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_ARF) {
+        ref ^= AOM_ALT_FLAG;
+        ref ^= AOM_BWD_FLAG;
+        ref ^= AOM_ALT2_FLAG;
+      } else {
+        if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+        if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+      }
 
-    av1_use_as_reference(cpi, ref);
+      av1_use_as_reference(cpi, ref);
+    }
   }
 
   if (flags &
-      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
-       AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) {
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
     int upd = AOM_REFFRAME_ALL;
 
-    if (flags & AOM_EFLAG_NO_UPD_LAST) {
-      upd ^= AOM_LAST_FLAG;
-#if CONFIG_EXT_REFS
-      upd ^= AOM_LAST2_FLAG;
-      upd ^= AOM_LAST3_FLAG;
-#endif  // CONFIG_EXT_REFS
-    }
+    // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+    if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
 
     if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
 
-    if (flags & AOM_EFLAG_NO_UPD_ARF) upd ^= AOM_ALT_FLAG;
+    if (flags & AOM_EFLAG_NO_UPD_ARF) {
+      upd ^= AOM_ALT_FLAG;
+      upd ^= AOM_BWD_FLAG;
+      upd ^= AOM_ALT2_FLAG;
+    }
 
     av1_update_reference(cpi, upd);
   }
 
+  cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+                               ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+  cpi->ext_use_error_resilient = cpi->oxcf.error_resilient_mode |
+                                 ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+  cpi->ext_use_s_frame =
+      cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+  cpi->ext_use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
   if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
     av1_update_entropy(cpi, 0);
   }
 }
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index eb779a3cd..5212db2b1 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -14,7 +14,8 @@
 
 #include <stdio.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aomcx.h"
 
 #include "av1/common/alloccommon.h"
@@ -22,11 +23,8 @@
 #include "av1/common/thread_common.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
+#include "av1/common/timing.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#include "aom_dsp/buf_ans.h"
-#endif
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
@@ -38,9 +36,6 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_XIPHRC
-#include "av1/encoder/ratectrl_xiph.h"
-#endif
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -54,19 +49,13 @@ extern "C" {
 #endif
 
 typedef struct {
-  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
-  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
-  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
-
-  // 0 = Intra, Last, GF, ARF
-  int8_t last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
-  // 0 = ZERO_MV, MV
-  int8_t last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+  int nmv_vec_cost[MV_JOINTS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
 
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
 typedef enum {
   // regular inter frame
   REGULAR_FRAME = 0,
@@ -76,14 +65,12 @@ typedef enum {
   OVERLAY_FRAME = 2,
   // golden frame
   GLD_FRAME = 3,
-#if CONFIG_EXT_REFS
   // backward reference frame
   BRF_FRAME = 4,
   // extra alternate reference frame
-  EXT_ARF_FRAME = 5
-#endif
+  EXT_ARF_FRAME = 5,
+  FRAME_CONTEXT_INDEXES
 } FRAME_CONTEXT_INDEX;
-#endif
 
 typedef enum {
   NORMAL = 0,
@@ -101,13 +88,9 @@ typedef enum {
 typedef enum {
   FRAMEFLAGS_KEY = 1 << 0,
   FRAMEFLAGS_GOLDEN = 1 << 1,
-#if CONFIG_EXT_REFS
   FRAMEFLAGS_BWDREF = 1 << 2,
   // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
   FRAMEFLAGS_ALTREF = 1 << 3,
-#else   // !CONFIG_EXT_REFS
-  FRAMEFLAGS_ALTREF = 1 << 2,
-#endif  // CONFIG_EXT_REFS
 } FRAMETYPE_FLAGS;
 
 typedef enum {
@@ -115,26 +98,22 @@ typedef enum {
   VARIANCE_AQ = 1,
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
-#if !CONFIG_EXT_DELTA_Q
-  DELTA_AQ = 4,
-#endif
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
-#if CONFIG_EXT_DELTA_Q
 typedef enum {
   NO_DELTA_Q = 0,
   DELTA_Q_ONLY = 1,
   DELTA_Q_LF = 2,
   DELTAQ_MODE_COUNT  // This should always be the last member of the enum
 } DELTAQ_MODE;
-#endif
+
 typedef enum {
   RESIZE_NONE = 0,    // No frame resizing allowed.
   RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
   RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
   RESIZE_MODES
 } RESIZE_MODE;
-#if CONFIG_FRAME_SUPERRES
+
 typedef enum {
   SUPERRES_NONE = 0,     // No frame superres allowed
   SUPERRES_FIXED = 1,    // All frames are coded at the specified scale,
@@ -145,13 +124,14 @@ typedef enum {
                          // q_index
   SUPERRES_MODES
 } SUPERRES_MODE;
-#endif  // CONFIG_FRAME_SUPERRES
 
 typedef struct AV1EncoderConfig {
   BITSTREAM_PROFILE profile;
   aom_bit_depth_t bit_depth;     // Codec bit-depth.
   int width;                     // width of data passed to the compressor
   int height;                    // height of data passed to the compressor
+  int forced_max_frame_width;    // forced maximum width of frame (if != 0)
+  int forced_max_frame_height;   // forced maximum height of frame (if != 0)
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;         // set to passed in framerate
   int64_t target_bandwidth;      // bandwidth to be used in bits per second
@@ -159,6 +139,7 @@ typedef struct AV1EncoderConfig {
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;          // sharpening output: recommendation 0:
   int speed;
+  int dev_sf;
   // maximum allowed bitrate for any intra frame in % of bitrate target.
   unsigned int rc_max_intra_bitrate_pct;
   // maximum allowed bitrate for any inter frame in % of bitrate target.
@@ -172,8 +153,11 @@ typedef struct AV1EncoderConfig {
   // Key Framing Operations
   int auto_key;  // autodetect cut scenes and set the keyframes
   int key_freq;  // maximum distance to key frame.
-
+  int sframe_dist;
+  int sframe_mode;
+  int sframe_enabled;
   int lag_in_frames;  // how many frames lag before we start encoding
+  int fwd_kf_enabled;
 
   // ----------------------------------------------------------------
   // DATARATE CONTROL OPTIONS
@@ -199,36 +183,33 @@ typedef struct AV1EncoderConfig {
   int best_allowed_q;
   int cq_level;
   AQ_MODE aq_mode;  // Adaptive Quantization mode
-#if CONFIG_EXT_DELTA_Q
   DELTAQ_MODE deltaq_mode;
-#endif
-#if CONFIG_AOM_QM
+  int enable_cdef;
+  int enable_restoration;
+  int disable_trellis_quant;
   int using_qm;
+  int qm_y;
+  int qm_u;
+  int qm_v;
   int qm_minlevel;
   int qm_maxlevel;
-#endif
 #if CONFIG_DIST_8X8
   int using_dist_8x8;
 #endif
   unsigned int num_tile_groups;
   unsigned int mtu;
 
-#if CONFIG_TEMPMV_SIGNALING
-  unsigned int disable_tempmv;
-#endif
   // Internal frame size scaling.
   RESIZE_MODE resize_mode;
   uint8_t resize_scale_denominator;
   uint8_t resize_kf_scale_denominator;
 
-#if CONFIG_FRAME_SUPERRES
   // Frame Super-Resolution size scaling.
   SUPERRES_MODE superres_mode;
   uint8_t superres_scale_denominator;
   uint8_t superres_kf_scale_denominator;
   int superres_qthresh;
   int superres_kf_qthresh;
-#endif  // CONFIG_FRAME_SUPERRES
 
   // Enable feature to reduce the frame quantization every x frames.
   int frame_periodic_boost;
@@ -241,9 +222,7 @@ typedef struct AV1EncoderConfig {
   // ----------------------------------------------------------------
 
   int enable_auto_arf;
-#if CONFIG_EXT_REFS
   int enable_auto_brf;  // (b)ackward (r)ef (f)rame
-#endif                  // CONFIG_EXT_REFS
 
   /* Bitfield defining the error resiliency features to enable.
    * Can provide decodable frames after losses in previous
@@ -251,12 +230,16 @@ typedef struct AV1EncoderConfig {
    */
   unsigned int error_resilient_mode;
 
+  unsigned int s_frame_mode;
+
   /* Bitfield defining the parallel decoding mode where the
    * decoding in successive frames may be conducted in parallel
    * just by decoding the frame headers.
    */
   unsigned int frame_parallel_decoding_mode;
 
+  unsigned int limit;
+
   int arnr_max_frames;
   int arnr_strength;
 
@@ -265,18 +248,10 @@ typedef struct AV1EncoderConfig {
 
   int tile_columns;
   int tile_rows;
-#if CONFIG_MAX_TILE
   int tile_width_count;
   int tile_height_count;
   int tile_widths[MAX_TILE_COLS];
   int tile_heights[MAX_TILE_ROWS];
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-  int dependent_horz_tiles;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  int loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 
   int max_threads;
 
@@ -289,34 +264,135 @@ typedef struct AV1EncoderConfig {
 
   aom_tune_metric tuning;
   aom_tune_content content;
-#if CONFIG_HIGHBITDEPTH
   int use_highbitdepth;
-#endif
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
   aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
   int render_width;
   int render_height;
-
-#if CONFIG_EXT_PARTITION
+  aom_timing_info_type_t timing_info_type;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int decoder_model_info_present_flag;
+  int display_model_info_present_flag;
+  int buffer_removal_delay_present;
+  aom_dec_model_info_t buffer_model;
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
+
+  uint8_t cdf_update_mode;
   aom_superblock_size_t superblock_size;
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  int ans_window_size_log2;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-#if CONFIG_EXT_TILE
   unsigned int large_scale_tile;
   unsigned int single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
-
+  int monochrome;
+  unsigned int full_still_picture_hdr;
+  int enable_dual_filter;
   unsigned int motion_vector_unit_test;
+  const cfg_options_t *cfg;
+  int enable_order_hint;
+  int enable_jnt_comp;
+  int enable_ref_frame_mvs;
+  unsigned int allow_ref_frame_mvs;
+  int enable_warped_motion;
+  int allow_warped_motion;
+  int enable_superres;
+  unsigned int save_as_annexb;
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+  unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+  unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  unsigned int cfl_sign[CFL_JOINT_SIGNS];
+  unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+  unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+  unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_y_color_index[PALETTE_SIZES]
+                                    [PALETTE_COLOR_INDEX_CONTEXTS]
+                                    [PALETTE_COLORS];
+  unsigned int palette_uv_color_index[PALETTE_SIZES]
+                                     [PALETTE_COLOR_INDEX_CONTEXTS]
+                                     [PALETTE_COLORS];
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+  unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                        [EOB_COEF_CONTEXTS][2];
+  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+                        [2];
+  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+  unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+  unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+  unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+  unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+  unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+  unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+  unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+  unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [LEVEL_CONTEXTS][BR_CDF_SIZE];
+  unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                               [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+  unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                                   [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+  unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+  unsigned int obmc[BLOCK_SIZES_ALL][2];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+  unsigned int intrabc[2];
+
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+  unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+  unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+  unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+  unsigned int delta_q[DELTA_Q_PROBS][2];
+  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+  unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+  unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+  unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+  unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+  unsigned int wiener_restore[2];
+  unsigned int sgrproj_restore[2];
+#endif  // CONFIG_ENTROPY_STATS
+
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
@@ -324,42 +400,31 @@ typedef struct TileDataEnc {
   int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
   int m_search_count;
   int ex_search_count;
-#if CONFIG_PVQ
-  PVQ_QUEUE pvq_q;
-#endif
-#if CONFIG_CFL
   CFL_CTX cfl;
-#endif
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  uint8_t allow_update_cdf;
 } TileDataEnc;
 
 typedef struct RD_COUNTS {
   int64_t comp_pred_diff[REFERENCE_MODES];
-#if CONFIG_GLOBAL_MOTION
   // Stores number of 4x4 blocks using global motion per reference frame.
-  int global_motion_used[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
-  int single_ref_used_flag;
+  int global_motion_used[REF_FRAMES];
   int compound_ref_used_flag;
+  int skip_mode_used_flag;
 } RD_COUNTS;
 
 typedef struct ThreadData {
   MACROBLOCK mb;
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
-#if !CONFIG_CB4X4
-  PICK_MODE_CONTEXT *leaf_tree;
-#endif
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
-#if CONFIG_MOTION_VAR
   int32_t *wsrc_buf;
   int32_t *mask_buf;
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
-#endif
-
   PALETTE_BUFFER *palette_buffer;
+  int intrabc_used_this_tile;
 } ThreadData;
 
 struct EncWorkerData;
@@ -370,14 +435,21 @@ typedef struct ActiveMap {
   unsigned char *map;
 } ActiveMap;
 
-#define NUM_STAT_TYPES 4  // types of stats: Y, U, V and ALL
+#if CONFIG_INTERNAL_STATS
+// types of stats
+typedef enum {
+  STAT_Y,
+  STAT_U,
+  STAT_V,
+  STAT_ALL,
+  NUM_STAT_TYPES  // This should always be the last member of the enum
+} StatType;
 
 typedef struct IMAGE_STAT {
   double stat[NUM_STAT_TYPES];
   double worst;
 } ImageStat;
-
-#undef NUM_STAT_TYPES
+#endif  // CONFIG_INTERNAL_STATS
 
 typedef struct {
   int ref_count;
@@ -392,16 +464,18 @@ typedef struct TileBufferEnc {
 typedef struct AV1_COMP {
   QUANTS quants;
   ThreadData td;
+  FRAME_COUNTS counts;
   MB_MODE_INFO_EXT *mbmi_ext_base;
-#if CONFIG_LV_MAP
   CB_COEFF_BUFFER *coeff_buffer_base;
-#endif
   Dequants dequants;
   AV1_COMMON common;
   AV1EncoderConfig oxcf;
   struct lookahead_ctx *lookahead;
   struct lookahead_entry *alt_ref_source;
 
+  int optimize_speed_feature;
+  int optimize_seg_arr[MAX_SEGMENTS];
+
   YV12_BUFFER_CONFIG *source;
   YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
   YV12_BUFFER_CONFIG *unscaled_source;
@@ -411,58 +485,42 @@ typedef struct AV1_COMP {
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
-#if CONFIG_AMVR
   double csm_rate_array[32];
   double m_rate_array[32];
   int rate_size;
   int rate_index;
-  hash_table *previsou_hash_table;
-  int previsous_index;
+  hash_table *previous_hash_table;
+  int previous_index;
   int cur_poc;  // DebugInfo
-#endif
 
-  int scaled_ref_idx[TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_REFS
-  int lst_fb_idxes[LAST_REF_FRAMES];
-#else
-  int lst_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  int gld_fb_idx;
-#if CONFIG_EXT_REFS
-  int bwd_fb_idx;   // BWDREF_FRAME
-  int alt2_fb_idx;  // ALTREF2_FRAME
-#endif              // CONFIG_EXT_REFS
-  int alt_fb_idx;
-#if CONFIG_EXT_REFS
-  int ext_fb_idx;      // extra ref frame buffer index
+  int scaled_ref_idx[REF_FRAMES];
+  int ref_fb_idx[REF_FRAMES];
   int refresh_fb_idx;  // ref frame buffer index to refresh
-#endif                 // CONFIG_EXT_REFS
 
   int last_show_frame_buf_idx;  // last show frame buffer index
 
   int refresh_last_frame;
   int refresh_golden_frame;
-#if CONFIG_EXT_REFS
   int refresh_bwd_ref_frame;
   int refresh_alt2_ref_frame;
-#endif  // CONFIG_EXT_REFS
   int refresh_alt_ref_frame;
 
   int ext_refresh_frame_flags_pending;
   int ext_refresh_last_frame;
   int ext_refresh_golden_frame;
+  int ext_refresh_bwd_ref_frame;
+  int ext_refresh_alt2_ref_frame;
   int ext_refresh_alt_ref_frame;
 
   int ext_refresh_frame_context_pending;
   int ext_refresh_frame_context;
+  int ext_use_ref_frame_mvs;
+  int ext_use_error_resilient;
+  int ext_use_s_frame;
+  int ext_use_primary_ref_none;
 
   YV12_BUFFER_CONFIG last_frame_uf;
-#if CONFIG_LOOP_RESTORATION
-  YV12_BUFFER_CONFIG last_frame_db;
   YV12_BUFFER_CONFIG trial_frame_rst;
-  uint8_t *extra_rstbuf;  // Extra buffers used in restoration search
-  RestorationInfo rst_search[MAX_MB_PLANE];  // Used for encoder side search
-#endif                                       // CONFIG_LOOP_RESTORATION
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -471,22 +529,17 @@ typedef struct AV1_COMP {
 
   CODING_CONTEXT coding_context;
 
-#if CONFIG_GLOBAL_MOTION
   int gmtype_cost[TRANS_TYPES];
-  int gmparams_cost[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
+  int gmparams_cost[REF_FRAMES];
 
-  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
-  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
 
   RATE_CONTROL rc;
-#if CONFIG_XIPHRC
-  od_rc_state od_rc;
-#endif
   double framerate;
 
   // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
@@ -500,6 +553,8 @@ typedef struct AV1_COMP {
   int mbgraph_n_frames;  // number of frames filled in the above
   int static_mb_pct;     // % forced skip mbs by segmentation
   int ref_frame_flags;
+  int ext_ref_frame_flags;
+  RATE_FACTOR_LEVEL frame_rf_level[FRAME_BUFFERS];
 
   SPEED_FEATURES sf;
 
@@ -507,6 +562,7 @@ typedef struct AV1_COMP {
   int mv_step_param;
 
   int allow_comp_inter_inter;
+  int all_one_sided_refs;
 
   uint8_t *segmentation_map;
 
@@ -514,7 +570,6 @@ typedef struct AV1_COMP {
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  av1_full_search_fn_t full_search_sad;  // It is currently unused.
   av1_diamond_search_fn_t diamond_search_sad;
   aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
   uint64_t time_receive_data;
@@ -581,8 +636,6 @@ typedef struct AV1_COMP {
   search_site_config ss_cfg;
 
   int multi_arf_allowed;
-  int multi_arf_enabled;
-  int multi_arf_last_grp_enabled;
 
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
@@ -597,6 +650,11 @@ typedef struct AV1_COMP {
   int resize_buffer_underflow;
   int resize_count;
 
+  // Sequence parameters have been transmitted already and locked
+  // or not. Once locked av1_change_config cannot change the seq
+  // parameters.
+  int seq_params_locked;
+
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
 
@@ -604,11 +662,6 @@ typedef struct AV1_COMP {
   int num_workers;
   AVxWorker *workers;
   struct EncWorkerData *tile_thr_data;
-  AV1LfSync lf_row_sync;
-#if CONFIG_ANS
-  struct BufAnsCoder buf_ans;
-#endif
-#if CONFIG_EXT_REFS
   int refresh_frame_mask;
   int existing_fb_idx_to_show;
   int is_arf_filter_off[MAX_EXT_ARFS + 1];
@@ -616,22 +669,24 @@ typedef struct AV1_COMP {
   int arf_map[MAX_EXT_ARFS + 1];
   int arf_pos_in_gf[MAX_EXT_ARFS + 1];
   int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_GLOBAL_MOTION
   int global_motion_search_done;
-#endif
-#if CONFIG_LV_MAP
   tran_low_t *tcoeff_buf[MAX_MB_PLANE];
-#endif
-
-#if CONFIG_EXT_REFS
   int extra_arf_allowed;
-  int bwd_ref_allowed;
-#endif  // CONFIG_EXT_REFS
+  // A flag to indicate if intrabc is ever used in current frame.
+  int intrabc_used;
+  int dv_cost[2][MV_VALS];
+  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
+  int dv_joint_cost[MV_JOINTS];
+  int has_lossless_segment;
+
+  // For frame refs short signaling:
+  //   A mapping of each reference frame from its encoder side value to the
+  //   decoder side value obtained following the short signaling procedure.
+  int ref_conv[REF_FRAMES];
 
-#if CONFIG_BGSPRITE
-  int bgsprite_allowed;
-#endif  // CONFIG_BGSPRITE
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
 } AV1_COMP;
 
 void av1_initialize_enc(void);
@@ -650,12 +705,17 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
 
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush);
+                            int64_t *time_end, int flush,
+                            const aom_rational_t *timebase);
 
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
 
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
 
 void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
@@ -675,6 +735,11 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
 
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
@@ -682,22 +747,7 @@ static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
 
 static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
-#if CONFIG_EXT_REFS
-  if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME)
-    return cpi->lst_fb_idxes[ref_frame - 1];
-#else
-  if (ref_frame == LAST_FRAME) return cpi->lst_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  else if (ref_frame == GOLDEN_FRAME)
-    return cpi->gld_fb_idx;
-#if CONFIG_EXT_REFS
-  else if (ref_frame == BWDREF_FRAME)
-    return cpi->bwd_fb_idx;
-  else if (ref_frame == ALTREF2_FRAME)
-    return cpi->alt2_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  else
-    return cpi->alt_fb_idx;
+  return (ref_frame >= 1) ? cpi->ref_fb_idx[ref_frame - 1] : INVALID_IDX;
 }
 
 static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
@@ -707,16 +757,19 @@ static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
 }
 
-#if CONFIG_HASH_ME
-static INLINE hash_table *get_ref_frame_hash_map(const AV1_COMP *cpi,
-                                                 MV_REFERENCE_FRAME ref_frame) {
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
+  return cm->allow_screen_content_tools;
+}
+
+static INLINE hash_table *av1_get_ref_frame_hash_map(
+    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
   const AV1_COMMON *const cm = &cpi->common;
   const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
   return buf_idx != INVALID_IDX
              ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
              : NULL;
 }
-#endif
 
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
     const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
@@ -726,7 +779,6 @@ static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
                                 : NULL;
 }
 
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   MV_REFERENCE_FRAME ref_frame;
   AV1_COMMON *const cm = &cpi->common;
@@ -737,48 +789,42 @@ static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   }
   return (ref_frame <= ALTREF_FRAME);
 }
-#endif  // CONFIG_EXT_REFS
 
-static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
-  // We assume 3 planes all at full resolution. We assume up to 1 token per
-  // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane,
-  // plus EOSB_TOKEN per plane.
-  return mb_rows * mb_cols * (16 * 16 + 17) * 3;
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
+  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
 }
 
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
-static INLINE unsigned int allocated_tokens(TileInfo tile) {
-#if CONFIG_CB4X4
+static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
+                                            int num_planes) {
   int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
   int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
-#else
-  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
-  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
-#endif
 
-  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+  return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
 }
 
-#if CONFIG_TEMPMV_SIGNALING
-void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction);
-#endif
-
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
+#define ALT_MIN_LAG 3
 static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
-  return cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.enable_auto_arf;
+  return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
 }
 
 // TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
-#if 0 && CONFIG_EXT_REFS
-static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
-  // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
-  //               alt_ref, and now will be off when the alt_ref interval is
-  //               not sufficiently large.
-  return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
-}
-#endif  // CONFIG_EXT_REFS
 
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
@@ -813,22 +859,14 @@ static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
   ubufs[new_uidx].ref_count++;
 }
 
-// Returns 1 if a frame is unscaled and 0 otherwise.
-static INLINE int av1_resize_unscaled(const AV1_COMMON *cm) {
-#if CONFIG_FRAME_SUPERRES
-  return cm->superres_upscaled_width == cm->render_width &&
-         cm->superres_upscaled_height == cm->render_height;
-#else
-  return cm->width == cm->render_width && cm->height == cm->render_height;
-#endif  // CONFIG_FRAME_SUPERRES
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+  return !(cm->superres_upscaled_width == cm->render_width &&
+           cm->superres_upscaled_height == cm->render_height);
 }
 
-static INLINE int av1_frame_unscaled(const AV1_COMMON *cm) {
-#if CONFIG_FRAME_SUPERRES
-  return av1_superres_unscaled(cm) && av1_resize_unscaled(cm);
-#else
-  return av1_resize_unscaled(cm);
-#endif  // CONFIG_FRAME_SUPERRES
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+  return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 6209d6fa4..4d4802b46 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -9,65 +9,81 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/scan.h"
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/idct.h"
 #include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
 #include "av1/encoder/bitstream.h"
-#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/cost.h"
-#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
 #include "av1/encoder/rdopt.h"
-#include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
 
-#define TEST_OPTIMIZE_TXB 0
+static int hbt_needs_init = 1;
+static CRC32C crc_calculator;
+static const int HBT_EOB = 16;            // also the length in opt_qcoeff
+static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
+static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
+// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
+static const int HBT_KICKOUT = 3;
+
+typedef struct OptTxbQcoeff {
+  // Use larger type if larger/no kickout value is used in hbt_create_hashes
+  int8_t deltas[16];
+  uint32_t hbt_qc_hash;
+  uint32_t hbt_ctx_hash;
+  int init;
+  int rate_cost;
+} OptTxbQcoeff;
+
+OptTxbQcoeff *hbt_hash_table;
+
+typedef struct LevelDownStats {
+  int update;
+  tran_low_t low_qc;
+  tran_low_t low_dqc;
+  int64_t dist0;
+  int rate;
+  int rate_low;
+  int64_t dist;
+  int64_t dist_low;
+  int64_t rd;
+  int64_t rd_low;
+  int64_t nz_rd;
+  int64_t rd_diff;
+  int cost_diff;
+  int64_t dist_diff;
+  int new_eob;
+} LevelDownStats;
 
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
-#if 0
-  AV1_COMMON *cm = &cpi->common;
-  int mi_block_size = 1 << MI_SIZE_LOG2;
-  // TODO(angiebird): Make sure cm->subsampling_x/y is set correctly, and then
-  // use precise buffer size according to cm->subsampling_x/y
-  int pixel_stride = mi_block_size * cm->mi_cols;
-  int pixel_height = mi_block_size * cm->mi_rows;
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    CHECK_MEM_ERROR(
-        cm, cpi->tcoeff_buf[i],
-        aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height));
-  }
-#else
   AV1_COMMON *cm = &cpi->common;
-  int size = ((cm->mi_rows >> MAX_MIB_SIZE_LOG2) + 1) *
-             ((cm->mi_cols >> MAX_MIB_SIZE_LOG2) + 1);
+  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
   CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
-                  aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
-#endif
+                  aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
 }
 
-void av1_free_txb_buf(AV1_COMP *cpi) {
-#if 0
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    aom_free(cpi->tcoeff_buf[i]);
-  }
-#else
-  aom_free(cpi->coeff_buffer_base);
-#endif
-}
+void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
 
 void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
                           int mi_row, int mi_col) {
-  int stride = (cpi->common.mi_cols >> MAX_MIB_SIZE_LOG2) + 1;
-  int offset =
-      (mi_row >> MAX_MIB_SIZE_LOG2) * stride + (mi_col >> MAX_MIB_SIZE_LOG2);
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int stride = (cm->mi_cols >> mib_size_log2) + 1;
+  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
   const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
+  for (int plane = 0; plane < num_planes; ++plane) {
     x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
     x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
     x->mbmi_ext->txb_skip_ctx[plane] =
@@ -93,435 +109,147 @@ static void write_golomb(aom_writer *w, int level) {
   for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
 }
 
-static INLINE void write_nz_map(aom_writer *w, const tran_low_t *tcoeff,
-                                uint16_t eob, int plane, const int16_t *scan,
-                                TX_SIZE tx_size, TX_TYPE tx_type,
-                                FRAME_CONTEXT *fc) {
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-#if CONFIG_CTX1D
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-  aom_prob *eob_flag = fc->eob_flag[txs_ctx][plane_type];
-#endif
-
-  for (int c = 0; c < eob; ++c) {
-    int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txs_ctx, tx_type);
-
-    tran_low_t v = tcoeff[scan[c]];
-    int is_nz = (v != 0);
-
-    if (c == seg_eob - 1) break;
-
-#if LV_MAP_PROB
-    aom_write_bin(w, is_nz, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
-#else
-    aom_write(w, is_nz, nz_map[coeff_ctx]);
-#endif
-
-    if (is_nz) {
-#if LV_MAP_PROB
-      aom_write_bin(w, c == (eob - 1),
-                    fc->eob_flag_cdf[txs_ctx][plane_type][eob_ctx], 2);
-#else
-      aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
-#endif
-    }
+static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
+  if (qc == 0) {
+    return 0;
   }
+  return qc > 0 ? qc - 1 : qc + 1;
 }
 
-#if CONFIG_CTX1D
-static INLINE void write_nz_map_vert(aom_writer *w, const tran_low_t *tcoeff,
-                                     uint16_t eob, int plane,
-                                     const int16_t *scan, const int16_t *iscan,
-                                     TX_SIZE tx_size, TX_TYPE tx_type,
-                                     FRAME_CONTEXT *fc) {
-  (void)eob;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_vert(eob_ls, tcoeff, width, height);
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int c = 0; c < width; ++c) {
-    int16_t veob = eob_ls[c];
-    assert(veob <= height);
-    int el_ctx = get_empty_line_ctx(c, eob_ls);
-#if LV_MAP_PROB
-    aom_write_bin(w, veob == 0,
-                  fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2);
-#else
-    aom_write(w, veob == 0,
-              fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]);
-#endif
-    if (veob) {
-      for (int r = 0; r < veob; ++r) {
-        if (r + 1 != height) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          aom_write_bin(w, is_nz,
-                        fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
-#else
-          aom_write(w, is_nz, nz_map[coeff_ctx]);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
-#if LV_MAP_PROB
-            aom_write_bin(
-                w, r == veob - 1,
-                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2);
-#else
-            aom_write(w, r == veob - 1,
-                      fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]);
-#endif
-          }
-        }
-      }
-    }
-  }
+static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
+                                           int dqv, int shift,
+                                           const qm_val_t *iqmatrix) {
+  int sign = qc < 0 ? -1 : 1;
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return sign * ((abs(qc) * dqv) >> shift);
 }
 
-static INLINE void write_nz_map_horiz(aom_writer *w, const tran_low_t *tcoeff,
-                                      uint16_t eob, int plane,
-                                      const int16_t *scan, const int16_t *iscan,
-                                      TX_SIZE tx_size, TX_TYPE tx_type,
-                                      FRAME_CONTEXT *fc) {
-  (void)scan;
-  (void)eob;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_horiz(eob_ls, tcoeff, width, height);
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int r = 0; r < height; ++r) {
-    int16_t heob = eob_ls[r];
-    int el_ctx = get_empty_line_ctx(r, eob_ls);
-#if LV_MAP_PROB
-    aom_write_bin(w, heob == 0,
-                  fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2);
-#else
-    aom_write(w, heob == 0,
-              fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]);
-#endif
-    if (heob) {
-      for (int c = 0; c < heob; ++c) {
-        if (c + 1 != width) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          aom_write_bin(w, is_nz,
-                        fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
-#else
-          aom_write(w, is_nz, nz_map[coeff_ctx]);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
-#if LV_MAP_PROB
-            aom_write_bin(
-                w, c == heob - 1,
-                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2);
-#else
-            aom_write(w, c == heob - 1,
-                      fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]);
-#endif
-          }
-        }
-      }
-    }
-  }
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift) {
+  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  const int64_t error = diff * diff;
+  return error;
 }
-#endif
-
-void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                          aom_writer *w, int blk_row, int blk_col, int block,
-                          int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
-                          uint16_t eob, TXB_CTX *txb_ctx) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
-  int c;
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-  uint16_t update_eob = 0;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-
-  (void)blk_row;
-  (void)blk_col;
 
-#if LV_MAP_PROB
-  aom_write_bin(w, eob == 0,
-                ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+                            TX_CLASS tx_class, PLANE_TYPE plane,
+                            FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+                            uint8_t allow_update_cdf) {
 #else
-  aom_write(w, eob == 0, ec_ctx->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx]);
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+                            PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+                            uint8_t allow_update_cdf) {
 #endif
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
 
-  if (eob == 0) return;
-#if CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd, blk_row, blk_col, block, plane,
-                    get_min_tx_size(tx_size), w);
-#endif
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
 
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
-  } else {
-    const int width = tx_size_wide[tx_size];
-    const int eob_offset = width + height;
-    const int eob_mode = eob > eob_offset;
-#if LV_MAP_PROB
-    aom_write_bin(w, eob_mode,
-                  ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], 2);
-#else
-    aom_write(w, eob_mode, ec_ctx->eob_mode[txs_ctx][plane_type][tx_class]);
+  switch (eob_multi_size) {
+    case 0:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-    if (eob_mode == 0) {
-      write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
-    } else {
-      const int16_t *iscan = scan_order->iscan;
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        write_nz_map_vert(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type,
-                          ec_ctx);
-      else
-        write_nz_map_horiz(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type,
-                           ec_ctx);
-    }
-  }
-#else
-  write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
-#endif  // CONFIG_CTX1D
-
-  int i;
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-#if !LV_MAP_PROB
-    aom_prob *coeff_base = ec_ctx->coeff_base[txs_ctx][plane_type][i];
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+      break;
+    case 1:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-    update_eob = 0;
-    for (c = eob - 1; c >= 0; --c) {
-      tran_low_t v = tcoeff[scan[c]];
-      tran_low_t level = abs(v);
-      int sign = (v < 0) ? 1 : 0;
-      int ctx;
-
-      if (level <= i) continue;
-
-      ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
-
-      if (level == i + 1) {
-#if LV_MAP_PROB
-        aom_write_bin(w, 1, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx],
-                      2);
-#else
-        aom_write(w, 1, coeff_base[ctx]);
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+      break;
+    case 2:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-        if (c == 0) {
-#if LV_MAP_PROB
-          aom_write_bin(w, sign,
-                        ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx],
-                        2);
-#else
-          aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+      break;
+    case 3:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-        } else {
-          aom_write_bit(w, sign);
-        }
-        continue;
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+                   8);
       }
-
-#if LV_MAP_PROB
-      aom_write_bin(w, 0, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx],
-                    2);
-#else
-      aom_write(w, 0, coeff_base[ctx]);
-#endif
-      update_eob = AOMMAX(update_eob, c);
-    }
-  }
-
-  for (c = update_eob; c >= 0; --c) {
-    tran_low_t v = tcoeff[scan[c]];
-    tran_low_t level = abs(v);
-    int sign = (v < 0) ? 1 : 0;
-    int idx;
-    int ctx;
-
-    if (level <= NUM_BASE_LEVELS) continue;
-
-    if (c == 0) {
-#if LV_MAP_PROB
-      aom_write_bin(w, sign,
-                    ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
-#else
-      aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+      break;
+    case 4:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-    } else {
-      aom_write_bit(w, sign);
-    }
-
-    // level is above 1.
-    ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
-
-#if BR_NODE
-    int base_range = level - 1 - NUM_BASE_LEVELS;
-    int br_set_idx = 0;
-    int br_base = 0;
-    int br_offset = 0;
-
-    if (base_range >= COEFF_BASE_RANGE)
-      br_set_idx = BASE_RANGE_SETS;
-    else
-      br_set_idx = coeff_to_br_index[base_range];
-
-    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-      aom_write_bin(w, idx == br_set_idx,
-                    ec_ctx->coeff_br_cdf[txs_ctx][plane_type][idx][ctx], 2);
-      if (idx == br_set_idx) {
-        br_base = br_index_to_coeff[br_set_idx];
-        br_offset = base_range - br_base;
-        int extra_bits = (1 << br_extra_bits[idx]) - 1;
-        for (int tok = 0; tok < extra_bits; ++tok) {
-          if (tok == br_offset) {
-            aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
-                          2);
-            break;
-          }
-          aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
-                        2);
-        }
-        //        aom_write_literal(w, br_offset, br_extra_bits[idx]);
-        break;
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+                   9);
       }
-    }
-
-    if (br_set_idx < BASE_RANGE_SETS) continue;
-#else  // BR_NODE
-    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-      if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-#if LV_MAP_PROB
-        aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2);
-#else
-        aom_write(w, 1, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]);
+      break;
+    case 5:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-        break;
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+                   10);
       }
-#if LV_MAP_PROB
-      aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2);
-#else
-      aom_write(w, 0, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]);
-#endif
-    }
-    if (idx < COEFF_BASE_RANGE) continue;
-#endif  // BR_NODE
-
-    // use 0-th order Golomb code to handle the residual level.
-    write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+      break;
+    case 6:
+    default:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+                   11);
+      }
+      break;
   }
-}
 
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, int plane) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-  struct macroblockd_plane *pd = &xd->plane[plane];
-
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
-#endif
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const int bkw = tx_size_wide_unit[tx_size];
-  const int bkh = tx_size_high_unit[tx_size];
-  const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size];
-  int row, col;
-  int block = 0;
-  for (row = 0; row < max_blocks_high; row += bkh) {
-    for (col = 0; col < max_blocks_wide; col += bkw) {
-      tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-      uint16_t eob = x->mbmi_ext->eobs[plane][block];
-      TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
-                          x->mbmi_ext->dc_sign_ctx[plane][block] };
-      av1_write_coeffs_txb(cm, xd, w, row, col, block, plane, tx_size, tcoeff,
-                           eob, &txb_ctx);
-      block += step;
-    }
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    int eob_ctx = eob_pt - 3;
+    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+    counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
   }
 }
 
-static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
-                                    int c,  // raster order
-                                    const int bwl, const int height,
-                                    int ctx_set[NUM_BASE_LEVELS]) {
-  const int row = c >> bwl;
-  const int col = c - (row << bwl);
-  const int stride = 1 << bwl;
-  int mag[NUM_BASE_LEVELS] = { 0 };
-  int idx;
-  tran_low_t abs_coeff;
-  int i;
-
-  for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
-    int ref_row = row + base_ref_offset[idx][0];
-    int ref_col = col + base_ref_offset[idx][1];
-    int pos = (ref_row << bwl) + ref_col;
-
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height || ref_col >= stride)
-      continue;
-
-    abs_coeff = abs(tcoeffs[pos]);
-
-    for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-      ctx_set[i] += abs_coeff > i;
-      if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0)
-        mag[i] |= abs_coeff > (i + 1);
-    }
-  }
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  int eob_cost = 0;
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    const int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+    const int offset_bits = k_eob_offset_bits[eob_pt];
+    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+  }
+  return eob_cost;
+}
 
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    ctx_set[i] = get_base_ctx_from_count_mag(row, col, ctx_set[i], mag[i]);
+static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
+                                    const int (*dc_sign_cost)[2],
+                                    int dc_sign_ctx) {
+  if (coeff_idx == 0) {
+    const int sign = (qc < 0) ? 1 : 0;
+    return dc_sign_cost[dc_sign_ctx][sign];
   }
-  return;
+  return av1_cost_literal(1);
 }
 
 static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
@@ -530,1440 +258,1522 @@ static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
   const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
   (void)ctx;
   if (abs_qc >= min_level) {
-#if BR_NODE
-    if (abs_qc >= max_level)
+    if (abs_qc >= max_level) {
       return coeff_lps[COEFF_BASE_RANGE];  // COEFF_BASE_RANGE * cost0;
-    else
+    } else {
       return coeff_lps[(abs_qc - min_level)];  //  * cost0 + cost1;
-#else
-    const int cost0 = coeff_lps[0];
-    const int cost1 = coeff_lps[1];
-    if (abs_qc >= max_level)
-      return COEFF_BASE_RANGE * cost0;
-    else
-      return (abs_qc - min_level) * cost0 + cost1;
-#endif
-  } else {
-    return 0;
+    }
   }
+  return 0;
 }
 
-static INLINE int get_base_cost(tran_low_t abs_qc, int ctx,
-                                const int coeff_base[2], int base_idx) {
-  const int level = base_idx + 1;
-  (void)ctx;
-  if (abs_qc < level)
-    return 0;
-  else
-    return coeff_base[abs_qc == level];
-}
-
-int get_nz_eob_map_cost(const LV_MAP_COEFF_COST *coeff_costs,
-                        const tran_low_t *qcoeff, uint16_t eob, int plane,
-                        const int16_t *scan, TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)plane;
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-#if CONFIG_CTX1D
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-  int cost = 0;
-  for (int c = 0; c < eob; ++c) {
-    tran_low_t v = qcoeff[scan[c]];
-    int is_nz = (v != 0);
-    if (c + 1 != seg_eob) {
-      int coeff_ctx = get_nz_map_ctx(qcoeff, c, scan, bwl, height, tx_type);
-      cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
-      if (is_nz) {
-        int eob_ctx = get_eob_ctx(qcoeff, scan[c], txs_ctx, tx_type);
-        cost += coeff_costs->eob_cost[eob_ctx][c == (eob - 1)];
-      }
-    }
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    const int length = get_msb(r) + 1;
+    return av1_cost_literal(2 * length - 1);
   }
-  return cost;
+  return 0;
 }
 
-#if CONFIG_CTX1D
-static INLINE int get_nz_eob_map_cost_vert(const LV_MAP_COEFF_COST *coeff_costs,
-                                           const tran_low_t *qcoeff,
-                                           uint16_t eob, int plane,
-                                           const int16_t *scan,
-                                           const int16_t *iscan,
-                                           TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)tx_size;
-  (void)scan;
-  (void)eob;
-  (void)plane;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_vert(eob_ls, qcoeff, width, height);
+static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
+                          const int is_eob, const TxbInfo *const txb_info,
+                          const LV_MAP_COEFF_COST *const txb_costs,
+                          const int coeff_ctx, const TX_CLASS tx_class) {
+  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
+  const int is_nz = (qc != 0);
+  const tran_low_t abs_qc = abs(qc);
   int cost = 0;
-  for (int c = 0; c < width; ++c) {
-    int16_t veob = eob_ls[c];
-    assert(veob <= height);
-    int el_ctx = get_empty_line_ctx(c, eob_ls);
-    cost += coeff_costs->empty_line_cost[tx_class][el_ctx][veob == 0];
-    if (veob) {
-      for (int r = 0; r < veob; ++r) {
-        if (r + 1 != height) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = qcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type);
-          cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
-            cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][r == veob - 1];
-          }
-        }
-      }
-    }
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int pos = scan[scan_idx];
+
+  if (is_eob) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
   }
-  return cost;
-}
+  if (is_nz) {
+    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
+                              txb_ctx->dc_sign_ctx);
 
-static INLINE int get_nz_eob_map_cost_horiz(
-    const LV_MAP_COEFF_COST *coeff_costs, const tran_low_t *qcoeff,
-    uint16_t eob, int plane, const int16_t *scan, const int16_t *iscan,
-    TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)tx_size;
-  (void)scan;
-  (void)eob;
-  (void)plane;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_horiz(eob_ls, qcoeff, width, height);
-  int cost = 0;
-  for (int r = 0; r < height; ++r) {
-    int16_t heob = eob_ls[r];
-    assert(heob <= width);
-    int el_ctx = get_empty_line_ctx(r, eob_ls);
-    cost += coeff_costs->empty_line_cost[tx_class][el_ctx][heob == 0];
-    if (heob) {
-      for (int c = 0; c < heob; ++c) {
-        if (c + 1 != width) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = qcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type);
-          cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
-            cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][c == heob - 1];
-          }
-        }
-      }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int ctx =
+          get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
+      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
+      cost += get_golomb_cost(abs_qc);
     }
   }
   return cost;
 }
-#endif
 
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                        int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                        TXB_CTX *txb_ctx) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const struct macroblock_plane *p = &x->plane[plane];
-  const int eob = p->eobs[block];
-  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int c, cost;
-  int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+                                 const int coeff_idx, const int bwl,
+                                 const int height, const int scan_idx,
+                                 const int is_eob, const TX_SIZE tx_size,
+                                 const TX_CLASS tx_class) {
+  if (is_eob) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) / 8) return 1;
+    if (scan_idx <= (height << bwl) / 4) return 2;
+    return 3;
+  }
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
 
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
+static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
+                                const int is_eob,
+                                const LV_MAP_COEFF_COST *const txb_costs,
+                                const TxbInfo *const txb_info,
+                                const TX_CLASS tx_class) {
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const uint8_t *const levels = txb_info->levels;
+  stats->new_eob = -1;
+  stats->update = 0;
+  stats->rd_low = 0;
+  stats->rd = 0;
+  stats->nz_rd = 0;
+  stats->dist_low = 0;
+  stats->rate_low = 0;
+  stats->low_qc = 0;
 
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
+  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  const int coeff_ctx =
+      get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
+                     scan_idx, is_eob, txb_info->tx_size, tx_class);
+  const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
+                                     coeff_ctx, tx_class);
+  assert(qc != 0);
+  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
+                                           txb_info->iqmatrix);
+  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
 
-  LV_MAP_COEFF_COST *coeff_costs = &x->coeff_costs[txs_ctx][plane_type];
+  // distortion difference when coefficient is quantized to 0
+  const tran_low_t dqc0 =
+      qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
 
-  cost = 0;
+  stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
+  stats->dist = dqc_dist - stats->dist0;
+  stats->rate = qc_cost;
 
-  if (eob == 0) {
-    cost = coeff_costs->txb_skip_cost[txb_skip_ctx][1];
-    return cost;
-  }
-  cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+  stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
 
-#if CONFIG_TXK_SEL
-  cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type);
-#endif
+  stats->low_qc = get_lower_coeff(qc);
 
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size,
-                                tx_type);
+  if (is_eob && stats->low_qc == 0) {
+    stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
   } else {
-    const int width = tx_size_wide[tx_size];
-    const int eob_offset = width + height;
-    const int eob_mode = eob > eob_offset;
-    cost += coeff_costs->eob_mode_cost[tx_class][eob_mode];
-    if (eob_mode == 0) {
-      cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan,
-                                  tx_size, tx_type);
+    if (stats->low_qc == 0) {
+      stats->dist_low = 0;
     } else {
-      const int16_t *iscan = scan_order->iscan;
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        cost += get_nz_eob_map_cost_vert(coeff_costs, qcoeff, eob, plane, scan,
-                                         iscan, tx_size, tx_type);
-      else
-        cost += get_nz_eob_map_cost_horiz(coeff_costs, qcoeff, eob, plane, scan,
-                                          iscan, tx_size, tx_type);
+      stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
+                                         txb_info->shift, txb_info->iqmatrix);
+      const int64_t low_dqc_dist =
+          get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
+      stats->dist_low = low_dqc_dist - stats->dist0;
     }
+    const int low_qc_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
+                       coeff_ctx, tx_class);
+    stats->rate_low = low_qc_cost;
+    stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
   }
-#else   // CONFIG_CTX1D
-  cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size,
-                              tx_type);
-#endif  // CONFIG_CTX1D
-
-  for (c = 0; c < eob; ++c) {
-    tran_low_t v = qcoeff[scan[c]];
-    int is_nz = (v != 0);
-    int level = abs(v);
-
-    if (is_nz) {
-      int ctx_ls[NUM_BASE_LEVELS] = { 0 };
-      int sign = (v < 0) ? 1 : 0;
-
-      // sign bit cost
-      if (c == 0) {
-        int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign];
-      } else {
-        cost += av1_cost_bit(128, sign);
-      }
+}
 
-      get_base_ctx_set(qcoeff, scan[c], bwl, height, ctx_ls);
+static void get_dist_cost_stats_with_eob(
+    LevelDownStats *const stats, const int scan_idx,
+    const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
+    const TX_CLASS tx_class) {
+  const int is_eob = 0;
+  get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
 
-      int i;
-      for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-        if (level <= i) continue;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int coeff_ctx_temp = get_nz_map_ctx(
+      txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
+      txb_info->tx_size, tx_class);
+  const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
+                                         coeff_ctx_temp, tx_class);
+  int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
+  if (stats->low_qc != 0) {
+    const int low_qc_eob_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
+                       coeff_ctx_temp, tx_class);
+    int64_t rd_eob_low =
+        RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
+    rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
+  }
+
+  stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
+}
 
-        if (level == i + 1) {
-          cost += coeff_costs->base_cost[i][ctx_ls[i]][1];
-          continue;
-        }
-        cost += coeff_costs->base_cost[i][ctx_ls[i]][0];
-      }
+static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
+                                 const TxbInfo *const txb_info) {
+  txb_info->qcoeff[coeff_idx] = qc;
+  txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
+      (uint8_t)clamp(abs(qc), 0, INT8_MAX);
+}
 
-      if (level > NUM_BASE_LEVELS) {
-        int ctx;
-        ctx = get_br_ctx(qcoeff, scan[c], bwl, height);
-#if BR_NODE
-        int base_range = level - 1 - NUM_BASE_LEVELS;
-        if (base_range < COEFF_BASE_RANGE) {
-          cost += coeff_costs->lps_cost[ctx][base_range];
-        } else {
-          cost += coeff_costs->lps_cost[ctx][COEFF_BASE_RANGE];
-        }
+static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
+                                const TxbInfo *const txb_info) {
+  update_qcoeff(coeff_idx, qc, txb_info);
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
+      qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+}
 
-#else
-        for (int idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-          if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-            cost += coeff_costs->lps_cost[ctx][1];
-            break;
-          }
-          cost += coeff_costs->lps_cost[ctx][0];
-        }
-#endif
-        if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-          // residual cost
-          int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
-          int ri = r;
-          int length = 0;
-
-          while (ri) {
-            ri >>= 1;
-            ++length;
-          }
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+                           const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *ls = levels;
 
-          for (ri = 0; ri < length - 1; ++ri) cost += av1_cost_bit(128, 0);
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
 
-          for (ri = length - 1; ri >= 0; --ri)
-            cost += av1_cost_bit(128, (r >> ri) & 0x01);
-        }
-      }
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX);
+    }
+    for (int j = 0; j < TX_PAD_HOR; j++) {
+      *ls++ = 0;
     }
   }
-
-  return cost;
 }
 
-static INLINE int has_base(tran_low_t qc, int base_idx) {
-  const int level = base_idx + 1;
-  return abs(qc) >= level;
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+                               const int16_t *const scan, const uint16_t eob,
+                               const TX_SIZE tx_size, const TX_CLASS tx_class,
+                               int8_t *const coeff_contexts) {
+  const int bwl = get_txb_bwl(tx_size);
+  const int height = get_txb_high(tx_size);
+  for (int i = 0; i < eob; ++i) {
+    const int pos = scan[i];
+    coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i,
+                                         i == eob - 1, tx_size, tx_class);
+  }
 }
 
-static INLINE int has_br(tran_low_t qc) {
-  return abs(qc) >= 1 + NUM_BASE_LEVELS;
-}
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          uint16_t eob, TXB_CTX *txb_ctx) {
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int c;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
 
-static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
-                                    const int (*dc_sign_cost)[2],
-                                    int dc_sign_ctx) {
-  const int sign = (qc < 0) ? 1 : 0;
-  // sign bit cost
-  if (coeff_idx == 0) {
-    return dc_sign_cost[dc_sign_ctx][sign];
-  } else {
-    return av1_cost_bit(128, sign);
+  aom_write_symbol(w, eob == 0,
+                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+  if (plane == 0 && eob == 0) {
+    assert(tx_type == DCT_DCT);
   }
-}
-static INLINE int get_golomb_cost(int abs_qc) {
-  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    // residual cost
-    int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
-    int ri = r;
-    int length = 0;
-
-    while (ri) {
-      ri >>= 1;
-      ++length;
-    }
+  if (eob == 0) return;
 
-    return av1_cost_literal(2 * length - 1);
-  } else {
-    return 0;
-  }
-}
+  av1_txb_init_levels(tcoeff, width, height, levels);
 
-void gen_txb_cache(TxbCache *txb_cache, TxbInfo *txb_info) {
-  // gen_nz_count_arr
-  const int16_t *scan = txb_info->scan_order->scan;
-  const int bwl = txb_info->bwl;
-  const int height = txb_info->height;
-  tran_low_t *qcoeff = txb_info->qcoeff;
-  const BASE_CTX_TABLE *base_ctx_table =
-      txb_info->coeff_ctx_table->base_ctx_table;
-  for (int c = 0; c < txb_info->eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    const int row = coeff_idx >> bwl;
-    const int col = coeff_idx - (row << bwl);
-#if REDUCE_CONTEXT_DEPENDENCY
-    int prev_coeff_idx;
-    int prev_row;
-    int prev_col;
-    if (c > MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY) {
-      prev_coeff_idx = scan[c - 1];  // raster order
-      prev_row = prev_coeff_idx >> bwl;
-      prev_col = prev_coeff_idx - (prev_row << bwl);
-    } else {
-      prev_coeff_idx = -1;
-      prev_row = -1;
-      prev_col = -1;
-    }
-    txb_cache->nz_count_arr[coeff_idx] =
-        get_nz_count(qcoeff, bwl, height, row, col, prev_row, prev_col);
-#else
-    txb_cache->nz_count_arr[coeff_idx] =
-        get_nz_count(qcoeff, bwl, height, row, col);
-#endif
-    const int nz_count = txb_cache->nz_count_arr[coeff_idx];
-    txb_cache->nz_ctx_arr[coeff_idx] =
-        get_nz_map_ctx_from_count(nz_count, coeff_idx, bwl, txb_info->tx_type);
-
-    // gen_base_count_mag_arr
-    if (!has_base(qcoeff[coeff_idx], 0)) continue;
-    int *base_mag = txb_cache->base_mag_arr[coeff_idx];
-    int count[NUM_BASE_LEVELS];
-    get_base_count_mag(base_mag, count, qcoeff, bwl, height, row, col);
-
-    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
-      if (!has_base(qcoeff[coeff_idx], i)) break;
-      txb_cache->base_count_arr[i][coeff_idx] = count[i];
-      const int level = i + 1;
-      txb_cache->base_ctx_arr[i][coeff_idx] =
-          base_ctx_table[row != 0][col != 0][base_mag[0] > level][count[i]];
-    }
+  av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
 
-    // gen_br_count_mag_arr
-    if (!has_br(qcoeff[coeff_idx])) continue;
-    int *br_count = txb_cache->br_count_arr + coeff_idx;
-    int *br_mag = txb_cache->br_mag_arr[coeff_idx];
-    *br_count = get_br_count_mag(br_mag, qcoeff, bwl, height, row, col,
-                                 NUM_BASE_LEVELS);
-    txb_cache->br_ctx_arr[coeff_idx] =
-        get_br_ctx_from_count_mag(row, col, *br_count, br_mag[0]);
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+      break;
+    case 1:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+      break;
+    case 2:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+      break;
+    case 3:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+      break;
+    case 4:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+      break;
+    case 5:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+      break;
+    default:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+      break;
   }
-}
 
-static INLINE const int *get_level_prob(int level, int coeff_idx,
-                                        const TxbCache *txb_cache,
-                                        const LV_MAP_COEFF_COST *txb_costs) {
-  if (level == 0) {
-    const int ctx = txb_cache->nz_ctx_arr[coeff_idx];
-    return txb_costs->nz_map_cost[ctx];
-  } else if (level >= 1 && level < 1 + NUM_BASE_LEVELS) {
-    const int idx = level - 1;
-    const int ctx = txb_cache->base_ctx_arr[idx][coeff_idx];
-    return txb_costs->base_cost[idx][ctx];
-  } else if (level >= 1 + NUM_BASE_LEVELS &&
-             level < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int ctx = txb_cache->br_ctx_arr[coeff_idx];
-    return txb_costs->lps_cost[ctx];
-  } else if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    printf("get_level_prob does not support golomb\n");
-    assert(0);
-    return 0;
-  } else {
-    assert(0);
-    return 0;
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    aom_write_symbol(w, bit,
+                     ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+      eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+      bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+      aom_write_bit(w, bit);
+    }
   }
-}
 
-static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
-  if (qc == 0) {
-    return 0;
-  }
-  return qc > 0 ? qc - 1 : qc + 1;
-}
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-static INLINE void update_mag_arr(int *mag_arr, int abs_qc) {
-  if (mag_arr[0] == abs_qc) {
-    mag_arr[1] -= 1;
-    assert(mag_arr[1] >= 0);
-  }
-}
+  for (c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = tcoeff[pos];
+    const tran_low_t level = abs(v);
 
-static INLINE int get_mag_from_mag_arr(const int *mag_arr) {
-  int mag;
-  if (mag_arr[1] > 0) {
-    mag = mag_arr[0];
-  } else if (mag_arr[0] > 0) {
-    mag = mag_arr[0] - 1;
-  } else {
-    // no neighbor
-    assert(mag_arr[0] == 0 && mag_arr[1] == 0);
-    mag = 0;
+    if (c == eob - 1) {
+      aom_write_symbol(
+          w, AOMMIN(level, 3) - 1,
+          ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+    } else {
+      aom_write_symbol(w, AOMMIN(level, 3),
+                       ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+                       4);
+    }
+    if (level > NUM_BASE_LEVELS) {
+      // level is above 1.
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        aom_write_symbol(
+            w, k,
+            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+            BR_CDF_SIZE);
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
   }
-  return mag;
-}
 
-static int neighbor_level_down_update(int *new_count, int *new_mag, int count,
-                                      const int *mag, int coeff_idx,
-                                      tran_low_t abs_nb_coeff, int nb_coeff_idx,
-                                      int level, const TxbInfo *txb_info) {
-  *new_count = count;
-  *new_mag = get_mag_from_mag_arr(mag);
-
-  int update = 0;
-  // check if br_count changes
-  if (abs_nb_coeff == level) {
-    update = 1;
-    *new_count -= 1;
-    assert(*new_count >= 0);
-  }
-  const int row = coeff_idx >> txb_info->bwl;
-  const int col = coeff_idx - (row << txb_info->bwl);
-  const int nb_row = nb_coeff_idx >> txb_info->bwl;
-  const int nb_col = nb_coeff_idx - (nb_row << txb_info->bwl);
-
-  // check if mag changes
-  if (nb_row >= row && nb_col >= col) {
-    if (abs_nb_coeff == mag[0]) {
-      assert(mag[1] > 0);
-      if (mag[1] == 1) {
-        // the nb is the only qc with max mag
-        *new_mag -= 1;
-        assert(*new_mag >= 0);
-        update = 1;
+  // Loop to code all signs in the transform block,
+  // starting with the sign of DC (if applicable)
+  for (c = 0; c < eob; ++c) {
+    const tran_low_t v = tcoeff[scan[c]];
+    const tran_low_t level = abs(v);
+    const int sign = (v < 0) ? 1 : 0;
+    if (level) {
+      if (c == 0) {
+        aom_write_symbol(
+            w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+      } else {
+        aom_write_bit(w, sign);
       }
+      if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+        write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
     }
   }
-  return update;
 }
 
-static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx,
-                                      const TxbCache *txb_cache,
-                                      const LV_MAP_COEFF_COST *txb_costs,
-                                      const TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const tran_low_t abs_qc = abs(qc);
-  const int level = NUM_BASE_LEVELS + 1;
-  if (abs_qc < level) return 0;
-
-  const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-  const tran_low_t abs_nb_coeff = abs(nb_coeff);
-  const int count = txb_cache->br_count_arr[coeff_idx];
-  const int *mag = txb_cache->br_mag_arr[coeff_idx];
-  int new_count;
-  int new_mag;
-  const int update =
-      neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx,
-                                 abs_nb_coeff, nb_coeff_idx, level, txb_info);
-  if (update) {
-    const int row = coeff_idx >> txb_info->bwl;
-    const int col = coeff_idx - (row << txb_info->bwl);
-    const int ctx = txb_cache->br_ctx_arr[coeff_idx];
-    const int org_cost = get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
-
-    const int new_ctx = get_br_ctx_from_count_mag(row, col, new_count, new_mag);
-    const int new_cost =
-        get_br_cost(abs_qc, new_ctx, txb_costs->lps_cost[new_ctx]);
-    const int cost_diff = -org_cost + new_cost;
-    return cost_diff;
-  } else {
-    return 0;
-  }
+typedef struct encode_txb_args {
+  const AV1_COMMON *cm;
+  MACROBLOCK *x;
+  aom_writer *w;
+} ENCODE_TXB_ARGS;
+
+static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
+                                  aom_writer *w, int plane, int block,
+                                  int blk_row, int blk_col, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  uint16_t eob = x->mbmi_ext->eobs[plane][block];
+  TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+                      x->mbmi_ext->dc_sign_ctx[plane][block] };
+  av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
+                       &txb_ctx);
 }
 
-static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx,
-                                        const TxbCache *txb_cache,
-                                        const LV_MAP_COEFF_COST *txb_costs,
-                                        const TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const tran_low_t abs_qc = abs(qc);
-  const BASE_CTX_TABLE *base_ctx_table =
-      txb_info->coeff_ctx_table->base_ctx_table;
-
-  int cost_diff = 0;
-  for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
-    const int level = base_idx + 1;
-    if (abs_qc < level) continue;
-
-    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-    const tran_low_t abs_nb_coeff = abs(nb_coeff);
-
-    const int count = txb_cache->base_count_arr[base_idx][coeff_idx];
-    const int *mag = txb_cache->base_mag_arr[coeff_idx];
-    int new_count;
-    int new_mag;
-    const int update =
-        neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx,
-                                   abs_nb_coeff, nb_coeff_idx, level, txb_info);
-    if (update) {
-      const int row = coeff_idx >> txb_info->bwl;
-      const int col = coeff_idx - (row << txb_info->bwl);
-      const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx];
-      const int org_cost = get_base_cost(
-          abs_qc, ctx, txb_costs->base_cost[base_idx][ctx], base_idx);
-
-      const int new_ctx =
-          base_ctx_table[row != 0][col != 0][new_mag > level][new_count];
-      const int new_cost = get_base_cost(
-          abs_qc, new_ctx, txb_costs->base_cost[base_idx][new_ctx], base_idx);
-      cost_diff += -org_cost + new_cost;
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+                         int mi_col, aom_writer *w, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  int block[MAX_MB_PLANE] = { 0 };
+  int row, col;
+  assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                       xd->plane[0].subsampling_y));
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+    for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        const int step = stepr * stepc;
+
+        const int unit_height = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+        const int unit_width = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+        for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+             blk_row += stepr) {
+          for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+               blk_col += stepc) {
+            write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row,
+                                  blk_col, tx_size);
+            block[plane] += step;
+          }
+        }
+      }
     }
   }
-  return cost_diff;
 }
 
-static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx,
-                                      const TxbCache *txb_cache,
-                                      const LV_MAP_COEFF_COST *txb_costs,
-                                      TxbInfo *txb_info) {
-  // assume eob doesn't change
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const tran_low_t abs_qc = abs(qc);
-  const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-  const tran_low_t abs_nb_coeff = abs(nb_coeff);
-  if (abs_nb_coeff != 1) return 0;
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  const int scan_idx = iscan[coeff_idx];
-  if (scan_idx == txb_info->seg_eob) return 0;
-  const int nb_scan_idx = iscan[nb_coeff_idx];
-  if (nb_scan_idx < scan_idx) {
-    const int count = txb_cache->nz_count_arr[coeff_idx];
-    assert(count > 0);
-    txb_info->qcoeff[nb_coeff_idx] = get_lower_coeff(nb_coeff);
-    const int new_ctx = get_nz_map_ctx_from_count(
-        count - 1, coeff_idx, txb_info->bwl, txb_info->tx_type);
-    txb_info->qcoeff[nb_coeff_idx] = nb_coeff;
-    const int ctx = txb_cache->nz_ctx_arr[coeff_idx];
-    const int is_nz = abs_qc > 0;
-    const int org_cost = txb_costs->nz_map_cost[ctx][is_nz];
-    const int new_cost = txb_costs->nz_map_cost[new_ctx][is_nz];
-    const int cost_diff = new_cost - org_cost;
-    return cost_diff;
-  } else {
-    return 0;
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                            const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type) {
+  if (plane > 0) return 0;
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+    } else {
+      if (ext_tx_set > 0) {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+        return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
+                                     [tx_type];
+      }
+    }
   }
+  return 0;
 }
 
-static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx,
-                               const TxbCache *txb_cache,
-                               const LV_MAP_COEFF_COST *txb_costs,
-                               TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  if (qc == 0) {
-    *low_coeff = 0;
-    return 0;
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+    const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane,
+    const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const struct macroblock_plane *p, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+    const TX_CLASS tx_class) {
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+  int c = eob - 1;
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int sign = v >> 31;
+    const int level = (v ^ sign) - sign;
+    const int coeff_ctx = coeff_contexts[pos];
+    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+    if (v) {
+      // sign bit cost
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
+      if (c) {
+        cost += av1_cost_literal(1);
+      } else {
+        const int sign01 = (sign ^ sign) - sign;
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+        return cost;
+      }
+    }
   }
-  const tran_low_t abs_qc = abs(qc);
-  *low_coeff = get_lower_coeff(qc);
-  int cost_diff;
-  if (*low_coeff == 0) {
-    const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-    if (scan_idx < txb_info->seg_eob) {
-      // When level-0, we code the binary of abs_qc > level
-      // but when level-k k > 0 we code the binary of abs_qc == level
-      // That's why wee need this special treatment for level-0 map
-      // TODO(angiebird): make leve-0 consistent to other levels
-      cost_diff = -level_cost[1] + low_level_cost[0] - low_level_cost[1];
-    } else {
-      cost_diff = -level_cost[1];
+  const int(*base_cost)[4] = coeff_costs->base_cost;
+  for (c = eob - 2; c >= 1; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const int level = abs(v);
+    const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)];
+    if (v) {
+      // sign bit cost
+      cost += av1_cost_literal(1);
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
     }
+    cost += cost0;
+  }
+  if (c == 0) {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int coeff_ctx = coeff_contexts[pos];
+    const int sign = v >> 31;
+    const int level = (v ^ sign) - sign;
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
 
-    if (scan_idx < txb_info->seg_eob) {
-      const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx,
-                                      txb_info->txs_ctx, txb_info->tx_type);
-      cost_diff -=
-          txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
+    if (v) {
+      // sign bit cost
+      const int sign01 = (sign ^ sign) - sign;
+      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
     }
-
-    const int sign_cost = get_sign_bit_cost(
-        qc, coeff_idx, txb_costs->dc_sign_cost, txb_info->txb_ctx->dc_sign_ctx);
-    cost_diff -= sign_cost;
-  } else if (abs_qc <= NUM_BASE_LEVELS) {
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
-  } else if (abs_qc == NUM_BASE_LEVELS + 1) {
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-#if BR_NODE
-    cost_diff = -level_cost[0] + low_level_cost[1] - low_level_cost[0];
-#else
-    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
-#endif
-  } else if (abs_qc < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-
-#if BR_NODE
-    cost_diff = -level_cost[abs_qc - 1 - NUM_BASE_LEVELS] +
-                low_level_cost[abs(*low_coeff) - 1 - NUM_BASE_LEVELS];
-#else
-    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
-#endif
-  } else if (abs_qc == 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-#if BR_NODE
-    cost_diff = -get_golomb_cost(abs_qc) - low_level_cost[COEFF_BASE_RANGE] +
-                low_level_cost[COEFF_BASE_RANGE - 1];
-#else
-    cost_diff =
-        -get_golomb_cost(abs_qc) + low_level_cost[1] - low_level_cost[0];
-#endif
-  } else {
-    assert(abs_qc > 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE);
-    const tran_low_t abs_low_coeff = abs(*low_coeff);
-    cost_diff = -get_golomb_cost(abs_qc) + get_golomb_cost(abs_low_coeff);
   }
-  return cost_diff;
-}
-
-#define COST_MAP_SIZE 5
-#define COST_MAP_OFFSET 2
-
-static INLINE int check_nz_neighbor(tran_low_t qc) { return abs(qc) == 1; }
-
-static INLINE int check_base_neighbor(tran_low_t qc) {
-  return abs(qc) <= 1 + NUM_BASE_LEVELS;
+  return cost;
 }
 
-static INLINE int check_br_neighbor(tran_low_t qc) {
-  return abs(qc) > BR_MAG_OFFSET;
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+                        const int plane, const int blk_row, const int blk_col,
+                        const int block, const TX_SIZE tx_size,
+                        const TXB_CTX *const txb_ctx) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal)                        \
+  case tx_class_literal:                                                       \
+    return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p,  \
+                                    eob, plane_type, coeff_costs, xd, tx_type, \
+                                    tx_class_literal);
+  switch (tx_class) {
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D);
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ);
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT);
+#undef WAREHOUSE_EFFICIENTS_TXB_CASE
+    default: assert(false); return 0;
+  }
 }
 
-#define FAST_OPTIMIZE_TXB 1
+static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                        const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
+  int update = 0;
+  if (txb_info->eob == 0) return update;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  // forward optimize the nz_map`
+  const int init_eob = txb_info->eob;
+  const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
+  const int eob_cost =
+      get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
 
-#if FAST_OPTIMIZE_TXB
-#define ALNB_REF_OFFSET_NUM 2
-static int alnb_ref_offset[ALNB_REF_OFFSET_NUM][2] = {
-  { -1, 0 }, { 0, -1 },
-};
-#define NB_REF_OFFSET_NUM 4
-static int nb_ref_offset[NB_REF_OFFSET_NUM][2] = {
-  { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 },
-};
-#endif  // FAST_OPTIMIZE_TXB
+  // backward optimize the level-k map
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int64_t prev_eob_rd_cost = INT64_MAX;
+  int64_t cur_eob_rd_cost = 0;
 
-// TODO(angiebird): add static to this function once it's called
-int try_level_down(int coeff_idx, const TxbCache *txb_cache,
-                   const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info,
-                   int (*cost_map)[COST_MAP_SIZE], int fast_mode) {
-#if !FAST_OPTIMIZE_TXB
-  (void)fast_mode;
-#endif
-  if (cost_map) {
-    for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
-  }
-
-  tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  tran_low_t low_coeff;
-  if (qc == 0) return 0;
-  int accu_cost_diff = 0;
-
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  const int eob = txb_info->eob;
-  const int scan_idx = iscan[coeff_idx];
-  if (scan_idx < eob) {
-    const int cost_diff = try_self_level_down(&low_coeff, coeff_idx, txb_cache,
-                                              txb_costs, txb_info);
-    if (cost_map)
-      cost_map[0 + COST_MAP_OFFSET][0 + COST_MAP_OFFSET] = cost_diff;
-    accu_cost_diff += cost_diff;
-  }
-
-  const int row = coeff_idx >> txb_info->bwl;
-  const int col = coeff_idx - (row << txb_info->bwl);
-  if (check_nz_neighbor(qc)) {
-#if FAST_OPTIMIZE_TXB
-    int(*ref_offset)[2];
-    int ref_num;
-    if (fast_mode) {
-      ref_offset = alnb_ref_offset;
-      ref_num = ALNB_REF_OFFSET_NUM;
+  {
+    const int si = init_eob - 1;
+    const int coeff_idx = scan[si];
+    LevelDownStats stats;
+    get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
+                        tx_class);
+    if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+      update = 1;
+      update_coeff(coeff_idx, stats.low_qc, txb_info);
+      accu_rate += stats.rate_low;
+      accu_dist += stats.dist_low;
     } else {
-      ref_offset = sig_ref_offset;
-      ref_num = SIG_REF_OFFSET_NUM;
-    }
-#else
-    int(*ref_offset)[2] = sig_ref_offset;
-    const int ref_num = SIG_REF_OFFSET_NUM;
-#endif
-    for (int i = 0; i < ref_num; ++i) {
-      const int nb_row = row - ref_offset[i][0];
-      const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
-        continue;
-
-      const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob) {
-        const int cost_diff = try_neighbor_level_down_nz(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
-        if (cost_map)
-          cost_map[nb_row - row + COST_MAP_OFFSET]
-                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
-        accu_cost_diff += cost_diff;
-      }
+      accu_rate += stats.rate;
+      accu_dist += stats.dist;
     }
   }
 
-  if (check_base_neighbor(qc)) {
-#if FAST_OPTIMIZE_TXB
-    int(*ref_offset)[2];
-    int ref_num;
-    if (fast_mode) {
-      ref_offset = nb_ref_offset;
-      ref_num = NB_REF_OFFSET_NUM;
+  int si = init_eob - 2;
+  int8_t has_nz_tail = 0;
+  // eob is not fixed
+  for (; si >= 0 && has_nz_tail < 2; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
     } else {
-      ref_offset = base_ref_offset;
-      ref_num = BASE_CONTEXT_POSITION_NUM;
-    }
-#else
-    int(*ref_offset)[2] = base_ref_offset;
-    int ref_num = BASE_CONTEXT_POSITION_NUM;
-#endif
-    for (int i = 0; i < ref_num; ++i) {
-      const int nb_row = row - ref_offset[i][0];
-      const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
-        continue;
-
-      const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob) {
-        const int cost_diff = try_neighbor_level_down_base(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
-        if (cost_map)
-          cost_map[nb_row - row + COST_MAP_OFFSET]
-                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
-        accu_cost_diff += cost_diff;
+      LevelDownStats stats;
+      get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
+      // check if it is better to make this the last significant coefficient
+      int cur_eob_rate =
+          get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
+      cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
+      prev_eob_rd_cost =
+          RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
+      if (cur_eob_rd_cost <= prev_eob_rd_cost) {
+        update = 1;
+        for (int j = si + 1; j < txb_info->eob; j++) {
+          const int coeff_pos_j = scan[j];
+          update_coeff(coeff_pos_j, 0, txb_info);
+        }
+        txb_info->eob = si + 1;
+
+        // rerun cost calculation due to change of eob
+        accu_rate = cur_eob_rate;
+        accu_dist = 0;
+        get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
+        if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+          update = 1;
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
+
+        // reset non zero tail when new eob is found
+        has_nz_tail = 0;
+      } else {
+        int bUpdCoeff = 0;
+        if (stats.rd_low < stats.rd) {
+          if ((si < txb_info->eob - 1)) {
+            bUpdCoeff = 1;
+            update = 1;
+          }
+        } else {
+          ++has_nz_tail;
+        }
+
+        if (bUpdCoeff) {
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
       }
     }
-  }
+  }  // for (si)
+
+  // eob is fixed
+  for (; si >= 0; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
 
-  if (check_br_neighbor(qc)) {
-#if FAST_OPTIMIZE_TXB
-    int(*ref_offset)[2];
-    int ref_num;
-    if (fast_mode) {
-      ref_offset = nb_ref_offset;
-      ref_num = NB_REF_OFFSET_NUM;
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
     } else {
-      ref_offset = br_ref_offset;
-      ref_num = BR_CONTEXT_POSITION_NUM;
-    }
-#else
-    int(*ref_offset)[2] = br_ref_offset;
-    const int ref_num = BR_CONTEXT_POSITION_NUM;
-#endif
-    for (int i = 0; i < ref_num; ++i) {
-      const int nb_row = row - ref_offset[i][0];
-      const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
-        continue;
-
-      const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob) {
-        const int cost_diff = try_neighbor_level_down_br(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
-        if (cost_map)
-          cost_map[nb_row - row + COST_MAP_OFFSET]
-                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
-        accu_cost_diff += cost_diff;
+      LevelDownStats stats;
+      get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
+
+      int bUpdCoeff = 0;
+      if (stats.rd_low < stats.rd) {
+        if ((si < txb_info->eob - 1)) {
+          bUpdCoeff = 1;
+          update = 1;
+        }
+      }
+      if (bUpdCoeff) {
+        update_coeff(coeff_idx, stats.low_qc, txb_info);
+        accu_rate += stats.rate_low;
+        accu_dist += stats.dist_low;
+      } else {
+        accu_rate += stats.rate;
+        accu_dist += stats.dist;
       }
     }
-  }
+  }  // for (si)
 
-  return accu_cost_diff;
-}
+  int non_zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
+  prev_eob_rd_cost =
+      RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
 
-static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
-                              const LV_MAP_COEFF_COST *txb_costs,
-                              const TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int abs_qc = abs(qc);
-  assert(abs_qc <= 1);
-  int cost = 0;
-  const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
-  if (scan_idx < txb_info->seg_eob) {
-    const int *level_cost = get_level_prob(0, coeff_idx, txb_cache, txb_costs);
-    cost += level_cost[qc != 0];
-  }
-
-  if (qc != 0) {
-    const int base_idx = 0;
-    const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx];
-    cost += get_base_cost(abs_qc, ctx, txb_costs->base_cost[base_idx][ctx],
-                          base_idx);
-    if (scan_idx < txb_info->seg_eob) {
-      const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx,
-                                      txb_info->txs_ctx, txb_info->tx_type);
-      cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
+  int zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
+  int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
+  if (zero_blk_rd_cost <= prev_eob_rd_cost) {
+    update = 1;
+    for (int j = 0; j < txb_info->eob; j++) {
+      const int coeff_pos_j = scan[j];
+      update_coeff(coeff_pos_j, 0, txb_info);
     }
-    cost += get_sign_bit_cost(qc, coeff_idx, txb_costs->dc_sign_cost,
-                              txb_info->txb_ctx->dc_sign_ctx);
+    txb_info->eob = 0;
   }
-  return cost;
+
+  // record total rate cost
+  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
+                   ? zero_blk_rate
+                   : accu_rate + non_zero_blk_rate;
+
+  if (txb_info->eob > 0) {
+    *rate_cost += txb_info->tx_type_cost;
+  }
+
+  return update;
 }
 
-static INLINE void set_eob(TxbInfo *txb_info, int eob) {
-  txb_info->eob = eob;
-  txb_info->seg_eob = AOMMIN(eob, tx_size_2d[txb_info->tx_size] - 1);
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 17, 13 },
+  { 16, 10 },
+};
+
+void hbt_init() {
+  hbt_hash_table =
+      aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  memset(hbt_hash_table, 0,
+         sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
+
+  hbt_needs_init = 0;
 }
 
-// TODO(angiebird): add static to this function once it's called
-int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
-                   const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info,
-                   int fast_mode) {
-  assert(txb_info->eob > 0);
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int abs_qc = abs(qc);
-  if (abs_qc != 1) {
-    *new_eob = -1;
-    return 0;
-  }
-  const int16_t *iscan = txb_info->scan_order->iscan;
+void hbt_destroy() { aom_free(hbt_hash_table); }
+
+int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                  TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                  const LV_MAP_EOB_COST *txb_eob_costs,
+                  const struct macroblock_plane *p, int block, int fast_mode,
+                  int *rate_cost) {
+  (void)fast_mode;
   const int16_t *scan = txb_info->scan_order->scan;
-  const int scan_idx = iscan[coeff_idx];
-  *new_eob = 0;
-  int cost_diff = 0;
-  cost_diff -= get_low_coeff_cost(coeff_idx, txb_cache, txb_costs, txb_info);
-  // int coeff_cost =
-  //     get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
-  // if (-cost_diff != coeff_cost) {
-  //   printf("-cost_diff %d coeff_cost %d\n", -cost_diff, coeff_cost);
-  //   get_low_coeff_cost(coeff_idx, txb_cache, txb_probs, txb_info);
-  //   get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
-  // }
-  for (int si = scan_idx - 1; si >= 0; --si) {
-    const int ci = scan[si];
-    if (txb_info->qcoeff[ci] != 0) {
-      *new_eob = si + 1;
-      break;
-    } else {
-      cost_diff -= get_low_coeff_cost(ci, txb_cache, txb_costs, txb_info);
-    }
+  int prev_eob = txb_info->eob;
+  assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
+  int32_t prev_coeff[16];
+  for (int i = 0; i < prev_eob; i++) {
+    prev_coeff[i] = txb_info->qcoeff[scan[i]];
   }
-
-  const int org_eob = txb_info->eob;
-  set_eob(txb_info, *new_eob);
-  cost_diff += try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, NULL,
-                              fast_mode);
-  set_eob(txb_info, org_eob);
-
-  if (*new_eob > 0) {
-    // Note that get_eob_ctx does NOT actually account for qcoeff, so we don't
-    // need to lower down the qcoeff here
-    const int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1],
-                                    txb_info->txs_ctx, txb_info->tx_type);
-    cost_diff -= txb_costs->eob_cost[eob_ctx][0];
-    cost_diff += txb_costs->eob_cost[eob_ctx][1];
-  } else {
-    const int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
-    cost_diff -= txb_costs->txb_skip_cost[txb_skip_ctx][0];
-    cost_diff += txb_costs->txb_skip_cost[txb_skip_ctx][1];
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    prev_coeff[i] = 0;  // For compiler piece of mind.
   }
-  return cost_diff;
-}
 
-static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int dqv, int shift) {
-  int sgn = qc < 0 ? -1 : 1;
-  return sgn * ((abs(qc) * dqv) >> shift);
-}
+  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                      txb_info->levels);
 
-// TODO(angiebird): add static to this function it's called
-void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int abs_qc = abs(qc);
-  if (qc == 0) return;
-  const tran_low_t low_coeff = get_lower_coeff(qc);
-  txb_info->qcoeff[coeff_idx] = low_coeff;
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-  txb_info->dqcoeff[coeff_idx] =
-      qcoeff_to_dqcoeff(low_coeff, dqv, txb_info->shift);
-
-  const int row = coeff_idx >> txb_info->bwl;
-  const int col = coeff_idx - (row << txb_info->bwl);
-  const int eob = txb_info->eob;
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) {
-    const int nb_row = row - sig_ref_offset[i][0];
-    const int nb_col = col - sig_ref_offset[i][1];
-
-    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
-      continue;
-
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-    const int nb_scan_idx = iscan[nb_coeff_idx];
-    if (nb_scan_idx < eob) {
-      const int scan_idx = iscan[coeff_idx];
-      if (scan_idx < nb_scan_idx) {
-        const int level = 1;
-        if (abs_qc == level) {
-          txb_cache->nz_count_arr[nb_coeff_idx] -= 1;
-          assert(txb_cache->nz_count_arr[nb_coeff_idx] >= 0);
-        }
-        const int count = txb_cache->nz_count_arr[nb_coeff_idx];
-        txb_cache->nz_ctx_arr[nb_coeff_idx] = get_nz_map_ctx_from_count(
-            count, nb_coeff_idx, txb_info->bwl, txb_info->tx_type);
-        // int ref_ctx = get_nz_map_ctx(txb_info->qcoeff, nb_coeff_idx,
-        // txb_info->bwl, tx_type);
-        // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx])
-        //   printf("nz ctx %d ref_ctx %d\n",
-        //   txb_cache->nz_ctx_arr[nb_coeff_idx], ref_ctx);
-      }
-    }
+  const int update =
+      optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+  // Overwrite old entry
+  uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+  uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .rate_cost = *rate_cost;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_qc_hash = hbt_qc_hash;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_ctx_hash = hbt_ctx_hash;
+  assert(prev_eob >= txb_info->eob);  // eob can't get longer
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Record how coeff changed. Convention: towards zero is negative.
+    if (txb_info->qcoeff[scan[i]] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
+  }
+  for (int i = txb_info->eob; i < prev_eob; i++) {
+    // If eob got shorter, record that all after it changed to zero.
+    if (prev_coeff[i] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = -prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    // Record 'no change' after optimized coefficients run out.
+    hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+        .deltas[i] = 0;
   }
 
-  const BASE_CTX_TABLE *base_ctx_table =
-      txb_info->coeff_ctx_table->base_ctx_table;
-  for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
-    const int nb_row = row - base_ref_offset[i][0];
-    const int nb_col = col - base_ref_offset[i][1];
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
-      continue;
-
-    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-    if (!has_base(nb_coeff, 0)) continue;
-    const int nb_scan_idx = iscan[nb_coeff_idx];
-    if (nb_scan_idx < eob) {
-      if (row >= nb_row && col >= nb_col)
-        update_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx], abs_qc);
-      const int mag =
-          get_mag_from_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx]);
-      for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
-        if (!has_base(nb_coeff, base_idx)) continue;
-        const int level = base_idx + 1;
-        if (abs_qc == level) {
-          txb_cache->base_count_arr[base_idx][nb_coeff_idx] -= 1;
-          assert(txb_cache->base_count_arr[base_idx][nb_coeff_idx] >= 0);
-        }
-        const int count = txb_cache->base_count_arr[base_idx][nb_coeff_idx];
-        txb_cache->base_ctx_arr[base_idx][nb_coeff_idx] =
-            base_ctx_table[nb_row != 0][nb_col != 0][mag > level][count];
-        // int ref_ctx = get_base_ctx(txb_info->qcoeff, nb_coeff_idx,
-        // txb_info->bwl, level);
-        // if (ref_ctx != txb_cache->base_ctx_arr[base_idx][nb_coeff_idx]) {
-        //   printf("base ctx %d ref_ctx %d\n",
-        //   txb_cache->base_ctx_arr[base_idx][nb_coeff_idx], ref_ctx);
-        // }
-      }
-    }
+  if (update) {
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
   }
+  return txb_info->eob;
+}
 
-  for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) {
-    const int nb_row = row - br_ref_offset[i][0];
-    const int nb_col = col - br_ref_offset[i][1];
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
+                 TxbInfo *txb_info, const struct macroblock_plane *p, int block,
+                 int *rate_cost) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  int new_eob = 0;
+  int update = 0;
 
-    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
-      continue;
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Delta convention is negatives go towards zero, so only apply those ones.
+    if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+            .deltas[i] < 0) {
+      if (txb_info->qcoeff[scan[i]] > 0)
+        txb_info->qcoeff[scan[i]] +=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
+      else
+        txb_info->qcoeff[scan[i]] -=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
 
-    const int nb_scan_idx = iscan[nb_coeff_idx];
-    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-    if (!has_br(nb_coeff)) continue;
-    if (nb_scan_idx < eob) {
-      const int level = 1 + NUM_BASE_LEVELS;
-      if (abs_qc == level) {
-        txb_cache->br_count_arr[nb_coeff_idx] -= 1;
-        assert(txb_cache->br_count_arr[nb_coeff_idx] >= 0);
-      }
-      if (row >= nb_row && col >= nb_col)
-        update_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx], abs_qc);
-      const int count = txb_cache->br_count_arr[nb_coeff_idx];
-      const int mag = get_mag_from_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx]);
-      txb_cache->br_ctx_arr[nb_coeff_idx] =
-          get_br_ctx_from_count_mag(nb_row, nb_col, count, mag);
-      // int ref_ctx = get_level_ctx(txb_info->qcoeff, nb_coeff_idx,
-      // txb_info->bwl);
-      // if (ref_ctx != txb_cache->br_ctx_arr[nb_coeff_idx]) {
-      //   printf("base ctx %d ref_ctx %d\n",
-      //   txb_cache->br_ctx_arr[nb_coeff_idx], ref_ctx);
-      // }
+      update = 1;
+      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
     }
+    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
   }
-}
 
-static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
-                          const LV_MAP_COEFF_COST *txb_costs) {
-  const TXB_CTX *txb_ctx = txb_info->txb_ctx;
-  const int is_nz = (qc != 0);
-  const tran_low_t abs_qc = abs(qc);
-  int cost = 0;
-  const int16_t *scan = txb_info->scan_order->scan;
+  // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
+  // it is expensive and gives little benefit as long as qc_hash is high bit
+  *rate_cost =
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .rate_cost;
 
-  if (scan_idx < txb_info->seg_eob) {
-    int coeff_ctx =
-        get_nz_map_ctx(txb_info->qcoeff, scan_idx, scan, txb_info->bwl,
-                       txb_info->height, txb_info->tx_type);
-    cost += txb_costs->nz_map_cost[coeff_ctx][is_nz];
+  if (update) {
+    txb_info->eob = new_eob;
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
   }
 
-  if (is_nz) {
-    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
-                              txb_ctx->dc_sign_ctx);
+  return txb_info->eob;
+}
 
-    int ctx_ls[NUM_BASE_LEVELS] = { 0 };
-    get_base_ctx_set(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
-                     txb_info->height, ctx_ls);
+int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                     TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                     const LV_MAP_EOB_COST *txb_eob_costs,
+                     const struct macroblock_plane *p, int block, int fast_mode,
+                     int *rate_cost) {
+  // Check for qcoeff match
+  int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+
+  if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_qc_hash == hbt_qc_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_ctx_hash == hbt_ctx_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .init) {
+    return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
+                        rate_cost);
+  } else {
+    return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                         txb_eob_costs, p, block, fast_mode, rate_cost);
+  }
+}
 
-    int i;
-    for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-      cost += get_base_cost(abs_qc, ctx_ls[i],
-                            txb_costs->base_cost[i][ctx_ls[i]], i);
-    }
+int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                      const LV_MAP_EOB_COST *txb_eob_costs,
+                      const struct macroblock_plane *p, int block,
+                      int fast_mode, int *rate_cost) {
+  // Initialize hash table if needed.
+  if (hbt_needs_init) {
+    hbt_init();
+  }
 
-    if (abs_qc > NUM_BASE_LEVELS) {
-      int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
-                           txb_info->height);
-      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
-      cost += get_golomb_cost(abs_qc);
+  //// Hash creation
+  uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
+  const int16_t *scan = txb_info->scan_order->scan;
+  uint8_t chunk = 0;
+  int hash_data_index = 0;
+
+  // Make qc_hash.
+  int packing_index = 0;  // needed for packing.
+  for (int i = 0; i < txb_info->eob; i++) {
+    tran_low_t prechunk = txb_info->qcoeff[scan[i]];
+
+    // Softening: Improves speed. Aligns with signed deltas.
+    if (prechunk < 0) prechunk *= -1;
+
+    // Early kick out: Don't apply feature if there are large coeffs:
+    // If this kickout value is removed or raised beyond int8_t,
+    // widen deltas type in OptTxbQcoeff struct.
+    assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
+    if (prechunk > HBT_KICKOUT) {
+      av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                          txb_info->levels);
+
+      const int update =
+          optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+      if (update) {
+        p->eobs[block] = txb_info->eob;
+        p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+            txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+      }
+      return txb_info->eob;
     }
 
-    if (scan_idx < txb_info->seg_eob) {
-      int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[scan_idx],
-                                txb_info->txs_ctx, txb_info->tx_type);
-      cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
+    // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
+    if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
+    chunk = prechunk << packing_index;
+    packing_index += 2;
+    txb_hash_data[hash_data_index] |= chunk;
+
+    // Full byte:
+    if (packing_index == 8) {
+      packing_index = 0;
+      hash_data_index++;
     }
   }
-  return cost;
-}
-
-#if TEST_OPTIMIZE_TXB
-#define ALL_REF_OFFSET_NUM 17
-static int all_ref_offset[ALL_REF_OFFSET_NUM][2] = {
-  { 0, 0 },  { -2, -1 }, { -2, 0 }, { -2, 1 }, { -1, -2 }, { -1, -1 },
-  { -1, 0 }, { -1, 1 },  { 0, -2 }, { 0, -1 }, { 1, -2 },  { 1, -1 },
-  { 1, 0 },  { 2, 0 },   { 0, 1 },  { 0, 2 },  { 1, 1 },
-};
-
-static int try_level_down_ref(int coeff_idx, const LV_MAP_COEFF_COST *txb_costs,
-                              TxbInfo *txb_info,
-                              int (*cost_map)[COST_MAP_SIZE]) {
-  if (cost_map) {
-    for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
-  }
-  tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  if (qc == 0) return 0;
-  int row = coeff_idx >> txb_info->bwl;
-  int col = coeff_idx - (row << txb_info->bwl);
-  int org_cost = 0;
-  for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
-    int nb_row = row - all_ref_offset[i][0];
-    int nb_col = col - all_ref_offset[i][1];
-    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-    int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
-    if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->height && nb_col < txb_info->stride) {
-      tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
-      if (cost_map)
-        cost_map[nb_row - row + COST_MAP_OFFSET]
-                [nb_col - col + COST_MAP_OFFSET] -= cost;
-      org_cost += cost;
+  // Needed when packing_index != 0, to include final byte.
+  hash_data_index++;
+  assert(hash_data_index <= 64);
+  // 31 bit qc_hash: index to array
+  uint32_t hbt_qc_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+
+  // Make ctx_hash.
+  hash_data_index = 0;
+  tran_low_t prechunk;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Save as magnitudes towards or away from zero.
+    if (txb_info->tcoeff[scan[i]] >= 0)
+      prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
+    else
+      prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
+
+    chunk = prechunk & 0xff;
+    txb_hash_data[hash_data_index++] = chunk;
+  }
+
+  // Extra ctx data:
+  // Include dequants.
+  txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
+  txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
+  chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // eob
+  chunk = txb_info->eob & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // rdmult (int64)
+  chunk = txb_info->rdmult & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // tx_type
+  chunk = txb_info->tx_type & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // base_eob_cost
+  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
+    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
+      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
     }
   }
-  txb_info->qcoeff[coeff_idx] = get_lower_coeff(qc);
-  int new_cost = 0;
-  for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
-    int nb_row = row - all_ref_offset[i][0];
-    int nb_col = col - all_ref_offset[i][1];
-    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-    int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
-    if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->height && nb_col < txb_info->stride) {
-      tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
-      if (cost_map)
-        cost_map[nb_row - row + COST_MAP_OFFSET]
-                [nb_col - col + COST_MAP_OFFSET] += cost;
-      new_cost += cost;
+  // eob_cost
+  for (int i = 0; i < 11; i++) {
+    for (int j = 0; j < 2; j++) {
+      chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
     }
   }
-  txb_info->qcoeff[coeff_idx] = qc;
-  return new_cost - org_cost;
-}
-
-static void test_level_down(int coeff_idx, const TxbCache *txb_cache,
-                            const LV_MAP_COEFF_COST *txb_costs,
-                            TxbInfo *txb_info) {
-  int cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
-  int ref_cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
-  const int cost_diff =
-      try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, cost_map, 0);
-  const int cost_diff_ref =
-      try_level_down_ref(coeff_idx, txb_costs, txb_info, ref_cost_map);
-  if (cost_diff != cost_diff_ref) {
-    printf("qc %d cost_diff %d cost_diff_ref %d\n", txb_info->qcoeff[coeff_idx],
-           cost_diff, cost_diff_ref);
-    for (int r = 0; r < COST_MAP_SIZE; ++r) {
-      for (int c = 0; c < COST_MAP_SIZE; ++c) {
-        printf("%d:%d ", cost_map[r][c], ref_cost_map[r][c]);
-      }
-      printf("\n");
+  // dc_sign_cost
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
+      chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
     }
   }
+
+  assert(hash_data_index <= 256);
+  // 31 bit ctx_hash: used to index table
+  uint32_t hbt_ctx_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+  //// End hash creation
+
+  return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                          txb_eob_costs, p, block, fast_mode, rate_cost);
 }
-#endif
 
-// TODO(angiebird): make this static once it's called
-int get_txb_cost(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs) {
-  int cost = 0;
-  int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
-  const int16_t *scan = txb_info->scan_order->scan;
-  if (txb_info->eob == 0) {
-    cost = txb_costs->txb_skip_cost[txb_skip_ctx][1];
-    return cost;
-  }
-  cost = txb_costs->txb_skip_cost[txb_skip_ctx][0];
-  for (int c = 0; c < txb_info->eob; ++c) {
-    tran_low_t qc = txb_info->qcoeff[scan[c]];
-    int coeff_cost = get_coeff_cost(qc, c, txb_info, txb_costs);
-    cost += coeff_cost;
+static AOM_FORCE_INLINE int get_coeff_cost_simple(
+    int ci, tran_low_t abs_qc, int coeff_ctx,
+    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
+    const uint8_t *levels) {
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(ci > 0);
+  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  if (abs_qc) {
+    cost += av1_cost_literal(1);
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+      cost += get_golomb_cost(abs_qc);
+    }
   }
   return cost;
 }
 
-#if TEST_OPTIMIZE_TXB
-void test_try_change_eob(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                         TxbCache *txb_cache) {
-  int eob = txb_info->eob;
-  const int16_t *scan = txb_info->scan_order->scan;
-  if (eob > 0) {
-    int last_si = eob - 1;
-    int last_ci = scan[last_si];
-    int last_coeff = txb_info->qcoeff[last_ci];
-    if (abs(last_coeff) == 1) {
-      int new_eob;
-      int cost_diff =
-          try_change_eob(&new_eob, last_ci, txb_cache, txb_costs, txb_info, 0);
-      int org_eob = txb_info->eob;
-      int cost = get_txb_cost(txb_info, txb_costs);
-
-      txb_info->qcoeff[last_ci] = get_lower_coeff(last_coeff);
-      set_eob(txb_info, new_eob);
-      int new_cost = get_txb_cost(txb_info, txb_costs);
-      set_eob(txb_info, org_eob);
-      txb_info->qcoeff[last_ci] = last_coeff;
-
-      int ref_cost_diff = -cost + new_cost;
-      if (cost_diff != ref_cost_diff)
-        printf("org_eob %d new_eob %d cost_diff %d ref_cost_diff %d\n", org_eob,
-               new_eob, cost_diff, ref_cost_diff);
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+                                         int sign, int coeff_ctx,
+                                         int dc_sign_ctx,
+                                         const LV_MAP_COEFF_COST *txb_costs,
+                                         int bwl, TX_CLASS tx_class,
+                                         const uint8_t *levels) {
+  int cost = 0;
+  if (is_last) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+      cost += get_golomb_cost(abs_qc);
     }
   }
+  return cost;
 }
-#endif
 
-static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
-                                     int shift) {
-  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
-  const int64_t error = diff * diff;
-  return error;
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+                                  int shift, tran_low_t *qc_low,
+                                  tran_low_t *dqc_low) {
+  tran_low_t abs_qc_low = abs_qc - 1;
+  *qc_low = (-sign ^ abs_qc_low) + sign;
+  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+  *dqc_low = (-sign ^ abs_dqc_low) + sign;
+  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
 }
 
-typedef struct LevelDownStats {
-  int update;
-  tran_low_t low_qc;
-  tran_low_t low_dqc;
-  int64_t rd_diff;
-  int cost_diff;
-  int64_t dist_diff;
-  int new_eob;
-} LevelDownStats;
-
-void try_level_down_facade(LevelDownStats *stats, int scan_idx,
-                           const TxbCache *txb_cache,
-                           const LV_MAP_COEFF_COST *txb_costs,
-                           TxbInfo *txb_info, int fast_mode) {
-  const int16_t *scan = txb_info->scan_order->scan;
-  const int coeff_idx = scan[scan_idx];
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  stats->new_eob = -1;
-  stats->update = 0;
+static INLINE void update_coeff_general(
+    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+    TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
+    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) {
+  const int dqv = dequant[si != 0];
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int is_last = si == (eob - 1);
+  const int coeff_ctx = get_lower_levels_ctx_general(
+      is_last, si, bwl, height, levels, ci, tx_size, tx_class);
   if (qc == 0) {
-    return;
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const int sign = (qc < 0) ? 1 : 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    const int rate =
+        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+    const int rate_low =
+        get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+      *accu_dist += dist_low - dist0;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist - dist0;
+    }
   }
+}
 
-  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-
-  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, dqv, txb_info->shift);
-  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
-
-  stats->low_qc = get_lower_coeff(qc);
-  stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, dqv, txb_info->shift);
-  const int64_t low_dqc_dist =
-      get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
-
-  stats->dist_diff = -dqc_dist + low_dqc_dist;
-  stats->cost_diff = 0;
-  stats->new_eob = txb_info->eob;
-  if (scan_idx == txb_info->eob - 1 && abs(qc) == 1) {
-    stats->cost_diff = try_change_eob(&stats->new_eob, coeff_idx, txb_cache,
-                                      txb_costs, txb_info, fast_mode);
+static AOM_FORCE_INLINE void update_coeff_simple(
+    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bwl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels) {
+  const int dqv = dequant[1];
+  (void)eob;
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(si != eob - 1);
+  assert(si > 0);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
   } else {
-    stats->cost_diff = try_level_down(coeff_idx, txb_cache, txb_costs, txb_info,
-                                      NULL, fast_mode);
-#if TEST_OPTIMIZE_TXB
-    test_level_down(coeff_idx, txb_cache, txb_costs, txb_info);
-#endif
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
+                                           bwl, tx_class, levels);
+    if (abs(dqc) < abs(tqc)) {
+      *accu_rate += rate;
+      return;
+    }
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    const int sign = (qc < 0) ? 1 : 0;
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+    const int rate_low = get_coeff_cost_simple(
+        ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+    } else {
+      *accu_rate += rate;
+    }
   }
-  stats->rd_diff = RDCOST(txb_info->rdmult, stats->cost_diff, stats->dist_diff);
-  if (stats->rd_diff < 0) stats->update = 1;
-  return;
 }
 
-static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                        TxbCache *txb_cache, int dry_run, int fast_mode) {
-  int update = 0;
-  if (txb_info->eob == 0) return update;
-  int cost_diff = 0;
-  int64_t dist_diff = 0;
-  int64_t rd_diff = 0;
-  const int max_eob = tx_size_2d[txb_info->tx_size];
-
-#if TEST_OPTIMIZE_TXB
-  int64_t sse;
-  int64_t org_dist =
-      av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
-      (1 << (2 * txb_info->shift));
-  int org_cost = get_txb_cost(txb_info, txb_costs);
-#endif
-
-  tran_low_t *org_qcoeff = txb_info->qcoeff;
-  tran_low_t *org_dqcoeff = txb_info->dqcoeff;
+static AOM_FORCE_INLINE void update_coeff_eob(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) {
+  const int dqv = dequant[si != 0];
+  assert(si != *eob - 1);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    int lower_level = 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int sign = (qc < 0) ? 1 : 0;
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+    const int rate_low =
+        get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low =
+        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+
+    int lower_level_new_eob = 0;
+    const int new_eob = si + 1;
+    uint8_t tmp_levels[3];
+    for (int ni = 0; ni < *nz_num; ++ni) {
+      const int last_ci = nz_ci[ni];
+      tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
+      levels[get_padded_idx(last_ci, bwl)] = 0;
+    }
 
-  tran_low_t tmp_qcoeff[MAX_TX_SQUARE];
-  tran_low_t tmp_dqcoeff[MAX_TX_SQUARE];
-  const int org_eob = txb_info->eob;
-  if (dry_run) {
-    memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob);
-    memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob);
-    txb_info->qcoeff = tmp_qcoeff;
-    txb_info->dqcoeff = tmp_dqcoeff;
-  }
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
+        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    const int new_eob_cost =
+        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+    int rate_coeff_eob =
+        new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
+                                              coeff_ctx_new_eob, dc_sign_ctx,
+                                              txb_costs, bwl, tx_class, levels);
+    int64_t dist_new_eob = dist;
+    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+    if (abs_qc_low > 0) {
+      const int rate_coeff_eob_low =
+          new_eob_cost +
+          get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      const int64_t dist_new_eob_low = dist_low;
+      const int64_t rd_new_eob_low =
+          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+      if (rd_new_eob_low < rd_new_eob) {
+        lower_level_new_eob = 1;
+        rd_new_eob = rd_new_eob_low;
+        rate_coeff_eob = rate_coeff_eob_low;
+        dist_new_eob = dist_new_eob_low;
+      }
+    }
 
-  const int16_t *scan = txb_info->scan_order->scan;
+    if (rd_low < rd) {
+      lower_level = 1;
+      rd = rd_low;
+      rate = rate_low;
+      dist = dist_low;
+    }
 
-  // forward optimize the nz_map
-  const int cur_eob = txb_info->eob;
-  for (int si = 0; si < cur_eob; ++si) {
-    const int coeff_idx = scan[si];
-    tran_low_t qc = txb_info->qcoeff[coeff_idx];
-    if (abs(qc) == 1) {
-      LevelDownStats stats;
-      try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info,
-                            fast_mode);
-      if (stats.update) {
-        update = 1;
-        cost_diff += stats.cost_diff;
-        dist_diff += stats.dist_diff;
-        rd_diff += stats.rd_diff;
-        update_level_down(coeff_idx, txb_cache, txb_info);
-        set_eob(txb_info, stats.new_eob);
+    if (sharpness == 0 && rd_new_eob < rd) {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        int last_ci = nz_ci[ni];
+        // levels[get_padded_idx(last_ci, bwl)] = 0;
+        qcoeff[last_ci] = 0;
+        dqcoeff[last_ci] = 0;
+      }
+      *eob = new_eob;
+      *nz_num = 0;
+      *accu_rate = rate_coeff_eob;
+      *accu_dist = dist_new_eob;
+      lower_level = lower_level_new_eob;
+    } else {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        const int last_ci = nz_ci[ni];
+        levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
       }
+      *accu_rate += rate;
+      *accu_dist += dist;
     }
-  }
 
-  // backward optimize the level-k map
-  int eob_fix = 0;
-  for (int si = txb_info->eob - 1; si >= 0; --si) {
-    const int coeff_idx = scan[si];
-    if (eob_fix == 1 && txb_info->qcoeff[coeff_idx] == 1) {
-      // when eob is fixed, there is not need to optimize again when
-      // abs(qc) == 1
-      continue;
+    if (lower_level) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
     }
-    LevelDownStats stats;
-    try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info,
-                          fast_mode);
-    if (stats.update) {
-#if TEST_OPTIMIZE_TXB
-// printf("si %d low_qc %d cost_diff %d dist_diff %ld rd_diff %ld eob %d new_eob
-// %d\n", si, stats.low_qc, stats.cost_diff, stats.dist_diff, stats.rd_diff,
-// txb_info->eob, stats.new_eob);
-#endif
-      update = 1;
-      cost_diff += stats.cost_diff;
-      dist_diff += stats.dist_diff;
-      rd_diff += stats.rd_diff;
-      update_level_down(coeff_idx, txb_cache, txb_info);
-      set_eob(txb_info, stats.new_eob);
+    if (qcoeff[ci]) {
+      nz_ci[*nz_num] = ci;
+      ++*nz_num;
     }
-    if (eob_fix == 0 && txb_info->qcoeff[coeff_idx] != 0) eob_fix = 1;
-    if (si > txb_info->eob) si = txb_info->eob;
-  }
-#if TEST_OPTIMIZE_TXB
-  int64_t new_dist =
-      av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
-      (1 << (2 * txb_info->shift));
-  int new_cost = get_txb_cost(txb_info, txb_costs);
-  int64_t ref_dist_diff = new_dist - org_dist;
-  int ref_cost_diff = new_cost - org_cost;
-  if (cost_diff != ref_cost_diff || dist_diff != ref_dist_diff)
-    printf(
-        "overall rd_diff %ld\ncost_diff %d ref_cost_diff%d\ndist_diff %ld "
-        "ref_dist_diff %ld\neob %d new_eob %d\n\n",
-        rd_diff, cost_diff, ref_cost_diff, dist_diff, ref_dist_diff, org_eob,
-        txb_info->eob);
-#endif
-  if (dry_run) {
-    txb_info->qcoeff = org_qcoeff;
-    txb_info->dqcoeff = org_dqcoeff;
-    set_eob(txb_info, org_eob);
   }
-  return update;
 }
 
-// These numbers are empirically obtained.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 17, 13 }, { 16, 10 },
-};
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+                               int nz_num, int *nz_ci, int64_t rdmult,
+                               int skip_cost, int non_skip_cost,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               int sharpness) {
+  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+  if (sharpness == 0 && rd_new_eob < rd) {
+    for (int i = 0; i < nz_num; ++i) {
+      const int ci = nz_ci[i];
+      qcoeff[ci] = 0;
+      dqcoeff[ci] = 0;
+      // no need to set up levels because this is the last step
+      // levels[get_padded_idx(ci, bwl)] = 0;
+    }
+    *accu_rate = 0;
+    *eob = 0;
+  }
+}
+
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int16_t *dequant = p->dequant_QTX;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  assert(width == (1 << bwl));
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *txb_eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+
+  const int shift = av1_get_tx_scale(tx_size);
+  const int64_t rdmult =
+      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+       2) >>
+      (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+                        ? 7 - mbmi->segment_id
+                        : 2));
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  // TODO(angirbird): check iqmatrix
+
+  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  int eob = p->eobs[block];
+  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int si = eob - 1;
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const tran_low_t abs_qc = abs(qc);
+  const int sign = qc < 0;
+  const int max_nz_num = 2;
+  int nz_num = 1;
+  int nz_ci[3] = { ci, 0, 0 };
+  if (abs_qc >= 2) {
+    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels);
+    --si;
+  } else {
+    assert(abs_qc == 1);
+    const int coeff_ctx = get_lower_levels_ctx_general(
+        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
+                                        txb_ctx->dc_sign_ctx, txb_costs, bwl,
+                                        tx_class, levels);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    accu_dist += dist - dist0;
+    --si;
+  }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                   \
+    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
+                       tx_size, tx_class_literal, bwl, height,             \
+                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+                       levels, sharpness);                                 \
+    }                                                                      \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+    default: assert(false);
+  }
+
+  if (si == -1 && nz_num <= max_nz_num) {
+    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+                non_skip_cost, qcoeff, dqcoeff, sharpness);
+  }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
+  case tx_class_literal:                                                       \
+    for (; si >= 1; --si) {                                                    \
+      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
+                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
+                          qcoeff, dqcoeff, levels);                            \
+    }                                                                          \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+    default: assert(false);
+  }
+
+  // DC position
+  if (si == 0) {
+    // no need to update accu_dist because it's not used after this point
+    int64_t dummy_dist = 0;
+    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels);
+  }
+
+  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  if (eob == 0)
+    accu_rate += skip_cost;
+  else
+    accu_rate += non_skip_cost + tx_type_cost;
+
+  p->eobs[block] = eob;
+  p->txb_entropy_ctx[block] =
+      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
 
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+  *rate_cost = accu_rate;
+  return eob;
+}
+
+// This function is deprecated, but we keep it here because hash trellis
+// is not integrated with av1_optimize_txb_new yet
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode) {
+                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
+  const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const MB_MODE_INFO *mbmi = xd->mi[0];
   const struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int eob = p->eobs[block];
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
-  const int16_t *dequant = pd->dequant;
-  const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int stride = 1 << bwl;
-  const int height = tx_size_high[tx_size];
+  const int16_t *dequant = p->dequant_QTX;
+  const int seg_eob = av1_get_max_eob(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
   const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const LV_MAP_COEFF_COST txb_costs = x->coeff_costs[txs_ctx][plane_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST txb_eob_costs =
+      x->eob_costs[eob_multi_size][plane_type];
 
   const int shift = av1_get_tx_scale(tx_size);
   const int64_t rdmult =
-      (x->rdmult * plane_rd_mult[is_inter][plane_type] + 2) >> 2;
-
-  TxbInfo txb_info = { qcoeff,
-                       dqcoeff,
-                       tcoeff,
-                       dequant,
-                       shift,
-                       tx_size,
-                       txs_ctx,
-                       tx_type,
-                       bwl,
-                       stride,
-                       height,
-                       eob,
-                       seg_eob,
-                       scan_order,
-                       txb_ctx,
-                       rdmult,
-                       &cm->coeff_ctx_table };
-
-  TxbCache txb_cache;
-  gen_txb_cache(&txb_cache, &txb_info);
+      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+       2) >>
+      2;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  assert(width == (1 << bwl));
+  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  TxbInfo txb_info = {
+    qcoeff,   levels,       dqcoeff,    tcoeff,  dequant, shift,
+    tx_size,  txs_ctx,      tx_type,    bwl,     width,   height,
+    eob,      seg_eob,      scan_order, txb_ctx, rdmult,  &cm->coeff_ctx_table,
+    iqmatrix, tx_type_cost,
+  };
+
+  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
+  // by storing the coefficient deltas in a hash table.
+  // Currently disabled in speedfeatures.c
+  if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
+    return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
+                             fast_mode, rate_cost);
+  }
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
 
   const int update =
-      optimize_txb(&txb_info, &txb_costs, &txb_cache, 0, fast_mode);
-  if (update) p->eobs[block] = txb_info.eob;
+      optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
+
+  if (update) {
+    p->eobs[block] = txb_info.eob;
+    p->txb_entropy_ctx[block] =
+        av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
+  }
   return txb_info.eob;
 }
+
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
                                 const SCAN_ORDER *scan_order, int eob) {
-  const int16_t *scan = scan_order->scan;
+  const int16_t *const scan = scan_order->scan;
   int cul_level = 0;
   int c;
 
   if (eob == 0) return 0;
   for (c = 0; c < eob; ++c) {
     cul_level += abs(qcoeff[scan[c]]);
+    if (cul_level > COEFF_CONTEXT_MASK) break;
   }
 
   cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
@@ -1981,167 +1791,72 @@ void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
   ThreadData *const td = args->td;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const uint16_t eob = p->eobs[block];
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  (void)plane_bsize;
-
-  int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
-  av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
-}
-
-static INLINE void av1_update_nz_eob_counts(FRAME_CONTEXT *fc,
-                                            FRAME_COUNTS *counts, uint16_t eob,
-                                            const tran_low_t *tcoeff, int plane,
-                                            TX_SIZE tx_size, TX_TYPE tx_type,
-                                            const int16_t *scan) {
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-  TX_SIZE txsize_ctx = get_txsize_context(tx_size);
-#if CONFIG_CTX1D
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      &counts->nz_map[txsize_ctx][plane_type];
-  for (int c = 0; c < eob; ++c) {
-    tran_low_t v = tcoeff[scan[c]];
-    int is_nz = (v != 0);
-    int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txsize_ctx, tx_type);
-
-    if (c == seg_eob - 1) break;
-
-    ++(*nz_map_count)[coeff_ctx][is_nz];
-#if LV_MAP_PROB
-    update_bin(fc->nz_map_cdf[txsize_ctx][plane_type][coeff_ctx], is_nz, 2);
-#endif
-
-    if (is_nz) {
-      ++counts->eob_flag[txsize_ctx][plane_type][eob_ctx][c == (eob - 1)];
-#if LV_MAP_PROB
-      update_bin(fc->eob_flag_cdf[txsize_ctx][plane_type][eob_ctx],
-                 c == (eob - 1), 2);
-#endif
-    }
-  }
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+                   blk_row);
 }
 
-#if CONFIG_CTX1D
-static INLINE void av1_update_nz_eob_counts_vert(
-    FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob,
-    const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type,
-    const int16_t *scan, const int16_t *iscan) {
-  (void)eob;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_vert(eob_ls, tcoeff, width, height);
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      &counts->nz_map[txs_ctx][plane_type];
-  for (int c = 0; c < width; ++c) {
-    int16_t veob = eob_ls[c];
-    assert(veob <= height);
-    int el_ctx = get_empty_line_ctx(c, eob_ls);
-    ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][veob == 0];
-#if LV_MAP_PROB
-    update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx],
-               veob == 0, 2);
-#endif
-    if (veob) {
-      for (int r = 0; r < veob; ++r) {
-        if (r + 1 != height) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-          ++(*nz_map_count)[coeff_ctx][is_nz];
-#if LV_MAP_PROB
-          update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
-            ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]
-                            [r == veob - 1];
-#if LV_MAP_PROB
-            update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx],
-                       r == veob - 1, 2);
-#endif
-          }
+static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                 int blk_row, int blk_col, int plane,
+                                 TX_SIZE tx_size, FRAME_COUNTS *counts,
+                                 uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int is_inter = is_inter_block(mbmi);
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+  (void)counts;
+#endif  // !CONFIG_ENTROPY_STATS
+
+  // Only y plane's tx_type is updated
+  if (plane > 0) return;
+  TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size,
+                                    cm->reduced_tx_set_used);
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    if (eset > 0) {
+      const TxSetType tx_set_type =
+          av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+      if (is_inter) {
+        if (allow_update_cdf) {
+          update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+                     av1_ext_tx_ind[tx_set_type][tx_type],
+                     av1_num_ext_tx_set[tx_set_type]);
         }
-      }
-    }
-  }
-}
-
-static INLINE void av1_update_nz_eob_counts_horiz(
-    FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob,
-    const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type,
-    const int16_t *scan, const int16_t *iscan) {
-  (void)eob;
-  (void)scan;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_horiz(eob_ls, tcoeff, width, height);
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      &counts->nz_map[txs_ctx][plane_type];
-  for (int r = 0; r < height; ++r) {
-    int16_t heob = eob_ls[r];
-    int el_ctx = get_empty_line_ctx(r, eob_ls);
-    ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][heob == 0];
-#if LV_MAP_PROB
-    update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx],
-               heob == 0, 2);
-#endif
-    if (heob) {
-      for (int c = 0; c < heob; ++c) {
-        if (c + 1 != width) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-          ++(*nz_map_count)[coeff_ctx][is_nz];
-#if LV_MAP_PROB
-          update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
-            ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]
-                            [c == heob - 1];
-#if LV_MAP_PROB
-            update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx],
-                       c == heob - 1, 2);
-#endif
-          }
+#if CONFIG_ENTROPY_STATS
+        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+      } else {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+        if (allow_update_cdf) {
+          update_cdf(
+              fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+              av1_ext_tx_ind[tx_set_type][tx_type],
+              av1_num_ext_tx_set[tx_set_type]);
         }
       }
     }
   }
 }
-#endif  // CONFIG_CTX1D
 
 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
@@ -2154,461 +1869,164 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int eob = p->eobs[block], update_eob = 0;
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-  const int segment_id = mbmi->segment_id;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
-  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  int c, i;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int eob = p->eobs[block];
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
               pd->left_context + blk_row, &txb_ctx);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-  int cul_level = 0;
-
-  TX_SIZE txsize_ctx = get_txsize_context(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const uint8_t allow_update_cdf = args->allow_update_cdf;
+  const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+  int cdf_idx = cm->coef_cdf_category;
+#endif  // CONFIG_ENTROPY_STATS
+
+#if CONFIG_ENTROPY_STATS
+  ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+  if (allow_update_cdf) {
+    update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
+               2);
+  }
 
-  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
-
-  ++td->counts->txb_skip[txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
-#if LV_MAP_PROB
-  update_bin(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
-             2);
-#endif
   x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
-
   x->mbmi_ext->eobs[plane][block] = eob;
 
   if (eob == 0) {
-    av1_set_contexts(xd, pd, plane, tx_size, 0, blk_col, blk_row);
+    av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
     return;
   }
 
-#if CONFIG_TXK_SEL
-  av1_update_tx_type_count(cm, xd, blk_row, blk_col, block, plane,
-                           mbmi->sb_type, get_min_tx_size(tx_size), td->counts);
-#endif
-
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
-                             tx_type, scan);
-  } else {
-    const int width = tx_size_wide[tx_size];
-    const int eob_offset = width + height;
-    const int eob_mode = eob > eob_offset;
-    const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-    ++td->counts->eob_mode[txs_ctx][plane_type][tx_class][eob_mode];
-#if LV_MAP_PROB
-    update_bin(ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], eob_mode,
-               2);
-#endif
-    if (eob_mode == 0) {
-      av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
-                               tx_type, scan);
-    } else {
-      const int16_t *iscan = scan_order->iscan;
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        av1_update_nz_eob_counts_vert(ec_ctx, td->counts, eob, tcoeff, plane,
-                                      tx_size, tx_type, scan, iscan);
-      else
-        av1_update_nz_eob_counts_horiz(ec_ctx, td->counts, eob, tcoeff, plane,
-                                       tx_size, tx_type, scan, iscan);
-    }
-  }
-#else   // CONFIG_CTX1D
-  av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
-                           tx_type, scan);
-#endif  // CONFIG_CTX1D
-
-  // Reverse process order to handle coefficient level and sign.
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    update_eob = 0;
-    for (c = eob - 1; c >= 0; --c) {
-      tran_low_t v = qcoeff[scan[c]];
-      tran_low_t level = abs(v);
-      int ctx;
-
-      if (level <= i) continue;
-
-      ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  const int segment_id = mbmi->segment_id;
+  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
-      if (level == i + 1) {
-        ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][1];
-#if LV_MAP_PROB
-        update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 1,
-                   2);
-#endif
-        if (c == 0) {
-          int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts,
+                       allow_update_cdf);
 
-          ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
-#if LV_MAP_PROB
-          update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2);
-#endif
-          x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
-        }
-        cul_level += level;
-        continue;
-      }
-      ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 0, 2);
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+#if CONFIG_ENTROPY_STATS
+  av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                         td->counts, allow_update_cdf);
+#else
+  av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                         allow_update_cdf);
 #endif
-      update_eob = AOMMAX(update_eob, c);
-    }
-  }
-
-  for (c = update_eob; c >= 0; --c) {
-    tran_low_t v = qcoeff[scan[c]];
-    tran_low_t level = abs(v);
-    int idx;
-    int ctx;
 
-    if (level <= NUM_BASE_LEVELS) continue;
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-    cul_level += level;
-    if (c == 0) {
-      int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+  for (int c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const tran_low_t level = abs(v);
 
-      ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2);
-#endif
-      x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+    if (allow_update_cdf) {
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+        update_cdf(
+            ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+            AOMMIN(level, 3) - 1, 3);
+      } else {
+        update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                   AOMMIN(level, 3), 4);
+      }
     }
-
-    // level is above 1.
-    ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
-
-#if BR_NODE
-    int base_range = level - 1 - NUM_BASE_LEVELS;
-    int br_set_idx = base_range < COEFF_BASE_RANGE
-                         ? coeff_to_br_index[base_range]
-                         : BASE_RANGE_SETS;
-
-    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-      if (idx == br_set_idx) {
-        int br_base = br_index_to_coeff[br_set_idx];
-        int br_offset = base_range - br_base;
-        ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][1];
-#if LV_MAP_PROB
-        update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 1,
-                   2);
-#endif
-        int extra_bits = (1 << br_extra_bits[idx]) - 1;
-        for (int tok = 0; tok < extra_bits; ++tok) {
-          if (br_offset == tok) {
-            ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
-#if LV_MAP_PROB
-            update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1,
-                       2);
-#endif
-            break;
-          }
-          ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
-#if LV_MAP_PROB
-          update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2);
+    {
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
 #endif
-        }
-        break;
       }
-      ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 0, 2);
-#endif
     }
-#else  // BR_NODE
-    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-      if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-        ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
-#if LV_MAP_PROB
-        update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1, 2);
+    if (level > NUM_BASE_LEVELS) {
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        if (allow_update_cdf) {
+          update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                         [plane_type][br_ctx],
+                     k, BR_CDF_SIZE);
+        }
+        for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps]
+                                 [br_ctx][lps == k];
+#endif  // CONFIG_ENTROPY_STATS
+          if (lps == k) break;
+        }
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                     [plane_type][br_ctx][k];
 #endif
-        break;
+        if (k < BR_CDF_SIZE - 1) break;
       }
-      ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2);
-#endif
     }
-    if (idx < COEFF_BASE_RANGE) continue;
-#endif  // BR_NODE
-    // use 0-th order Golomb code to handle the residual level.
   }
 
-  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+  // Update the context needed to code the DC sign (if applicable)
+  if (tcoeff[0] != 0) {
+    const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+    const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+    x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+  }
 
-  // DC value
-  set_dc_sign(&cul_level, tcoeff[0]);
-  av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
-
-#if CONFIG_ADAPT_SCAN
-  // Since dqcoeff is not available here, we pass qcoeff into
-  // av1_update_scan_count_facade(). The update behavior should be the same
-  // because av1_update_scan_count_facade() only cares if coefficients are zero
-  // or not.
-  av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
-                               qcoeff, eob);
-#endif
+  const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+                   blk_row);
 }
 
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            int mi_row, int mi_col) {
+                            int mi_row, int mi_col, uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, NULL, 0 };
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf };
   (void)rate;
   (void)mi_row;
   (void)mi_col;
   if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
     return;
   }
 
   if (!dry_run) {
-    td->counts->skip[ctx][0] += skip_inc;
     av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_and_record_txb_context, &arg);
+                                  av1_update_and_record_txb_context, &arg,
+                                  num_planes);
   } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_txb_context_b, &arg);
+                                  av1_update_txb_context_b, &arg, num_planes);
   } else {
     printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
     assert(0);
   }
 }
-
-static void find_new_prob(unsigned int *branch_cnt, aom_prob *oldp,
-                          int *savings, int *update, aom_writer *const bc) {
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  int u = 0;
-  aom_prob newp = get_binary_prob(branch_cnt[0], branch_cnt[1]);
-  int s = av1_prob_diff_update_savings_search(branch_cnt, *oldp, &newp, upd, 1);
-
-  if (s > 0 && newp != *oldp) u = 1;
-
-  if (u)
-    *savings += s - (int)(av1_cost_zero(upd));  // TODO(jingning): 1?
-  else
-    *savings -= (int)(av1_cost_zero(upd));
-
-  if (update) {
-    ++update[u];
-    return;
-  }
-
-  aom_write(bc, u, upd);
-  if (u) {
-    /* send/use new probability */
-    av1_write_prob_diff_update(bc, newp, *oldp);
-    *oldp = newp;
-  }
-}
-
-static void write_txb_probs(aom_writer *const bc, AV1_COMP *cpi,
-                            TX_SIZE tx_size) {
-  FRAME_CONTEXT *fc = cpi->common.fc;
-  FRAME_COUNTS *counts = cpi->td.counts;
-  int savings = 0;
-  int update[2] = { 0, 0 };
-  int plane, ctx, level;
-
-  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
-    find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
-                  &savings, update, bc);
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->nz_map[tx_size][plane][ctx],
-                    &fc->nz_map[tx_size][plane][ctx], &savings, update, bc);
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->eob_flag[tx_size][plane][ctx],
-                    &fc->eob_flag[tx_size][plane][ctx], &savings, update, bc);
-    }
-  }
-
-  for (level = 0; level < NUM_BASE_LEVELS; ++level) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
-        find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
-                      &fc->coeff_base[tx_size][plane][level][ctx], &savings,
-                      update, bc);
-      }
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-      find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
-                    &fc->coeff_lps[tx_size][plane][ctx], &savings, update, bc);
-    }
-  }
-
-  // Decide if to update the model for this tx_size
-  if (update[1] == 0 || savings < 0) {
-    aom_write_bit(bc, 0);
-    return;
-  }
-  aom_write_bit(bc, 1);
-
-  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
-    find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
-                  &savings, NULL, bc);
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->nz_map[tx_size][plane][ctx],
-                    &fc->nz_map[tx_size][plane][ctx], &savings, NULL, bc);
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->eob_flag[tx_size][plane][ctx],
-                    &fc->eob_flag[tx_size][plane][ctx], &savings, NULL, bc);
-    }
-  }
-
-  for (level = 0; level < NUM_BASE_LEVELS; ++level) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
-        find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
-                      &fc->coeff_base[tx_size][plane][level][ctx], &savings,
-                      NULL, bc);
-      }
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-      find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
-                    &fc->coeff_lps[tx_size][plane][ctx], &savings, NULL, bc);
-    }
-  }
-}
-
-void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) {
-  const TX_MODE tx_mode = cpi->common.tx_mode;
-  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-  TX_SIZE tx_size;
-  int ctx, plane;
-
-#if LV_MAP_PROB
-  return;
-#endif
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-      av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx],
-                                cpi->td.counts->dc_sign[plane][ctx], 1);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    write_txb_probs(w, cpi, tx_size);
-}
-
-#if CONFIG_TXK_SEL
-int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                            int block, int blk_row, int blk_col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                            int use_fast_coef_costing, RD_STATS *rd_stats) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  TX_TYPE txk_start = DCT_DCT;
-  TX_TYPE txk_end = TX_TYPES - 1;
-  TX_TYPE best_tx_type = txk_start;
-  int64_t best_rd = INT64_MAX;
-  uint8_t best_eob = 0;
-  const int coeff_ctx = combine_entropy_contexts(*a, *l);
-  RD_STATS best_rd_stats;
-  TX_TYPE tx_type;
-
-  av1_invalid_rd_stats(&best_rd_stats);
-
-  for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
-    if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = tx_type;
-    TX_TYPE ref_tx_type = av1_get_tx_type(get_plane_type(plane), xd, blk_row,
-                                          blk_col, block, tx_size);
-    if (tx_type != ref_tx_type) {
-      // use av1_get_tx_type() to check if the tx_type is valid for the current
-      // mode if it's not, we skip it here.
-      continue;
-    }
-
-#if CONFIG_EXT_TX
-    const int is_inter = is_inter_block(mbmi);
-    const TxSetType tx_set_type =
-        get_ext_tx_set_type(get_min_tx_size(tx_size), mbmi->sb_type, is_inter,
-                            cm->reduced_tx_set_used);
-    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-#endif  // CONFIG_EXT_TX
-
-    RD_STATS this_rd_stats;
-    av1_invalid_rd_stats(&this_rd_stats);
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    coeff_ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-    av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
-                   &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_PREDICTED_PIXELS);
-    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-    this_rd_stats.rate =
-        av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
-                        scan_order, a, l, use_fast_coef_costing);
-    int rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-
-    if (rd < best_rd) {
-      best_rd = rd;
-      best_rd_stats = this_rd_stats;
-      best_tx_type = tx_type;
-      best_eob = x->plane[plane].txb_entropy_ctx[block];
-    }
-  }
-
-  av1_merge_rd_stats(rd_stats, &best_rd_stats);
-
-  if (best_eob == 0 && is_inter_block(mbmi)) best_tx_type = DCT_DCT;
-
-  if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = best_tx_type;
-  x->plane[plane].txb_entropy_ctx[block] = best_eob;
-
-  if (!is_inter_block(mbmi)) {
-    // intra mode needs decoded result such that the next transform block
-    // can use it for prediction.
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    coeff_ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-
-    av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
-                                       x->plane[plane].eobs[block]);
-  }
-  return best_rd;
-}
-#endif  // CONFIG_TXK_SEL
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 76a04bb41..aa847ad62 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -12,7 +12,8 @@
 #ifndef ENCODETXB_H_
 #define ENCODETXB_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
@@ -25,6 +26,7 @@ extern "C" {
 
 typedef struct TxbInfo {
   tran_low_t *qcoeff;
+  uint8_t *levels;  // absolute values and clamped to 255.
   tran_low_t *dqcoeff;
   const tran_low_t *tcoeff;
   const int16_t *dequant;
@@ -33,7 +35,7 @@ typedef struct TxbInfo {
   TX_SIZE txs_ctx;
   TX_TYPE tx_type;
   int bwl;
-  int stride;
+  int width;
   int height;
   int eob;
   int seg_eob;
@@ -41,51 +43,27 @@ typedef struct TxbInfo {
   TXB_CTX *txb_ctx;
   int64_t rdmult;
   const LV_MAP_CTX_TABLE *coeff_ctx_table;
+  const qm_val_t *iqmatrix;
+  int tx_type_cost;
 } TxbInfo;
 
-typedef struct TxbCache {
-  int nz_count_arr[MAX_TX_SQUARE];
-  int nz_ctx_arr[MAX_TX_SQUARE];
-  int base_count_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
-  int base_mag_arr[MAX_TX_SQUARE]
-                  [2];  // [0]: max magnitude [1]: num of max magnitude
-  int base_ctx_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
-
-  int br_count_arr[MAX_TX_SQUARE];
-  int br_mag_arr[MAX_TX_SQUARE]
-                [2];  // [0]: max magnitude [1]: num of max magnitude
-  int br_ctx_arr[MAX_TX_SQUARE];
-} TxbCache;
-
-typedef struct TxbProbs {
-  const aom_prob *dc_sign_prob;
-  const aom_prob *nz_map;
-  aom_prob (*coeff_base)[COEFF_BASE_CONTEXTS];
-  const aom_prob *coeff_lps;
-  const aom_prob *eob_flag;
-  const aom_prob *txb_skip;
-#if BR_NODE
-  const aom_prob *coeff_br;
-#endif
-} TxbProbs;
-
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                        int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                        TXB_CTX *txb_ctx);
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+                        const int plane, const int blk_row, const int blk_col,
+                        const int block, const TX_SIZE tx_size,
+                        const TXB_CTX *const txb_ctx);
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                          aom_writer *w, int blk_row, int blk_col, int block,
-                          int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          TX_SIZE tx_size, const tran_low_t *tcoeff,
                           uint16_t eob, TXB_CTX *txb_ctx);
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, int plane);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+                         int mi_col, aom_writer *w, BLOCK_SIZE bsize);
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
                                 const SCAN_ORDER *scan_order, int eob);
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            const int mi_row, const int mi_col);
-void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w);
+                            int mi_row, int mi_col, uint8_t allow_update_cdf);
 
 void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
@@ -98,16 +76,10 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
 void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
                           int mi_row, int mi_col);
 
-#if CONFIG_TXK_SEL
-int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                            int block, int blk_row, int blk_col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                            int use_fast_coef_costing, RD_STATS *rd_stats);
-#endif
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
-                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode);
+void hbt_destroy();
+int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index edc9b1d61..404af2e7c 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -18,15 +18,13 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   for (int i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
-#if CONFIG_GLOBAL_MOTION
-  for (int i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+  for (int i = 0; i < REF_FRAMES; i++)
     td->rd_counts.global_motion_used[i] +=
         td_t->rd_counts.global_motion_used[i];
-#endif  // CONFIG_GLOBAL_MOTION
 
   td->rd_counts.compound_ref_used_flag |=
       td_t->rd_counts.compound_ref_used_flag;
-  td->rd_counts.single_ref_used_flag |= td_t->rd_counts.single_ref_used_flag;
+  td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
@@ -53,7 +51,7 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tile_cols;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
+  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
 
   av1_init_tile_data(cpi);
@@ -81,29 +79,19 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
                         aom_memalign(32, sizeof(*thread_data->td)));
         av1_zero(*thread_data->td);
 
-// Set up pc_tree.
-#if !CONFIG_CB4X4
-        thread_data->td->leaf_tree = NULL;
-#endif
+        // Set up pc_tree.
         thread_data->td->pc_tree = NULL;
         av1_setup_pc_tree(cm, thread_data->td);
 
-#if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
-        int buf_scaler = 2;
-#else
-        int buf_scaler = 1;
-#endif
         CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
                         (uint8_t *)aom_memalign(
-                            16,
-                            buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                sizeof(*thread_data->td->above_pred_buf)));
+                            16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->above_pred_buf)));
         CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
                         (uint8_t *)aom_memalign(
-                            16,
-                            buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                sizeof(*thread_data->td->left_pred_buf)));
+                            16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->left_pred_buf)));
+
         CHECK_MEM_ERROR(
             cm, thread_data->td->wsrc_buf,
             (int32_t *)aom_memalign(
@@ -112,7 +100,6 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
             cm, thread_data->td->mask_buf,
             (int32_t *)aom_memalign(
                 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
-#endif
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         aom_calloc(1, sizeof(*thread_data->td->counts)));
@@ -133,6 +120,8 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
 
       winterface->sync(worker);
     }
+  } else {
+    num_workers = AOMMIN(num_workers, cpi->num_workers);
   }
 
   for (i = 0; i < num_workers; i++) {
@@ -148,16 +137,13 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
-#if CONFIG_MOTION_VAR
       thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
       thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
       thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
       thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
-#endif
     }
-    if (thread_data->td->counts != &cpi->common.counts) {
-      memcpy(thread_data->td->counts, &cpi->common.counts,
-             sizeof(cpi->common.counts));
+    if (thread_data->td->counts != &cpi->counts) {
+      memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
 
     if (i < num_workers - 1)
@@ -187,14 +173,24 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
   for (i = 0; i < num_workers; i++) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
+    cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
     // Accumulate counters.
     if (i < cpi->num_workers - 1) {
-      av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts);
+      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
-#if CONFIG_VAR_TX
       cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
-#endif
     }
   }
 }
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+                                 const FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)acc_counts;
+  const unsigned int *const cnt = (const unsigned int *)counts;
+
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+  for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
index 6c30a3e5c..b6b1fed4e 100644
--- a/third_party/aom/av1/encoder/ethread.h
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -27,6 +27,9 @@ typedef struct EncWorkerData {
 
 void av1_encode_tiles_mt(struct AV1_COMP *cpi);
 
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+                                 const struct FRAME_COUNTS *counts);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
index 007694a38..e9621a574 100644
--- a/third_party/aom/av1/encoder/extend.c
+++ b/third_party/aom/av1/encoder/extend.c
@@ -57,7 +57,6 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
                                          uint8_t *dst8, int dst_pitch, int w,
                                          int h, int extend_top, int extend_left,
@@ -100,7 +99,6 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
     dst_ptr2 += dst_pitch;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
@@ -124,7 +122,6 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
-#if CONFIG_HIGHBITDEPTH
   if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                                  dst->y_stride, src->y_crop_width,
@@ -139,7 +136,6 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
         src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_crop_width, src->y_crop_height,
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index 2a4200887..113c068c1 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -27,9 +27,7 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
@@ -41,6 +39,7 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/rd.h"
+#include "av1/encoder/dwt.h"
 
 #define OUTPUT_FPF 0
 #define ARF_STATS_OUTPUT 0
@@ -143,6 +142,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
   section->intra_error = 0.0;
+  section->frame_avg_wavelet_energy = 0.0;
   section->coded_error = 0.0;
   section->sr_coded_error = 0.0;
   section->pcnt_inter = 0.0;
@@ -169,6 +169,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->frame += frame->frame;
   section->weight += frame->weight;
   section->intra_error += frame->intra_error;
+  section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
   section->coded_error += frame->coded_error;
   section->sr_coded_error += frame->sr_coded_error;
   section->pcnt_inter += frame->pcnt_inter;
@@ -195,6 +196,7 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->frame -= frame->frame;
   section->weight -= frame->weight;
   section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
   section->coded_error -= frame->coded_error;
   section->sr_coded_error -= frame->sr_coded_error;
   section->pcnt_inter -= frame->pcnt_inter;
@@ -305,7 +307,6 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize,
   return sse;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
                                                       int bd) {
   switch (bd) {
@@ -345,7 +346,6 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
   return sse;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 // Refine the motion search range according to the frame dimension
 // for first pass test.
@@ -361,10 +361,10 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
                                      const MV *ref_mv, MV *best_mv,
                                      int *best_motion_err) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MV tmp_mv = { 0, 0 };
+  MV tmp_mv = kZeroMv;
   MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
   int num00, tmp_err, n;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
 
@@ -376,11 +376,9 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
 
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
@@ -459,7 +457,6 @@ static void set_first_pass_params(AV1_COMP *cpi) {
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-#if CONFIG_EXT_REFS
 static double raw_motion_error_stdev(int *raw_motion_err_list,
                                      int raw_motion_err_counts) {
   int64_t sum_raw_err = 0;
@@ -482,7 +479,6 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
   return raw_err_stdev;
 }
-#endif  // CONFIG_EXT_REFS
 
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
@@ -490,6 +486,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
@@ -500,6 +497,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
   int recon_yoffset, recon_uvoffset;
   int64_t intra_error = 0;
+  int64_t frame_avg_wavelet_energy = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
 
@@ -515,9 +513,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   int image_data_start_row = INVALID_ROW;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
-  MV lastmv = { 0, 0 };
+  MV lastmv = kZeroMv;
   TWO_PASS *twopass = &cpi->twopass;
-  const MV zero_mv = { 0, 0 };
   int recon_y_stride, recon_uv_stride, uv_mb_height;
 
   YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
@@ -529,18 +526,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   BufferPool *const pool = cm->buffer_pool;
   const int qindex = find_fp_qindex(cm->bit_depth);
   const int mb_scale = mi_size_wide[BLOCK_16X16];
-#if CONFIG_PVQ
-  PVQ_QUEUE pvq_q;
-  od_adapt_ctx pvq_context;
-#endif
 
-#if CONFIG_EXT_REFS
   int *raw_motion_err_list;
   int raw_motion_err_counts = 0;
   CHECK_MEM_ERROR(
       cm, raw_motion_err_list,
       aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
-#endif  // CONFIG_EXT_REFS
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -555,7 +546,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
-  x->e_mbd.mi[0]->mbmi.sb_type = BLOCK_16X16;
+  x->e_mbd.mi[0]->sb_type = BLOCK_16X16;
 
   intra_factor = 0.0;
   brightness_factor = 0.0;
@@ -564,80 +555,34 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   set_first_pass_params(cpi);
   av1_set_quantizer(cm, qindex);
 
-  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y,
+                         num_planes);
 
-  av1_setup_src_planes(x, cpi->source, 0, 0);
-  av1_setup_dst_planes(xd->plane, cm->sb_size, new_yv12, 0, 0);
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, new_yv12, 0, 0, 0,
+                       num_planes);
 
   if (!frame_is_intra_only(cm)) {
-    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL, num_planes);
   }
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
-#if CONFIG_CFL
   // Don't store luma on the fist pass since chroma is not computed
-  xd->cfl->store_y = 0;
-#endif  // CONFIG_CFL
+  xd->cfl.store_y = 0;
   av1_frame_init_quantizer(cpi);
 
-#if CONFIG_PVQ
-  // For pass 1 of 2-pass encoding, init here for PVQ for now.
-  {
-    pvq_q.buf_len = 5000;
-    CHECK_MEM_ERROR(cm, pvq_q.buf,
-                    aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO)));
-    pvq_q.curr_pos = 0;
-    x->pvq_coded = 0;
-
-    x->pvq_q = &pvq_q;
-
-    // TODO(yushin): Since this init step is also called in 2nd pass,
-    // or 1-pass encoding, consider factoring out it as a function.
-    // TODO(yushin)
-    // If activity masking is enabled, change below to OD_HVS_QM
-    x->daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
-    x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA;
-    x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA;
-
-    od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv,
-               x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
-#if !CONFIG_ANS
-    od_ec_enc_init(&x->daala_enc.w.ec, 65025);
-    od_ec_enc_reset(&x->daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  }
-#endif
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
-#if CONFIG_PVQ
-    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
-#endif
     p[i].eobs = ctx->eobs[i];
-#if CONFIG_LV_MAP
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-#endif
   }
 
   av1_init_mv_probs(cm);
-#if CONFIG_LV_MAP
   av1_init_lv_map(cm);
-#endif
-#if CONFIG_ADAPT_SCAN
-  av1_init_scan_order(cm);
-  av1_deliver_eob_threshold(cm, xd);
-#endif
-  av1_convolve_init(cm);
-#if CONFIG_PVQ
-  od_adapt_ctx_reset(&pvq_context, 0);
-  x->daala_enc.state.adapt = &pvq_context;
-#endif  // CONFIG_PVQ
   av1_initialize_rd_consts(cpi);
 
   // Tiling is ignored in the first pass.
@@ -648,7 +593,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    MV best_ref_mv = { 0, 0 };
+    MV best_ref_mv = kZeroMv;
 
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
@@ -674,31 +619,28 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
       aom_clear_system_state();
 
+      const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
+      xd->mi = cm->mi_grid_visible + idx_str;
+      xd->mi[0] = cm->mi + idx_str;
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-      xd->mi[0]->mbmi.sb_type = bsize;
-      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->mi[0]->sb_type = bsize;
+      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
-                     mb_col * mb_scale, mi_size_wide[bsize],
-#if CONFIG_DEPENDENT_HORZTILES
-                     cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                     cm->mi_rows, cm->mi_cols);
+                     mb_col * mb_scale, mi_size_wide[bsize], cm->mi_rows,
+                     cm->mi_cols);
 
-      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize]);
+      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
 
       // Do intra 16x16 prediction.
-      xd->mi[0]->mbmi.segment_id = 0;
-#if CONFIG_SUPERTX
-      xd->mi[0]->mbmi.segment_id_supertx = 0;
-#endif  // CONFIG_SUPERTX
-      xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0);
-      xd->mi[0]->mbmi.mode = DC_PRED;
-      xd->mi[0]->mbmi.tx_size =
+      xd->mi[0]->segment_id = 0;
+      xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+      xd->mi[0]->mode = DC_PRED;
+      xd->mi[0]->tx_size =
           use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+      av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
       this_error = aom_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
@@ -712,7 +654,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
         image_data_start_row = mb_row;
       }
 
-#if CONFIG_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
         switch (cm->bit_depth) {
           case AOM_BITS_8: break;
@@ -725,7 +666,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             return;
         }
       }
-#endif  // CONFIG_HIGHBITDEPTH
 
       aom_clear_system_state();
       log_intra = log(this_error + 1.0);
@@ -734,14 +674,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       else
         intra_factor += 1.0;
 
-#if CONFIG_HIGHBITDEPTH
       if (cm->use_highbitdepth)
         level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
       else
         level_sample = x->plane[0].src.buf[0];
-#else
-      level_sample = x->plane[0].src.buf[0];
-#endif
       if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
         brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
       else
@@ -759,6 +695,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
+      int stride = x->plane[0].src.stride;
+      uint8_t *buf = x->plane[0].src.buf;
+      for (int r8 = 0; r8 < 2; ++r8)
+        for (int c8 = 0; c8 < 2; ++c8) {
+          int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+          frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+              buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+        }
+
 #if CONFIG_FP_MB_STATS
       if (cpi->use_fp_mb_stats) {
         // initialization
@@ -775,11 +720,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       if (!frame_is_intra_only(cm)) {  // Do a motion search
         int tmp_err, motion_error, raw_motion_error;
         // Assume 0,0 motion with no mv overhead.
-        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+        MV mv = kZeroMv, tmp_mv = kZeroMv;
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-#if CONFIG_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           motion_error = highbd_get_prediction_error(
               bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
@@ -787,10 +731,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                               &xd->plane[0].pre[0]);
         }
-#else
-        motion_error =
-            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
-#endif  // CONFIG_HIGHBITDEPTH
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -799,7 +739,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             cpi->unscaled_last_source->y_buffer + recon_yoffset;
         unscaled_last_source_buf_2d.stride =
             cpi->unscaled_last_source->y_stride;
-#if CONFIG_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           raw_motion_error = highbd_get_prediction_error(
               bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
@@ -807,10 +746,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                   &unscaled_last_source_buf_2d);
         }
-#else
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
-#endif  // CONFIG_HIGHBITDEPTH
 
         // TODO(pengchong): Replace the hard-coded threshold
         if (raw_motion_error > 25) {
@@ -822,7 +757,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           // 0,0 based search as well.
           if (!is_zero_mv(&best_ref_mv)) {
             tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
 
             if (tmp_err < motion_error) {
               motion_error = tmp_err;
@@ -836,7 +771,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             int gf_motion_error;
 
             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-#if CONFIG_HIGHBITDEPTH
             if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
               gf_motion_error = highbd_get_prediction_error(
                   bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
@@ -844,12 +778,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
               gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                      &xd->plane[0].pre[0]);
             }
-#else
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
-#endif  // CONFIG_HIGHBITDEPTH
 
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv,
                                      &gf_motion_error);
 
             if (gf_motion_error < motion_error && gf_motion_error < this_error)
@@ -913,11 +843,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           mv.row *= 8;
           mv.col *= 8;
           this_error = motion_error;
-          xd->mi[0]->mbmi.mode = NEWMV;
-          xd->mi[0]->mbmi.mv[0].as_mv = mv;
-          xd->mi[0]->mbmi.tx_size = TX_4X4;
-          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+          xd->mi[0]->mode = NEWMV;
+          xd->mi[0]->mv[0].as_mv = mv;
+          xd->mi[0]->tx_size = TX_4X4;
+          xd->mi[0]->ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->ref_frame[1] = NONE_FRAME;
           av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
                                          mb_col * mb_scale, NULL, bsize);
           av1_encode_sby_pass1(cm, x, bsize);
@@ -1006,9 +936,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             }
           }
         }
-#if CONFIG_EXT_REFS
         raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
-#endif  // CONFIG_EXT_REFS
       } else {
         sr_coded_error += (int64_t)this_error;
       }
@@ -1031,25 +959,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
     aom_clear_system_state();
   }
-#if CONFIG_EXT_REFS
   const double raw_err_stdev =
       raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
   aom_free(raw_motion_err_list);
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_PVQ
-#if !CONFIG_ANS
-  od_ec_enc_clear(&x->daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-
-  x->pvq_q->last_pos = x->pvq_q->curr_pos;
-  x->pvq_q->curr_pos = 0;
-  x->pvq_q = NULL;
-
-  aom_free(pvq_q.buf);
-#endif
 
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
@@ -1083,6 +995,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     fps.coded_error = (double)(coded_error >> 8) + min_err;
     fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
     fps.intra_error = (double)(intra_error >> 8) + min_err;
+    fps.frame_avg_wavelet_energy = (double)frame_avg_wavelet_energy;
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / num_mbs;
     fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
@@ -1090,9 +1003,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
     fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
-#if CONFIG_EXT_REFS
     fps.raw_error_stdev = raw_err_stdev;
-#endif  // CONFIG_EXT_REFS
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1144,41 +1055,29 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
-#if CONFIG_EXT_REFS
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-                 cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
-#else
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-                 cm->ref_frame_map[cpi->lst_fb_idx]);
-#endif  // CONFIG_EXT_REFS
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+                 cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
     }
     twopass->sr_update_lag = 1;
   } else {
     ++twopass->sr_update_lag;
   }
 
-  aom_extend_frame_borders(new_yv12);
+  aom_extend_frame_borders(new_yv12, num_planes);
 
-// The frame we just compressed now becomes the last frame.
-#if CONFIG_EXT_REFS
+  // The frame we just compressed now becomes the last frame.
   ref_cnt_fb(pool->frame_bufs,
-             &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]],
-             cm->new_fb_idx);
-#else
-  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             &cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]],
              cm->new_fb_idx);
-#endif  // CONFIG_EXT_REFS
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
-#if CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
-#else
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->ref_frame_map[cpi->lst_fb_idx]);
-#endif  // CONFIG_EXT_REFS
+  if (cm->current_video_frame == 0 &&
+      cpi->ref_fb_idx[GOLDEN_FRAME - 1] != INVALID_IDX) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+               cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -1234,7 +1133,7 @@ static int get_twopass_worst_quality(const AV1_COMP *cpi,
                             : cpi->common.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
-    const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const double speed_term = 1.0;
     double ediv_size_correction;
     const int target_norm_bits_per_mb =
         (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
@@ -1662,21 +1561,6 @@ static int calculate_boost_bits(int frame_count, int boost,
                 0);
 }
 
-#if !CONFIG_EXT_REFS
-// Current limit on maximum number of active arfs in a GF/ARF group.
-#define MAX_ACTIVE_ARFS 2
-#define ARF_SLOT1 2
-#define ARF_SLOT2 3
-// This function indirects the choice of buffers for arfs.
-// At the moment the values are fixed but this may change as part of
-// the integration process with other codec features that swap buffers around.
-static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
-  arf_buffer_indices[0] = ARF_SLOT1;
-  arf_buffer_indices[1] = ARF_SLOT2;
-}
-#endif  // !CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 // === GF Group of 16 ===
 #define GF_INTERVAL_16 16
@@ -2146,10 +2030,8 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) {
       gf_group->bidir_pred_enabled[frame_index] = 0;
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
         gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx;
-      gf_group->refresh_idx[frame_index] =
-          cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
-      gf_group->refresh_flag[frame_index] =
-          cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+      gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
+      gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
 
       continue;
     }
@@ -2247,19 +2129,16 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) {
   }
 }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
 static void define_gf_group_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
   if (rc->baseline_gf_interval == 16) {
     define_gf_group_structure_16(cpi);
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
@@ -2267,7 +2146,6 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   int frame_index = 0;
   const int key_frame = cpi->common.frame_type == KEY_FRAME;
 
-#if CONFIG_EXT_REFS
   // The use of bi-predictive frames are only enabled when following 3
   // conditions are met:
   // (1) ALTREF is enabled;
@@ -2275,7 +2153,7 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   // (3) The bi-predictive group interval is strictly smaller than the
   //     golden group interval.
   const int is_bipred_enabled =
-      cpi->bwd_ref_allowed && rc->source_alt_ref_pending &&
+      cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
       rc->bipred_group_interval &&
       rc->bipred_group_interval <=
           (rc->baseline_gf_interval - rc->source_alt_ref_pending);
@@ -2288,14 +2166,6 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   int subgroup_interval[MAX_EXT_ARFS + 1];
   int is_sg_bipred_enabled = is_bipred_enabled;
   int accumulative_subgroup_interval = 0;
-#else
-  int mid_frame_idx;
-  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
-#endif  // CONFIG_EXT_REFS
-
-#if !CONFIG_EXT_REFS
-  get_arf_buffer_indices(arf_buffer_indices);
-#endif  // !CONFIG_EXT_REFS
 
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
@@ -2308,25 +2178,16 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
       gf_group->update_type[frame_index] = GF_UPDATE;
       gf_group->rf_level[frame_index] = GF_ARF_STD;
     }
-#if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = 0;
     gf_group->arf_ref_idx[frame_index] = 0;
-#else
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-#endif  // CONFIG_EXT_REFS
   }
 
-#if CONFIG_EXT_REFS
   gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
-#endif  // CONFIG_EXT_REFS
 
   frame_index++;
 
-#if CONFIG_EXT_REFS
   bipred_frame_index++;
-#endif  // CONFIG_EXT_REFS
 
   // === [frame_index == 1] ===
   if (rc->source_alt_ref_pending) {
@@ -2335,21 +2196,13 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
     gf_group->arf_src_offset[frame_index] =
         (unsigned char)(rc->baseline_gf_interval - 1);
 
-#if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = 0;
     gf_group->arf_ref_idx[frame_index] = 0;
 
     gf_group->bidir_pred_enabled[frame_index] = 0;
     gf_group->brf_src_offset[frame_index] = 0;
-// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
-#else
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[frame_index] =
-        arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
-                           rc->source_alt_ref_active];
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
+    // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+
     // Work out the ARFs' positions in this gf group
     // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
     // order (except for the original ARF). In the example of three ALT_REF's,
@@ -2370,11 +2223,9 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
     subgroup_interval[cpi->num_extra_arfs] =
         cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
         (cpi->num_extra_arfs == 0 ? 1 : 2);
-#endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Insert an extra ARF
     // === [frame_index == 2] ===
     if (cpi->num_extra_arfs) {
@@ -2387,43 +2238,12 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
       ++frame_index;
     }
     accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
-#else   // !CONFIG_EXT_ARFS
-    if (cpi->multi_arf_enabled) {
-      // Set aside a slot for a level 1 arf.
-      gf_group->update_type[frame_index] = ARF_UPDATE;
-      gf_group->rf_level[frame_index] = GF_ARF_LOW;
-      gf_group->arf_src_offset[frame_index] =
-          (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
-      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-      ++frame_index;
-    }
-#endif  // CONFIG_EXT_ARFS
   }
 
-#if !CONFIG_EXT_REFS
-  // Define middle frame
-  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
-#endif  // !CONFIG_EXT_REFS
-
   for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
-#if !CONFIG_EXT_REFS
-    int arf_idx = 0;
-
-    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      if (frame_index <= mid_frame_idx) arf_idx = 1;
-    }
-#endif  // !CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = which_arf;
     gf_group->arf_ref_idx[frame_index] = which_arf;
-#else
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
-#endif  // CONFIG_EXT_REFS
 
-#if CONFIG_EXT_REFS
     // If we are going to have ARFs, check whether we can have BWDREF in this
     // subgroup, and further, whether we can have ARF subgroup which contains
     // the BWDREF subgroup but contained within the GF group:
@@ -2472,18 +2292,14 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
         bipred_group_end = 1;
       }
     } else {
-#endif  // CONFIG_EXT_REFS
       gf_group->update_type[frame_index] = LF_UPDATE;
       gf_group->rf_level[frame_index] = INTER_NORMAL;
-#if CONFIG_EXT_REFS
       gf_group->bidir_pred_enabled[frame_index] = 0;
       gf_group->brf_src_offset[frame_index] = 0;
     }
-#endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Check if we need to update the ARF.
     if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
         frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
@@ -2503,25 +2319,19 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
         ++frame_index;
       }
     }
-#endif  // CONFIG_EXT_REFS
   }
 
-// NOTE: We need to configure the frame at the end of the sequence + 1 that will
-//       be the start frame for the next group. Otherwise prior to the call to
-//       av1_rc_get_second_pass_params() the data will be undefined.
-#if CONFIG_EXT_REFS
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  // will
+  //       be the start frame for the next group. Otherwise prior to the call to
+  //       av1_rc_get_second_pass_params() the data will be undefined.
   gf_group->arf_update_idx[frame_index] = 0;
   gf_group->arf_ref_idx[frame_index] = 0;
-#else
-  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-#endif  // CONFIG_EXT_REFS
 
   if (rc->source_alt_ref_pending) {
     gf_group->update_type[frame_index] = OVERLAY_UPDATE;
     gf_group->rf_level[frame_index] = INTER_NORMAL;
 
-#if CONFIG_EXT_REFS
     cpi->arf_pos_in_gf[0] = 1;
     if (cpi->num_extra_arfs) {
       // Overwrite the update_type for extra-ARF's corresponding internal
@@ -2534,21 +2344,13 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
         gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
       }
     }
-#else
-    // Final setup for second arf and its overlay.
-    if (cpi->multi_arf_enabled) {
-      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
-    }
-#endif  // CONFIG_EXT_REFS
   } else {
     gf_group->update_type[frame_index] = GF_UPDATE;
     gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
 
-#if CONFIG_EXT_REFS
   gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
-#endif  // CONFIG_EXT_REFS
 }
 
 static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
@@ -2566,18 +2368,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   int64_t total_group_bits = gf_group_bits;
   double modified_err = 0.0;
   double err_fraction;
-  int mid_boost_bits = 0;
-#if CONFIG_EXT_REFS
   int ext_arf_boost[MAX_EXT_ARFS];
-#else
-  int mid_frame_idx;
-#endif  // CONFIG_EXT_REFS
 
   define_gf_group_structure(cpi);
 
-#if CONFIG_EXT_REFS
   av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
-#endif  // CONFIG_EXT_REFS
 
   key_frame = cpi->common.frame_type == KEY_FRAME;
 
@@ -2607,24 +2402,14 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Skip all the extra-ARF's right after ARF at the starting segment of
     // the current GF group.
     if (cpi->num_extra_arfs) {
       while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
         ++frame_index;
     }
-#else   // !CONFIG_EXT_ARFS
-    // Set aside a slot for a level 1 arf.
-    if (cpi->multi_arf_enabled) ++frame_index;
-#endif  // CONFIG_EXT_ARFS
   }
 
-#if !CONFIG_EXT_REFS
-  // Define middle frame
-  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
-#endif  // !CONFIG_EXT_REFS
-
   // Allocate bits to the other frames in the group.
   for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
     if (EOF == input_stats(twopass, &frame_stats)) break;
@@ -2638,15 +2423,9 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
 
     target_frame_size = (int)((double)total_group_bits * err_fraction);
 
-    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      mid_boost_bits += (target_frame_size >> 4);
-      target_frame_size -= (target_frame_size >> 4);
-    }
-
     target_frame_size =
         clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
 
-#if CONFIG_EXT_REFS
     if (gf_group->update_type[frame_index] == BRF_UPDATE) {
       // Boost up the allocated bits on BWDREF_FRAME
       gf_group->bit_allocation[frame_index] =
@@ -2662,28 +2441,22 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     } else {
       assert(gf_group->update_type[frame_index] == LF_UPDATE ||
              gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
-#endif  // CONFIG_EXT_REFS
       gf_group->bit_allocation[frame_index] = target_frame_size;
-#if CONFIG_EXT_REFS
     }
-#endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Skip all the extra-ARF's.
     if (cpi->num_extra_arfs) {
       while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
         ++frame_index;
     }
-#endif  // CONFIG_EXT_REFS
   }
 
   // NOTE: We need to configure the frame at the end of the sequence + 1 that
   //       will be the start frame for the next group. Otherwise prior to the
   //       call to av1_rc_get_second_pass_params() the data will be undefined.
   if (rc->source_alt_ref_pending) {
-#if CONFIG_EXT_REFS
     if (cpi->num_extra_arfs) {
       // NOTE: For bit allocation, move the allocated bits associated with
       //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
@@ -2702,18 +2475,7 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
         gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
       }
     }
-#else
-    // Final setup for second arf and its overlay.
-    if (cpi->multi_arf_enabled) {
-      gf_group->bit_allocation[2] =
-          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
-      gf_group->bit_allocation[mid_frame_idx] = 0;
-    }
-#endif  // CONFIG_EXT_REFS
   }
-
-  // Note whether multi-arf was enabled this group for next time.
-  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
 }
 
 // Analyse and define a gf/arf group.
@@ -2761,10 +2523,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const int is_key_frame = frame_is_intra_only(cm);
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
-#if CONFIG_EXT_REFS
   cpi->extra_arf_allowed = 1;
-  cpi->bwd_ref_allowed = 1;
-#endif  // CONFIG_EXT_REFS
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -2826,15 +2585,9 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
   }
 
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   double avg_sr_coded_error = 0;
   double avg_raw_err_stdev = 0;
   int non_zero_stdev_count = 0;
-#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
-#if CONFIG_BGSPRITE
-  double avg_pcnt_second_ref = 0;
-  int non_zero_pcnt_second_ref_count = 0;
-#endif
 
   i = 0;
   while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
@@ -2859,20 +2612,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     accumulate_frame_motion_stats(
         &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
     // sum up the metric values of current gf group
     avg_sr_coded_error += next_frame.sr_coded_error;
     if (fabs(next_frame.raw_error_stdev) > 0.000001) {
       non_zero_stdev_count++;
       avg_raw_err_stdev += next_frame.raw_error_stdev;
     }
-#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
-#if CONFIG_BGSPRITE
-    if (this_frame->pcnt_second_ref) {
-      avg_pcnt_second_ref += this_frame->pcnt_second_ref;
-    }
-    non_zero_pcnt_second_ref_count++;
-#endif  // CONFIG_BGSPRITE
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
@@ -2912,18 +2657,14 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
              (abs_mv_in_out_accumulator > 3.0) ||
              (mv_in_out_accumulator < -2.0) ||
              ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-#if CONFIG_EXT_REFS
       // If GF group interval is < 12, we force it to be 8. Otherwise,
       // if it is >= 12, we keep it as is.
       // NOTE: 'i' is 1 more than the GF group interval candidate that is being
       //       checked.
       if (i == (8 + 1) || i >= (12 + 1)) {
-#endif  // CONFIG_EXT_REFS
         boost_score = old_boost_score;
         break;
-#if CONFIG_EXT_REFS
       }
-#endif  // CONFIG_EXT_REFS
     }
 
     *this_frame = next_frame;
@@ -2934,12 +2675,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
                                                              : cpi->common.MBs;
   assert(num_mbs > 0);
   if (i) avg_sr_coded_error /= i;
-#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
 
   // Should we use the alternate reference frame.
   if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
@@ -2948,24 +2687,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     rc->gfu_boost =
         calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
     rc->source_alt_ref_pending = 1;
-
-    // Test to see if multi arf is appropriate.
-    cpi->multi_arf_enabled =
-        (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
-         (zero_motion_accumulator < 0.995))
-            ? 1
-            : 0;
-#if CONFIG_BGSPRITE
-    if (non_zero_pcnt_second_ref_count) {
-      avg_pcnt_second_ref /= non_zero_pcnt_second_ref_count;
-    }
-
-    cpi->bgsprite_allowed = 1;
-    if (abs_mv_in_out_accumulator > 0.30 || decay_accumulator < 0.90 ||
-        avg_sr_coded_error / num_mbs < 20 || avg_pcnt_second_ref < 0.30) {
-      cpi->bgsprite_allowed = 0;
-    }
-#endif  // CONFIG_BGSPRITE
   } else {
     rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
@@ -2973,7 +2694,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Set the interval until the next gf.
   rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
-#if CONFIG_EXT_REFS
   if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
 
   // Disable extra altrefs and backward refs for "still" gf group:
@@ -2981,13 +2701,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
   //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
   //                            motion error per block of each frame.
-  assert(num_mbs > 0);
   const int disable_bwd_extarf =
       (zero_motion_accumulator > MIN_ZERO_MOTION &&
        avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
        avg_raw_err_stdev < MAX_RAW_ERR_VAR);
 
-  if (disable_bwd_extarf) cpi->extra_arf_allowed = cpi->bwd_ref_allowed = 0;
+  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
 
   if (!cpi->extra_arf_allowed) {
     cpi->num_extra_arfs = 0;
@@ -2998,15 +2717,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
   // Currently at maximum two extra ARFs' are allowed
   assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
-#endif  // CONFIG_EXT_REFS
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
-#if CONFIG_EXT_REFS
   rc->bipred_group_interval = BFG_INTERVAL;
   // The minimum bi-predictive frame group interval is 2.
   if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
-#endif  // CONFIG_EXT_REFS
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
@@ -3226,7 +2942,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Clear the alt ref active flag and last group multi arf flags as they
   // can never be set for a key frame.
   rc->source_alt_ref_active = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
@@ -3397,6 +3112,8 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
                                  twopass->kf_group_bits);
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  //        kf_bits, twopass->kf_zeromotion_pct);
 
   // Work out the fraction of the kf group bits reserved for the inter frames
   // within the group after discounting the bits for the kf itself.
@@ -3433,17 +3150,9 @@ void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
   int ref_fb_idx_prev[REF_FRAMES];
   int ref_fb_idx_curr[REF_FRAMES];
 
-  ref_fb_idx_prev[LAST_FRAME - LAST_FRAME] =
-      cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
-  ref_fb_idx_prev[LAST2_FRAME - LAST_FRAME] =
-      cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME];
-  ref_fb_idx_prev[LAST3_FRAME - LAST_FRAME] =
-      cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME];
-  ref_fb_idx_prev[GOLDEN_FRAME - LAST_FRAME] = cpi->gld_fb_idx;
-  ref_fb_idx_prev[BWDREF_FRAME - LAST_FRAME] = cpi->bwd_fb_idx;
-  ref_fb_idx_prev[ALTREF2_FRAME - LAST_FRAME] = cpi->alt2_fb_idx;
-  ref_fb_idx_prev[ALTREF_FRAME - LAST_FRAME] = cpi->alt_fb_idx;
-  ref_fb_idx_prev[REF_FRAMES - LAST_FRAME] = cpi->ext_fb_idx;
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame];
+  }
 
   // Update map index for each reference frame
   for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
@@ -3451,17 +3160,9 @@ void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
     ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME];
   }
 
-  cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] =
-      ref_fb_idx_curr[LAST_FRAME - LAST_FRAME];
-  cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] =
-      ref_fb_idx_curr[LAST2_FRAME - LAST_FRAME];
-  cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] =
-      ref_fb_idx_curr[LAST3_FRAME - LAST_FRAME];
-  cpi->gld_fb_idx = ref_fb_idx_curr[GOLDEN_FRAME - LAST_FRAME];
-  cpi->bwd_fb_idx = ref_fb_idx_curr[BWDREF_FRAME - LAST_FRAME];
-  cpi->alt2_fb_idx = ref_fb_idx_curr[ALTREF2_FRAME - LAST_FRAME];
-  cpi->alt_fb_idx = ref_fb_idx_curr[ALTREF_FRAME - LAST_FRAME];
-  cpi->ext_fb_idx = ref_fb_idx_curr[REF_FRAMES - LAST_FRAME];
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame];
+  }
 }
 
 // Define the reference buffers that will be updated post encode.
@@ -3487,26 +3188,36 @@ static void configure_buffer_updates_16(AV1_COMP *cpi) {
   // Update refresh index
   switch (gf_group->refresh_idx[gf_group->index]) {
     case LAST_FRAME:
-      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME];
       break;
 
     case LAST2_FRAME:
-      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME];
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME];
       break;
 
     case LAST3_FRAME:
-      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME];
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME];
       break;
 
-    case GOLDEN_FRAME: cpi->refresh_fb_idx = cpi->gld_fb_idx; break;
+    case GOLDEN_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+      break;
 
-    case BWDREF_FRAME: cpi->refresh_fb_idx = cpi->bwd_fb_idx; break;
+    case BWDREF_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1];
+      break;
 
-    case ALTREF2_FRAME: cpi->refresh_fb_idx = cpi->alt2_fb_idx; break;
+    case ALTREF2_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
+      break;
 
-    case ALTREF_FRAME: cpi->refresh_fb_idx = cpi->alt_fb_idx; break;
+    case ALTREF_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+      break;
 
-    case REF_FRAMES: cpi->refresh_fb_idx = cpi->ext_fb_idx; break;
+    case REF_FRAMES:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1];
+      break;
 
     default: assert(0); break;
   }
@@ -3579,7 +3290,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
 
   cpi->rc.is_src_frame_alt_ref = 0;
-#if CONFIG_EXT_REFS
   cpi->rc.is_bwd_ref_frame = 0;
   cpi->rc.is_last_bipred_frame = 0;
   cpi->rc.is_bipred_frame = 0;
@@ -3592,22 +3302,21 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
       cpi->refresh_bwd_ref_frame = 1;
       cpi->refresh_alt2_ref_frame = 1;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 1;
       break;
 
-    case LF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       break;
 
@@ -3616,35 +3325,30 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       //               needed.
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       break;
 
     case OVERLAY_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
 
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
-    case ARF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 1;
       break;
 
-#if CONFIG_EXT_REFS
     case BRF_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
@@ -3693,7 +3397,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       cpi->refresh_alt2_ref_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       break;
-#endif  // CONFIG_EXT_REFS
 
     default: assert(0); break;
   }
@@ -3734,11 +3437,8 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE
-#if CONFIG_EXT_REFS
-      || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
-#endif  // CONFIG_EXT_REFS
-      ) {
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
     configure_buffer_updates(cpi);
     target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
@@ -3850,6 +3550,8 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
     // applied when combining MB error values for the frame.
     twopass->mb_av_energy =
         log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+    twopass->frame_avg_haar_energy =
+        log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
   }
 
   // Update the total stats remaining structure.
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index 9ac542bf3..4ff0f73b0 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -42,7 +42,6 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
-#if CONFIG_EXT_REFS
 // Length of the bi-predictive frame group (BFG)
 // NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
 //       number of bi-predictive frames.
@@ -64,7 +63,6 @@ typedef struct {
 #define MAX_SR_CODED_ERROR 40
 #define MAX_RAW_ERR_VAR 2000
 #define MIN_MV_IN_OUT 0.4
-#endif  // CONFIG_EXT_REFS
 
 #define VLOW_MOTION_THRESHOLD 950
 
@@ -72,6 +70,7 @@ typedef struct {
   double frame;
   double weight;
   double intra_error;
+  double frame_avg_wavelet_energy;
   double coded_error;
   double sr_coded_error;
   double pcnt_inter;
@@ -91,10 +90,8 @@ typedef struct {
   double new_mv_count;
   double duration;
   double count;
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   // standard deviation for (0, 0) motion prediction error
   double raw_error_stdev;
-#endif  // CONFIG_EXT_REFS
 } FIRSTPASS_STATS;
 
 typedef enum {
@@ -103,16 +100,12 @@ typedef enum {
   GF_UPDATE = 2,
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
-#if CONFIG_EXT_REFS
   BRF_UPDATE = 5,            // Backward Reference Frame
   LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
   BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
   INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
   INTNL_ARF_UPDATE = 9,      // Internal Altref Frame (candidate for ALTREF2)
   FRAME_UPDATE_TYPES = 10
-#else   // !CONFIG_EXT_REFS
-  FRAME_UPDATE_TYPES = 5
-#endif  // CONFIG_EXT_REFS
 } FRAME_UPDATE_TYPE;
 
 #define FC_ANIMATION_THRESH 0.15
@@ -129,13 +122,11 @@ typedef struct {
   unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-#if CONFIG_EXT_REFS
   unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
   unsigned char refresh_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char refresh_flag[(MAX_LAG_BUFFERS * 2) + 1];
-#endif  // CONFIG_EXT_REFS
   int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
 
@@ -153,6 +144,7 @@ typedef struct {
   double modified_error_max;
   double modified_error_left;
   double mb_av_energy;
+  double frame_avg_haar_energy;
 
 #if CONFIG_FP_MB_STATS
   uint8_t *frame_mb_stats_buf;
@@ -198,7 +190,6 @@ void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index);
 #endif  // USE_GF16_MULTI_LAYER
@@ -213,7 +204,6 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
   else
     return 0;
 }
-#endif  // CONFIG_EXT_REFS
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/generic_encoder.c b/third_party/aom/av1/encoder/generic_encoder.c
deleted file mode 100644
index a31bb9ef6..000000000
--- a/third_party/aom/av1/encoder/generic_encoder.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitwriter.h"
-#include "av1/common/generic_code.h"
-#include "av1/common/odintrin.h"
-#include "pvq_encoder.h"
-
-/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
- * the cdf accordingly.
- *
- * @param [in,out] w     multi-symbol entropy encoder
- * @param [in]     val   variable being encoded
- * @param [in,out] cdf   CDF of the variable (Q15)
- * @param [in]     n     number of values possible
- * @param [in,out] count number of symbols encoded with that cdf so far
- * @param [in]     rate  adaptation rate shift (smaller is faster)
- */
-void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
- int *count, int rate) {
-  int i;
-  if (*count == 0) {
-    /* On the first call, we normalize the cdf to (32768 - n). This should
-       eventually be moved to the state init, but for now it makes it much
-       easier to experiment and convert symbols to the Q15 adaptation.*/
-    int ft;
-    ft = cdf[n - 1];
-    for (i = 0; i < n; i++) {
-      cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
-    }
-  }
-  aom_write_cdf(w, val, cdf, n);
-  aom_cdf_adapt_q15(val, cdf, n, count, rate);
-}
-
-/** Encodes a random variable using a "generic" model, assuming that the
- * distribution is one-sided (zero and up), has a single mode, and decays
- * exponentially past the model.
- *
- * @param [in,out] w     multi-symbol entropy encoder
- * @param [in,out] model generic probability model
- * @param [in]     x     variable being encoded
- * @param [in,out] ExQ16 expectation of x (adapted)
- * @param [in]     integration integration period of ExQ16 (leaky average over
- * 1<<integration samples)
- */
-void generic_encode(aom_writer *w, generic_encoder *model, int x,
- int *ex_q16, int integration) {
-  int lg_q1;
-  int shift;
-  int id;
-  uint16_t *cdf;
-  int xs;
-  lg_q1 = log_ex(*ex_q16);
-  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
-   "%d %d", *ex_q16, lg_q1));
-  /* If expectation is too large, shift x to ensure that
-     all we have past xs=15 is the exponentially decaying tail
-     of the distribution */
-  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
-  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
-  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
-  cdf = model->cdf[id];
-  xs = (x + (1 << shift >> 1)) >> shift;
-  aom_write_symbol_pvq(w, OD_MINI(15, xs), cdf, 16);
-  if (xs >= 15) {
-    int e;
-    unsigned decay;
-    /* Estimate decay based on the assumption that the distribution is close
-       to Laplacian for large values. We should probably have an adaptive
-       estimate instead. Note: The 2* is a kludge that's not fully understood
-       yet. */
-    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
-    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
-    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
-    /* Encode the tail of the distribution assuming exponential decay. */
-    aom_laplace_encode_special(w, xs - 15, decay);
-  }
-  if (shift != 0) {
-    int special;
-    /* Because of the rounding, there's only half the number of possibilities
-       for xs=0. */
-    special = xs == 0;
-    if (shift - special > 0) {
-      aom_write_literal(w, x - (xs << shift) + (!special << (shift - 1)),
-       shift - special);
-    }
-  }
-  generic_model_update(ex_q16, x, integration);
-  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
-   "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng));
-}
-
-/** Estimates the cost of encoding a value with generic_encode().
- *
- * @param [in,out] model generic probability model
- * @param [in]     x     variable being encoded
- * @param [in,out] ExQ16 expectation of x (adapted)
- * @return number of bits (approximation)
- */
-double generic_encode_cost(generic_encoder *model, int x, int *ex_q16) {
-  int lg_q1;
-  int shift;
-  int id;
-  uint16_t *cdf;
-  int xs;
-  int extra;
-  lg_q1 = log_ex(*ex_q16);
-  /* If expectation is too large, shift x to ensure that
-       all we have past xs=15 is the exponentially decaying tail
-       of the distribution */
-  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
-  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
-  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
-  cdf = model->cdf[id];
-  xs = (x + (1 << shift >> 1)) >> shift;
-  extra = 0;
-  if (shift) extra = shift - (xs == 0);
-  xs = OD_MINI(15, xs);
-  /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */
-  if (xs == 15) extra += 2;
-  return
-      extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/cdf[15]);
-}
-
-/*Estimates the cost of encoding a value with a given CDF.*/
-double od_encode_cdf_cost(int val, uint16_t *cdf, int n) {
-  int total_prob;
-  int prev_prob;
-  double val_prob;
-  OD_ASSERT(n > 0);
-  total_prob = cdf[n - 1];
-  if (val == 0) {
-    prev_prob = 0;
-  }
-  else {
-    prev_prob = cdf[val - 1];
-  }
-  val_prob = (cdf[val] - prev_prob) / (double)total_prob;
-  return -OD_LOG2(val_prob);
-}
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index 4d44e9a6f..f07d1bc00 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -32,12 +32,14 @@
 // Border over which to compute the global motion
 #define ERRORADV_BORDER 0
 
-#define ERRORADV_MAX_THRESH 0.995
-#define ERRORADV_COST_PRODUCT_THRESH 26000
+static const double erroradv_tr[] = { 0.75, 0.70, 0.65 };
+static const double erroradv_prod_tr[] = { 22000, 20000, 18000 };
 
-int is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
-  return best_erroradvantage < ERRORADV_MAX_THRESH &&
-         best_erroradvantage * params_cost < ERRORADV_COST_PRODUCT_THRESH;
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                             int erroradv_type) {
+  assert(erroradv_type < GM_ERRORADV_TR_TYPES);
+  return best_erroradvantage < erroradv_tr[erroradv_type] &&
+         best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
 }
 
 static void convert_to_params(const double *params, int32_t *model) {
@@ -76,6 +78,7 @@ static void convert_to_params(const double *params, int32_t *model) {
 void convert_model_to_params(const double *params, WarpedMotionParams *model) {
   convert_to_params(params, model->wmmat);
   model->wmtype = get_gmtype(model);
+  model->invalid = 0;
 }
 
 // Adds some offset to a global motion parameter and handles
@@ -110,32 +113,31 @@ static int32_t add_param_offset(int param_index, int32_t param_value,
 
 static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
   switch (wmtype) {
-    case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0;
+    case IDENTITY:
+      wm->wmmat[0] = 0;
+      wm->wmmat[1] = 0;
+      AOM_FALLTHROUGH_INTENDED;
     case TRANSLATION:
       wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
       wm->wmmat[3] = 0;
-    case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
+    case ROTZOOM:
+      wm->wmmat[4] = -wm->wmmat[3];
+      wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
     case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
-    case HORTRAPEZOID: wm->wmmat[6] = wm->wmmat[4] = 0; break;
-    case VERTRAPEZOID: wm->wmmat[7] = wm->wmmat[3] = 0; break;
-    case HOMOGRAPHY: break;
     default: assert(0);
   }
   wm->wmtype = wmtype;
 }
 
 int64_t refine_integerized_param(WarpedMotionParams *wm,
-                                 TransformationType wmtype,
-#if CONFIG_HIGHBITDEPTH
-                                 int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+                                 TransformationType wmtype, int use_hbd, int bd,
                                  uint8_t *ref, int r_width, int r_height,
                                  int r_stride, uint8_t *dst, int d_width,
                                  int d_height, int d_stride, int n_refinements,
                                  int64_t best_frame_error) {
-  static const int max_trans_model_params[TRANS_TYPES] = {
-    0, 2, 4, 6, 8, 8, 8
-  };
+  static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
   const int border = ERRORADV_BORDER;
   int i = 0, p;
   int n_params = max_trans_model_params[wmtype];
@@ -147,35 +149,26 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
   int32_t best_param;
 
   force_wmtype(wm, wmtype);
-  best_error = av1_warp_error(
-      wm,
-#if CONFIG_HIGHBITDEPTH
-      use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-      ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-      border, border, d_width - 2 * border, d_height - 2 * border, d_stride, 0,
-      0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_frame_error);
+  best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                              dst + border * d_stride + border, border, border,
+                              d_width - 2 * border, d_height - 2 * border,
+                              d_stride, 0, 0, best_frame_error);
   best_error = AOMMIN(best_error, best_frame_error);
   step = 1 << (n_refinements - 1);
   for (i = 0; i < n_refinements; i++, step >>= 1) {
     for (p = 0; p < n_params; ++p) {
       int step_dir = 0;
       // Skip searches for parameters that are forced to be 0
-      if (wmtype == HORTRAPEZOID && (p == 4 || p == 6)) continue;
-      if (wmtype == VERTRAPEZOID && (p == 3 || p == 7)) continue;
       param = param_mat + p;
       curr_param = *param;
       best_param = curr_param;
       // look to the left
       *param = add_param_offset(p, curr_param, -step);
-      step_error = av1_warp_error(
-          wm,
-#if CONFIG_HIGHBITDEPTH
-          use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-          ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-          border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
-          0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, best_error);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -184,14 +177,11 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 
       // look to the right
       *param = add_param_offset(p, curr_param, step);
-      step_error = av1_warp_error(
-          wm,
-#if CONFIG_HIGHBITDEPTH
-          use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-          ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-          border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
-          0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, best_error);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -203,15 +193,11 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
       // for the biggest step size
       while (step_dir) {
         *param = add_param_offset(p, best_param, step * step_dir);
-        step_error = av1_warp_error(
-            wm,
-#if CONFIG_HIGHBITDEPTH
-            use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-            ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-            border, border, d_width - 2 * border, d_height - 2 * border,
-            d_stride, 0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS,
-            best_error);
+        step_error =
+            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                           dst + border * d_stride + border, border, border,
+                           d_width - 2 * border, d_height - 2 * border,
+                           d_stride, 0, 0, best_error);
         if (step_error < best_error) {
           best_error = step_error;
           best_param = *param;
@@ -229,9 +215,6 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 
 static INLINE RansacFunc get_ransac_type(TransformationType type) {
   switch (type) {
-    case HOMOGRAPHY: return ransac_homography;
-    case HORTRAPEZOID: return ransac_hortrapezoid;
-    case VERTRAPEZOID: return ransac_vertrapezoid;
     case AFFINE: return ransac_affine;
     case ROTZOOM: return ransac_rotzoom;
     case TRANSLATION: return ransac_translation;
@@ -239,7 +222,6 @@ static INLINE RansacFunc get_ransac_type(TransformationType type) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
                                         int bit_depth) {
   int i, j;
@@ -257,14 +239,13 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
   }
   return buf_8bit;
 }
-#endif
 
-int compute_global_motion_feature_based(
-    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
-#if CONFIG_HIGHBITDEPTH
-    int bit_depth,
-#endif
-    int *num_inliers_by_motion, double *params_by_motion, int num_motions) {
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                        int *num_inliers_by_motion,
+                                        double *params_by_motion,
+                                        int num_motions) {
   int i;
   int num_frm_corners, num_ref_corners;
   int num_correspondences;
@@ -274,7 +255,6 @@ int compute_global_motion_feature_based(
   unsigned char *ref_buffer = ref->y_buffer;
   RansacFunc ransac = get_ransac_type(type);
 
-#if CONFIG_HIGHBITDEPTH
   if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
     // The frame buffer is 16-bit, so we need to convert to 8 bits for the
     // following code. We cache the result until the frame is released.
@@ -283,7 +263,6 @@ int compute_global_motion_feature_based(
   if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
     ref_buffer = downconvert_frame(ref, bit_depth);
   }
-#endif
 
   // compute interest points in images using FAST features
   num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 7fca5327f..2c15753fd 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -24,16 +24,14 @@ extern "C" {
 
 void convert_model_to_params(const double *params, WarpedMotionParams *model);
 
-int is_enough_erroradvantage(double erroradv, int params_cost);
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                             int erroradv_type);
 
 // Returns the av1_warp_error between "dst" and the result of applying the
 // motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
 // modified in place.
 int64_t refine_integerized_param(WarpedMotionParams *wm,
-                                 TransformationType wmtype,
-#if CONFIG_HIGHBITDEPTH
-                                 int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+                                 TransformationType wmtype, int use_hbd, int bd,
                                  uint8_t *ref, int r_width, int r_height,
                                  int r_stride, uint8_t *dst, int d_width,
                                  int d_height, int d_stride, int n_refinements,
@@ -54,12 +52,12 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
   number of inlier feature points for each motion. Params for which the
   num_inliers entry is 0 should be ignored by the caller.
 */
-int compute_global_motion_feature_based(
-    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
-#if CONFIG_HIGHBITDEPTH
-    int bit_depth,
-#endif
-    int *num_inliers_by_motion, double *params_by_motion, int num_motions);
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                        int *num_inliers_by_motion,
+                                        double *params_by_motion,
+                                        int num_motions);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 000000000..45632da9b
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_GRAIN_TEST_VECTORS_H_
+#define AV1_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+  /* Test 1 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 16, 0 },
+        { 25, 136 },
+        { 33, 144 },
+        { 41, 160 },
+        { 48, 168 },
+        { 56, 136 },
+        { 67, 128 },
+        { 82, 144 },
+        { 97, 152 },
+        { 113, 144 },
+        { 128, 176 },
+        { 143, 168 },
+        { 158, 176 },
+        { 178, 184 } },
+      14 /* num_points_y */,
+      { { 16, 0 },
+        { 20, 64 },
+        { 28, 88 },
+        { 60, 104 },
+        { 90, 136 },
+        { 105, 160 },
+        { 134, 168 },
+        { 168, 208 } },
+      8 /* num_cb_points */,
+      { { 16, 0 },
+        { 28, 96 },
+        { 56, 80 },
+        { 66, 96 },
+        { 80, 104 },
+        { 108, 96 },
+        { 122, 112 },
+        { 137, 112 },
+        { 169, 176 } },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      247 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      18 /* cb_offset */,
+      229 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      54 /* cr_offset */,
+      0 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /* chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 2 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cb_points */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 3 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 192 }, { 255, 192 } },
+      2 /* num_points_y */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cb_points */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      1 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 4 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 137 },
+          { 53, 146 },
+          { 63, 155 },
+          { 78, 155 },
+          { 107, 150 },
+          { 122, 147 },
+          { 136, 147 },
+          { 166, 153 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 72 },
+          { 27, 82 },
+          { 33, 91 },
+          { 69, 121 },
+          { 95, 143 },
+          { 108, 154 },
+          { 134, 169 },
+          { 147, 177 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 24, 95 },
+          { 54, 93 },
+          { 65, 94 },
+          { 79, 98 },
+          { 109, 107 },
+          { 124, 119 },
+          { 139, 136 },
+          { 169, 170 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 5 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_points_y */,
+      {
+          { 0, 96 },
+          { 32, 90 },
+          { 64, 83 },
+          { 96, 76 },
+          { 128, 68 },
+          { 159, 59 },
+          { 191, 48 },
+          { 223, 34 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 34 },
+          { 64, 48 },
+          { 96, 59 },
+          { 128, 68 },
+          { 159, 76 },
+          { 191, 83 },
+          { 223, 90 },
+          { 255, 96 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          -2, 2,  -5, 7,   -6, 4,   -2, -1, 1,  -2,  0,  -2, 2,
+          -3, -5, 13, -13, 6,  -14, 8,  -1, 18, -36, 58, 0,
+      },
+      {
+          -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+          0,  1,  0,  -7, 50, -8, -2, 2, 2,  2, -4,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1063 /* random_seed */
+  },
+  /* Test 6 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 96 },
+          { 20, 92 },
+          { 39, 88 },
+          { 59, 84 },
+          { 78, 80 },
+          { 98, 75 },
+          { 118, 70 },
+          { 137, 65 },
+          { 157, 60 },
+          { 177, 53 },
+          { 196, 46 },
+          { 216, 38 },
+          { 235, 27 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      2754 /* random_seed */
+  },
+  /* Test 7 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 27 },
+          { 39, 38 },
+          { 59, 46 },
+          { 78, 53 },
+          { 98, 60 },
+          { 118, 65 },
+          { 137, 70 },
+          { 157, 75 },
+          { 177, 80 },
+          { 196, 84 },
+          { 216, 88 },
+          { 235, 92 },
+          { 255, 96 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 8 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cb_points */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 9 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 10 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 11 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_points_y */,
+      {
+          { 0, 48 },
+          { 32, 45 },
+          { 64, 42 },
+          { 96, 38 },
+          { 128, 34 },
+          { 159, 29 },
+          { 191, 24 },
+          { 223, 17 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 17 },
+          { 64, 24 },
+          { 96, 29 },
+          { 128, 34 },
+          { 159, 38 },
+          { 191, 42 },
+          { 223, 45 },
+          { 255, 48 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1357 /* random_seed */
+  },
+  /* Test 12 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 49 },
+          { 39, 69 },
+          { 46, 84 },
+          { 53, 91 },
+          { 63, 100 },
+          { 78, 114 },
+          { 92, 134 },
+          { 164, 139 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 31 },
+          { 26, 42 },
+          { 33, 54 },
+          { 40, 65 },
+          { 47, 72 },
+          { 56, 85 },
+          { 84, 123 },
+          { 152, 157 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 25, 14 },
+          { 39, 33 },
+          { 47, 40 },
+          { 54, 47 },
+          { 64, 62 },
+          { 79, 76 },
+          { 94, 83 },
+          { 167, 101 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      0 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 13 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 48 },
+          { 20, 46 },
+          { 39, 44 },
+          { 59, 42 },
+          { 78, 40 },
+          { 98, 38 },
+          { 118, 35 },
+          { 137, 33 },
+          { 157, 30 },
+          { 177, 27 },
+          { 196, 23 },
+          { 216, 19 },
+          { 235, 13 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 14 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 13 },
+          { 39, 19 },
+          { 59, 23 },
+          { 78, 27 },
+          { 98, 30 },
+          { 118, 33 },
+          { 137, 35 },
+          { 157, 38 },
+          { 177, 40 },
+          { 196, 42 },
+          { 216, 44 },
+          { 235, 46 },
+          { 255, 48 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 15 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      1 /* num_points_y */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cb_points */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+      { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+      { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      1 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 16 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 58, 126 },
+          { 87, 120 },
+          { 97, 122 },
+          { 112, 125 },
+          { 126, 131 },
+          { 141, 139 },
+          { 199, 153 },
+      },
+      8 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 59, 68 },
+          { 66, 76 },
+          { 73, 82 },
+          { 79, 85 },
+          { 86, 86 },
+          { 151, 95 },
+          { 192, 101 },
+      },
+      8 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 59, 64 },
+          { 89, 80 },
+          { 99, 86 },
+          { 114, 90 },
+          { 129, 93 },
+          { 144, 97 },
+          { 203, 85 },
+      },
+      8 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      2 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+};
+#endif  // AV1_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
index 89c5bd8a3..180115d9f 100644
--- a/third_party/aom/av1/encoder/hash.c
+++ b/third_party/aom/av1/encoder/hash.c
@@ -22,7 +22,7 @@ static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
   }
 }
 
-void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
   p_crc_calculator->remainder = 0;
 }
 
@@ -61,9 +61,65 @@ void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
   crc_calculator_init_table(p_crc_calculator);
 }
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length) {
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length) {
+  CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
   crc_calculator_reset(p_crc_calculator);
   crc_calculator_process_data(p_crc_calculator, p, length);
   return crc_calculator_get_crc(p_crc_calculator);
 }
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+  uint32_t crc;
+
+  for (int n = 0; n < 256; n++) {
+    crc = n;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    p_crc32c->table[0][n] = crc;
+  }
+  for (int n = 0; n < 256; n++) {
+    crc = p_crc32c->table[0][n];
+    for (int k = 1; k < 8; k++) {
+      crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+      p_crc32c->table[k][n] = crc;
+    }
+  }
+}
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+ than using the hardware instructions.  This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(CRC32C *p, uint8_t *buf, size_t len) {
+  const uint8_t *next = (const uint8_t *)(buf);
+  uint64_t crc;
+
+  crc = 0 ^ 0xffffffff;
+  while (len && ((uintptr_t)next & 7) != 0) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  while (len >= 8) {
+    crc ^= *(uint64_t *)next;
+    crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+          p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+          p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+          p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+    next += 8;
+    len -= 8;
+  }
+  while (len) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
index a0fd54fb6..8b6227540 100644
--- a/third_party/aom/av1/encoder/hash.h
+++ b/third_party/aom/av1/encoder/hash.h
@@ -12,7 +12,8 @@
 #ifndef AV1_ENCODER_HASH_H_
 #define AV1_ENCODER_HASH_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
@@ -31,9 +32,16 @@ typedef struct _crc_calculator {
 // calling av1_get_crc_value().
 void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
                              uint32_t truncPoly);
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+  /* Table for a quadword-at-a-time software crc. */
+  uint32_t table[8][256];
+} CRC32C;
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length);
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index 2378597ad..5a8f8cbba 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -1,7 +1,9 @@
 #include <assert.h>
+
+#include "config/av1_rtcd.h"
+
 #include "av1/encoder/hash.h"
 #include "av1/encoder/hash_motion.h"
-#include "./av1_rtcd.h"
 
 static const int crc_bits = 16;
 static const int block_size_bits = 3;
@@ -16,7 +18,7 @@ static void hash_table_clear_all(hash_table *p_hash_table) {
   int max_addr = 1 << (crc_bits + block_size_bits);
   for (int i = 0; i < max_addr; i++) {
     if (p_hash_table->p_lookup_table[i] != NULL) {
-      vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
       aom_free(p_hash_table->p_lookup_table[i]);
       p_hash_table->p_lookup_table[i] = NULL;
     }
@@ -37,11 +39,30 @@ static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
   }
 }
 
+static void get_pixels_in_1D_short_array_by_block_2x2(uint16_t *y_src,
+                                                      int stride,
+                                                      uint16_t *p_pixels_in1D) {
+  uint16_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
 static int is_block_2x2_row_same_value(uint8_t *p) {
   if (p[0] != p[1] || p[2] != p[3]) {
     return 0;
   }
+  return 1;
+}
 
+static int is_block16_2x2_row_same_value(uint16_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
   return 1;
 }
 
@@ -49,7 +70,13 @@ static int is_block_2x2_col_same_value(uint8_t *p) {
   if ((p[0] != p[2]) || (p[1] != p[3])) {
     return 0;
   }
+  return 1;
+}
 
+static int is_block16_2x2_col_same_value(uint16_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
   return 1;
 }
 
@@ -63,6 +90,7 @@ static int hash_block_size_to_index(int block_size) {
     case 16: return 2;
     case 32: return 3;
     case 64: return 4;
+    case 128: return 5;
     default: return -1;
   }
 }
@@ -100,11 +128,13 @@ static void hash_table_add_to_table(hash_table *p_hash_table,
   if (p_hash_table->p_lookup_table[hash_value] == NULL) {
     p_hash_table->p_lookup_table[hash_value] =
         aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
-    vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
-                 sizeof(curr_block_hash[0]));
-    vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash);
+    aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                     sizeof(curr_block_hash[0]));
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
   } else {
-    vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash);
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
   }
 }
 
@@ -119,7 +149,7 @@ int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
 Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
                                      uint32_t hash_value) {
   assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
-  return vector_begin(p_hash_table->p_lookup_table[hash_value]);
+  return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
 }
 
 int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
@@ -127,8 +157,9 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
   if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
     return 0;
   }
-  Iterator iterator = vector_begin(p_hash_table->p_lookup_table[hash_value1]);
-  Iterator last = vector_end(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator iterator =
+      aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
   for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) {
     if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) {
       return 1;
@@ -146,25 +177,45 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
   const int y_end = picture->y_crop_height - height + 1;
 
   const int length = width * 2;
-  uint8_t p[4];
-
-  int pos = 0;
-  for (int y_pos = 0; y_pos < y_end; y_pos++) {
-    for (int x_pos = 0; x_pos < x_end; x_pos++) {
-      get_pixels_in_1D_char_array_by_block_2x2(
-          picture->y_buffer + y_pos * picture->y_stride + x_pos,
-          picture->y_stride, p);
-      pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
-      pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
-
-      pic_block_hash[0][pos] =
-          av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
-      pic_block_hash[1][pos] =
-          av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
-
-      pos++;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_short_array_by_block_2x2(
+            CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+                x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] = av1_get_crc_value(
+            &crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] = av1_get_crc_value(
+            &crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  } else {
+    uint8_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_char_array_by_block_2x2(
+            picture->y_buffer + y_pos * picture->y_stride + x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
     }
-    pos += width - 1;
   }
 }
 
@@ -222,14 +273,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
   }
 
   if (block_size >= 4) {
-    const int size_minus1 = block_size - 1;
+    const int size_minus_1 = block_size - 1;
     pos = 0;
     for (int y_pos = 0; y_pos < y_end; y_pos++) {
       for (int x_pos = 0; x_pos < x_end; x_pos++) {
         dst_pic_block_same_info[2][pos] =
             (!dst_pic_block_same_info[0][pos] &&
              !dst_pic_block_same_info[1][pos]) ||
-            (((x_pos & size_minus1) == 0) && ((y_pos & size_minus1) == 0));
+            (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
         pos++;
       }
       pos += block_size - 1;
@@ -276,13 +327,25 @@ int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
   const int stride = picture->y_stride;
   const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
 
-  for (int i = 0; i < block_size; i++) {
-    for (int j = 1; j < block_size; j++) {
-      if (p[j] != p[0]) {
-        return 0;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j] != p16[0]) {
+          return 0;
+        }
       }
+      p16 += stride;
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j] != p[0]) {
+          return 0;
+        }
+      }
+      p += stride;
     }
-    p += stride;
   }
 
   return 1;
@@ -293,26 +356,38 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
   const int stride = picture->y_stride;
   const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
 
-  for (int i = 0; i < block_size; i++) {
-    for (int j = 1; j < block_size; j++) {
-      if (p[j * stride + i] != p[i]) {
-        return 0;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j * stride + i] != p16[i]) {
+          return 0;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j * stride + i] != p[i]) {
+          return 0;
+        }
       }
     }
   }
-
   return 1;
 }
 
 // global buffer for hash value calculation of a block
 // used only in av1_get_block_hash_value()
-static uint32_t hash_value_buffer[2][2][1024];  // [first hash/second hash]
-                                                // [two buffers used ping-pong]
-                                                // [num of 2x2 blocks in 64x64]
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+// [first hash/second hash]
+// [two buffers used ping-pong]
+// [num of 2x2 blocks in 128x128]
+static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH];
 
 void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
-                              uint32_t *hash_value1, uint32_t *hash_value2) {
-  uint8_t pixel_to_hash[4];
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth) {
   uint32_t to_hash[4];
   const int add_value = hash_block_size_to_index(block_size) << crc_bits;
   assert(add_value >= 0);
@@ -320,16 +395,34 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
 
   // 2x2 subblock hash values in current CU
   int sub_block_in_width = (block_size >> 1);
-  for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
-    for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
-      int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
-      get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
-                                               stride, pixel_to_hash);
-
-      hash_value_buffer[0][0][pos] = av1_get_crc_value(
-          &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
-      hash_value_buffer[1][0][pos] = av1_get_crc_value(
-          &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+  if (use_highbitdepth) {
+    uint16_t pixel_to_hash[4];
+    uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_short_array_by_block_2x2(
+            y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        hash_value_buffer[0][0][pos] = av1_get_crc_value(
+            &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+        hash_value_buffer[1][0][pos] = av1_get_crc_value(
+            &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+      }
+    }
+  } else {
+    uint8_t pixel_to_hash[4];
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+                                                 stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        hash_value_buffer[0][0][pos] = av1_get_crc_value(
+            &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+        hash_value_buffer[1][0][pos] = av1_get_crc_value(
+            &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+      }
     }
   }
 
@@ -349,6 +442,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
       for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
         int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
 
+        assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(srcPos + src_sub_block_in_width + 1 <
+               AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
         to_hash[0] = hash_value_buffer[0][src_idx][srcPos];
         to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1];
         to_hash[2] =
@@ -378,3 +475,5 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
   *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
   *hash_value2 = hash_value_buffer[1][dst_idx][0];
 }
+
+#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
index 26e1ac46e..8deb92eb6 100644
--- a/third_party/aom/av1/encoder/hash_motion.h
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -12,7 +12,8 @@
 #ifndef AV1_ENCODER_HASH_MOTION_H_
 #define AV1_ENCODER_HASH_MOTION_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
 #include "third_party/vector/vector.h"
@@ -29,7 +30,9 @@ typedef struct _block_hash {
   uint32_t hash_value2;
 } block_hash;
 
-typedef struct _hash_table { Vector **p_lookup_table; } hash_table;
+typedef struct _hash_table {
+  Vector **p_lookup_table;
+} hash_table;
 
 void av1_hash_table_init(hash_table *p_hash_table);
 void av1_hash_table_destroy(hash_table *p_hash_table);
@@ -63,7 +66,8 @@ int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
 int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
                                  int block_size, int x_start, int y_start);
 void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
-                              uint32_t *hash_value1, uint32_t *hash_value2);
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 6ddeb2b77..0922557d0 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -9,228 +9,73 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "av1/common/idct.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
-#if CONFIG_CHROMA_2X2
-static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-  tran_high_t a1 = src_diff[0];
-  tran_high_t b1 = src_diff[1];
-  tran_high_t c1 = src_diff[diff_stride];
-  tran_high_t d1 = src_diff[1 + diff_stride];
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  a1 = a2 + b2;
-  b1 = a2 - b2;
-  c1 = c2 + d2;
-  d1 = c2 - d2;
-
-  coeff[0] = (tran_low_t)(4 * a1);
-  coeff[1] = (tran_low_t)(4 * b1);
-  coeff[2] = (tran_low_t)(4 * c1);
-  coeff[3] = (tran_low_t)(4 * d1);
-
-  (void)txfm_param;
-}
-#endif
-
-static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-  if (txfm_param->lossless) {
-    assert(txfm_param->tx_type == DCT_DCT);
-    av1_fwht4x4(src_diff, coeff, diff_stride);
-    return;
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
   }
-
-#if CONFIG_LGT || CONFIG_DAALA_DCT4
-  // only C version has LGTs
-  av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht4x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht4x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht4x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht8x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht8x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht16x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht16x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-  av1_fht16x32(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-  av1_fht32x16(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT || CONFIG_DAALA_DCT8
-  av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_DCT16
-  av1_fht16x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht16x16(src_diff, coeff, diff_stride, txfm_param);
-#endif  // CONFIG_DAALA_DCT16
-}
-
-static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_MRC_TX
-  // MRC_DCT currently only has a C implementation
-  if (txfm_param->tx_type == MRC_DCT) {
-    av1_fht32x32_c(src_diff, coeff, diff_stride, txfm_param);
-    return;
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
   }
-#endif  // CONFIG_MRC_TX
-  av1_fht32x32(src_diff, coeff, diff_stride, txfm_param);
-}
-
-#if CONFIG_TX64X64
-static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_EXT_TX
-  if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 64, txfm_param->tx_type);
-  else
-#endif
-    av1_fht64x64(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_EXT_TX
-  if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, 64, txfm_param->tx_type);
-  else
-#endif
-    av1_fht32x64(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_EXT_TX
-  if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 32, txfm_param->tx_type);
-  else
-#endif
-    av1_fht64x32(src_diff, coeff, diff_stride, txfm_param);
-}
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static void fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht16x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht16x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht4x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht4x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht32x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht32x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
-static void fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht8x32_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x32(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-#endif
-
-#if CONFIG_CHROMA_2X2
-static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TxfmParam *txfm_param) {
-  tran_high_t a1 = src_diff[0];
-  tran_high_t b1 = src_diff[1];
-  tran_high_t c1 = src_diff[diff_stride];
-  tran_high_t d1 = src_diff[1 + diff_stride];
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  a1 = a2 + b2;
-  b1 = a2 - b2;
-  c1 = c2 + d2;
-  d1 = c2 - d2;
-
-  coeff[0] = (tran_low_t)(4 * a1);
-  coeff[1] = (tran_low_t)(4 * b1);
-  coeff[2] = (tran_low_t)(4 * c1);
-  coeff[3] = (tran_low_t)(4 * d1);
-
-  (void)txfm_param;
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  av1_fwht4x4_c(input, output, stride);
 }
-#endif
 
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
@@ -243,22 +88,6 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     return;
   }
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -267,11 +96,11 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
@@ -317,28 +146,40 @@ static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                          txfm_param->bd);
 }
 
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -347,11 +188,11 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
@@ -361,22 +202,6 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -385,11 +210,11 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
@@ -399,22 +224,6 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -423,206 +232,72 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
-#if CONFIG_TX64X64
 static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
   int32_t *dst_coeff = (int32_t *)coeff;
-  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
-      break;
-    case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 32, 64, tx_type);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
+  av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
 
 static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
   int32_t *dst_coeff = (int32_t *)coeff;
-  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
-      break;
-    case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 32, tx_type);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
+  av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x16_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
+
 static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
   int32_t *dst_coeff = (int32_t *)coeff;
-  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_64x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
-      break;
-    case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 64, tx_type);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
+  av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
-#endif  // CONFIG_TX64X64
 
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                   TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-#if CONFIG_LGT_FROM_PRED
-  if (txfm_param->use_lgt) {
-    // if use_lgt is 1, it will override tx_type
-    assert(is_lgt_allowed(txfm_param->mode, tx_size));
-    flgt2d_from_pred_c(src_diff, coeff, diff_stride, txfm_param);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-  switch (tx_size) {
-#if CONFIG_TX64X64
-    case TX_64X64:
-      fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_32X64:
-      fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_64X32:
-      fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-#endif  // CONFIG_TX64X64
-    case TX_32X32:
-      fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X16:
-      fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_8X8: fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break;
-    case TX_4X8: fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break;
-    case TX_8X4: fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break;
-    case TX_8X16:
-      fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X8:
-      fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X32:
-      fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_32X16:
-      fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_4X4: fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param); break;
-#endif
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X4:
-      fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_8X32:
-      fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_32X8:
-      fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
-      break;
-#endif
-    default: assert(0); break;
-  }
+  if (txfm_param->bd == 8)
+    av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+  else
+    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TxfmParam *txfm_param) {
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
 }
 
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-#if CONFIG_TX64X64
     case TX_64X64:
       highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
       break;
@@ -632,7 +307,12 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_64X32:
       highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
       break;
-#endif  // CONFIG_TX64X64
+    case TX_16X64:
+      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
     case TX_32X32:
       highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
       break;
@@ -663,11 +343,18 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_4X4:
       highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
       break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2:
-      highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param);
+    case TX_4X16:
+      highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X4:
+      highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
       break;
-#endif
     default: assert(0); break;
   }
 }
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
index b25ffb8d8..6155b255a 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -12,7 +12,7 @@
 #ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
 #define AV1_ENCODER_HYBRID_FWD_TXFM_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
index 3a433d9b5..9e526b88b 100644
--- a/third_party/aom/av1/encoder/k_means_template.h
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -23,25 +23,23 @@
 #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
 #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
 
-static float RENAME(calc_dist)(const float *p1, const float *p2) {
-  float dist = 0;
-  int i;
-  for (i = 0; i < AV1_K_MEANS_DIM; ++i) {
-    const float diff = p1[i] - p2[i];
+static int RENAME(calc_dist)(const int *p1, const int *p2) {
+  int dist = 0;
+  for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const int diff = p1[i] - p2[i];
     dist += diff * diff;
   }
   return dist;
 }
 
-void RENAME(av1_calc_indices)(const float *data, const float *centroids,
+void RENAME(av1_calc_indices)(const int *data, const int *centroids,
                               uint8_t *indices, int n, int k) {
-  int i, j;
-  for (i = 0; i < n; ++i) {
-    float min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+  for (int i = 0; i < n; ++i) {
+    int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
     indices[i] = 0;
-    for (j = 1; j < k; ++j) {
-      const float this_dist = RENAME(calc_dist)(
-          data + i * AV1_K_MEANS_DIM, centroids + j * AV1_K_MEANS_DIM);
+    for (int j = 1; j < k; ++j) {
+      const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                                              centroids + j * AV1_K_MEANS_DIM);
       if (this_dist < min_dist) {
         min_dist = this_dist;
         indices[i] = j;
@@ -50,19 +48,16 @@ void RENAME(av1_calc_indices)(const float *data, const float *centroids,
   }
 }
 
-static void RENAME(calc_centroids)(const float *data, float *centroids,
+static void RENAME(calc_centroids)(const int *data, int *centroids,
                                    const uint8_t *indices, int n, int k) {
-  int i, j, index;
-  int count[PALETTE_MAX_SIZE];
+  int i, j;
+  int count[PALETTE_MAX_SIZE] = { 0 };
   unsigned int rand_state = (unsigned int)data[0];
-
   assert(n <= 32768);
-
-  memset(count, 0, sizeof(count[0]) * k);
   memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
 
   for (i = 0; i < n; ++i) {
-    index = indices[i];
+    const int index = indices[i];
     assert(index < k);
     ++count[index];
     for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
@@ -76,43 +71,35 @@ static void RENAME(calc_centroids)(const float *data, float *centroids,
              data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
              sizeof(centroids[0]) * AV1_K_MEANS_DIM);
     } else {
-      const float norm = 1.0f / count[i];
-      for (j = 0; j < AV1_K_MEANS_DIM; ++j)
-        centroids[i * AV1_K_MEANS_DIM + j] *= norm;
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+        centroids[i * AV1_K_MEANS_DIM + j] =
+            DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+      }
     }
   }
-
-  // Round to nearest integers.
-  for (i = 0; i < k * AV1_K_MEANS_DIM; ++i) {
-    centroids[i] = roundf(centroids[i]);
-  }
 }
 
-static float RENAME(calc_total_dist)(const float *data, const float *centroids,
-                                     const uint8_t *indices, int n, int k) {
-  float dist = 0;
-  int i;
+static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
+                                       const uint8_t *indices, int n, int k) {
+  int64_t dist = 0;
   (void)k;
-
-  for (i = 0; i < n; ++i)
+  for (int i = 0; i < n; ++i) {
     dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
                               centroids + indices[i] * AV1_K_MEANS_DIM);
-
+  }
   return dist;
 }
 
-void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices,
+void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
                          int n, int k, int max_itr) {
-  int i;
-  float this_dist;
-  float pre_centroids[2 * PALETTE_MAX_SIZE];
+  int pre_centroids[2 * PALETTE_MAX_SIZE];
   uint8_t pre_indices[MAX_SB_SQUARE];
 
   RENAME(av1_calc_indices)(data, centroids, indices, n, k);
-  this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+  int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
-  for (i = 0; i < max_itr; ++i) {
-    const float pre_dist = this_dist;
+  for (int i = 0; i < max_itr; ++i) {
+    const int64_t pre_dist = this_dist;
     memcpy(pre_centroids, centroids,
            sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
     memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
@@ -132,6 +119,5 @@ void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices,
       break;
   }
 }
-
 #undef RENAME_
 #undef RENAME
diff --git a/third_party/aom/av1/encoder/laplace_encoder.c b/third_party/aom/av1/encoder/laplace_encoder.c
deleted file mode 100644
index 54ffc88fb..000000000
--- a/third_party/aom/av1/encoder/laplace_encoder.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitwriter.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/pvq.h"
-#include "pvq_encoder.h"
-
-static void aom_encode_pvq_split(aom_writer *w, od_pvq_codeword_ctx *adapt,
- int count, int sum, int ctx) {
-  int shift;
-  int rest;
-  int fctx;
-  if (sum == 0) return;
-  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
-  if (shift) {
-    rest = count & ((1 << shift) - 1);
-    count >>= shift;
-    sum >>= shift;
-  }
-  fctx = 7*ctx + sum - 1;
-  aom_write_symbol_pvq(w, count, adapt->pvq_split_cdf[fctx], sum + 1);
-  if (shift) aom_write_literal(w, rest, shift);
-}
-
-void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
- const int *y, int n, int k, int level) {
-  int mid;
-  int i;
-  int count_right;
-  if (n <= 1 || k == 0) return;
-  if (k == 1 && n <= 16) {
-    int cdf_id;
-    int pos;
-    cdf_id = od_pvq_k1_ctx(n, level == 0);
-    for (pos = 0; !y[pos]; pos++);
-    OD_ASSERT(pos < n);
-    aom_write_symbol_pvq(w, pos, adapt->pvq_k1_cdf[cdf_id], n);
-  }
-  else {
-    mid = n >> 1;
-    count_right = k;
-    for (i = 0; i < mid; i++) count_right -= abs(y[i]);
-    aom_encode_pvq_split(w, adapt, count_right, k, od_pvq_size_ctx(n));
-    aom_encode_band_pvq_splits(w, adapt, y, mid, k - count_right, level + 1);
-    aom_encode_band_pvq_splits(w, adapt, y + mid, n - mid, count_right,
-     level + 1);
-  }
-}
-
-/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't
- * do anything special for the zero case.
- *
- * @param [in,out] enc     range encoder
- * @param [in]     x       variable to encode (has to be positive)
- * @param [in]     decay   decay factor of the distribution in Q8 format,
- * i.e. pdf ~= decay^x
- */
-void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay) {
-  int shift;
-  int xs;
-  int sym;
-  const uint16_t *cdf;
-  shift = 0;
-  /* We don't want a large decay value because that would require too many
-     symbols. */
-  while (decay > 235) {
-    decay = (decay*decay + 128) >> 8;
-    shift++;
-  }
-  decay = OD_MINI(decay, 254);
-  decay = OD_MAXI(decay, 2);
-  xs = x >> shift;
-  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
-  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay));
-  do {
-    sym = OD_MINI(xs, 15);
-    {
-      int i;
-      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift,
-       sym, max));
-      for (i = 0; i < 16; i++) {
-        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
-      }
-      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
-    }
-    aom_write_cdf(w, sym, cdf, 16);
-    xs -= 15;
-  } while (sym >= 15);
-  if (shift) aom_write_literal(w, x & ((1 << shift) - 1), shift);
-}
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
index 591ca6152..1bf8ecbac 100644
--- a/third_party/aom/av1/encoder/lookahead.c
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -11,10 +11,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "av1/common/common.h"
-
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/lookahead.h"
@@ -42,14 +41,9 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
   }
 }
 
-struct lookahead_ctx *av1_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int subsampling_x,
-                                         unsigned int subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                         int use_highbitdepth,
-#endif
-                                         unsigned int depth) {
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
@@ -68,10 +62,7 @@ struct lookahead_ctx *av1_lookahead_init(unsigned int width,
     if (!ctx->buf) goto bail;
     for (i = 0; i < depth; i++)
       if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
-                                 subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                 use_highbitdepth,
-#endif
+                                 subsampling_y, use_highbitdepth,
                                  AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
         goto bail;
   }
@@ -84,10 +75,7 @@ bail:
 #define USE_PARTIAL_COPY 0
 
 int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end,
-#if CONFIG_HIGHBITDEPTH
-                       int use_highbitdepth,
-#endif
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
@@ -160,10 +148,7 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
       YV12_BUFFER_CONFIG new_img;
       memset(&new_img, 0, sizeof(new_img));
       if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                                 subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                 use_highbitdepth,
-#endif
+                                 subsampling_y, use_highbitdepth,
                                  AOM_BORDER_IN_PIXELS, 0))
         return 1;
       aom_free_frame_buffer(&buf->img);
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
index 19f75d7e4..3897c2a6a 100644
--- a/third_party/aom/av1/encoder/lookahead.h
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -44,14 +44,9 @@ struct lookahead_ctx {
  * The lookahead stage is a queue of frame buffers on which some analysis
  * may be done when buffers are enqueued.
  */
-struct lookahead_ctx *av1_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int subsampling_x,
-                                         unsigned int subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                         int use_highbitdepth,
-#endif
-                                         unsigned int depth);
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth);
 
 /**\brief Destroys the lookahead stage
  */
@@ -73,10 +68,7 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx);
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end,
-#if CONFIG_HIGHBITDEPTH
-                       int use_highbitdepth,
-#endif
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags);
 
 /**\brief Get the next source buffer to encode
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 7d2510af9..472173634 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -11,8 +11,8 @@
 
 #include <limits.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -47,32 +47,28 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
   av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
                  cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
 
-// Try sub-pixel MC
-// if (bestsme > error_thresh && bestsme < INT_MAX)
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level == 1) {
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  if (cpi->common.cur_frame_force_integer_mv == 1) {
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
   } else {
-#else
-  {
-#endif
     int distortion;
     unsigned int sse;
-    cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
-                                 x->errorperbit, &v_fn_ptr, 0,
-                                 mv_sf->subpel_iters_per_step,
-                                 cond_cost_list(cpi, cost_list), NULL, NULL,
-                                 &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
+    cpi->find_fractional_mv_step(
+        x, &cpi->common, mb_row, mb_col, ref_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0,
+        mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL,
+        NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
   }
 
-  if (has_second_ref(&xd->mi[0]->mbmi))
-    xd->mi[0]->mbmi.mode = NEW_NEWMV;
+  if (has_second_ref(xd->mi[0]))
+    xd->mi[0]->mode = NEW_NEWMV;
   else
-    xd->mi[0]->mbmi.mode = NEWMV;
+    xd->mi[0]->mode = NEWMV;
 
-  xd->mi[0]->mbmi.mv[0] = x->best_mv;
-  xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+  xd->mi[0]->mv[0] = x->best_mv;
+  xd->mi[0]->ref_frame[1] = NONE_FRAME;
 
   av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
                                  BLOCK_16X16);
@@ -108,7 +104,7 @@ static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
-    MV zero_ref_mv = { 0, 0 };
+    MV zero_ref_mv = kZeroMv;
 
     tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
     if (tmp_err < err) {
@@ -144,14 +140,14 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
 
   // calculate SATD for each intra prediction mode;
   // we're intentionally not doing 4x4, we just want a rough estimate
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  for (mode = DC_PRED; mode <= PAETH_PRED; mode++) {
     unsigned int err;
 
-    xd->mi[0]->mbmi.mode = mode;
-    av1_predict_intra_block(cm, xd, 16, 16, BLOCK_16X16, mode,
-                            x->plane[0].src.buf, x->plane[0].src.stride,
-                            xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0,
-                            0);
+    xd->mi[0]->mode = mode;
+    av1_predict_intra_block(cm, xd, 16, 16, TX_16X16, mode, 0, 0,
+                            FILTER_INTRA_MODES, x->plane[0].src.buf,
+                            x->plane[0].src.stride, xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride, 0, 0, 0);
     err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
@@ -231,8 +227,8 @@ static void update_mbgraph_frame_stats(AV1_COMP *cpi,
 
   int mb_col, mb_row, offset = 0;
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
-  MV gld_top_mv = { 0, 0 };
-  MODE_INFO mi_local;
+  MV gld_top_mv = kZeroMv;
+  MB_MODE_INFO mi_local;
 
   av1_zero(mi_local);
   // Set up limit values for motion vectors to prevent them extending outside
@@ -244,9 +240,9 @@ static void update_mbgraph_frame_stats(AV1_COMP *cpi,
   xd->plane[0].pre[0].stride = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
   xd->mi[0] = &mi_local;
-  mi_local.mbmi.sb_type = BLOCK_16X16;
-  mi_local.mbmi.ref_frame[0] = LAST_FRAME;
-  mi_local.mbmi.ref_frame[1] = NONE_FRAME;
+  mi_local.sb_type = BLOCK_16X16;
+  mi_local.ref_frame[0] = LAST_FRAME;
+  mi_local.ref_frame[1] = NONE_FRAME;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
index 758e2ad15..3e0a4fa9b 100644
--- a/third_party/aom/av1/encoder/mbgraph.h
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -23,10 +23,12 @@ typedef struct {
       int_mv mv;
       PREDICTION_MODE mode;
     } m;
-  } ref[TOTAL_REFS_PER_FRAME];
+  } ref[REF_FRAMES];
 } MBGRAPH_MB_STATS;
 
-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
 
 struct AV1_COMP;
 
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index 6c8503da0..c4572a341 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -22,9 +22,11 @@
 
 #include "av1/common/common.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/rdopt.h"
 
@@ -54,10 +56,9 @@ void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
   if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
 }
 
-static void av1_set_subpel_mv_search_range(const MvLimits *mv_limits,
-                                           int *col_min, int *col_max,
-                                           int *row_min, int *row_max,
-                                           const MV *ref_mv) {
+static void set_subpel_mv_search_range(const MvLimits *mv_limits, int *col_min,
+                                       int *col_max, int *row_min, int *row_max,
+                                       const MV *ref_mv) {
   const int max_mv = MAX_FULL_PEL_VAL * 8;
   const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
   const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
@@ -172,57 +173,64 @@ void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
 static INLINE int sp(int x) { return x & 7; }
 
 static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
-  return &buf[(r >> 3) * stride + (c >> 3)];
+  const int offset = (r >> 3) * stride + (c >> 3);
+  return buf + offset;
 }
 
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
-    MV this_mv = { r, c };                                                \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
-    if (second_pred == NULL)                                              \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
-                         src_address, src_stride, &sse);                  \
-    else if (mask)                                                        \
-      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, second_pred, mask,     \
-                          mask_stride, invert_mask, &sse);                \
-    else                                                                  \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, &sse, second_pred);    \
-    v += thismse;                                                         \
-    if (v < besterr) {                                                    \
-      besterr = v;                                                        \
-      br = r;                                                             \
-      bc = c;                                                             \
-      *distortion = thismse;                                              \
-      *sse1 = sse;                                                        \
-    }                                                                     \
-  } else {                                                                \
-    v = INT_MAX;                                                          \
+#define CHECK_BETTER(v, r, c)                                                \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+    MV this_mv = { r, c };                                                   \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);       \
+    if (second_pred == NULL) {                                               \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),     \
+                         src_address, src_stride, &sse);                     \
+    } else if (mask) {                                                       \
+      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),    \
+                          src_address, src_stride, second_pred, mask,        \
+                          mask_stride, invert_mask, &sse);                   \
+    } else {                                                                 \
+      if (xd->jcp_param.use_jnt_comp_avg)                                    \
+        thismse = vfp->jsvaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                             src_address, src_stride, &sse, second_pred,     \
+                             &xd->jcp_param);                                \
+      else                                                                   \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
+                            src_address, src_stride, &sse, second_pred);     \
+    }                                                                        \
+    v += thismse;                                                            \
+    if (v < besterr) {                                                       \
+      besterr = v;                                                           \
+      br = r;                                                                \
+      bc = c;                                                                \
+      *distortion = thismse;                                                 \
+      *sse1 = sse;                                                           \
+    }                                                                        \
+  } else {                                                                   \
+    v = INT_MAX;                                                             \
   }
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                              \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-    MV this_mv = { r, c };                                                  \
-    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
-                                   pre(y, y_stride, r, c), y_stride, sp(c), \
-                                   sp(r), second_pred, mask, mask_stride,   \
-                                   invert_mask, w, h, &sse);                \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
-    v += thismse;                                                           \
-    if (v < besterr) {                                                      \
-      besterr = v;                                                          \
-      br = r;                                                               \
-      bc = c;                                                               \
-      *distortion = thismse;                                                \
-      *sse1 = sse;                                                          \
-    }                                                                       \
-  } else {                                                                  \
-    v = INT_MAX;                                                            \
+#define CHECK_BETTER1(v, r, c)                                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                  \
+    MV this_mv = { r, c };                                                 \
+    thismse = upsampled_pref_error(                                        \
+        xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,    \
+        pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
+        mask_stride, invert_mask, w, h, &sse);                             \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);     \
+    v += thismse;                                                          \
+    if (v < besterr) {                                                     \
+      besterr = v;                                                         \
+      br = r;                                                              \
+      bc = c;                                                              \
+      *distortion = thismse;                                               \
+      *sse1 = sse;                                                         \
+    }                                                                      \
+  } else {                                                                 \
+    v = INT_MAX;                                                           \
   }
 
 #define FIRST_LEVEL_CHECKS                                       \
@@ -294,33 +302,33 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     }                                              \
   }
 
-#define SETUP_SUBPEL_SEARCH                                                 \
-  const uint8_t *const src_address = x->plane[0].src.buf;                   \
-  const int src_stride = x->plane[0].src.stride;                            \
-  const MACROBLOCKD *xd = &x->e_mbd;                                        \
-  unsigned int besterr = INT_MAX;                                           \
-  unsigned int sse;                                                         \
-  unsigned int whichdir;                                                    \
-  int thismse;                                                              \
-  MV *bestmv = &x->best_mv.as_mv;                                           \
-  const unsigned int halfiters = iters_per_step;                            \
-  const unsigned int quarteriters = iters_per_step;                         \
-  const unsigned int eighthiters = iters_per_step;                          \
-  const int y_stride = xd->plane[0].pre[0].stride;                          \
-  const int offset = bestmv->row * y_stride + bestmv->col;                  \
-  const uint8_t *const y = xd->plane[0].pre[0].buf;                         \
-                                                                            \
-  int br = bestmv->row * 8;                                                 \
-  int bc = bestmv->col * 8;                                                 \
-  int hstep = 4;                                                            \
-  int minc, maxc, minr, maxr;                                               \
-  int tr = br;                                                              \
-  int tc = bc;                                                              \
-                                                                            \
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
-                                 ref_mv);                                   \
-                                                                            \
-  bestmv->row *= 8;                                                         \
+#define SETUP_SUBPEL_SEARCH                                             \
+  const uint8_t *const src_address = x->plane[0].src.buf;               \
+  const int src_stride = x->plane[0].src.stride;                        \
+  const MACROBLOCKD *xd = &x->e_mbd;                                    \
+  unsigned int besterr = INT_MAX;                                       \
+  unsigned int sse;                                                     \
+  unsigned int whichdir;                                                \
+  int thismse;                                                          \
+  MV *bestmv = &x->best_mv.as_mv;                                       \
+  const unsigned int halfiters = iters_per_step;                        \
+  const unsigned int quarteriters = iters_per_step;                     \
+  const unsigned int eighthiters = iters_per_step;                      \
+  const int y_stride = xd->plane[0].pre[0].stride;                      \
+  const int offset = bestmv->row * y_stride + bestmv->col;              \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                     \
+                                                                        \
+  int br = bestmv->row * 8;                                             \
+  int bc = bestmv->col * 8;                                             \
+  int hstep = 4;                                                        \
+  int minc, maxc, minr, maxr;                                           \
+  int tr = br;                                                          \
+  int tc = bc;                                                          \
+                                                                        \
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
+                             ref_mv);                                   \
+                                                                        \
+  bestmv->row *= 8;                                                     \
   bestmv->col *= 8;
 
 static unsigned int setup_center_error(
@@ -331,25 +339,34 @@ static unsigned int setup_center_error(
     int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
     int *mvcost[2], unsigned int *sse1, int *distortion) {
   unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      if (mask)
+      if (mask) {
         aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
                                   y_stride, mask, mask_stride, invert_mask);
-      else
-        aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                                 y_stride);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h,
+                                       y + offset, y_stride, &xd->jcp_param);
+        else
+          aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                                   y_stride);
+      }
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
       DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-      if (mask)
+      if (mask) {
         aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
                            mask, mask_stride, invert_mask);
-      else
-        aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+                                y_stride, &xd->jcp_param);
+        else
+          aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      }
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
   } else {
@@ -357,22 +374,6 @@ static unsigned int setup_center_error(
   }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#else
-  (void)xd;
-  if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-    if (mask)
-      aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
-                         mask, mask_stride, invert_mask);
-    else
-      aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#endif  // CONFIG_HIGHBITDEPTH
   return besterr;
 }
 
@@ -401,11 +402,13 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
 }
 
 int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -418,7 +421,10 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore(
   (void)allow_hp;
   (void)forced_stop;
   (void)hstep;
-  (void)use_upsampled_ref;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
 
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -468,13 +474,18 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore(
 }
 
 int av1_find_best_sub_pixel_tree_pruned_more(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
-  (void)use_upsampled_ref;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -531,13 +542,18 @@ int av1_find_best_sub_pixel_tree_pruned_more(
 }
 
 int av1_find_best_sub_pixel_tree_pruned(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
-  (void)use_upsampled_ref;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -624,7 +640,8 @@ static const MV search_step_table[12] = {
 };
 /* clang-format on */
 
-static int upsampled_pref_error(const MACROBLOCKD *xd,
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *const mv,
                                 const aom_variance_fn_ptr_t *vfp,
                                 const uint8_t *const src, const int src_stride,
                                 const uint8_t *const y, int y_stride,
@@ -633,73 +650,105 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
                                 int mask_stride, int invert_mask, int w, int h,
                                 unsigned int *sse) {
   unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     if (second_pred != NULL) {
-      if (mask)
+      if (mask) {
         aom_highbd_comp_mask_upsampled_pred(
-            pred16, second_pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride,
-            mask, mask_stride, invert_mask, xd->bd);
-      else
-        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h,
-                                           subpel_x_q3, subpel_y_q3, y,
-                                           y_stride, xd->bd);
+            xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3,
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_highbd_jnt_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
+              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param);
+        else
+          aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16,
+                                             second_pred, w, h, subpel_x_q3,
+                                             subpel_y_q3, y, y_stride, xd->bd);
+      }
     } else {
-      aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
-                                y_stride, xd->bd);
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
+                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
     }
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
   } else {
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  (void)xd;
-#endif  // CONFIG_HIGHBITDEPTH
     if (second_pred != NULL) {
-      if (mask)
-        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
-                                     subpel_y_q3, y, y_stride, mask,
-                                     mask_stride, invert_mask);
-      else
-        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
-                                    subpel_y_q3, y, y_stride);
+      if (mask) {
+        aom_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_jnt_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, &xd->jcp_param);
+        else
+          aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+                                      second_pred, w, h, subpel_x_q3,
+                                      subpel_y_q3, y, y_stride);
+      }
     } else {
-      aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                         subpel_y_q3, y, y_stride);
     }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
-#if CONFIG_HIGHBITDEPTH
   }
-#endif
   return besterr;
 }
 
 static unsigned int upsampled_setup_center_error(
-    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
-    int *mvcost[2], unsigned int *sse1, int *distortion) {
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const y, int y_stride,
+    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+    int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
   unsigned int besterr = upsampled_pref_error(
-      xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred, mask,
-      mask_stride, invert_mask, w, h, sse1);
+      xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset,
+      y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
+// when use_accurate_subpel_search == 0
+static INLINE unsigned int estimate_upsampled_pref_error(
+    MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const pre, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
+    const uint8_t *mask, int mask_stride, int invert_mask, unsigned int *sse) {
+  if (second_pred == NULL) {
+    return vfp->svf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                    sse);
+  } else if (mask) {
+    return vfp->msvf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     second_pred, mask, mask_stride, invert_mask, sse);
+  } else {
+    if (xd->jcp_param.use_jnt_comp_avg)
+      return vfp->jsvaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src,
+                        src_stride, sse, second_pred, &xd->jcp_param);
+    else
+      return vfp->svaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                       sse, second_pred);
+  }
+}
+
 int av1_find_best_sub_pixel_tree(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int thismse;
@@ -720,8 +769,7 @@ int av1_find_best_sub_pixel_tree(
   int kr, kc;
   int minc, maxc, minr, maxr;
 
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
 
   if (!allow_hp)
     if (round == 3) round = 2;
@@ -729,12 +777,11 @@ int av1_find_best_sub_pixel_tree(
   bestmv->row *= 8;
   bestmv->col *= 8;
 
-  // use_upsampled_ref can be 0 or 1
-  if (use_upsampled_ref)
+  if (use_accurate_subpel_search)
     besterr = upsampled_setup_center_error(
-        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-        y_stride, second_pred, mask, mask_stride, invert_mask, w, h, offset,
-        mvjcost, mvcost, sse1, distortion);
+        xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
+        src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
+        h, offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                  src_address, src_stride, y, y_stride,
@@ -751,23 +798,16 @@ int av1_find_best_sub_pixel_tree(
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = { tr, tc };
 
-        if (use_upsampled_ref) {
-          thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                         pre(y, y_stride, tr, tc), y_stride,
-                                         sp(tc), sp(tr), second_pred, mask,
-                                         mask_stride, invert_mask, w, h, &sse);
+        if (use_accurate_subpel_search) {
+          thismse = upsampled_pref_error(
+              xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+              mask, mask_stride, invert_mask, w, h, &sse);
         } else {
-          const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
-          if (second_pred == NULL)
-            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                               src_address, src_stride, &sse);
-          else if (mask)
-            thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
-                                src_address, src_stride, second_pred, mask,
-                                mask_stride, invert_mask, &sse);
-          else
-            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                                src_address, src_stride, &sse, second_pred);
+          thismse = estimate_upsampled_pref_error(
+              xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+              y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+              invert_mask, &sse);
         }
 
         cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
@@ -793,24 +833,16 @@ int av1_find_best_sub_pixel_tree(
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
       MV this_mv = { tr, tc };
 
-      if (use_upsampled_ref) {
-        thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                       pre(y, y_stride, tr, tc), y_stride,
-                                       sp(tc), sp(tr), second_pred, mask,
-                                       mask_stride, invert_mask, w, h, &sse);
+      if (use_accurate_subpel_search) {
+        thismse = upsampled_pref_error(
+            xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+            mask, mask_stride, invert_mask, w, h, &sse);
       } else {
-        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
-
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                             src_stride, &sse);
-        else if (mask)
-          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, second_pred, mask,
-                              mask_stride, invert_mask, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+        thismse = estimate_upsampled_pref_error(
+            xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+            y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+            invert_mask, &sse);
       }
 
       cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
@@ -835,7 +867,7 @@ int av1_find_best_sub_pixel_tree(
     }
 
     if (iters_per_step > 1 && best_idx != -1) {
-      if (use_upsampled_ref) {
+      if (use_accurate_subpel_search) {
         SECOND_LEVEL_CHECKS_BEST(1);
       } else {
         SECOND_LEVEL_CHECKS_BEST(0);
@@ -861,63 +893,51 @@ int av1_find_best_sub_pixel_tree(
 #undef PRE
 #undef CHECK_BETTER
 
-#if CONFIG_WARPED_MOTION
 unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
                                      BLOCK_SIZE bsize, int mi_row, int mi_col,
                                      const MV *this_mv) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
   const uint8_t *const src = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   uint8_t *const dst = xd->plane[0].dst.buf;
   const int dst_stride = xd->plane[0].dst.stride;
   const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
-  const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
   unsigned int mse;
   unsigned int sse;
 
   av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
   mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
-  mse +=
-      mv_err_cost(this_mv, &ref_mv, x->nmvjointcost, x->mvcost, x->errorperbit);
+  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost,
+                     x->errorperbit);
   return mse;
 }
 
 // Refine MV in a small range
-#if WARPED_MOTION_SORT_SAMPLES
 unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                  int *pts0, int *pts_inref0, int *pts_mv0,
+                                  int *pts0, int *pts_inref0,
                                   int total_samples) {
-#else
-unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                  int *pts, int *pts_inref) {
-#endif  // WARPED_MOTION_SORT_SAMPLES
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
                             { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
-  const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
   int16_t br = mbmi->mv[0].as_mv.row;
   int16_t bc = mbmi->mv[0].as_mv.col;
   int16_t *tr = &mbmi->mv[0].as_mv.row;
   int16_t *tc = &mbmi->mv[0].as_mv.col;
   WarpedMotionParams best_wm_params = mbmi->wm_params[0];
-#if WARPED_MOTION_SORT_SAMPLES
   int best_num_proj_ref = mbmi->num_proj_ref[0];
-#endif  // WARPED_MOTION_SORT_SAMPLES
   unsigned int bestmse;
   int minc, maxc, minr, maxr;
   const int start = cm->allow_high_precision_mv ? 0 : 4;
   int ite;
 
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 &ref_mv);
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                             &ref_mv.as_mv);
 
   // Calculate the center position's error
   assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr);
@@ -937,15 +957,13 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
 
       if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
         MV this_mv = { *tr, *tc };
-#if WARPED_MOTION_SORT_SAMPLES
         int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
 
         memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
         memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
         if (total_samples > 1)
           mbmi->num_proj_ref[0] =
-              sortSamples(pts_mv0, &this_mv, pts, pts_inref, total_samples);
-#endif  // WARPED_MOTION_SORT_SAMPLES
+              selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
 
         if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
                              *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
@@ -955,9 +973,7 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
           if (thismse < bestmse) {
             best_idx = idx;
             best_wm_params = mbmi->wm_params[0];
-#if WARPED_MOTION_SORT_SAMPLES
             best_num_proj_ref = mbmi->num_proj_ref[0];
-#endif  // WARPED_MOTION_SORT_SAMPLES
             bestmse = thismse;
           }
         }
@@ -975,12 +991,9 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
   *tr = br;
   *tc = bc;
   mbmi->wm_params[0] = best_wm_params;
-#if WARPED_MOTION_SORT_SAMPLES
   mbmi->num_proj_ref[0] = best_num_proj_ref;
-#endif  // WARPED_MOTION_SORT_SAMPLES
   return bestmse;
 }
-#endif  // CONFIG_WARPED_MOTION
 
 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
                                int range) {
@@ -1386,11 +1399,19 @@ int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
   const MV mv = { best_mv->row * 8, best_mv->col * 8 };
   unsigned int unused;
 
-  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
-                   what->buf, what->stride, &unused, second_pred) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
-                                   x->errorperbit)
-                     : 0);
+  if (xd->jcp_param.use_jnt_comp_avg)
+    return vfp->jsvaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                      what->buf, what->stride, &unused, second_pred,
+                      &xd->jcp_param) +
+           (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                     x->errorperbit)
+                       : 0);
+  else
+    return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                     what->buf, what->stride, &unused, second_pred) +
+           (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                     x->errorperbit)
+                       : 0);
 }
 
 int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
@@ -1785,205 +1806,6 @@ int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
   return bestsad;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl) {
-  int best_sad = INT_MAX;
-  int this_sad;
-  int d;
-  int center, offset = 0;
-  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
-  for (d = 0; d <= bw; d += 16) {
-    this_sad = aom_vector_var(&ref[d], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      offset = d;
-    }
-  }
-  center = offset;
-
-  for (d = -8; d <= 8; d += 16) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -4; d <= 4; d += 8) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -2; d <= 2; d += 4) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -1; d <= 1; d += 2) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-
-  return (center - (bw >> 1));
-}
-
-static const MV search_pos[4] = {
-  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
-};
-
-unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
-                                           BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
-  int idx;
-  const int src_stride = x->plane[0].src.stride;
-  const int ref_stride = xd->plane[0].pre[0].stride;
-  uint8_t const *ref_buf, *src_buf;
-  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
-  unsigned int best_sad, tmp_sad, sad_arr[4];
-  MV this_mv;
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
-
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
-    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
-  }
-
-#if CONFIG_HIGHBITDEPTH
-  {
-    unsigned int this_sad;
-    tmp_mv->row = 0;
-    tmp_mv->col = 0;
-    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                      xd->plane[0].pre[0].buf, ref_stride);
-
-    if (scaled_ref_frame) {
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
-    }
-    return this_sad;
-  }
-#endif
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int search_width = bw << 1;
-  const int search_height = bh << 1;
-  const int norm_factor = 3 + (bw >> 5);
-
-  // Set up prediction 1-D reference set
-  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  for (idx = 0; idx < search_width; idx += 16) {
-    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
-    ref_buf += 16;
-  }
-
-  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
-    ref_buf += ref_stride;
-  }
-
-  // Set up src 1-D reference set
-  for (idx = 0; idx < bw; idx += 16) {
-    src_buf = x->plane[0].src.buf + idx;
-    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
-  }
-
-  src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
-    src_buf += src_stride;
-  }
-
-  // Find the best match per 1-D search
-  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
-  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
-
-  this_mv = *tmp_mv;
-  src_buf = x->plane[0].src.buf;
-  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
-  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-
-  {
-    const uint8_t *const pos[4] = {
-      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
-    };
-
-    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, sad_arr);
-  }
-
-  for (idx = 0; idx < 4; ++idx) {
-    if (sad_arr[idx] < best_sad) {
-      best_sad = sad_arr[idx];
-      tmp_mv->row = search_pos[idx].row + this_mv.row;
-      tmp_mv->col = search_pos[idx].col + this_mv.col;
-    }
-  }
-
-  if (sad_arr[0] < sad_arr[3])
-    this_mv.row -= 1;
-  else
-    this_mv.row += 1;
-
-  if (sad_arr[1] < sad_arr[2])
-    this_mv.col -= 1;
-  else
-    this_mv.col += 1;
-
-  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
-
-  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-  if (best_sad > tmp_sad) {
-    *tmp_mv = this_mv;
-    best_sad = tmp_sad;
-  }
-
-  tmp_mv->row *= 8;
-  tmp_mv->col *= 8;
-
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
-  }
-
-  return best_sad;
-}
-
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
@@ -2110,197 +1932,6 @@ static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-int av1_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const aom_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r, c;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    for (c = col_min; c < col_max; ++c) {
-      const MV mv = { r, c };
-      const int sad =
-          fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                      in_what->stride) +
-          mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-      if (sad < best_sad) {
-        best_sad = sad;
-        *best_mv = mv;
-      }
-    }
-  }
-  return best_sad;
-}
-
-int av1_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const aom_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
-int av1_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const aom_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx8f != NULL) {
-      while ((c + 7) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[8]);
-
-        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 8; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
 int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
                             const aom_variance_fn_ptr_t *fn_ptr,
@@ -2394,16 +2025,23 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
   clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
-  if (mask)
+  if (mask) {
     best_sad = fn_ptr->msdf(what->buf, what->stride,
                             get_buf_from_mv(in_what, best_mv), in_what->stride,
                             second_pred, mask, mask_stride, invert_mask) +
                mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
-  else
-    best_sad =
-        fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                     in_what->stride, second_pred) +
-        mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  } else {
+    if (xd->jcp_param.use_jnt_comp_avg)
+      best_sad = fn_ptr->jsdaf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, best_mv),
+                               in_what->stride, second_pred, &xd->jcp_param) +
+                 mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+    else
+      best_sad = fn_ptr->sdaf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, best_mv),
+                              in_what->stride, second_pred) +
+                 mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  }
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
@@ -2414,14 +2052,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
       if (is_mv_in(&x->mv_limits, &mv)) {
         unsigned int sad;
-        if (mask)
+        if (mask) {
           sad = fn_ptr->msdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &mv), in_what->stride,
                              second_pred, mask, mask_stride, invert_mask);
-        else
-          sad = fn_ptr->sdaf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &mv), in_what->stride,
-                             second_pred);
+        } else {
+          if (xd->jcp_param.use_jnt_comp_avg)
+            sad = fn_ptr->jsdaf(what->buf, what->stride,
+                                get_buf_from_mv(in_what, &mv), in_what->stride,
+                                second_pred, &xd->jcp_param);
+          else
+            sad = fn_ptr->sdaf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &mv), in_what->stride,
+                               second_pred);
+        }
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -2454,45 +2098,10 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
          (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
 }
 
-#if CONFIG_HASH_ME
-#define MAX_HASH_MV_TABLE_SIZE 5
-static void add_to_sort_table(block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE],
-                              int costs[MAX_HASH_MV_TABLE_SIZE], int *existing,
-                              int max_size, block_hash curr_block,
-                              int curr_cost) {
-  if (*existing < max_size) {
-    block_hashes[*existing] = curr_block;
-    costs[*existing] = curr_cost;
-    (*existing)++;
-  } else {
-    int max_cost = 0;
-    int max_cost_idx = 0;
-    for (int i = 0; i < max_size; i++) {
-      if (costs[i] > max_cost) {
-        max_cost = costs[i];
-        max_cost_idx = i;
-      }
-    }
-
-    if (curr_cost < max_cost) {
-      block_hashes[max_cost_idx] = curr_block;
-      costs[max_cost_idx] = curr_cost;
-    }
-  }
-}
-#endif
-
-#if CONFIG_HASH_ME
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max, int rd,
                           int x_pos, int y_pos, int intra) {
-#else
-int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, int var_max,
-                          int rd) {
-#endif
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
@@ -2539,7 +2148,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       if (is_exhaustive_allowed(cpi, x)) {
         int exhuastive_thr = sf->exhaustive_searches_thresh;
         exhuastive_thr >>=
-            10 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
 
         // Threshold variance for an exhaustive full search.
         if (var > exhuastive_thr) {
@@ -2556,44 +2165,37 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
         }
       }
       break;
-
-      break;
     default: assert(0 && "Invalid search method.");
   }
 
   if (method != NSTEP && rd && var < var_max)
     var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
 
-#if CONFIG_HASH_ME
   do {
-    if (!cpi->common.allow_screen_content_tools) {
-      break;
-    }
+    if (!av1_use_hash_me(&cpi->common)) break;
+
     // already single ME
     // get block size and original buffer of current block
     const int block_height = block_size_high[bsize];
     const int block_width = block_size_wide[bsize];
     if (block_height == block_width && x_pos >= 0 && y_pos >= 0) {
       if (block_width == 4 || block_width == 8 || block_width == 16 ||
-          block_width == 32 || block_width == 64) {
+          block_width == 32 || block_width == 64 || block_width == 128) {
         uint8_t *what = x->plane[0].src.buf;
         const int what_stride = x->plane[0].src.stride;
-        block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE];
-        int costs[MAX_HASH_MV_TABLE_SIZE];
-        int existing = 0;
-        int i;
         uint32_t hash_value1, hash_value2;
         MV best_hash_mv;
         int best_hash_cost = INT_MAX;
 
         // for the hashMap
         hash_table *ref_frame_hash =
-            intra ? &cpi->common.cur_frame->hash_table
-                  : get_ref_frame_hash_map(cpi,
-                                           x->e_mbd.mi[0]->mbmi.ref_frame[0]);
+            intra
+                ? &cpi->common.cur_frame->hash_table
+                : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]);
 
-        av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
-                                 &hash_value2);
+        av1_get_block_hash_value(
+            what, what_stride, block_width, &hash_value1, &hash_value2,
+            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
 
         const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
         // for intra, at lest one matching can be found, itself.
@@ -2603,44 +2205,31 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
         Iterator iterator =
             av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
-        for (i = 0; i < count; i++, iterator_increment(&iterator)) {
+        for (int i = 0; i < count; i++, iterator_increment(&iterator)) {
           block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator));
           if (hash_value2 == ref_block_hash.hash_value2) {
-            // for intra, make sure the prediction is from valid area
-            // not predict from current block.
-            // TODO(roger): check if the constrain is necessary
-            if (intra &&
-                ref_block_hash.y + block_height >
-                    ((y_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2) &&
-                ref_block_hash.x + block_width >
-                    ((x_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2)) {
-              continue;
+            // For intra, make sure the prediction is from valid area.
+            if (intra) {
+              const int mi_col = x_pos / MI_SIZE;
+              const int mi_row = y_pos / MI_SIZE;
+              const MV dv = { 8 * (ref_block_hash.y - y_pos),
+                              8 * (ref_block_hash.x - x_pos) };
+              if (!av1_is_dv_valid(dv, &cpi->common, &x->e_mbd, mi_row, mi_col,
+                                   bsize, cpi->common.seq_params.mib_size_log2))
+                continue;
+            }
+            MV hash_mv;
+            hash_mv.col = ref_block_hash.x - x_pos;
+            hash_mv.row = ref_block_hash.y - y_pos;
+            if (!is_mv_in(&x->mv_limits, &hash_mv)) continue;
+            const int refCost =
+                av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
+            if (refCost < best_hash_cost) {
+              best_hash_cost = refCost;
+              best_hash_mv = hash_mv;
             }
-            int refCost =
-                abs(ref_block_hash.x - x_pos) + abs(ref_block_hash.y - y_pos);
-            add_to_sort_table(block_hashes, costs, &existing,
-                              MAX_HASH_MV_TABLE_SIZE, ref_block_hash, refCost);
-          }
-        }
-
-        if (existing == 0) {
-          break;
-        }
-
-        for (i = 0; i < existing; i++) {
-          MV hash_mv;
-          hash_mv.col = block_hashes[i].x - x_pos;
-          hash_mv.row = block_hashes[i].y - y_pos;
-          if (!is_mv_in(&x->mv_limits, &hash_mv)) {
-            continue;
-          }
-          int currHashCost = av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
-          if (currHashCost < best_hash_cost) {
-            best_hash_cost = currHashCost;
-            best_hash_mv = hash_mv;
           }
         }
-
         if (best_hash_cost < var) {
           x->second_best_mv = x->best_mv;
           x->best_mv.as_mv = best_hash_mv;
@@ -2649,12 +2238,10 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       }
     }
   } while (0);
-#endif
 
   return var;
 }
 
-#if CONFIG_MOTION_VAR
 /* returns subpixel variance error function */
 #define DIST(r, c) \
   vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
@@ -2687,20 +2274,21 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 #undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                              \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-    thismse =                                                               \
-        upsampled_obmc_pref_error(xd, mask, vfp, z, pre(y, y_stride, r, c), \
-                                  y_stride, sp(c), sp(r), w, h, &sse);      \
-    if ((v = MVC(r, c) + thismse) < besterr) {                              \
-      besterr = v;                                                          \
-      br = r;                                                               \
-      bc = c;                                                               \
-      *distortion = thismse;                                                \
-      *sse1 = sse;                                                          \
-    }                                                                       \
-  } else {                                                                  \
-    v = INT_MAX;                                                            \
+#define CHECK_BETTER1(v, r, c)                                                \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                     \
+    MV this_mv = { r, c };                                                    \
+    thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv,     \
+                                        mask, vfp, z, pre(y, y_stride, r, c), \
+                                        y_stride, sp(c), sp(r), w, h, &sse);  \
+    if ((v = MVC(r, c) + thismse) < besterr) {                                \
+      besterr = v;                                                            \
+      br = r;                                                                 \
+      bc = c;                                                                 \
+      *distortion = thismse;                                                  \
+      *sse1 = sse;                                                            \
+    }                                                                         \
+  } else {                                                                    \
+    v = INT_MAX;                                                              \
   }
 
 static unsigned int setup_obmc_center_error(
@@ -2715,60 +2303,55 @@ static unsigned int setup_obmc_center_error(
   return besterr;
 }
 
-static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask,
-                                     const aom_variance_fn_ptr_t *vfp,
-                                     const int32_t *const wsrc,
-                                     const uint8_t *const y, int y_stride,
-                                     int subpel_x_q3, int subpel_y_q3, int w,
-                                     int h, unsigned int *sse) {
+static int upsampled_obmc_pref_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
+    const int32_t *const wsrc, const uint8_t *const y, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) {
   unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
-                              y_stride, xd->bd);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
+                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
 
     besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
   } else {
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  (void)xd;
-#endif  // CONFIG_HIGHBITDEPTH
-    aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, y, y_stride);
 
     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
-#if CONFIG_HIGHBITDEPTH
   }
-#endif
   return besterr;
 }
 
 static unsigned int upsampled_setup_obmc_center_error(
-    const MACROBLOCKD *xd, const int32_t *mask, const MV *bestmv,
-    const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w,
-    int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
-  unsigned int besterr = upsampled_obmc_pref_error(
-      xd, mask, vfp, wsrc, y + offset, y_stride, 0, 0, w, h, sse1);
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, int w, int h, int offset,
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr =
+      upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc,
+                                y + offset, y_stride, 0, 0, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
 int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, int is_second, int use_upsampled_ref) {
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_accurate_subpel_search) {
   const int32_t *wsrc = x->wsrc_buf;
   const int32_t *mask = x->mask_buf;
   const int *const z = wsrc;
   const int *const src_address = z;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int thismse;
@@ -2794,8 +2377,7 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 
   int minc, maxc, minr, maxr;
 
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
 
   y = pd->pre[is_second].buf;
   y_stride = pd->pre[is_second].stride;
@@ -2806,11 +2388,11 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 
   bestmv->row *= 8;
   bestmv->col *= 8;
-  // use_upsampled_ref can be 0 or 1
-  if (use_upsampled_ref)
+  // use_accurate_subpel_search can be 0 or 1
+  if (use_accurate_subpel_search)
     besterr = upsampled_setup_obmc_center_error(
-        xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
-        offset, mvjcost, mvcost, sse1, distortion);
+        xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
+        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,
@@ -2823,15 +2405,13 @@ int av1_find_best_obmc_sub_pixel_tree_up(
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = { tr, tc };
-        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
-
-        if (use_upsampled_ref) {
-          thismse =
-              upsampled_obmc_pref_error(xd, mask, vfp, src_address, pre_address,
-                                        y_stride, sp(tc), sp(tr), w, h, &sse);
+        if (use_accurate_subpel_search) {
+          thismse = upsampled_obmc_pref_error(
+              xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
         } else {
-          thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, mask, &sse);
+          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
+                              sp(tr), src_address, mask, &sse);
         }
 
         cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
@@ -2856,10 +2436,10 @@ int av1_find_best_obmc_sub_pixel_tree_up(
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
       MV this_mv = { tr, tc };
 
-      if (use_upsampled_ref) {
-        thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
-                                            pre(y, y_stride, tr, tc), y_stride,
-                                            sp(tc), sp(tr), w, h, &sse);
+      if (use_accurate_subpel_search) {
+        thismse = upsampled_obmc_pref_error(
+            xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
       } else {
         thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
                             src_address, mask, &sse);
@@ -2887,7 +2467,7 @@ int av1_find_best_obmc_sub_pixel_tree_up(
     }
 
     if (iters_per_step > 1 && best_idx != -1) {
-      if (use_upsampled_ref) {
+      if (use_accurate_subpel_search) {
         SECOND_LEVEL_CHECKS_BEST(1);
       } else {
         SECOND_LEVEL_CHECKS_BEST(0);
@@ -3123,89 +2703,98 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
   }
   return bestsme;
 }
-#endif  // CONFIG_MOTION_VAR
 
 // Note(yunqingwang): The following 2 functions are only used in the motion
 // vector unit test, which return extreme motion vectors allowed by the MV
 // limits.
-#define COMMON_MV_TEST     \
-  SETUP_SUBPEL_SEARCH;     \
-                           \
-  (void)error_per_bit;     \
-  (void)vfp;               \
-  (void)src_address;       \
-  (void)src_stride;        \
-  (void)y;                 \
-  (void)y_stride;          \
-  (void)second_pred;       \
-  (void)w;                 \
-  (void)h;                 \
-  (void)use_upsampled_ref; \
-  (void)offset;            \
-  (void)mvjcost;           \
-  (void)mvcost;            \
-  (void)sse1;              \
-  (void)distortion;        \
-                           \
-  (void)halfiters;         \
-  (void)quarteriters;      \
-  (void)eighthiters;       \
-  (void)whichdir;          \
-  (void)forced_stop;       \
-  (void)hstep;             \
-                           \
-  (void)tr;                \
-  (void)tc;                \
-  (void)sse;               \
-  (void)thismse;           \
+#define COMMON_MV_TEST              \
+  SETUP_SUBPEL_SEARCH;              \
+                                    \
+  (void)error_per_bit;              \
+  (void)vfp;                        \
+  (void)src_address;                \
+  (void)src_stride;                 \
+  (void)y;                          \
+  (void)y_stride;                   \
+  (void)second_pred;                \
+  (void)w;                          \
+  (void)h;                          \
+  (void)use_accurate_subpel_search; \
+  (void)offset;                     \
+  (void)mvjcost;                    \
+  (void)mvcost;                     \
+  (void)sse1;                       \
+  (void)distortion;                 \
+                                    \
+  (void)halfiters;                  \
+  (void)quarteriters;               \
+  (void)eighthiters;                \
+  (void)whichdir;                   \
+  (void)forced_stop;                \
+  (void)hstep;                      \
+                                    \
+  (void)tr;                         \
+  (void)tc;                         \
+  (void)sse;                        \
+  (void)thismse;                    \
   (void)cost_list;
 // Return the maximum MV.
-int av1_return_max_sub_pixel_mv(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *ref_mv,
+                                int allow_hp, int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                int use_accurate_subpel_search) {
   COMMON_MV_TEST;
   (void)mask;
   (void)mask_stride;
   (void)invert_mask;
   (void)minr;
   (void)minc;
+
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
   bestmv->row = maxr;
   bestmv->col = maxc;
   besterr = 0;
-// In the sub-pel motion search, if hp is not used, then the last bit of mv
-// has to be 0.
-#if CONFIG_AMVR
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
   lower_mv_precision(bestmv, allow_hp, 0);
-#else
-  lower_mv_precision(bestmv, allow_hp);
-#endif
   return besterr;
 }
 // Return the minimum MV.
-int av1_return_min_sub_pixel_mv(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *ref_mv,
+                                int allow_hp, int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                int use_accurate_subpel_search) {
   COMMON_MV_TEST;
   (void)maxr;
   (void)maxc;
   (void)mask;
   (void)mask_stride;
   (void)invert_mask;
+
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
   bestmv->row = minr;
   bestmv->col = minc;
   besterr = 0;
-// In the sub-pel motion search, if hp is not used, then the last bit of mv
-// has to be 0.
-#if CONFIG_AMVR
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
   lower_mv_precision(bestmv, allow_hp, 0);
-#else
-  lower_mv_precision(bestmv, allow_hp);
-#endif
   return besterr;
 }
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 2c53075cc..539e8f4e4 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -69,10 +69,9 @@ struct SPEED_FEATURES;
 
 int av1_init_search_range(int size);
 
-int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv,
-                            int sad_per_bit, int distance,
-                            const aom_variance_fn_ptr_t *fn_ptr,
-                            const struct mv *center_mv);
+int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
+                            int distance, const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv);
 
 // Runs sequence of diamond searches in smaller steps for RD.
 int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
@@ -81,24 +80,20 @@ int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                            const aom_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
-// Perform integral projection based motion estimation.
-unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
-                                           MACROBLOCK *x, BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
-
 int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
                    const aom_variance_fn_ptr_t *vfp, int use_mvcost,
                    const MV *center_mv);
 
 typedef int(fractional_mv_step_fp)(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1, const uint8_t *second_pred,
     const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
-    int use_upsampled_ref);
+    int use_accurate_subpel_search);
 
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
@@ -123,52 +118,33 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              int invert_mask, const MV *center_mv,
                              const uint8_t *second_pred);
 
-struct AV1_COMP;
-
-#if CONFIG_HASH_ME
 int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
                           int var_max, int rd, int x_pos, int y_pos, int intra);
-#else
-int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
-                          int error_per_bit, int *cost_list, const MV *ref_mv,
-                          int var_max, int rd);
-#endif
 
-#if CONFIG_MOTION_VAR
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                 MV *mvp_full, int step_param, int sadpb,
                                 int further_steps, int do_refine,
                                 const aom_variance_fn_ptr_t *fn_ptr,
                                 const MV *ref_mv, MV *dst_mv, int is_second);
 int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, int is_second, int use_upsampled_ref);
-#endif  // CONFIG_MOTION_VAR
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_accurate_subpel_search);
 
-#if CONFIG_WARPED_MOTION
 unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
                                      MACROBLOCK *const x, BLOCK_SIZE bsize,
                                      int mi_row, int mi_col, const MV *this_mv);
-#if WARPED_MOTION_SORT_SAMPLES
 unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
                                   MACROBLOCK *const x, BLOCK_SIZE bsize,
                                   int mi_row, int mi_col, int *pts0,
-                                  int *pts_inref0, int *pts_mv0,
-                                  int total_samples);
-#else
-unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
-                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col, int *pts,
-                                  int *pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
+                                  int *pts_inref0, int total_samples);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c
index 8d13af7ad..2e86dee43 100644
--- a/third_party/aom/av1/encoder/mips/msa/error_msa.c
+++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
deleted file mode 100644
index 4b0364d6c..000000000
--- a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "av1/encoder/mips/msa/fdct_msa.h"
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
-                                   const int32_t *const0, int16_t *int_buf) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
-  v4i32 k0, k1, k2, k3;
-
-  /* load input data */
-  r0 = LD_SH(input);
-  r15 = LD_SH(input + 15 * stride);
-  r7 = LD_SH(input + 7 * stride);
-  r8 = LD_SH(input + 8 * stride);
-  SLLI_4V(r0, r15, r7, r8, 2);
-
-  /* stage 1 */
-  LD_SW2(const0, 4, k0, k1);
-  LD_SW2(const0 + 8, 4, k2, k3);
-  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
-
-  r3 = LD_SH(input + 3 * stride);
-  r4 = LD_SH(input + 4 * stride);
-  r11 = LD_SH(input + 11 * stride);
-  r12 = LD_SH(input + 12 * stride);
-  SLLI_4V(r3, r4, r11, r12, 2);
-
-  LD_SW2(const0 + 4 * 4, 4, k0, k1);
-  LD_SW2(const0 + 4 * 6, 4, k2, k3);
-  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
-
-  /* stage 2 */
-  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
-  ST_SH2(tp0, tp2, int_buf, 8);
-  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
-
-  LD_SW2(const0 + 4 * 8, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 10);
-  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-
-  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
-  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
-
-  r9 = LD_SH(input + 9 * stride);
-  r6 = LD_SH(input + 6 * stride);
-  r1 = LD_SH(input + stride);
-  r14 = LD_SH(input + 14 * stride);
-  SLLI_4V(r9, r6, r1, r14, 2);
-
-  LD_SW2(const0 + 4 * 11, 4, k0, k1);
-  LD_SW2(const0 + 4 * 13, 4, k2, k3);
-  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
-
-  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
-
-  r13 = LD_SH(input + 13 * stride);
-  r2 = LD_SH(input + 2 * stride);
-  r5 = LD_SH(input + 5 * stride);
-  r10 = LD_SH(input + 10 * stride);
-  SLLI_4V(r13, r2, r5, r10, 2);
-
-  LD_SW2(const0 + 4 * 15, 4, k0, k1);
-  LD_SW2(const0 + 4 * 17, 4, k2, k3);
-  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
-
-  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
-
-  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
-  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
-}
-
-static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0,
-                                     int16_t *out, int16_t *out_ptr) {
-  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
-  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
-  v4i32 k0, k1, k2, k3;
-
-  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
-  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
-  LD_SW2(const0 + 4 * 19, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 21);
-  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
-
-  tp0 = LD_SH(int_buf + 4 * 8);
-  tp1 = LD_SH(int_buf + 5 * 8);
-  tp3 = LD_SH(int_buf + 10 * 8);
-  tp2 = LD_SH(int_buf + 14 * 8);
-  LD_SW2(const0 + 4 * 22, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 24);
-  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
-  out4 = -out4;
-  ST_SH(out4, (out + 3 * 16));
-  ST_SH(out5, (out_ptr + 4 * 16));
-
-  h1 = LD_SH(int_buf + 9 * 8);
-  h3 = LD_SH(int_buf + 12 * 8);
-  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
-  out13 = -out13;
-  ST_SH(out12, (out + 2 * 16));
-  ST_SH(out13, (out_ptr + 5 * 16));
-
-  tp0 = LD_SH(int_buf);
-  tp1 = LD_SH(int_buf + 8);
-  tp2 = LD_SH(int_buf + 2 * 8);
-  tp3 = LD_SH(int_buf + 6 * 8);
-
-  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
-  out1 = -out1;
-  ST_SH(out0, (out));
-  ST_SH(out1, (out_ptr + 7 * 16));
-
-  h0 = LD_SH(int_buf + 8 * 8);
-  h2 = LD_SH(int_buf + 13 * 8);
-
-  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
-  out8 = -out8;
-  ST_SH(out8, (out + 16));
-  ST_SH(out9, (out_ptr + 6 * 16));
-
-  /* stage 4 */
-  LD_SW2(const0 + 4 * 25, 4, k0, k1);
-  LD_SW2(const0 + 4 * 27, 4, k2, k3);
-  MADD_SHORT(h10, h11, k1, k2, out2, out3);
-  ST_SH(out2, (out + 7 * 16));
-  ST_SH(out3, (out_ptr));
-
-  MADD_SHORT(out6, out7, k0, k3, out6, out7);
-  ST_SH(out6, (out + 4 * 16));
-  ST_SH(out7, (out_ptr + 3 * 16));
-
-  MADD_SHORT(out10, out11, k0, k3, out10, out11);
-  ST_SH(out10, (out + 6 * 16));
-  ST_SH(out11, (out_ptr + 16));
-
-  MADD_SHORT(out14, out15, k1, k2, out14, out15);
-  ST_SH(out14, (out + 5 * 16));
-  ST_SH(out15, (out_ptr + 2 * 16));
-}
-
-static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
-                                   int16_t *out) {
-  fadst16_step2_msa_helper(int_buf, const0, out, out + 128);
-}
-
-static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
-  /* load input data */
-  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  FDCT_POSTPROC_2V_NEG_H(r0, r1);
-  FDCT_POSTPROC_2V_NEG_H(r2, r3);
-  FDCT_POSTPROC_2V_NEG_H(r4, r5);
-  FDCT_POSTPROC_2V_NEG_H(r6, r7);
-  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
-  out += 64;
-
-  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  FDCT_POSTPROC_2V_NEG_H(r8, r9);
-  FDCT_POSTPROC_2V_NEG_H(r10, r11);
-  FDCT_POSTPROC_2V_NEG_H(r12, r13);
-  FDCT_POSTPROC_2V_NEG_H(r14, r15);
-  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
-  out += 64;
-
-  /* load input data */
-  input += 128;
-  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  FDCT_POSTPROC_2V_NEG_H(r0, r1);
-  FDCT_POSTPROC_2V_NEG_H(r2, r3);
-  FDCT_POSTPROC_2V_NEG_H(r4, r5);
-  FDCT_POSTPROC_2V_NEG_H(r6, r7);
-  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
-  out += 64;
-
-  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  FDCT_POSTPROC_2V_NEG_H(r8, r9);
-  FDCT_POSTPROC_2V_NEG_H(r10, r11);
-  FDCT_POSTPROC_2V_NEG_H(r12, r13);
-  FDCT_POSTPROC_2V_NEG_H(r14, r15);
-  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
-}
-
-static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
-                                   int16_t *int_buf) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
-  v4i32 k0, k1, k2, k3;
-
-  /* load input data */
-  r0 = LD_SH(input);
-  r7 = LD_SH(input + 7 * 8);
-  r8 = LD_SH(input + 8 * 8);
-  r15 = LD_SH(input + 15 * 8);
-
-  /* stage 1 */
-  LD_SW2(const0, 4, k0, k1);
-  LD_SW2(const0 + 4 * 2, 4, k2, k3);
-  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
-
-  r3 = LD_SH(input + 3 * 8);
-  r4 = LD_SH(input + 4 * 8);
-  r11 = LD_SH(input + 11 * 8);
-  r12 = LD_SH(input + 12 * 8);
-
-  LD_SW2(const0 + 4 * 4, 4, k0, k1);
-  LD_SW2(const0 + 4 * 6, 4, k2, k3);
-  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
-
-  /* stage 2 */
-  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
-  ST_SH2(tp0, tp1, int_buf, 4 * 8);
-  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
-
-  LD_SW2(const0 + 4 * 8, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 10);
-  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
-  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
-
-  r1 = LD_SH(input + 8);
-  r6 = LD_SH(input + 6 * 8);
-  r9 = LD_SH(input + 9 * 8);
-  r14 = LD_SH(input + 14 * 8);
-
-  LD_SW2(const0 + 4 * 11, 4, k0, k1);
-  LD_SW2(const0 + 4 * 13, 4, k2, k3);
-  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
-  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
-
-  r2 = LD_SH(input + 2 * 8);
-  r5 = LD_SH(input + 5 * 8);
-  r10 = LD_SH(input + 10 * 8);
-  r13 = LD_SH(input + 13 * 8);
-
-  LD_SW2(const0 + 4 * 15, 4, k0, k1);
-  LD_SW2(const0 + 4 * 17, 4, k2, k3);
-  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
-  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
-  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
-  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
-}
-
-static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
-                                   int16_t *out) {
-  fadst16_step2_msa_helper(int_buf, const0, out, out + 8);
-}
-
-static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
-  /* load input data */
-  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
-          l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
-  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
-  out += 16 * 8;
-
-  /* load input data */
-  input += 128;
-  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
-          l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
-  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
-}
-
-static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
-  int16_t *temp = intermediate;
-  int16_t *out = output;
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
-  v8i16 in12, in13, in14, in15;
-
-  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
-  temp = intermediate + 8;
-  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  FDCT_POSTPROC_2V_NEG_H(in0, in1);
-  FDCT_POSTPROC_2V_NEG_H(in2, in3);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  FDCT_POSTPROC_2V_NEG_H(in6, in7);
-  FDCT_POSTPROC_2V_NEG_H(in8, in9);
-  FDCT_POSTPROC_2V_NEG_H(in10, in11);
-  FDCT_POSTPROC_2V_NEG_H(in12, in13);
-  FDCT_POSTPROC_2V_NEG_H(in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
-               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
-  temp = intermediate;
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
-                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  temp = intermediate;
-  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
-               in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
-                     tmp1, in1, tmp2, in2, tmp3, in3);
-  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
-  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
-                     tmp5, in5, tmp6, in6, tmp7, in7);
-  out = output + 8;
-  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
-}
-
-void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
-                      int32_t tx_type) {
-  DECLARE_ALIGNED(32, int16_t, tmp[256]);
-  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
-  int32_t i;
-  int16_t *ptmpbuf = &tmp_buf[0];
-  int16_t *trans = &trans_buf[0];
-  const int32_t const_arr[29 * 4] = {
-    52707308,    52707308,    52707308,    52707308,    -1072430300,
-    -1072430300, -1072430300, -1072430300, 795618043,   795618043,
-    795618043,   795618043,   -721080468,  -721080468,  -721080468,
-    -721080468,  459094491,   459094491,   459094491,   459094491,
-    -970646691,  -970646691,  -970646691,  -970646691,  1010963856,
-    1010963856,  1010963856,  1010963856,  -361743294,  -361743294,
-    -361743294,  -361743294,  209469125,   209469125,   209469125,
-    209469125,   -1053094788, -1053094788, -1053094788, -1053094788,
-    1053160324,  1053160324,  1053160324,  1053160324,  639644520,
-    639644520,   639644520,   639644520,   -862444000,  -862444000,
-    -862444000,  -862444000,  1062144356,  1062144356,  1062144356,
-    1062144356,  -157532337,  -157532337,  -157532337,  -157532337,
-    260914709,   260914709,   260914709,   260914709,   -1041559667,
-    -1041559667, -1041559667, -1041559667, 920985831,   920985831,
-    920985831,   920985831,   -551995675,  -551995675,  -551995675,
-    -551995675,  596522295,   596522295,   596522295,   596522295,
-    892853362,   892853362,   892853362,   892853362,   -892787826,
-    -892787826,  -892787826,  -892787826,  410925857,   410925857,
-    410925857,   410925857,   -992012162,  -992012162,  -992012162,
-    -992012162,  992077698,   992077698,   992077698,   992077698,
-    759246145,   759246145,   759246145,   759246145,   -759180609,
-    -759180609,  -759180609,  -759180609,  -759222975,  -759222975,
-    -759222975,  -759222975,  759288511,   759288511,   759288511,
-    759288511
-  };
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
-      }
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
-      }
-      break;
-    case ADST_DCT:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
-        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
-      }
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
-      }
-      break;
-    case DCT_ADST:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
-      }
-
-      fadst16_transpose_postproc_msa(tmp, trans);
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
-        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
-      }
-
-      fadst16_transpose_msa(tmp, output);
-      break;
-    case ADST_ADST:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
-        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
-      }
-
-      fadst16_transpose_postproc_msa(tmp, trans);
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
-        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
-      }
-
-      fadst16_transpose_msa(tmp, output);
-      break;
-    default: assert(0); break;
-  }
-}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
index da1ac74f0..085c08bfb 100644
--- a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
+++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -12,7 +12,6 @@
 #include <assert.h>
 
 #include "av1/common/enums.h"
-#include "av1/encoder/mips/msa/fdct_msa.h"
 
 void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
                      int32_t src_stride) {
@@ -45,54 +44,3 @@ void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
   ST4x2_UB(in1, output + 8, 4);
   ST4x2_UB(in2, output + 12, 4);
 }
-
-void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
-                    int32_t tx_type) {
-  v8i16 in0, in1, in2, in3;
-
-  LD_SH4(input, stride, in0, in1, in2, in3);
-
-  /* fdct4 pre-process */
-  {
-    v8i16 temp, mask;
-    v16i8 zero = { 0 };
-    v16i8 one = __msa_ldi_b(1);
-
-    mask = (v8i16)__msa_sldi_b(zero, one, 15);
-    SLLI_4V(in0, in1, in2, in3, 4);
-    temp = __msa_ceqi_h(in0, 0);
-    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
-    temp = mask & temp;
-    in0 += temp;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_ADST:
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    default: assert(0); break;
-  }
-
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
-  SRA_4V(in0, in1, in2, in3, 2);
-  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
-  ST_SH2(in0, in2, output, 8);
-}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
deleted file mode 100644
index 4cbf60a11..000000000
--- a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "av1/encoder/mips/msa/fdct_msa.h"
-
-void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
-                    int32_t tx_type) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
-  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case ADST_DCT:
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case DCT_ADST:
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case ADST_ADST:
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    default: assert(0); break;
-  }
-
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
-}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
deleted file mode 100644
index 52bcf790c..000000000
--- a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
-#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_ports/mem.h"
-
-#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
-                  out3, out4, out5, out6, out7)                              \
-  {                                                                          \
-    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
-    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
-                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
-    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
-                       cospi_24_64, -cospi_24_64, 0,           0 };          \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-                                                                             \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
-    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
-    out7 = -s0_m;                                                            \
-    out0 = s1_m;                                                             \
-                                                                             \
-    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
-                                                                             \
-    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-    cnst1_m = cnst0_m;                                                       \
-                                                                             \
-    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
-                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
-                                                                             \
-    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
-                                                                             \
-    out1 = -out1;                                                            \
-    out3 = -out3;                                                            \
-    out5 = -out5;                                                            \
-  }
-
-#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3)              \
-  {                                                                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m, constant_m;                               \
-    v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m;                               \
-                                                                            \
-    UNPCK_R_SH_SW(in0, in0_r_m);                                            \
-    UNPCK_R_SH_SW(in1, in1_r_m);                                            \
-    UNPCK_R_SH_SW(in2, in2_r_m);                                            \
-    UNPCK_R_SH_SW(in3, in3_r_m);                                            \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_4_9);                                   \
-    MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m);             \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_1_9);                                   \
-    s0_m += in0_r_m * constant_m;                                           \
-    s1_m -= in1_r_m * constant_m;                                           \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_2_9);                                   \
-    s0_m += in1_r_m * constant_m;                                           \
-    s1_m += in3_r_m * constant_m;                                           \
-                                                                            \
-    s2_m = in0_r_m + in1_r_m - in3_r_m;                                     \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_3_9);                                   \
-    MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m);             \
-                                                                            \
-    in0_r_m = s0_m + s3_m;                                                  \
-    s2_m = s1_m - s3_m;                                                     \
-    s3_m = s1_m - s0_m + s3_m;                                              \
-                                                                            \
-    SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS);              \
-    PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
-                out0, out1, out2, out3);                                    \
-  }
-#endif  // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
index 4ec679642..531ae090a 100644
--- a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
+++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 000000000..3a27e5845
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/encoder/ml.h"
+
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+                    float *output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  const float *input_nodes = features;
+
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int layer = 0; layer < num_layers; ++layer) {
+    const float *weights = nn_config->weights[layer];
+    const float *bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (int node = 0; node < num_output_nodes; ++node) {
+      float val = 0.0f;
+      for (int i = 0; i < num_input_nodes; ++i)
+        val += weights[i] * input_nodes[i];
+      val += bias[node];
+      // ReLU as activation function.
+      val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+      output_nodes[node] = val;
+      weights += num_input_nodes;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
+
+  // Final output layer.
+  const float *weights = nn_config->weights[num_layers];
+  for (int node = 0; node < nn_config->num_outputs; ++node) {
+    const float *bias = nn_config->bias[num_layers];
+    float val = 0.0f;
+    for (int i = 0; i < num_input_nodes; ++i)
+      val += weights[i] * input_nodes[i];
+    output[node] = val + bias[node];
+    weights += num_input_nodes;
+  }
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 000000000..614cb60bb
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ML_H_
+#define AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+typedef struct {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+                    float *output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
index f34b82544..e61cd02ce 100644
--- a/third_party/aom/av1/encoder/palette.c
+++ b/third_party/aom/av1/encoder/palette.c
@@ -23,16 +23,14 @@
 #include "av1/encoder/k_means_template.h"
 #undef AV1_K_MEANS_DIM
 
-static int float_comparer(const void *a, const void *b) {
-  const float fa = *(const float *)a;
-  const float fb = *(const float *)b;
-  return (fa > fb) - (fa < fb);
+static int int_comparer(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
 }
 
-int av1_remove_duplicates(float *centroids, int num_centroids) {
+int av1_remove_duplicates(int *centroids, int num_centroids) {
   int num_unique;  // number of unique centroids
   int i;
-  qsort(centroids, num_centroids, sizeof(*centroids), float_comparer);
+  qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
   // Remove duplicates.
   num_unique = 1;
   for (i = 1; i < num_centroids; ++i) {
@@ -43,7 +41,6 @@ int av1_remove_duplicates(float *centroids, int num_centroids) {
   return num_unique;
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 static int delta_encode_cost(const int *colors, int num, int bit_depth,
                              int min_val) {
   if (num <= 0) return 0;
@@ -116,15 +113,11 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
   }
   return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
                              uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
                              int bit_depth) {
   const int n = pmi->palette_size[0];
-#if CONFIG_PALETTE_DELTA_ENCODING
   int out_cache_colors[PALETTE_MAX_SIZE];
   uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
   const int n_out_cache =
@@ -132,19 +125,13 @@ int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
                             cache_color_found, out_cache_colors);
   const int total_bits =
       n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
-  return total_bits * av1_cost_bit(128, 0);
-#else
-  return bit_depth * n * av1_cost_bit(128, 0);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
+  return av1_cost_literal(total_bits);
 }
 
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
                               uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
                               int bit_depth) {
   const int n = pmi->palette_size[1];
-#if CONFIG_PALETTE_DELTA_ENCODING
   int total_bits = 0;
   // U channel palette color cost.
   int out_cache_colors[PALETTE_MAX_SIZE];
@@ -163,8 +150,5 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
       2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
   const int bits_using_raw = bit_depth * n;
   total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
-  return total_bits * av1_cost_bit(128, 0);
-#else
-  return 2 * bit_depth * n * av1_cost_bit(128, 0);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
+  return av1_cost_literal(total_bits);
 }
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index efd89f66f..bbdd50784 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -20,22 +20,22 @@ extern "C" {
 
 #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
 
-void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const float *data,
-                                             const float *centroids,
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
+                                             const int *centroids,
                                              uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const float *data,
-                                             const float *centroids,
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
+                                             const int *centroids,
                                              uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_k_means, 1)(const float *data, float *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
-void AV1_K_MEANS_RENAME(av1_k_means, 2)(const float *data, float *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
 
 // Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
 // calculate the centroid 'indices' for the data points.
-static INLINE void av1_calc_indices(const float *data, const float *centroids,
+static INLINE void av1_calc_indices(const int *data, const int *centroids,
                                     uint8_t *indices, int n, int k, int dim) {
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
@@ -50,7 +50,7 @@ static INLINE void av1_calc_indices(const float *data, const float *centroids,
 // dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
 // updated 'centroids' and the centroid 'indices' for elements in 'data'.
 // Note: the output centroids are rounded off to nearest integers.
-static INLINE void av1_k_means(const float *data, float *centroids,
+static INLINE void av1_k_means(const int *data, int *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
   if (dim == 1) {
@@ -66,9 +66,8 @@ static INLINE void av1_k_means(const float *data, float *centroids,
 // puts these unique centroids in first 'k' indices of 'centroids' array.
 // Ideally, the centroids should be rounded to integers before calling this
 // method.
-int av1_remove_duplicates(float *centroids, int num_centroids);
+int av1_remove_duplicates(int *centroids, int num_centroids);
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Given a color cache and a set of base colors, find if each cache color is
 // present in the base colors, record the binary results in "cache_color_found".
 // Record the colors that are not in the color cache in "out_cache_colors".
@@ -80,20 +79,14 @@ int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
 // assign zero_count with the number of deltas being 0.
 int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
                                  int bit_depth, int *zero_count, int *min_bits);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 // Return the rate cost for transmitting luma palette color values.
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
-                             uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-                             int bit_depth);
+                             uint16_t *color_cache, int n_cache, int bit_depth);
 
 // Return the rate cost for transmitting chroma palette color values.
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
                               uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
                               int bit_depth);
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
index accc97e57..4f6265617 100644
--- a/third_party/aom/av1/encoder/pickcdef.c
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -12,7 +12,8 @@
 #include <math.h>
 #include <string.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "av1/common/cdef.h"
 #include "av1/common/onyxc_int.h"
@@ -23,7 +24,7 @@
 #define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 
-static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 4, 7, 12, 25 };
+static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 };
 
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
@@ -68,16 +69,11 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
                                 uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
                                 int fast) {
   uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
-#if !CONFIG_CDEF_SINGLEPASS
-  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
-#endif
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id0 = 0;
   int best_id1 = 0;
-#if CONFIG_CDEF_SINGLEPASS
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
-#endif
   memset(tot_mse, 0, sizeof(tot_mse));
   for (i = 0; i < sb_count; i++) {
     int gi;
@@ -204,10 +200,9 @@ static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
   svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
   dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
   return (uint64_t)floor(
-      .5 +
-      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
-          (svar + dvar + (400 << 2 * coeff_shift)) /
-          (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+      .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+               (svar + dvar + (400 << 2 * coeff_shift)) /
+               (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
 }
 
 static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
@@ -290,7 +285,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int fbr, fbc;
   uint16_t *src[3];
   uint16_t *ref_coeff[3];
-  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
+  static cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
   int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int stride[3];
@@ -310,32 +305,27 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
-#if CONFIG_CDEF_SINGLEPASS
   int pri_damping = 3 + (cm->base_qindex >> 6);
-#else
-  int pri_damping = 6;
-#endif
   int sec_damping = 3 + (cm->base_qindex >> 6);
   int i;
   int nb_strengths;
   int nb_strength_bits;
   int quantizer;
   double lambda;
-  int nplanes = 3;
+  const int num_planes = av1_num_planes(cm);
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
   DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
   uint16_t *in;
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE]);
-  int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
-                    xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
   quantizer =
-      av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+      av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
   lambda = .12 * quantizer * quantizer / 256.;
 
-  av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
   mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
   mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-  for (pli = 0; pli < nplanes; pli++) {
+  for (pli = 0; pli < num_planes; pli++) {
     uint8_t *ref_buffer;
     int ref_stride;
     switch (pli) {
@@ -371,20 +361,16 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
 
     for (r = 0; r < frame_height; ++r) {
       for (c = 0; c < frame_width; ++c) {
-#if CONFIG_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
               xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
           ref_coeff[pli][r * stride[pli] + c] =
               CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
         } else {
-#endif
           src[pli][r * stride[pli] + c] =
               xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
           ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
-#if CONFIG_HIGHBITDEPTH
         }
-#endif
       }
     }
   }
@@ -397,13 +383,33 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
       int dirinit = 0;
       nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
       nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
-      cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                          MI_SIZE_64X64 * fbc]
-          ->mbmi.cdef_strength = -1;
+      int hb_step = 1;
+      int vb_step = 1;
+      BLOCK_SIZE bs = BLOCK_64X64;
+      MB_MODE_INFO *const mbmi =
+          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+      if (((fbc & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
+          ((fbr & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
+        continue;
+      if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
+          mbmi->sb_type == BLOCK_64X128)
+        bs = mbmi->sb_type;
+      if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+        nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
+        hb_step = 2;
+      }
+      if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+        nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
+        vb_step = 2;
+      }
+      // No filtering if the entire filter block is skipped
       if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue;
       cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
-                                        fbc * MI_SIZE_64X64, dlist, 1);
-      for (pli = 0; pli < nplanes; pli++) {
+                                        fbc * MI_SIZE_64X64, dlist, bs);
+      for (pli = 0; pli < num_planes; pli++) {
         for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
         for (gi = 0; gi < total_strengths; gi++) {
           int threshold;
@@ -411,7 +417,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
           int sec_strength;
           threshold = gi / CDEF_SEC_STRENGTHS;
           if (fast) threshold = priconv[threshold];
-          if (pli > 0 && !chroma_cdef) threshold = 0;
           /* We avoid filtering the pixels for which some of the pixels to
              average
              are outside the frame. We could change the filter instead, but it
@@ -419,11 +424,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
           int yoff = CDEF_VBORDER * (fbr != 0);
           int xoff = CDEF_HBORDER * (fbc != 0);
           int ysize = (nvb << mi_high_l2[pli]) +
-                      CDEF_VBORDER * (fbr != nvfb - 1) + yoff;
+                      CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
           int xsize = (nhb << mi_wide_l2[pli]) +
-                      CDEF_HBORDER * (fbc != nhfb - 1) + xoff;
+                      CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
           sec_strength = gi % CDEF_SEC_STRENGTHS;
-#if CONFIG_CDEF_SINGLEPASS
           copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
                        src[pli],
                        (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
@@ -433,19 +437,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                          dir, &dirinit, var, pli, dlist, cdef_count, threshold,
                          sec_strength + (sec_strength == 3), pri_damping,
                          sec_damping, coeff_shift);
-#else
-          if (sec_strength == 0)
-            copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
-                         src[pli],
-                         (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
-                         (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
-                         stride[pli], ysize, xsize);
-          cdef_filter_fb(sec_strength ? NULL : (uint8_t *)in, CDEF_BSTRIDE,
-                         tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var,
-                         pli, dlist, cdef_count, threshold,
-                         sec_strength + (sec_strength == 3), sec_damping,
-                         pri_damping, coeff_shift, sec_strength != 0, 1);
-#endif
           curr_mse = compute_cdef_dist(
               ref_coeff[pli] +
                   (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
@@ -470,7 +461,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
     int best_lev0[CDEF_MAX_STRENGTHS];
     int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
     nb_strengths = 1 << i;
-    if (nplanes >= 3)
+    if (num_planes >= 3)
       tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
                                            mse, sb_count, fast);
     else
@@ -500,14 +491,14 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
     best_gi = 0;
     for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
       uint64_t curr = mse[0][i][cm->cdef_strengths[gi]];
-      if (nplanes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
+      if (num_planes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
       if (curr < best_mse) {
         best_gi = gi;
         best_mse = curr;
       }
     }
     selected_strength[i] = best_gi;
-    cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
+    cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
   }
 
   if (fast) {
@@ -526,7 +517,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   cm->cdef_sec_damping = sec_damping;
   aom_free(mse[0]);
   aom_free(mse[1]);
-  for (pli = 0; pli < nplanes; pli++) {
+  for (pli = 0; pli < num_planes; pli++) {
     aom_free(src[pli]);
     aom_free(ref_coeff[pli]);
   }
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index d8b6f9074..5f802a707 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -12,7 +12,7 @@
 #include <assert.h>
 #include <limits.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/psnr.h"
@@ -27,74 +27,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/picklpf.h"
 
-#if CONFIG_LPF_SB
-#if CONFIG_HIGHBITDEPTH
-static int compute_sb_y_sse_highbd(const YV12_BUFFER_CONFIG *src,
-                                   const YV12_BUFFER_CONFIG *frame,
-                                   AV1_COMMON *const cm, int mi_row,
-                                   int mi_col) {
-  int sse = 0;
-  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
-  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
-
-  const int row = mi_row_start * MI_SIZE;
-  const int col = mi_col_start * MI_SIZE;
-  const uint16_t *src_y =
-      CONVERT_TO_SHORTPTR(src->y_buffer) + row * src->y_stride + col;
-  const uint16_t *frame_y =
-      CONVERT_TO_SHORTPTR(frame->y_buffer) + row * frame->y_stride + col;
-  const int row_end = (mi_row_end - mi_row_start) * MI_SIZE;
-  const int col_end = (mi_col_end - mi_col_start) * MI_SIZE;
-
-  int x, y;
-  for (y = 0; y < row_end; ++y) {
-    for (x = 0; x < col_end; ++x) {
-      const int diff = src_y[x] - frame_y[x];
-      sse += diff * diff;
-    }
-    src_y += src->y_stride;
-    frame_y += frame->y_stride;
-  }
-  return sse;
-}
-#endif
-
-static int compute_sb_y_sse(const YV12_BUFFER_CONFIG *src,
-                            const YV12_BUFFER_CONFIG *frame,
-                            AV1_COMMON *const cm, int mi_row, int mi_col) {
-  int sse = 0;
-  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
-  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
-
-  const int row = mi_row_start * MI_SIZE;
-  const int col = mi_col_start * MI_SIZE;
-  const uint8_t *src_y = src->y_buffer + row * src->y_stride + col;
-  const uint8_t *frame_y = frame->y_buffer + row * frame->y_stride + col;
-  const int row_end = (mi_row_end - mi_row_start) * MI_SIZE;
-  const int col_end = (mi_col_end - mi_col_start) * MI_SIZE;
-
-  int x, y;
-  for (y = 0; y < row_end; ++y) {
-    for (x = 0; x < col_end; ++x) {
-      const int diff = src_y[x] - frame_y[x];
-      sse += diff * diff;
-    }
-    src_y += src->y_stride;
-    frame_y += frame->y_stride;
-  }
-  return sse;
-}
-#endif  // CONFIG_LPF_SB
-
-#if !CONFIG_LPF_SB
 static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
                             YV12_BUFFER_CONFIG *dst_bc, int plane) {
   switch (plane) {
@@ -104,7 +36,6 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
     default: assert(plane >= 0 && plane <= 2); break;
   }
 }
-#endif  // CONFIG_LPF_SB
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
@@ -115,195 +46,58 @@ int av1_get_max_filter_level(const AV1_COMP *cpi) {
   }
 }
 
-#if CONFIG_LPF_SB
-// TODO(chengchen): reduce memory usage by copy superblock instead of frame
-static int try_filter_superblock(const YV12_BUFFER_CONFIG *sd,
-                                 AV1_COMP *const cpi, int filt_level,
-                                 int partial_frame, int mi_row, int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  int filt_err;
-
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
-                        partial_frame, mi_row, mi_col);
-#else
-  if (cpi->num_workers > 1)
-    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
-                             filt_level, 1, partial_frame, cpi->workers,
-                             cpi->num_workers, &cpi->lf_row_sync);
-  else
-    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                          1, partial_frame);
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    filt_err =
-        compute_sb_y_sse_highbd(sd, cm->frame_to_show, cm, mi_row, mi_col);
-  } else {
-    filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col);
-  }
-#else
-  filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col);
-#endif  // CONFIG_HIGHBITDEPTH
-
-  // TODO(chengchen): Copy the superblock only
-  // Re-instate the unfiltered frame
-  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
-  return filt_err;
-}
-
-static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                               int partial_frame, double *best_cost_ret,
-                               int mi_row, int mi_col, int last_lvl) {
-  assert(partial_frame == 1);
-  assert(last_lvl >= 0);
-
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->td.mb;
-
-  int min_filter_level = AOMMAX(0, last_lvl - MAX_LPF_OFFSET);
-  int max_filter_level =
-      AOMMIN(av1_get_max_filter_level(cpi), last_lvl + MAX_LPF_OFFSET);
-
-  // search a larger range for the start superblock
-  if (mi_row == 0 && mi_col == 0) {
-    min_filter_level = 0;
-    max_filter_level = av1_get_max_filter_level(cpi);
-  }
-
-  // TODO(chengchen): Copy for superblock only
-  // Make a copy of the unfiltered / processed recon buffer
-  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-
-  int estimate_err =
-      try_filter_superblock(sd, cpi, last_lvl, partial_frame, mi_row, mi_col);
-
-  int best_err = estimate_err;
-  int filt_best = last_lvl;
-
-  int i;
-  for (i = min_filter_level; i <= max_filter_level; i += LPF_STEP) {
-    if (i == last_lvl) continue;
-
-    int filt_err =
-        try_filter_superblock(sd, cpi, i, partial_frame, mi_row, mi_col);
-
-    if (filt_err < best_err) {
-      best_err = filt_err;
-      filt_best = i;
-    }
-  }
-
-  // If previous sb filter level has similar filtering performance as current
-  // best filter level, use previous level such that we can only send one bit
-  // to indicate current filter level is the same as the previous.
-  int threshold = 400;
-
-  // ratio = the filtering area / a superblock size
-  int ratio = 1;
-  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
-    ratio *= (cm->mi_rows - mi_row);
-  } else {
-    if (mi_row == 0) {
-      ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET);
-    } else {
-      ratio *= MAX_MIB_SIZE;
-    }
-  }
-  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
-    ratio *= (cm->mi_cols - mi_col);
-  } else {
-    if (mi_col == 0) {
-      ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET);
-    } else {
-      ratio *= MAX_MIB_SIZE;
-    }
-  }
-  threshold = threshold * ratio / (MAX_MIB_SIZE * MAX_MIB_SIZE);
-
-  const int diff = abs(estimate_err - best_err);
-
-  const int percent_thresh = (int)((double)estimate_err * 0.01);
-  threshold = AOMMAX(threshold, percent_thresh);
-  if (diff < threshold) {
-    best_err = estimate_err;
-    filt_best = last_lvl;
-  }
-
-  // Compute rdcost to determine whether to reuse previous filter lvl
-  if (filt_best != last_lvl) {
-  }
-
-  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
-  return filt_best;
-}
-
-#else  // CONFIG_LPF_SB
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
-                                int partial_frame
-#if CONFIG_LOOPFILTER_LEVEL
-                                ,
-                                int plane, int dir
-#endif
-                                ) {
+                                int partial_frame, int plane, int dir) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-#if CONFIG_LOOPFILTER_LEVEL
   assert(plane >= 0 && plane <= 2);
   int filter_level[2] = { filt_level, filt_level };
   if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
   if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
 
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd,
-                        filter_level[0], filter_level[1], plane, partial_frame);
-#else
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
-                        partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
+  // set base filters for use of get_filter_level when in DELTA_Q_LF mode
+  switch (plane) {
+    case 0:
+      cm->lf.filter_level[0] = filter_level[0];
+      cm->lf.filter_level[1] = filter_level[1];
+      break;
+    case 1: cm->lf.filter_level_u = filter_level[0]; break;
+    case 2: cm->lf.filter_level_v = filter_level[0]; break;
+  }
+
+      // TODO(any): please enable multi-thread and remove the flag when loop
+      // filter mask is compatible with multi-thread.
+#if LOOP_FILTER_BITMASK
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                        plane + 1, partial_frame);
 #else
   if (cpi->num_workers > 1)
-    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
-                             filt_level, 1, partial_frame, cpi->workers,
+    av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                             plane + 1, partial_frame, cpi->workers,
                              cpi->num_workers, &cpi->lf_row_sync);
   else
-    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                          1, partial_frame);
+    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                          plane + 1, partial_frame);
 #endif
 
   int highbd = 0;
-#if CONFIG_HIGHBITDEPTH
   highbd = cm->use_highbitdepth;
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_LOOPFILTER_LEVEL
   filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
-#else
-  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, 0, highbd);
-
-  // Re-instate the unfiltered frame
-  yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, 0);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
   return filt_err;
 }
 
 static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                               int partial_frame, double *best_cost_ret
-#if CONFIG_LOOPFILTER_LEVEL
-                               ,
-                               int plane, int dir
-#endif
-                               ) {
+                               int partial_frame,
+                               const int *last_frame_filter_level,
+                               double *best_cost_ret, int plane, int dir) {
   const AV1_COMMON *const cm = &cpi->common;
-  const struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
   const int max_filter_level = av1_get_max_filter_level(cpi);
   int filt_direction = 0;
@@ -311,39 +105,24 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   int filt_best;
   MACROBLOCK *x = &cpi->td.mb;
 
-// Start the search at the previous frame filter level unless it is now out of
-// range.
-#if CONFIG_LOOPFILTER_LEVEL
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
   int lvl;
   switch (plane) {
-    case 0: lvl = (dir == 1) ? lf->filter_level[1] : lf->filter_level[0]; break;
-    case 1: lvl = lf->filter_level_u; break;
-    case 2: lvl = lf->filter_level_v; break;
+    case 0: lvl = last_frame_filter_level[dir]; break;
+    case 1: lvl = last_frame_filter_level[2]; break;
+    case 2: lvl = last_frame_filter_level[3]; break;
     default: assert(plane >= 0 && plane <= 2); return 0;
   }
   int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
-#else
-  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
-#endif  // CONFIG_LOOPFILTER_LEVEL
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
 
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
-
-#if CONFIG_LOOPFILTER_LEVEL
   yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
-#else
-  //  Make a copy of the unfiltered / processed recon buffer
-  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-
-#if CONFIG_LOOPFILTER_LEVEL
   best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
-#else
-  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
@@ -363,12 +142,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-#if CONFIG_LOOPFILTER_LEVEL
         ss_err[filt_low] =
             try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
-#else
-        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
@@ -384,12 +159,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-#if CONFIG_LOOPFILTER_LEVEL
         ss_err[filt_high] =
             try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
-#else
-        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
       }
       // If value is significantly better than previous best, bias added against
       // raising filter value
@@ -415,33 +186,36 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
   return filt_best;
 }
-#endif  // CONFIG_LPF_SB
 
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   struct loopfilter *const lf = &cm->lf;
+  (void)sd;
 
-  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+  lf->sharpness_level = 0;
 
   if (method == LPF_PICK_MINIMAL_LPF) {
-#if CONFIG_LOOPFILTER_LEVEL
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
-#else
-    lf->filter_level = 0;
-#endif
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
-    const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
-// These values were determined by linear fitting the result of the
-// searched level, filt_guess = q * 0.316206 + 3.87252
-#if CONFIG_HIGHBITDEPTH
+    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth);
+    // These values were determined by linear fitting the result of the
+    // searched level for 8 bit depth:
+    // Keyframes: filt_guess = q * 0.06699 - 1.60817
+    // Other frames: filt_guess = q * 0.02295 + 2.48225
+    //
+    // And high bit depth separately:
+    // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
     switch (cm->bit_depth) {
       case AOM_BITS_8:
-        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        filt_guess = (cm->frame_type == KEY_FRAME)
+                         ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+                         : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
         break;
       case AOM_BITS_10:
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
@@ -455,58 +229,36 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                "or AOM_BITS_12");
         return;
     }
-#else
-    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
-#endif  // CONFIG_HIGHBITDEPTH
-    if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
-#if CONFIG_LOOPFILTER_LEVEL
+    if (cm->bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    // TODO(chengchen): retrain the model for Y, U, V filter levels
     lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
     lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
-#else
-    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
-#endif
+    lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-#if CONFIG_LPF_SB
-    int mi_row, mi_col;
-    // TODO(chengchen): init last_lvl using previous frame's info?
-    int last_lvl = 0;
-    // TODO(chengchen): if the frame size makes the last superblock very small,
-    // consider merge it to the previous superblock to save bits.
-    // Example, if frame size 1080x720, then in the last row of superblock,
-    // there're (FILT_BOUNDAR_OFFSET + 16) pixels.
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-        int lvl =
-            search_filter_level(sd, cpi, 1, NULL, mi_row, mi_col, last_lvl);
-
-        av1_loop_filter_sb_level_init(cm, mi_row, mi_col, lvl);
-
-        // For the superblock at row start, its previous filter level should be
-        // the one above it, not the one at the end of last row
-        if (mi_col + MAX_MIB_SIZE >= cm->mi_cols) {
-          last_lvl = cm->mi_grid_visible[mi_row * cm->mi_stride]->mbmi.filt_lvl;
-        } else {
-          last_lvl = lvl;
-        }
-      }
+    const int last_frame_filter_level[4] = { lf->filter_level[0],
+                                             lf->filter_level[1],
+                                             lf->filter_level_u,
+                                             lf->filter_level_v };
+
+    lf->filter_level[0] = lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 2);
+    lf->filter_level[0] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 0);
+    lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 1);
+
+    if (num_planes > 1) {
+      lf->filter_level_u =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 1, 0);
+      lf->filter_level_v =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 2, 0);
     }
-#else  // CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
-    lf->filter_level[0] = lf->filter_level[1] = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 2);
-    lf->filter_level[0] = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 0);
-    lf->filter_level[1] = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 1);
-
-    lf->filter_level_u = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 1, 0);
-    lf->filter_level_v = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 2, 0);
-#else
-    lf->filter_level =
-        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
   }
 }
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index a2262b6fc..93ea09690 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -14,7 +14,7 @@
 #include <limits.h>
 #include <math.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
@@ -40,150 +40,156 @@ static const RestorationType force_restore_type = RESTORE_TYPES;
 // Number of Wiener iterations
 #define NUM_WIENER_ITERS 5
 
-typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
-                                      AV1_COMP *cpi, int partial_frame,
-                                      int plane, RestorationInfo *info,
-                                      RestorationType *rest_level,
-                                      int64_t *best_tile_cost,
-                                      YV12_BUFFER_CONFIG *dst_frame);
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
 
 const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
 
-static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src,
-                                    const YV12_BUFFER_CONFIG *dst,
-                                    const AV1_COMMON *cm, int h_start,
-                                    int width, int v_start, int height,
-                                    int components_pattern) {
-  int64_t filt_err = 0;
-  (void)cm;
-  // Y and UV components cannot be mixed
-  assert(components_pattern == 1 || components_pattern == 2 ||
-         components_pattern == 4 || components_pattern == 6);
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    if ((components_pattern >> AOM_PLANE_Y) & 1) {
-      filt_err +=
-          aom_highbd_get_y_sse_part(src, dst, h_start, width, v_start, height);
-    }
-    if ((components_pattern >> AOM_PLANE_U) & 1) {
-      filt_err +=
-          aom_highbd_get_u_sse_part(src, dst, h_start, width, v_start, height);
-    }
-    if ((components_pattern >> AOM_PLANE_V) & 1) {
-      filt_err +=
-          aom_highbd_get_v_sse_part(src, dst, h_start, width, v_start, height);
-    }
-    return filt_err;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  if ((components_pattern >> AOM_PLANE_Y) & 1) {
-    filt_err += aom_get_y_sse_part(src, dst, h_start, width, v_start, height);
-  }
-  if ((components_pattern >> AOM_PLANE_U) & 1) {
-    filt_err += aom_get_u_sse_part(src, dst, h_start, width, v_start, height);
-  }
-  if ((components_pattern >> AOM_PLANE_V) & 1) {
-    filt_err += aom_get_v_sse_part(src, dst, h_start, width, v_start, height);
-  }
-  return filt_err;
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                      const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                           const YV12_BUFFER_CONFIG *b,
+                                           int hstart, int width, int vstart,
+                                           int height);
+
+#define NUM_EXTRACTORS (3 * (1 + 1))
+
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_sse_part,        aom_get_u_sse_part,
+  aom_get_v_sse_part,        aom_highbd_get_y_sse_part,
+  aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+                                    const YV12_BUFFER_CONFIG *src,
+                                    const YV12_BUFFER_CONFIG *dst, int plane,
+                                    int highbd) {
+  return sse_part_extractors[3 * highbd + plane](
+      src, dst, limits->h_start, limits->h_end - limits->h_start,
+      limits->v_start, limits->v_end - limits->v_start);
 }
 
-static int64_t sse_restoration_frame(AV1_COMMON *const cm,
-                                     const YV12_BUFFER_CONFIG *src,
-                                     const YV12_BUFFER_CONFIG *dst,
-                                     int components_pattern) {
-  int64_t filt_err = 0;
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    if ((components_pattern >> AOM_PLANE_Y) & 1) {
-      filt_err += aom_highbd_get_y_sse(src, dst);
-    }
-    if ((components_pattern >> AOM_PLANE_U) & 1) {
-      filt_err += aom_highbd_get_u_sse(src, dst);
-    }
-    if ((components_pattern >> AOM_PLANE_V) & 1) {
-      filt_err += aom_highbd_get_v_sse(src, dst);
-    }
-    return filt_err;
-  }
-#else
-  (void)cm;
-#endif  // CONFIG_HIGHBITDEPTH
-  if ((components_pattern >> AOM_PLANE_Y) & 1) {
-    filt_err = aom_get_y_sse(src, dst);
-  }
-  if ((components_pattern >> AOM_PLANE_U) & 1) {
-    filt_err += aom_get_u_sse(src, dst);
-  }
-  if ((components_pattern >> AOM_PLANE_V) & 1) {
-    filt_err += aom_get_v_sse(src, dst);
-  }
-  return filt_err;
+typedef struct {
+  // The best coefficients for Wiener or Sgrproj restoration
+  WienerInfo wiener;
+  SgrprojInfo sgrproj;
+
+  // The sum of squared errors for this rtype.
+  int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+  // The rtype to use for this unit given a frame rtype as
+  // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
+  RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+typedef struct {
+  const YV12_BUFFER_CONFIG *src;
+  YV12_BUFFER_CONFIG *dst;
+
+  const AV1_COMMON *cm;
+  const MACROBLOCK *x;
+  int plane;
+  int plane_width;
+  int plane_height;
+  RestUnitSearchInfo *rusi;
+
+  // Speed features
+  const SPEED_FEATURES *sf;
+
+  uint8_t *dgd_buffer;
+  int dgd_stride;
+  const uint8_t *src_buffer;
+  int src_stride;
+
+  // sse and bits are initialised by reset_rsc in search_rest_type
+  int64_t sse;
+  int64_t bits;
+  int tile_y0, tile_stripe0;
+
+  // sgrproj and wiener are initialised by rsc_on_tile when starting the first
+  // tile in the frame.
+  SgrprojInfo sgrproj;
+  WienerInfo wiener;
+  AV1PixelRect tile_rect;
+} RestSearchCtxt;
+
+static void rsc_on_tile(int tile_row, int tile_col, void *priv) {
+  (void)tile_col;
+
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  set_default_sgrproj(&rsc->sgrproj);
+  set_default_wiener(&rsc->wiener);
+
+  rsc->tile_stripe0 =
+      (tile_row == 0) ? 0 : rsc->cm->rst_end_stripe[tile_row - 1];
 }
 
-static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
-                                    AV1_COMP *const cpi, RestorationInfo *rsi,
-                                    int components_pattern, int partial_frame,
-                                    int tile_idx,
-                                    YV12_BUFFER_CONFIG *dst_frame) {
-  AV1_COMMON *const cm = &cpi->common;
-  int64_t filt_err;
-  int tile_width, tile_height, nhtiles, nvtiles;
-  int ntiles, width, height;
-
-  // Y and UV components cannot be mixed
-  assert(components_pattern == 1 || components_pattern == 2 ||
-         components_pattern == 4 || components_pattern == 6);
-
-  if (components_pattern == 1) {  // Y only
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-  } else {  // Color
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-  }
-  ntiles = av1_get_rest_ntiles(
-      width, height, cm->rst_info[components_pattern > 1].restoration_tilesize,
-      &tile_width, &tile_height, &nhtiles, &nvtiles);
-  (void)ntiles;
-
-  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
-                             partial_frame, dst_frame);
-  RestorationTileLimits limits = av1_get_rest_tile_limits(
-      tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      height, components_pattern > 1 ? cm->subsampling_y : 0);
-#else
-      height);
-#endif
-  filt_err = sse_restoration_tile(
-      src, dst_frame, cm, limits.h_start, limits.h_end - limits.h_start,
-      limits.v_start, limits.v_end - limits.v_start, components_pattern);
-
-  return filt_err;
+static void reset_rsc(RestSearchCtxt *rsc) {
+  rsc->sse = 0;
+  rsc->bits = 0;
 }
 
-static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
-                                     AV1_COMP *const cpi, RestorationInfo *rsi,
-                                     int components_pattern, int partial_frame,
-                                     YV12_BUFFER_CONFIG *dst_frame) {
-  AV1_COMMON *const cm = &cpi->common;
-  int64_t filt_err;
-  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
-                             partial_frame, dst_frame);
-  filt_err = sse_restoration_frame(cm, src, dst_frame, components_pattern);
-  return filt_err;
+static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
+                     const MACROBLOCK *x, const SPEED_FEATURES *sf, int plane,
+                     RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst,
+                     RestSearchCtxt *rsc) {
+  rsc->src = src;
+  rsc->dst = dst;
+  rsc->cm = cm;
+  rsc->x = x;
+  rsc->plane = plane;
+  rsc->rusi = rusi;
+  rsc->sf = sf;
+
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int is_uv = plane != AOM_PLANE_Y;
+  rsc->plane_width = src->crop_widths[is_uv];
+  rsc->plane_height = src->crop_heights[is_uv];
+  rsc->src_buffer = src->buffers[plane];
+  rsc->src_stride = src->strides[is_uv];
+  rsc->dgd_buffer = dgd->buffers[plane];
+  rsc->dgd_stride = dgd->strides[is_uv];
+  rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
+  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+                                    const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    const RestorationUnitInfo *rui) {
+  const AV1_COMMON *const cm = rsc->cm;
+  const int plane = rsc->plane;
+  const int is_uv = plane > 0;
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationLineBuffers rlbs;
+  const int bit_depth = cm->bit_depth;
+  const int highbd = cm->use_highbitdepth;
+
+  const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+  // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+  // also used in encoder.
+  const int optimized_lr = 0;
+
+  av1_loop_restoration_filter_unit(
+      limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+      is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth,
+      fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+
+  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
 }
 
 static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
                                     int src_stride, const uint8_t *dat8,
                                     int dat_stride, int use_highbitdepth,
-                                    int32_t *flt1, int flt1_stride,
-                                    int32_t *flt2, int flt2_stride, int *xqd) {
+                                    int32_t *flt0, int flt0_stride,
+                                    int32_t *flt1, int flt1_stride, int *xqd,
+                                    const sgr_params_type *params) {
   int i, j;
   int64_t err = 0;
   int xq[2];
-  decode_xq(xqd, xq);
+  decode_xq(xqd, xq, params);
   if (!use_highbitdepth) {
     const uint8_t *src = src8;
     const uint8_t *dat = dat8;
@@ -191,9 +197,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
       for (j = 0; j < width; ++j) {
         const int32_t u =
             (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
-        const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
-        const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u);
+        if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u);
         const int32_t e =
             ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
             src[i * src_stride + j];
@@ -203,17 +209,67 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        const int32_t u =
-            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
-        const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
-        const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-        const int32_t e =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
-            src[i * src_stride + j];
-        err += e * e;
+    const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+    if (params->r[0] > 0 && params->r[1] > 0) {
+      int xq0 = xq[0];
+      int xq1 = xq[1];
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+          int32_t v0 = flt0[j] - u;
+          int32_t v1 = flt1[j] - u;
+          int32_t v = half;
+          v += xq0 * v0;
+          v += xq1 * v1;
+          const int32_t e =
+              (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        flt0 += flt0_stride;
+        flt1 += flt1_stride;
+        src += src_stride;
+      }
+    } else if (params->r[0] > 0 || params->r[1] > 0) {
+      int exq;
+      int32_t *flt;
+      int flt_stride;
+      if (params->r[0] > 0) {
+        exq = xq[0];
+        flt = flt0;
+        flt_stride = flt0_stride;
+      } else {
+        exq = xq[1];
+        flt = flt1;
+        flt_stride = flt1_stride;
+      }
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+          int32_t v = half;
+          v += exq * (flt[j] - u);
+          const int32_t e =
+              (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        flt += flt_stride;
+        src += src_stride;
+      }
+    } else {
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t e = d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        src += src_stride;
       }
     }
   }
@@ -223,11 +279,12 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
 #define USE_SGRPROJ_REFINEMENT_SEARCH 1
 static int64_t finer_search_pixel_proj_error(
     const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt1,
-    int flt1_stride, int32_t *flt2, int flt2_stride, int start_step, int *xqd) {
-  int64_t err = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                     dat_stride, use_highbitdepth, flt1,
-                                     flt1_stride, flt2, flt2_stride, xqd);
+    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+    int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+    const sgr_params_type *params) {
+  int64_t err = get_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt0_stride, flt1, flt1_stride, xqd, params);
   (void)start_step;
 #if USE_SGRPROJ_REFINEMENT_SEARCH
   int64_t err2;
@@ -235,13 +292,17 @@ static int64_t finer_search_pixel_proj_error(
   int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
   for (int s = start_step; s >= 1; s >>= 1) {
     for (int p = 0; p < 2; ++p) {
+      if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+        continue;
+      }
       int skip = 0;
       do {
         if (xqd[p] - s >= tap_min[p]) {
           xqd[p] -= s;
-          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, use_highbitdepth, flt1,
-                                      flt1_stride, flt2, flt2_stride, xqd);
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
           if (err2 > err) {
             xqd[p] += s;
           } else {
@@ -257,9 +318,10 @@ static int64_t finer_search_pixel_proj_error(
       do {
         if (xqd[p] + s <= tap_max[p]) {
           xqd[p] += s;
-          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, use_highbitdepth, flt1,
-                                      flt1_stride, flt2, flt2_stride, xqd);
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
           if (err2 > err) {
             xqd[p] -= s;
           } else {
@@ -277,10 +339,11 @@ static int64_t finer_search_pixel_proj_error(
 }
 
 static void get_proj_subspace(const uint8_t *src8, int width, int height,
-                              int src_stride, uint8_t *dat8, int dat_stride,
-                              int use_highbitdepth, int32_t *flt1,
-                              int flt1_stride, int32_t *flt2, int flt2_stride,
-                              int *xq) {
+                              int src_stride, const uint8_t *dat8,
+                              int dat_stride, int use_highbitdepth,
+                              int32_t *flt0, int flt0_stride, int32_t *flt1,
+                              int flt1_stride, int *xq,
+                              const sgr_params_type *params) {
   int i, j;
   double H[2][2] = { { 0, 0 }, { 0, 0 } };
   double C[2] = { 0, 0 };
@@ -301,8 +364,10 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height,
         const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
         const double s =
             (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-        const double f1 = (double)flt1[i * flt1_stride + j] - u;
-        const double f2 = (double)flt2[i * flt2_stride + j] - u;
+        const double f1 =
+            (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+        const double f2 =
+            (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
         H[0][0] += f1 * f1;
         H[1][1] += f2 * f2;
         H[0][1] += f1 * f2;
@@ -318,8 +383,10 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height,
         const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
         const double s =
             (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-        const double f1 = (double)flt1[i * flt1_stride + j] - u;
-        const double f2 = (double)flt2[i * flt2_stride + j] - u;
+        const double f1 =
+            (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+        const double f2 =
+            (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
         H[0][0] += f1 * f1;
         H[1][1] += f2 * f2;
         H[0][1] += f1 * f2;
@@ -334,99 +401,103 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height,
   H[1][0] = H[0][1];
   C[0] /= size;
   C[1] /= size;
-  Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
-  if (Det < 1e-8) return;  // ill-posed, return default values
-  x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
-  x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
-  xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
-  xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  if (params->r[0] == 0) {
+    // H matrix is now only the scalar H[1][1]
+    // C vector is now only the scalar C[1]
+    Det = H[1][1];
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = 0;
+    x[1] = C[1] / Det;
+
+    xq[0] = 0;
+    xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  } else if (params->r[1] == 0) {
+    // H matrix is now only the scalar H[0][0]
+    // C vector is now only the scalar C[0]
+    Det = H[0][0];
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = C[0] / Det;
+    x[1] = 0;
+
+    xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+    xq[1] = 0;
+  } else {
+    Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+    x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+
+    xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+    xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  }
 }
 
-void encode_xq(int *xq, int *xqd) {
-  xqd[0] = xq[0];
-  xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
-  xqd[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1];
-  xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xqd[0] = 0;
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else if (params->r[1] == 0) {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  }
 }
 
-static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
-                                          int dat_stride, const uint8_t *src8,
-                                          int src_stride, int use_highbitdepth,
-                                          int bit_depth, int pu_width,
-                                          int pu_height, int *eps, int *xqd,
-                                          int32_t *rstbuf) {
-  int32_t *flt1 = rstbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+// Apply the self-guided filter across an entire restoration unit.
+static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
+                      int height, int dat_stride, int use_highbd, int bit_depth,
+                      int pu_width, int pu_height, int32_t *flt0, int32_t *flt1,
+                      int flt_stride) {
+  for (int i = 0; i < height; i += pu_height) {
+    const int h = AOMMIN(pu_height, height - i);
+    int32_t *flt0_row = flt0 + i * flt_stride;
+    int32_t *flt1_row = flt1 + i * flt_stride;
+    const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+    // Iterate over the stripe in blocks of width pu_width
+    for (int j = 0; j < width; j += pu_width) {
+      const int w = AOMMIN(pu_width, width - j);
+      av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j,
+                                 flt1_row + j, flt_stride, sgr_params_idx,
+                                 bit_depth, use_highbd);
+    }
+  }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+    const uint8_t *dat8, int width, int height, int dat_stride,
+    const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+    int pu_width, int pu_height, int32_t *rstbuf) {
+  int32_t *flt0 = rstbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   int ep, bestep = 0;
-  int64_t err, besterr = -1;
+  int64_t besterr = -1;
   int exqd[2], bestxqd[2] = { 0, 0 };
-  int flt1_stride = ((width + 7) & ~7) + 8;
-  int flt2_stride = ((width + 7) & ~7) + 8;
+  int flt_stride = ((width + 7) & ~7) + 8;
   assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
          pu_width == RESTORATION_PROC_UNIT_SIZE);
   assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
          pu_height == RESTORATION_PROC_UNIT_SIZE);
-#if !CONFIG_HIGHBITDEPTH
-  (void)bit_depth;
-#endif
 
   for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
     int exq[2];
-#if CONFIG_HIGHBITDEPTH
-    if (use_highbitdepth) {
-      uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
-      for (int i = 0; i < height; i += pu_height)
-        for (int j = 0; j < width; j += pu_width) {
-          const int w = AOMMIN(pu_width, width - j);
-          const int h = AOMMIN(pu_height, height - i);
-          uint16_t *dat_p = dat + i * dat_stride + j;
-          int32_t *flt1_p = flt1 + i * flt1_stride + j;
-          int32_t *flt2_p = flt2 + i * flt2_stride + j;
-#if USE_HIGHPASS_IN_SGRPROJ
-          av1_highpass_filter_highbd(dat_p, w, h, dat_stride, flt1_p,
-                                     flt1_stride, sgr_params[ep].corner,
-                                     sgr_params[ep].edge);
-#else
-          av1_selfguided_restoration_highbd(
-              dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
-              sgr_params[ep].r1, sgr_params[ep].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-          av1_selfguided_restoration_highbd(
-              dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
-              sgr_params[ep].r2, sgr_params[ep].e2);
-        }
-    } else {
-#endif
-      for (int i = 0; i < height; i += pu_height)
-        for (int j = 0; j < width; j += pu_width) {
-          const int w = AOMMIN(pu_width, width - j);
-          const int h = AOMMIN(pu_height, height - i);
-          uint8_t *dat_p = dat8 + i * dat_stride + j;
-          int32_t *flt1_p = flt1 + i * flt1_stride + j;
-          int32_t *flt2_p = flt2 + i * flt2_stride + j;
-#if USE_HIGHPASS_IN_SGRPROJ
-          av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
-                              sgr_params[ep].corner, sgr_params[ep].edge);
-#else
-        av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
-                                   sgr_params[ep].r1, sgr_params[ep].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-          av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
-                                     flt2_stride, sgr_params[ep].r2,
-                                     sgr_params[ep].e2);
-        }
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif
+    apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+              pu_width, pu_height, flt0, flt1, flt_stride);
     aom_clear_system_state();
+    const sgr_params_type *const params = &sgr_params[ep];
     get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
-                      use_highbitdepth, flt1, flt1_stride, flt2, flt2_stride,
-                      exq);
+                      use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+                      params);
     aom_clear_system_state();
-    encode_xq(exq, exqd);
-    err = finer_search_pixel_proj_error(
+    encode_xq(exq, exqd, params);
+    int64_t err = finer_search_pixel_proj_error(
         src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
-        flt1, flt1_stride, flt2, flt2_stride, 2, exqd);
+        flt0, flt_stride, flt1, flt_stride, 2, exqd, params);
     if (besterr == -1 || err < besterr) {
       bestep = ep;
       besterr = err;
@@ -434,273 +505,86 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
       bestxqd[1] = exqd[1];
     }
   }
-  *eps = bestep;
-  xqd[0] = bestxqd[0];
-  xqd[1] = bestxqd[1];
+
+  SgrprojInfo ret;
+  ret.ep = bestep;
+  ret.xqd[0] = bestxqd[0];
+  ret.xqd[1] = bestxqd[1];
+  return ret;
 }
 
 static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
                               SgrprojInfo *ref_sgrproj_info) {
   int bits = SGRPROJ_PARAMS_BITS;
-  bits += aom_count_primitive_refsubexpfin(
-      SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
-      ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
-      sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
-  bits += aom_count_primitive_refsubexpfin(
-      SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
-      ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
-      sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+  if (params->r[0] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  if (params->r[1] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
   return bits;
 }
 
-struct rest_search_ctxt {
-  const YV12_BUFFER_CONFIG *src;
-  AV1_COMP *cpi;
-  uint8_t *dgd_buffer;
-  const uint8_t *src_buffer;
-  int dgd_stride;
-  int src_stride;
-  int partial_frame;
-  RestorationInfo *info;
-  RestorationType *type;
-  int64_t *best_tile_cost;
-  int plane;
-  int plane_width;
-  int plane_height;
-  int nrtiles_x;
-  int nrtiles_y;
-  YV12_BUFFER_CONFIG *dst_frame;
-};
-
-// Fill in ctxt. Returns the number of restoration tiles for this plane
-static INLINE int init_rest_search_ctxt(
-    const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
-    RestorationInfo *info, RestorationType *type, int64_t *best_tile_cost,
-    YV12_BUFFER_CONFIG *dst_frame, struct rest_search_ctxt *ctxt) {
-  AV1_COMMON *const cm = &cpi->common;
-  ctxt->src = src;
-  ctxt->cpi = cpi;
-  ctxt->partial_frame = partial_frame;
-  ctxt->info = info;
-  ctxt->type = type;
-  ctxt->best_tile_cost = best_tile_cost;
-  ctxt->plane = plane;
-  ctxt->dst_frame = dst_frame;
-
-  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
-  if (plane == AOM_PLANE_Y) {
-    ctxt->plane_width = src->y_crop_width;
-    ctxt->plane_height = src->y_crop_height;
-    ctxt->src_buffer = src->y_buffer;
-    ctxt->src_stride = src->y_stride;
-    ctxt->dgd_buffer = dgd->y_buffer;
-    ctxt->dgd_stride = dgd->y_stride;
-    assert(ctxt->plane_width == dgd->y_crop_width);
-    assert(ctxt->plane_height == dgd->y_crop_height);
-    assert(ctxt->plane_width == src->y_crop_width);
-    assert(ctxt->plane_height == src->y_crop_height);
-  } else {
-    ctxt->plane_width = src->uv_crop_width;
-    ctxt->plane_height = src->uv_crop_height;
-    ctxt->src_stride = src->uv_stride;
-    ctxt->dgd_stride = dgd->uv_stride;
-    ctxt->src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
-    ctxt->dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
-    assert(ctxt->plane_width == dgd->uv_crop_width);
-    assert(ctxt->plane_height == dgd->uv_crop_height);
-  }
-
-  return av1_get_rest_ntiles(ctxt->plane_width, ctxt->plane_height,
-                             cm->rst_info[plane].restoration_tilesize, NULL,
-                             NULL, &ctxt->nrtiles_x, &ctxt->nrtiles_y);
-}
-
-typedef void (*rtile_visitor_t)(const struct rest_search_ctxt *search_ctxt,
-                                int rtile_idx,
-                                const RestorationTileLimits *limits, void *arg);
-
-static void foreach_rtile_in_tile(const struct rest_search_ctxt *ctxt,
-                                  int tile_row, int tile_col,
-                                  rtile_visitor_t fun, void *arg) {
-  const AV1_COMMON *const cm = &ctxt->cpi->common;
-  const RestorationInfo *rsi = ctxt->cpi->rst_search;
-  TileInfo tile_info;
-
-  av1_tile_set_row(&tile_info, cm, tile_row);
-  av1_tile_set_col(&tile_info, cm, tile_col);
-
-  int tile_col_start = tile_info.mi_col_start * MI_SIZE;
-  int tile_col_end = tile_info.mi_col_end * MI_SIZE;
-  int tile_row_start = tile_info.mi_row_start * MI_SIZE;
-  int tile_row_end = tile_info.mi_row_end * MI_SIZE;
-  if (ctxt->plane > 0) {
-    tile_col_start = ROUND_POWER_OF_TWO(tile_col_start, cm->subsampling_x);
-    tile_col_end = ROUND_POWER_OF_TWO(tile_col_end, cm->subsampling_x);
-    tile_row_start = ROUND_POWER_OF_TWO(tile_row_start, cm->subsampling_y);
-    tile_row_end = ROUND_POWER_OF_TWO(tile_row_end, cm->subsampling_y);
-  }
+static void search_sgrproj(const RestorationTileLimits *limits,
+                           const AV1PixelRect *tile, int rest_unit_idx,
+                           void *priv, int32_t *tmpbuf,
+                           RestorationLineBuffers *rlbs) {
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-#if CONFIG_FRAME_SUPERRES
-  // If upscaling is enabled, the tile limits need scaling to match the
-  // upscaled frame where the restoration tiles live. To do this, scale up the
-  // top-left and bottom-right of the tile.
-  if (!av1_superres_unscaled(cm)) {
-    av1_calculate_unscaled_superres_size(&tile_col_start, &tile_row_start,
-                                         cm->superres_scale_denominator);
-    av1_calculate_unscaled_superres_size(&tile_col_end, &tile_row_end,
-                                         cm->superres_scale_denominator);
-    // Make sure we don't fall off the bottom-right of the frame.
-    tile_col_end = AOMMIN(tile_col_end, ctxt->plane_width);
-    tile_row_end = AOMMIN(tile_row_end, ctxt->plane_height);
-  }
-#endif  // CONFIG_FRAME_SUPERRES
-
-  const int rtile_size = rsi->restoration_tilesize;
-  const int rtile_col0 = (tile_col_start + rtile_size - 1) / rtile_size;
-  const int rtile_col1 =
-      AOMMIN((tile_col_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_x);
-  const int rtile_row0 = (tile_row_start + rtile_size - 1) / rtile_size;
-  const int rtile_row1 =
-      AOMMIN((tile_row_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_y);
-
-  const int rtile_width = AOMMIN(tile_col_end - tile_col_start, rtile_size);
-  const int rtile_height = AOMMIN(tile_row_end - tile_row_start, rtile_size);
-
-  for (int rtile_row = rtile_row0; rtile_row < rtile_row1; ++rtile_row) {
-    for (int rtile_col = rtile_col0; rtile_col < rtile_col1; ++rtile_col) {
-      const int rtile_idx = rtile_row * ctxt->nrtiles_x + rtile_col;
-      RestorationTileLimits limits = av1_get_rest_tile_limits(
-          rtile_idx, ctxt->nrtiles_x, ctxt->nrtiles_y, rtile_width,
-          rtile_height, ctxt->plane_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-          ctxt->plane_height, ctxt->plane > 0 ? cm->subsampling_y : 0);
-#else
-          ctxt->plane_height);
-#endif
-      fun(ctxt, rtile_idx, &limits, arg);
-    }
-  }
-}
+  const MACROBLOCK *const x = rsc->x;
+  const AV1_COMMON *const cm = rsc->cm;
+  const int highbd = cm->use_highbitdepth;
+  const int bit_depth = cm->bit_depth;
 
-static void search_sgrproj_for_rtile(const struct rest_search_ctxt *ctxt,
-                                     int rtile_idx,
-                                     const RestorationTileLimits *limits,
-                                     void *arg) {
-  const MACROBLOCK *const x = &ctxt->cpi->td.mb;
-  const AV1_COMMON *const cm = &ctxt->cpi->common;
-  RestorationInfo *rsi = ctxt->cpi->rst_search;
-  SgrprojInfo *sgrproj_info = ctxt->info->sgrproj_info;
-
-  SgrprojInfo *ref_sgrproj_info = (SgrprojInfo *)arg;
-
-  int64_t err =
-      sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start,
-                           limits->h_end - limits->h_start, limits->v_start,
-                           limits->v_end - limits->v_start, (1 << ctxt->plane));
-  // #bits when a tile is not restored
-  int bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
-  double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  ctxt->best_tile_cost[rtile_idx] = INT64_MAX;
-
-  RestorationInfo *plane_rsi = &rsi[ctxt->plane];
-  SgrprojInfo *rtile_sgrproj_info = &plane_rsi->sgrproj_info[rtile_idx];
   uint8_t *dgd_start =
-      ctxt->dgd_buffer + limits->v_start * ctxt->dgd_stride + limits->h_start;
+      rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
   const uint8_t *src_start =
-      ctxt->src_buffer + limits->v_start * ctxt->src_stride + limits->h_start;
+      rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
 
-  search_selfguided_restoration(
-      dgd_start, limits->h_end - limits->h_start,
-      limits->v_end - limits->v_start, ctxt->dgd_stride, src_start,
-      ctxt->src_stride,
-#if CONFIG_HIGHBITDEPTH
-      cm->use_highbitdepth, cm->bit_depth,
-#else
-      0, 8,
-#endif  // CONFIG_HIGHBITDEPTH
-      rsi[ctxt->plane].procunit_width, rsi[ctxt->plane].procunit_height,
-      &rtile_sgrproj_info->ep, rtile_sgrproj_info->xqd,
-      cm->rst_internal.tmpbuf);
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ;
-  err = try_restoration_tile(ctxt->src, ctxt->cpi, rsi, (1 << ctxt->plane),
-                             ctxt->partial_frame, rtile_idx, ctxt->dst_frame);
-  bits =
-      count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx], ref_sgrproj_info)
-      << AV1_PROB_COST_SHIFT;
-  bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
-  double cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  if (cost_sgrproj >= cost_norestore) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
-  } else {
-    ctxt->type[rtile_idx] = RESTORE_SGRPROJ;
-    *ref_sgrproj_info = sgrproj_info[rtile_idx] =
-        plane_rsi->sgrproj_info[rtile_idx];
-    ctxt->best_tile_cost[rtile_idx] = err;
-  }
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-}
+  const int is_uv = rsc->plane > 0;
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_y = is_uv && cm->subsampling_y;
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+  const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
 
-static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                             int partial_frame, int plane,
-                             RestorationInfo *info, RestorationType *type,
-                             int64_t *best_tile_cost,
-                             YV12_BUFFER_CONFIG *dst_frame) {
-  struct rest_search_ctxt ctxt;
-  const int nrtiles =
-      init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type,
-                            best_tile_cost, dst_frame, &ctxt);
-
-  RestorationInfo *plane_rsi = &cpi->rst_search[plane];
-  plane_rsi->frame_restoration_type = RESTORE_SGRPROJ;
-  for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) {
-    plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-  }
-
-  // Compute best Sgrproj filters for each rtile, one (encoder/decoder)
-  // tile at a time.
-  const AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
-                        ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ,
-                        SGRPROJ_BORDER_VERT);
-  else
-#endif
-    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
-                 ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT);
-
-  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      SgrprojInfo ref_sgrproj_info;
-      set_default_sgrproj(&ref_sgrproj_info);
-      foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_sgrproj_for_rtile,
-                            &ref_sgrproj_info);
-    }
-  }
-
-  // Cost for Sgrproj filtering
-  SgrprojInfo ref_sgrproj_info;
-  set_default_sgrproj(&ref_sgrproj_info);
-  SgrprojInfo *sgrproj_info = info->sgrproj_info;
-
-  int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type]
-             << AV1_PROB_COST_SHIFT;
-  for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) {
-    bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB,
-                         type[rtile_idx] != RESTORE_NONE);
-    plane_rsi->sgrproj_info[rtile_idx] = sgrproj_info[rtile_idx];
-    if (type[rtile_idx] == RESTORE_SGRPROJ) {
-      bits += count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx],
-                                 &ref_sgrproj_info)
-              << AV1_PROB_COST_SHIFT;
-      ref_sgrproj_info = plane_rsi->sgrproj_info[rtile_idx];
-    }
-    plane_rsi->restoration_type[rtile_idx] = type[rtile_idx];
-  }
-  int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, (1 << plane),
-                                      partial_frame, dst_frame);
-  double cost_sgrproj = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err);
-  return cost_sgrproj;
+  rusi->sgrproj = search_selfguided_restoration(
+      dgd_start, limits->h_end - limits->h_start,
+      limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+      rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+      tmpbuf);
+
+  RestorationUnitInfo rui;
+  rui.restoration_type = RESTORE_SGRPROJ;
+  rui.sgrproj_info = rusi->sgrproj;
+
+  rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+
+  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+                           (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
+                            << AV1_PROB_COST_SHIFT);
+
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_sgr =
+      RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+  if (rusi->sgrproj.ep < 10)
+    cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+
+  RestorationType rtype =
+      (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
+  if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
 }
 
 static double find_average(const uint8_t *src, int h_start, int h_end,
@@ -758,7 +642,6 @@ static void compute_stats(int wiener_win, const uint8_t *dgd,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
                                   int v_start, int v_end, int stride) {
   uint64_t sum = 0;
@@ -771,10 +654,10 @@ static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
   return avg;
 }
 
-static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
-                                 const uint8_t *src8, int h_start, int h_end,
-                                 int v_start, int v_end, int dgd_stride,
-                                 int src_stride, double *M, double *H) {
+static AOM_FORCE_INLINE void compute_stats_highbd(
+    int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start,
+    int h_end, int v_start, int v_end, int dgd_stride, int src_stride,
+    double *M, double *H) {
   int i, j, k, l;
   double Y[WIENER_WIN2];
   const int wiener_win2 = wiener_win * wiener_win;
@@ -798,13 +681,15 @@ static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
       }
       assert(idx == wiener_win2);
       for (k = 0; k < wiener_win2; ++k) {
-        M[k] += Y[k] * X;
-        H[k * wiener_win2 + k] += Y[k] * Y[k];
+        double Yk = Y[k];
+        M[k] += Yk * X;
+        double *H2 = &H[k * wiener_win2];
+        H2[k] += Yk * Yk;
         for (l = k + 1; l < wiener_win2; ++l) {
           // H is a symmetric matrix, so we only need to fill out the upper
           // triangle here. We can copy it down to the lower triangle outside
           // the (i, j) loops.
-          H[k * wiener_win2 + l] += Y[k] * Y[l];
+          H2[l] += Yk * Y[l];
         }
       }
     }
@@ -815,7 +700,6 @@ static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE int wrap_index(int i, int wiener_win) {
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
@@ -1059,37 +943,37 @@ static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
 }
 
 #define USE_WIENER_REFINEMENT_SEARCH 1
-static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
-                                        AV1_COMP *cpi, RestorationInfo *rsi,
-                                        int start_step, int plane,
-                                        int wiener_win, int tile_idx,
-                                        int partial_frame,
-                                        YV12_BUFFER_CONFIG *dst_frame) {
+static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
+                                        const RestorationTileLimits *limits,
+                                        const AV1PixelRect *tile,
+                                        RestorationUnitInfo *rui,
+                                        int wiener_win) {
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
-  int64_t err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                     tile_idx, dst_frame);
-  (void)start_step;
+  int64_t err = try_restoration_unit(rsc, limits, tile, rui);
 #if USE_WIENER_REFINEMENT_SEARCH
   int64_t err2;
   int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
                     WIENER_FILT_TAP2_MINV };
   int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
                     WIENER_FILT_TAP2_MAXV };
+
+  WienerInfo *plane_wiener = &rui->wiener_info;
+
   // printf("err  pre = %"PRId64"\n", err);
+  const int start_step = 4;
   for (int s = start_step; s >= 1; s >>= 1) {
     for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
       int skip = 0;
       do {
-        if (rsi[plane].wiener_info[tile_idx].hfilter[p] - s >= tap_min[p]) {
-          rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+          plane_wiener->hfilter[p] -= s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
+            plane_wiener->hfilter[p] += s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
           } else {
             err = err2;
             skip = 1;
@@ -1101,16 +985,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
       } while (1);
       if (skip) break;
       do {
-        if (rsi[plane].wiener_info[tile_idx].hfilter[p] + s <= tap_max[p]) {
-          rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+          plane_wiener->hfilter[p] += s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
+            plane_wiener->hfilter[p] -= s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
           } else {
             err = err2;
             // At the highest step size continue moving in the same direction
@@ -1123,16 +1006,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
     for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
       int skip = 0;
       do {
-        if (rsi[plane].wiener_info[tile_idx].vfilter[p] - s >= tap_min[p]) {
-          rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+          plane_wiener->vfilter[p] -= s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
+            plane_wiener->vfilter[p] += s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
           } else {
             err = err2;
             skip = 1;
@@ -1144,16 +1026,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
       } while (1);
       if (skip) break;
       do {
-        if (rsi[plane].wiener_info[tile_idx].vfilter[p] + s <= tap_max[p]) {
-          rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+          plane_wiener->vfilter[p] += s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
+            plane_wiener->vfilter[p] -= s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
           } else {
             err = err2;
             // At the highest step size continue moving in the same direction
@@ -1169,372 +1050,264 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
   return err;
 }
 
-static void search_wiener_for_rtile(const struct rest_search_ctxt *ctxt,
-                                    int rtile_idx,
-                                    const RestorationTileLimits *limits,
-                                    void *arg) {
-  const MACROBLOCK *const x = &ctxt->cpi->td.mb;
-  const AV1_COMMON *const cm = &ctxt->cpi->common;
-  RestorationInfo *rsi = ctxt->cpi->rst_search;
+static void search_wiener(const RestorationTileLimits *limits,
+                          const AV1PixelRect *tile_rect, int rest_unit_idx,
+                          void *priv, int32_t *tmpbuf,
+                          RestorationLineBuffers *rlbs) {
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
   const int wiener_win =
-      (ctxt->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
 
   double M[WIENER_WIN2];
   double H[WIENER_WIN2 * WIENER_WIN2];
   double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
 
-  WienerInfo *ref_wiener_info = (WienerInfo *)arg;
-
-  int64_t err =
-      sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start,
-                           limits->h_end - limits->h_start, limits->v_start,
-                           limits->v_end - limits->v_start, (1 << ctxt->plane));
-  // #bits when a tile is not restored
-  int bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
-  double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  ctxt->best_tile_cost[rtile_idx] = INT64_MAX;
-
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    compute_stats_highbd(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer,
+  const AV1_COMMON *const cm = rsc->cm;
+  if (cm->use_highbitdepth) {
+    compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                          limits->h_start, limits->h_end, limits->v_start,
-                         limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M,
-                         H);
-  else
-#endif  // CONFIG_HIGHBITDEPTH
-    compute_stats(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer,
-                  limits->h_start, limits->h_end, limits->v_start,
-                  limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M, H);
+                         limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+  } else {
+    compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start,
+                  limits->h_end, limits->v_start, limits->v_end,
+                  rsc->dgd_stride, rsc->src_stride, M, H);
+  }
 
-  ctxt->type[rtile_idx] = RESTORE_WIENER;
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->wiener_restore_cost[0];
 
   if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilterd, hfilterd)) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
     return;
   }
 
-  RestorationInfo *plane_rsi = &rsi[ctxt->plane];
-  WienerInfo *rtile_wiener_info = &plane_rsi->wiener_info[rtile_idx];
-  quantize_sym_filter(wiener_win, vfilterd, rtile_wiener_info->vfilter);
-  quantize_sym_filter(wiener_win, hfilterd, rtile_wiener_info->hfilter);
+  RestorationUnitInfo rui;
+  memset(&rui, 0, sizeof(rui));
+  rui.restoration_type = RESTORE_WIENER;
+  quantize_sym_filter(wiener_win, vfilterd, rui.wiener_info.vfilter);
+  quantize_sym_filter(wiener_win, hfilterd, rui.wiener_info.hfilter);
 
   // Filter score computes the value of the function x'*A*x - x'*b for the
   // learned filter and compares it against identity filer. If there is no
   // reduction in the function, the filter is reverted back to identity
-  double score = compute_score(wiener_win, M, H, rtile_wiener_info->vfilter,
-                               rtile_wiener_info->hfilter);
-  if (score > 0.0) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
+  if (compute_score(wiener_win, M, H, rui.wiener_info.vfilter,
+                    rui.wiener_info.hfilter) > 0) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
     return;
   }
+
   aom_clear_system_state();
 
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_WIENER;
-  err = finer_tile_search_wiener(ctxt->src, ctxt->cpi, rsi, 4, ctxt->plane,
-                                 wiener_win, rtile_idx, ctxt->partial_frame,
-                                 ctxt->dst_frame);
+  rusi->sse[RESTORE_WIENER] =
+      finer_tile_search_wiener(rsc, limits, tile_rect, &rui, wiener_win);
+  rusi->wiener = rui.wiener_info;
+
   if (wiener_win != WIENER_WIN) {
-    assert(rtile_wiener_info->vfilter[0] == 0 &&
-           rtile_wiener_info->vfilter[WIENER_WIN - 1] == 0);
-    assert(rtile_wiener_info->hfilter[0] == 0 &&
-           rtile_wiener_info->hfilter[WIENER_WIN - 1] == 0);
-  }
-  bits = count_wiener_bits(wiener_win, rtile_wiener_info, ref_wiener_info)
-         << AV1_PROB_COST_SHIFT;
-  bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
-  double cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  if (cost_wiener >= cost_norestore) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
-  } else {
-    ctxt->type[rtile_idx] = RESTORE_WIENER;
-    *ref_wiener_info = ctxt->info->wiener_info[rtile_idx] = *rtile_wiener_info;
-    ctxt->best_tile_cost[rtile_idx] = err;
+    assert(rui.wiener_info.vfilter[0] == 0 &&
+           rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+    assert(rui.wiener_info.hfilter[0] == 0 &&
+           rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
   }
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-}
 
-static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                            int partial_frame, int plane, RestorationInfo *info,
-                            RestorationType *type, int64_t *best_tile_cost,
-                            YV12_BUFFER_CONFIG *dst_frame) {
-  struct rest_search_ctxt ctxt;
-  const int nrtiles =
-      init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type,
-                            best_tile_cost, dst_frame, &ctxt);
-
-  RestorationInfo *plane_rsi = &cpi->rst_search[plane];
-  plane_rsi->frame_restoration_type = RESTORE_WIENER;
-  for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) {
-    plane_rsi->restoration_type[tile_idx] = RESTORE_NONE;
-  }
+  const int64_t bits_wiener =
+      x->wiener_restore_cost[1] +
+      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+       << AV1_PROB_COST_SHIFT);
 
-  AV1_COMMON *const cm = &cpi->common;
-// Construct a (WIENER_HALFWIN)-pixel border around the frame
-// Note use this border to gather stats even though the actual filter
-// may use less border on the top/bottom of a processing unit.
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
-                        ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN,
-                        WIENER_HALFWIN);
-  else
-#endif
-    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
-                 ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN);
-
-  // Compute best Wiener filters for each rtile, one (encoder/decoder)
-  // tile at a time.
-  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      WienerInfo ref_wiener_info;
-      set_default_wiener(&ref_wiener_info);
-
-      foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_wiener_for_rtile,
-                            &ref_wiener_info);
-    }
-  }
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_wiener =
+      RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
 
-  // cost for Wiener filtering
-  WienerInfo ref_wiener_info;
-  set_default_wiener(&ref_wiener_info);
-  int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type]
-             << AV1_PROB_COST_SHIFT;
-  WienerInfo *wiener_info = info->wiener_info;
-  const int wiener_win =
-      (plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
-
-  for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) {
-    bits +=
-        av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
-    plane_rsi->wiener_info[tile_idx] = wiener_info[tile_idx];
-
-    if (type[tile_idx] == RESTORE_WIENER) {
-      bits += count_wiener_bits(wiener_win, &plane_rsi->wiener_info[tile_idx],
-                                &ref_wiener_info)
-              << AV1_PROB_COST_SHIFT;
-      ref_wiener_info = plane_rsi->wiener_info[tile_idx];
-    }
-    plane_rsi->restoration_type[tile_idx] = type[tile_idx];
-  }
-  int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, 1 << plane,
-                                      partial_frame, dst_frame);
-  double cost_wiener = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err);
+  RestorationType rtype =
+      (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
 
-  return cost_wiener;
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
+  if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
 }
 
-static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                               int partial_frame, int plane,
-                               RestorationInfo *info, RestorationType *type,
-                               int64_t *best_tile_cost,
-                               YV12_BUFFER_CONFIG *dst_frame) {
-  int64_t err;
-  double cost_norestore;
-  int bits;
-  MACROBLOCK *x = &cpi->td.mb;
-  AV1_COMMON *const cm = &cpi->common;
-  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int width, height;
-  if (plane == AOM_PLANE_Y) {
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-  } else {
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-  }
-  const int ntiles = av1_get_rest_ntiles(
-      width, height, cm->rst_info[plane].restoration_tilesize, &tile_width,
-      &tile_height, &nhtiles, &nvtiles);
-  (void)info;
-  (void)dst_frame;
-  (void)partial_frame;
-
-  info->frame_restoration_type = RESTORE_NONE;
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    RestorationTileLimits limits = av1_get_rest_tile_limits(
-        tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-        height, plane != AOM_PLANE_Y ? cm->subsampling_y : 0);
-#else
-        height);
-#endif
-    err = sse_restoration_tile(src, cm->frame_to_show, cm, limits.h_start,
-                               limits.h_end - limits.h_start, limits.v_start,
-                               limits.v_end - limits.v_start, 1 << plane);
-    type[tile_idx] = RESTORE_NONE;
-    best_tile_cost[tile_idx] = err;
-  }
-  // RD cost associated with no restoration
-  err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
-  bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
-  cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  return cost_norestore;
-}
+static void search_norestore(const RestorationTileLimits *limits,
+                             const AV1PixelRect *tile_rect, int rest_unit_idx,
+                             void *priv, int32_t *tmpbuf,
+                             RestorationLineBuffers *rlbs) {
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
 
-struct switchable_rest_search_ctxt {
-  SgrprojInfo sgrproj_info;
-  WienerInfo wiener_info;
-  RestorationType *const *restore_types;
-  int64_t *const *tile_cost;
-  double cost_switchable;
-};
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-static void search_switchable_for_rtile(const struct rest_search_ctxt *ctxt,
-                                        int rtile_idx,
-                                        const RestorationTileLimits *limits,
-                                        void *arg) {
-  const MACROBLOCK *x = &ctxt->cpi->td.mb;
-  RestorationInfo *rsi = &ctxt->cpi->common.rst_info[ctxt->plane];
-  struct switchable_rest_search_ctxt *swctxt =
-      (struct switchable_rest_search_ctxt *)arg;
+  const int highbd = rsc->cm->use_highbitdepth;
+  rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+      limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
+
+  rsc->sse += rusi->sse[RESTORE_NONE];
+}
 
+static void search_switchable(const RestorationTileLimits *limits,
+                              const AV1PixelRect *tile_rect, int rest_unit_idx,
+                              void *priv, int32_t *tmpbuf,
+                              RestorationLineBuffers *rlbs) {
   (void)limits;
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  double best_cost =
-      RDCOST_DBL(x->rdmult, (x->switchable_restore_cost[RESTORE_NONE] >> 4),
-                 swctxt->tile_cost[RESTORE_NONE][rtile_idx]);
-  rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-  for (RestorationType r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    if (force_restore_type != RESTORE_TYPES)
-      if (r != force_restore_type) continue;
-    int tilebits = 0;
-    if (swctxt->restore_types[r][rtile_idx] != r) continue;
-    if (r == RESTORE_WIENER)
-      tilebits += count_wiener_bits(
-          (ctxt->plane == AOM_PLANE_Y ? WIENER_WIN : WIENER_WIN - 2),
-          &rsi->wiener_info[rtile_idx], &swctxt->wiener_info);
-    else if (r == RESTORE_SGRPROJ)
-      tilebits += count_sgrproj_bits(&rsi->sgrproj_info[rtile_idx],
-                                     &swctxt->sgrproj_info);
-    tilebits <<= AV1_PROB_COST_SHIFT;
-    tilebits += x->switchable_restore_cost[r];
-    double cost =
-        RDCOST_DBL(x->rdmult, tilebits >> 4, swctxt->tile_cost[r][rtile_idx]);
-
-    if (cost < best_cost) {
-      rsi->restoration_type[rtile_idx] = r;
-      best_cost = cost;
+  const MACROBLOCK *const x = rsc->x;
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  double best_cost = 0;
+  int64_t best_bits = 0;
+  RestorationType best_rtype = RESTORE_NONE;
+
+  for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    // Check for the condition that wiener or sgrproj search could not
+    // find a solution or the solution was worse than RESTORE_NONE.
+    // In either case the best_rtype will be set as RESTORE_NONE. These
+    // should be skipped from the test below.
+    if (r > RESTORE_NONE) {
+      if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
     }
-  }
-  if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER)
-    swctxt->wiener_info = rsi->wiener_info[rtile_idx];
-  else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ)
-    swctxt->sgrproj_info = rsi->sgrproj_info[rtile_idx];
-  if (force_restore_type != RESTORE_TYPES)
-    assert(rsi->restoration_type[rtile_idx] == force_restore_type ||
-           rsi->restoration_type[rtile_idx] == RESTORE_NONE);
-  swctxt->cost_switchable += best_cost;
-}
 
-static double search_switchable_restoration(
-    const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
-    RestorationType *const restore_types[RESTORE_SWITCHABLE_TYPES],
-    int64_t *const tile_cost[RESTORE_SWITCHABLE_TYPES], RestorationInfo *rsi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  struct rest_search_ctxt ctxt;
-  init_rest_search_ctxt(src, cpi, partial_frame, plane, NULL, NULL, NULL, NULL,
-                        &ctxt);
-  struct switchable_rest_search_ctxt swctxt;
-  swctxt.restore_types = restore_types;
-  swctxt.tile_cost = tile_cost;
-
-  rsi->frame_restoration_type = RESTORE_SWITCHABLE;
-  int bits = frame_level_restore_bits[rsi->frame_restoration_type]
-             << AV1_PROB_COST_SHIFT;
-  swctxt.cost_switchable = RDCOST_DBL(cpi->td.mb.rdmult, bits >> 4, 0);
-
-  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      set_default_sgrproj(&swctxt.sgrproj_info);
-      set_default_wiener(&swctxt.wiener_info);
-      foreach_rtile_in_tile(&ctxt, tile_row, tile_col,
-                            search_switchable_for_rtile, &swctxt);
+    const int64_t sse = rusi->sse[r];
+    int64_t coeff_pcost = 0;
+    switch (r) {
+      case RESTORE_NONE: coeff_pcost = 0; break;
+      case RESTORE_WIENER:
+        coeff_pcost =
+            count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+        break;
+      case RESTORE_SGRPROJ:
+        coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+        break;
+      default: assert(0); break;
+    }
+    const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+    const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+    if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+    if (r == 0 || cost < best_cost) {
+      best_cost = cost;
+      best_bits = bits;
+      best_rtype = r;
     }
   }
 
-  return swctxt.cost_switchable;
+  rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+  rsc->sse += rusi->sse[best_rtype];
+  rsc->bits += best_bits;
+  if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
+  if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
 }
 
-void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                                 LPF_PICK_METHOD method) {
-  static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = {
-    search_norestore, search_wiener, search_sgrproj,
+static void copy_unit_info(RestorationType frame_rtype,
+                           const RestUnitSearchInfo *rusi,
+                           RestorationUnitInfo *rui) {
+  assert(frame_rtype > 0);
+  rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+  if (rui->restoration_type == RESTORE_WIENER)
+    rui->wiener_info = rusi->wiener;
+  else
+    rui->sgrproj_info = rusi->sgrproj;
+}
+
+static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+  static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+    search_norestore, search_wiener, search_sgrproj, search_switchable
   };
-  AV1_COMMON *const cm = &cpi->common;
-  double cost_restore[RESTORE_TYPES];
-  int64_t *tile_cost[RESTORE_SWITCHABLE_TYPES];
-  RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES];
-  double best_cost_restore;
-  RestorationType r, best_restore;
-  const int ywidth = src->y_crop_width;
-  const int yheight = src->y_crop_height;
-  const int uvwidth = src->uv_crop_width;
-  const int uvheight = src->uv_crop_height;
-
-  const int ntiles_y =
-      av1_get_rest_ntiles(ywidth, yheight, cm->rst_info[0].restoration_tilesize,
-                          NULL, NULL, NULL, NULL);
-  const int ntiles_uv = av1_get_rest_ntiles(
-      uvwidth, uvheight, cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL,
-      NULL);
-
-  // Assume ntiles_uv is never larger that ntiles_y and so the same arrays work.
-  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    tile_cost[r] = (int64_t *)aom_malloc(sizeof(*tile_cost[0]) * ntiles_y);
-    restore_types[r] =
-        (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles_y);
-  }
 
-  for (int plane = AOM_PLANE_Y; plane <= AOM_PLANE_V; ++plane) {
-    for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
-      cost_restore[r] = DBL_MAX;
-      if (force_restore_type != RESTORE_TYPES)
-        if (r != RESTORE_NONE && r != force_restore_type) continue;
-      cost_restore[r] =
-          search_restore_fun[r](src, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                                plane, &cm->rst_info[plane], restore_types[r],
-                                tile_cost[r], &cpi->trial_frame_rst);
-    }
-    if (plane == AOM_PLANE_Y)
-      cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
-          src, cpi, method == LPF_PICK_FROM_SUBIMAGE, plane, restore_types,
-          tile_cost, &cm->rst_info[plane]);
-    else
-      cost_restore[RESTORE_SWITCHABLE] = DBL_MAX;
-    best_cost_restore = DBL_MAX;
-    best_restore = 0;
-    for (r = 0; r < RESTORE_TYPES; ++r) {
-      if (force_restore_type != RESTORE_TYPES)
-        if (r != RESTORE_NONE && r != force_restore_type) continue;
-      if (cost_restore[r] < best_cost_restore) {
-        best_restore = r;
-        best_cost_restore = cost_restore[r];
+  reset_rsc(rsc);
+  rsc_on_tile(LR_TILE_ROW, LR_TILE_COL, rsc);
+  av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
+                                 &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
+  return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+}
+
+static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  return rsi->units_per_tile;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  assert(!cm->all_lossless);
+
+  int ntiles[2];
+  for (int is_uv = 0; is_uv < 2; ++is_uv)
+    ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+
+  assert(ntiles[1] <= ntiles[0]);
+  RestUnitSearchInfo *rusi =
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]);
+
+  // If the restoration unit dimensions are not multiples of
+  // rsi->restoration_unit_size then some elements of the rusi array may be
+  // left uninitialised when we reach copy_unit_info(...). This is not a
+  // problem, as these elements are ignored later, but in order to quiet
+  // Valgrind's warnings we initialise the array below.
+  memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+
+  RestSearchCtxt rsc;
+  const int plane_start = AOM_PLANE_Y;
+  const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+  for (int plane = plane_start; plane <= plane_end; ++plane) {
+    init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+             &cpi->trial_frame_rst, &rsc);
+
+    const int plane_ntiles = ntiles[plane > 0];
+    const RestorationType num_rtypes =
+        (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+    double best_cost = 0;
+    RestorationType best_rtype = RESTORE_NONE;
+
+    const int highbd = rsc.cm->use_highbitdepth;
+    extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+                 rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+                 highbd);
+
+    for (RestorationType r = 0; r < num_rtypes; ++r) {
+      if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+          (r != force_restore_type))
+        continue;
+
+      double cost = search_rest_type(&rsc, r);
+
+      if (r == 0 || cost < best_cost) {
+        best_cost = cost;
+        best_rtype = r;
       }
     }
-    cm->rst_info[plane].frame_restoration_type = best_restore;
+
+    cm->rst_info[plane].frame_restoration_type = best_rtype;
     if (force_restore_type != RESTORE_TYPES)
-      assert(best_restore == force_restore_type ||
-             best_restore == RESTORE_NONE);
-    if (best_restore != RESTORE_SWITCHABLE) {
-      const int nt = (plane == AOM_PLANE_Y ? ntiles_y : ntiles_uv);
-      memcpy(cm->rst_info[plane].restoration_type, restore_types[best_restore],
-             nt * sizeof(restore_types[best_restore][0]));
+      assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE);
+
+    if (best_rtype != RESTORE_NONE) {
+      for (int u = 0; u < plane_ntiles; ++u) {
+        copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+      }
     }
   }
-  /*
-  printf("Frame %d/%d restore types: %d %d %d\n", cm->current_video_frame,
-         cm->show_frame, cm->rst_info[0].frame_restoration_type,
-         cm->rst_info[1].frame_restoration_type,
-         cm->rst_info[2].frame_restoration_type);
-  printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
-         cm->current_video_frame, cm->show_frame,
-         cm->rst_info[0].frame_restoration_type, cost_restore[0],
-         cost_restore[1], cost_restore[2], cost_restore[3]);
-         */
-
-  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    aom_free(tile_cost[r]);
-    aom_free(restore_types[r]);
-  }
+
+  aom_free(rusi);
 }
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
index f6096ed1d..179b89ff9 100644
--- a/third_party/aom/av1/encoder/pickrst.h
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -20,8 +20,7 @@ extern "C" {
 struct yv12_buffer_config;
 struct AV1_COMP;
 
-void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                                 LPF_PICK_METHOD method);
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 000000000..ef333b6d8
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_PUSTATS_H_
+#define AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES 20
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 10
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      13.8498f,  19.6630f,   13.3036f,  5.2448f,   -18.0270f,  21.6671f,
+      -0.2135f,  -0.0060f,   0.1211f,   -0.3549f,  -0.3550f,   0.0190f,
+      0.0167f,   -0.1192f,   0.2003f,   8.6663f,   32.0264f,   9.9558f,
+      9.0935f,   -110.4994f, 51.8056f,  64.8041f,  58.5392f,   53.0189f,
+      -61.6300f, 4.7540f,    -0.0140f,  0.0185f,   -15.8050f,  0.0790f,
+      0.0707f,   0.0784f,    0.0766f,   -0.3030f,  0.0392f,    49.3312f,
+      63.3326f,  61.4025f,   54.2723f,  -62.2769f, -147.1736f, -84.9432f,
+      -82.5422f, -70.4857f,  46.7622f,  -1.0285f,  -0.4809f,   0.0068f,
+      1.0888f,   -0.0515f,   -0.0384f,  -0.0232f,  -0.0396f,   0.2429f,
+      0.2040f,   -144.4016f, -88.0868f, -80.3134f, -70.6685f,  66.8528f,
+      -53.8097f, -45.4011f,  -52.8680f, -58.7226f, 99.7830f,   2.3728f,
+      0.0229f,   0.0002f,    -0.3288f,  -0.0563f,  -0.0550f,   -0.0552f,
+      -0.0563f,  0.2214f,    0.0139f,   -60.8965f, -45.5251f,  -50.4188f,
+      -51.5623f, 85.7369f,   77.3415f,  47.4930f,  53.8120f,   58.2311f,
+      -45.9650f, -2.4938f,   0.1639f,   -0.5270f,  -75.4622f,  -0.0026f,
+      0.0031f,   0.0047f,    0.0015f,   0.0092f,   0.0654f,    75.6402f,
+      54.7447f,  54.8156f,   52.6834f,  -9.1246f,  -34.0108f,  -35.6423f,
+      -34.2911f, -38.5444f,  72.1123f,  10.9750f,  -0.1595f,   0.1983f,
+      22.5724f,  -0.0556f,   -0.0618f,  -0.0571f,  -0.0608f,   0.2439f,
+      -0.0805f,  -32.5107f,  -28.9688f, -33.7284f, -48.1365f,  61.5297f,
+      39.2492f,  -35.1928f,  -11.5000f, 7.7038f,   -94.2469f,  13.5586f,
+      0.7541f,   0.0105f,    4.4041f,   0.1799f,   0.1339f,    0.1567f,
+      -0.6668f,  -0.7384f,   0.2185f,   17.1700f,  -26.4601f,  -1.8970f,
+      38.9635f,  -30.1916f,  31.8139f,  14.6157f,  10.0565f,   3.3340f,
+      -40.6985f, -2.1186f,   0.0116f,   0.0962f,   0.7115f,    -1.4071f,
+      -1.3701f,  -1.4728f,   -1.3404f,  -1.7286f,  5.5632f,    28.4998f,
+      5.4087f,   16.2668f,   11.8693f,  -39.4153f, 106.3281f,  38.3075f,
+      39.4933f,  47.3805f,   -15.0514f, -21.2421f, -0.2358f,   -0.0024f,
+      0.3505f,   -0.0429f,   -0.0377f,  -0.0322f,  -0.0344f,   0.2020f,
+      0.1417f,   99.6711f,   35.3896f,  43.1117f,  59.8879f,   -17.8250f,
+      -16.6976f, 18.5100f,   6.3383f,   25.3020f,  -55.8824f,  25.1027f,
+      -0.9926f,  -0.0738f,   -1.4892f,  0.0269f,   -0.0051f,   -5.8168f,
+      -0.0579f,  -0.1500f,   0.7224f,   8.3066f,   -3.8805f,   -12.1482f,
+      14.3492f,  -20.8118f,
+    };
+
+static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+    {
+      17.6566f,  62.2217f, -107.2644f, -56.2255f, 68.2252f,
+      -37.5662f, 9.587f,   18.5206f,   69.6873f,  4.3903f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.0494f, 0.3505f,   -0.0461f, -1.3451f, 0.0198f,  -0.0746f, -0.2217f,
+      -0.9525f, 0.0633f,   -0.0737f, -0.3568f, 1.8569f,  -0.0189f, -1.8269f,
+      0.6281f,  -1.3266f,  -0.9202f, 2.8978f,  -0.6437f, -0.8709f, -1.5066f,
+      -1.0582f, -1.9509f,  -0.0417f, -0.1315f, -0.3368f, 0.0014f,  -0.5734f,
+      -1.4640f, -1.6042f,  3.3911f,  -1.6815f, -1.9026f, -4.8702f, -0.1012f,
+      -1.4517f, -3.2156f,  0.8448f,  0.2331f,  -0.1593f, 2.6627f,  -0.8451f,
+      -1.7382f, 0.9303f,   2.3003f,  -0.0659f, 0.5772f,  0.4253f,  0.2083f,
+      0.3649f,  -0.9198f,  -0.2183f, -0.5381f, -1.0831f, 2.0359f,  0.0040f,
+      -0.0871f, -0.1715f,  2.2453f,  0.5099f,  -0.5900f, -0.6313f, -1.3028f,
+      -1.7257f, 1.4130f,   -0.7189f, -0.4336f, 1.9266f,  1.7495f,  -0.3321f,
+      0.2827f,  0.4015f,   -0.5044f, -1.0420f, -0.1258f, -0.0342f, -0.1190f,
+      -3.1263f, 0.7485f,   -0.3161f, -0.2224f, 2.5533f,  -0.2121f, -1.3389f,
+      0.5556f,  -0.9407f,  -0.7456f, 1.4137f,  -0.0353f, -0.0521f, 2.4382f,
+      0.1493f,  -11.5631f, -1.6178f, 3.5538f,  -3.6538f, -0.5972f, -3.0038f,
+      -2.1640f, 0.5754f,
+    };
+
+static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+    {
+      69.1995f, 41.7369f, -1.4885f, -35.785f, 26.1678f,
+      58.4472f, 36.2223f, 66.327f,  50.8867f, 2.8306f,
+    };
+
+static const float
+    av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      1.811f,  0.9009f, 0.0694f, -0.9985f, -0.039f,
+      0.2076f, 0.5643f, 0.5408f, 0.6071f,  0.277f,
+    };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+  39.5529f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+  NUM_FEATURES,                                      // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_rate_hiddenlayer_0_kernel,
+      av1_pustats_rate_hiddenlayer_1_kernel,
+      av1_pustats_rate_logits_kernel,
+  },
+  {
+      av1_pustats_rate_hiddenlayer_0_bias,
+      av1_pustats_rate_hiddenlayer_1_bias,
+      av1_pustats_rate_logits_bias,
+  },
+};
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -39.0787f,  -212.9998f, -174.2088f, -264.1454f, 292.7151f,  -60.8750f,
+      -5.9915f,   0.0712f,    -60.2312f,  -0.2020f,   -0.2135f,   -0.1663f,
+      -0.0711f,   0.2267f,    0.9152f,    -36.1294f,  -159.9320f, -222.9809f,
+      -270.2556f, 300.7162f,  159.9224f,  -172.5735f, -7.6852f,   54.3985f,
+      110.6721f,  19.2907f,   -15.1039f,  -0.0457f,   0.3289f,    0.4529f,
+      -8.2222f,   1.3213f,    -0.8378f,   -0.2605f,   3.9600f,    17.3407f,
+      113.1116f,  34.6326f,   11.6688f,   109.3541f,  240.8123f,  45.0615f,
+      80.7443f,   39.2500f,   -21.0931f,  -27.1989f,  -0.4264f,   -0.1345f,
+      1.6269f,    -0.0716f,   0.0989f,    -0.1382f,   0.0248f,    0.0913f,
+      4.3903f,    244.1014f,  32.2567f,   58.6171f,   62.2273f,   -2.8647f,
+      -227.5659f, 16.0031f,   -70.5256f,  23.8071f,   290.7356f,  13.6094f,
+      -2.1842f,   0.0104f,    -2.8760f,   0.3708f,    0.8501f,    -3.2964f,
+      -0.2088f,   -0.4474f,   1.2248f,    40.5180f,   -130.7891f, -188.1583f,
+      -174.0906f, 205.9622f,  0.3425f,    0.2531f,    0.2822f,    0.0488f,
+      0.1416f,    -0.0433f,   -0.1195f,   -0.0413f,   -0.0708f,   -0.0787f,
+      -0.0889f,   -0.4022f,   -0.5055f,   -0.4715f,   0.2315f,    0.1021f,
+      -0.3676f,   -0.3499f,   -0.0715f,   0.1913f,    205.7521f,  125.2265f,
+      92.0640f,   77.5566f,   -164.4280f, -19.3715f,  -0.1346f,   -0.4060f,
+      0.5042f,    -0.2395f,   -0.1329f,   -0.1397f,   0.2175f,    0.2895f,
+      5.5019f,    198.9799f,  114.0018f,  94.9015f,   86.8434f,   -183.4237f,
+      121.5626f,  94.8945f,   65.0803f,   93.6487f,   -346.5279f, -47.6168f,
+      0.0633f,    0.0135f,    -0.0692f,   -0.1015f,   -0.1146f,   -0.1341f,
+      -0.1175f,   0.4186f,    0.1505f,    130.7402f,  107.8443f,  62.8497f,
+      65.3501f,   -312.7407f, 282.8321f,  98.1531f,   75.6648f,   25.8733f,
+      -176.9298f, -37.2695f,  -0.3760f,   0.0017f,    0.1030f,    -0.1483f,
+      0.0787f,    -0.0962f,   0.4109f,    -0.2292f,   9.1681f,    274.3607f,
+      60.9538f,   75.9405f,   68.3776f,   -167.3098f, -335.1045f, -69.2583f,
+      -76.3441f,  -16.5793f,  218.5244f,  28.2405f,   0.9169f,    -0.0026f,
+      -0.8077f,   -1.5756f,   -0.0804f,   0.1404f,    1.2656f,    0.0272f,
+      -0.2529f,   -340.8659f, -112.7778f, -58.3890f,  -4.1224f,   108.1709f,
+      -180.7382f, -93.7114f,  -77.8686f,  -131.8134f, 353.3893f,  4.8233f,
+      0.0205f,    0.0000f,    -1.1654f,   -0.0161f,   -0.0255f,   -0.0358f,
+      -0.0412f,   0.1103f,    0.1041f,    -188.9934f, -110.1792f, -88.6301f,
+      -93.7226f,  336.9746f,
+    };
+
+static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+    { -175.6918f, 43.4519f,  154.196f, -81.1015f,  -0.0758f,
+      136.5695f,  110.8713f, 142.029f, -153.0901f, -145.2688f };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.1727f, -0.2859f,  -0.3757f, -0.4260f,  -0.5441f, -0.0666f, -0.3792f,
+      -0.1335f, -0.1521f,  -0.0821f, -3.1590f,  0.2711f,  0.5889f,  0.0878f,
+      0.4693f,  0.7773f,   -9.2989f, 0.0414f,   0.4485f,  22.8958f, -3.7024f,
+      -2.4672f, -43.2908f, 0.0956f,  0.4431f,   2.3429f,  1.7183f,  0.3985f,
+      -0.2275f, -3.1583f,  -0.3485f, 0.3280f,   0.3763f,  0.2069f,  0.4231f,
+      0.7366f,  -6.9527f,  0.0713f,  0.1359f,   16.6500f, -1.7655f, -0.1651f,
+      0.1280f,  -0.2678f,  -0.2120f, 1.6243f,   1.8773f,  -0.7543f, -0.3292f,
+      -0.7627f, -0.2001f,  -0.1125f, -0.8100f,  -0.1866f, 0.0567f,  -0.4002f,
+      3.2429f,  0.6427f,   -0.3759f, -11.6518f, -2.2893f, 0.7708f,  -1.8637f,
+      1.7148f,  0.3124f,   -0.7129f, -0.4927f,  0.1964f,  -0.2570f, -25.0783f,
+      2.5061f,  0.1457f,   -1.1239f, 0.0570f,   -0.2526f, -0.0669f, 0.6791f,
+      1.1531f,  -0.7246f,  -0.3180f, -0.0015f,  -0.0061f, -0.1626f, -0.0181f,
+      0.1271f,  -0.0140f,  -0.6027f, 0.0736f,   -0.0157f, 1.2420f,  -6.4055f,
+      0.2128f,  -0.0386f,  0.3446f,  0.1840f,   -0.7208f, -1.6979f, -0.0442f,
+      0.3230f,  -1.9745f,
+    };
+
+static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+    { 0.f,      70.3414f, 9.6036f,   -118.1096f, 49.2507f,
+      95.1849f, 81.8015f, 167.0967f, -337.7945f, 169.8344f };
+
+static const float
+    av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      -0.3627f, 1.2272f,  0.2201f, -1.7406f, -0.6885f,
+      0.8487f,  -0.2761f, 0.7731f, -5.2096f, -0.7351f,
+    };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+  48.2331f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+  NUM_FEATURES,                                      // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_dist_hiddenlayer_0_kernel,
+      av1_pustats_dist_hiddenlayer_1_kernel,
+      av1_pustats_dist_logits_kernel,
+  },
+  {
+      av1_pustats_dist_hiddenlayer_0_bias,
+      av1_pustats_dist_hiddenlayer_1_bias,
+      av1_pustats_dist_logits_bias,
+  },
+};
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c
deleted file mode 100644
index 9d5133012..000000000
--- a/third_party/aom/av1/encoder/pvq_encoder.c
+++ /dev/null
@@ -1,988 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "aom_dsp/entcode.h"
-#include "aom_dsp/entenc.h"
-#include "av1/common/blockd.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/partition.h"
-#include "av1/common/pvq_state.h"
-#include "av1/encoder/encodemb.h"
-#include "av1/encoder/pvq_encoder.h"
-#include "aom_ports/system_state.h"
-
-/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
-   dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
-#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
-
-void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
-    int nsymbs) {
-  if (cdf[0] == 0)
-    aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
-  aom_write_symbol(w, symb, cdf, nsymbs);
-}
-
-static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
- const od_coeff *in, int n, int k) {
-  int i;
-  aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
-  for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
-}
-
-/* Computes 1/sqrt(i) using a table for small values. */
-static double od_rsqrt_table(int i) {
-  static double table[16] = {
-    1.000000, 0.707107, 0.577350, 0.500000,
-    0.447214, 0.408248, 0.377964, 0.353553,
-    0.333333, 0.316228, 0.301511, 0.288675,
-    0.277350, 0.267261, 0.258199, 0.250000};
-  if (i <= 16) return table[i-1];
-  else return 1./sqrt(i);
-}
-
-/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
-   where 0 <= i < table_size.*/
-static double od_custom_rsqrt_dynamic_table(const double* table,
- const int table_size, const double start, const int i) {
-  if (i < table_size) return table[i];
-  else return od_rsqrt_table((int)(start + 2*i + 1));
-}
-
-/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
-static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
- const double start) {
-  int i;
-  for (i = 0; i < table_size; i++)
-    table[i] = od_rsqrt_table((int)(start + 2*i + 1));
-}
-
-/** Find the codepoint on the given PSphere closest to the desired
- * vector. Double-precision PVQ search just to make sure our tests
- * aren't limited by numerical accuracy.
- *
- * @param [in]      xcoeff  input vector to quantize (x in the math doc)
- * @param [in]      n       number of dimensions
- * @param [in]      k       number of pulses
- * @param [out]     ypulse  optimal codevector found (y in the math doc)
- * @param [out]     g2      multiplier for the distortion (typically squared
- *                          gain units)
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in]      prev_k  number of pulses already in ypulse that we should
- *                          reuse for the search (or 0 for a new search)
- * @return                  cosine distance between x and y (between 0 and 1)
- */
-double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
- od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
-  int i, j;
-  double xy;
-  double yy;
-  /* TODO - This blows our 8kB stack space budget and should be fixed when
-   converting PVQ to fixed point. */
-  double x[MAXN];
-  double xx;
-  double lambda;
-  double norm_1;
-  int rdo_pulses;
-  double delta_rate;
-  xx = xy = yy = 0;
-  for (j = 0; j < n; j++) {
-    x[j] = fabs((float)xcoeff[j]);
-    xx += x[j]*x[j];
-  }
-  norm_1 = 1./sqrt(1e-30 + xx);
-  lambda = pvq_norm_lambda/(1e-30 + g2);
-  i = 0;
-  if (prev_k > 0 && prev_k <= k) {
-    /* We reuse pulses from a previous search so we don't have to search them
-       again. */
-    for (j = 0; j < n; j++) {
-      ypulse[j] = abs(ypulse[j]);
-      xy += x[j]*ypulse[j];
-      yy += ypulse[j]*ypulse[j];
-      i += ypulse[j];
-    }
-  }
-  else if (k > 2) {
-    double l1_norm;
-    double l1_inv;
-    l1_norm = 0;
-    for (j = 0; j < n; j++) l1_norm += x[j];
-    l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
-    for (j = 0; j < n; j++) {
-      double tmp;
-      tmp = k*x[j]*l1_inv;
-      ypulse[j] = OD_MAXI(0, (int)floor(tmp));
-      xy += x[j]*ypulse[j];
-      yy += ypulse[j]*ypulse[j];
-      i += ypulse[j];
-    }
-  }
-  else OD_CLEAR(ypulse, n);
-
-  /* Only use RDO on the last few pulses. This not only saves CPU, but using
-     RDO on all pulses actually makes the results worse for reasons I don't
-     fully understand. */
-  rdo_pulses = 1 + k/4;
-  /* Rough assumption for now, the last position costs about 3 bits more than
-     the first. */
-  delta_rate = 3./n;
-  /* Search one pulse at a time */
-  for (; i < k - rdo_pulses; i++) {
-    int pos;
-    double best_xy;
-    double best_yy;
-    pos = 0;
-    best_xy = -10;
-    best_yy = 1;
-    for (j = 0; j < n; j++) {
-      double tmp_xy;
-      double tmp_yy;
-      tmp_xy = xy + x[j];
-      tmp_yy = yy + 2*ypulse[j] + 1;
-      tmp_xy *= tmp_xy;
-      if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
-        best_xy = tmp_xy;
-        best_yy = tmp_yy;
-        pos = j;
-      }
-    }
-    xy = xy + x[pos];
-    yy = yy + 2*ypulse[pos] + 1;
-    ypulse[pos]++;
-  }
-  /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
-     and since x^2 and y^2 are constant, we just maximize x*y, plus a
-     lambda*rate term. Note that since x and y aren't normalized here,
-     we need to divide by sqrt(x^2)*sqrt(y^2). */
-  for (; i < k; i++) {
-    double rsqrt_table[4];
-    int rsqrt_table_size = 4;
-    int pos;
-    double best_cost;
-    pos = 0;
-    best_cost = -1e5;
-    /*Fill the small rsqrt lookup table with inputs relative to yy.
-      Specifically, the table of n values is filled with
-       rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
-    od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
-    for (j = 0; j < n; j++) {
-      double tmp_xy;
-      double tmp_yy;
-      tmp_xy = xy + x[j];
-      /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
-      tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
-       yy, ypulse[j]);
-      tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
-      if (j == 0 || tmp_xy > best_cost) {
-        best_cost = tmp_xy;
-        pos = j;
-      }
-    }
-    xy = xy + x[pos];
-    yy = yy + 2*ypulse[pos] + 1;
-    ypulse[pos]++;
-  }
-  for (i = 0; i < n; i++) {
-    if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
-  }
-  return xy/(1e-100 + sqrt(xx*yy));
-}
-
-/** Encodes the gain so that the return value increases with the
- * distance |x-ref|, so that we can encode a zero when x=ref. The
- * value x=0 is not covered because it is only allowed in the noref
- * case.
- *
- * @param [in]      x      quantized gain to encode
- * @param [in]      ref    quantized gain of the reference
- * @return                 interleave-encoded quantized gain value
- */
-static int neg_interleave(int x, int ref) {
-  if (x < ref) return -2*(x - ref) - 1;
-  else if (x < 2*ref) return 2*(x - ref);
-  else return x-1;
-}
-
-int od_vector_is_null(const od_coeff *x, int len) {
-  int i;
-  for (i = 0; i < len; i++) if (x[i]) return 0;
-  return 1;
-}
-
-static double od_pvq_rate(int qg, int icgr, int theta, int ts,
-  const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
-  double rate;
-  if (k == 0) rate = 0;
-  else if (speed > 0) {
-    int i;
-    int sum;
-    double f;
-    /* Compute "center of mass" of the pulse vector. */
-    sum = 0;
-    for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
-    f = sum/(double)(k*n);
-    /* Estimates the number of bits it will cost to encode K pulses in
-       N dimensions based on hand-tuned fit for bitrate vs K, N and
-       "center of mass". */
-    rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
-  }
-  else {
-    aom_writer w;
-    od_pvq_codeword_ctx cd;
-    int tell;
-#if !CONFIG_ANS
-    od_ec_enc_init(&w.ec, 1000);
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-    OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
-#if !CONFIG_ANS
-    tell = od_ec_enc_tell_frac(&w.ec);
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-    aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
-#if !CONFIG_ANS
-    rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
-    od_ec_enc_clear(&w.ec);
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  }
-  if (qg > 0 && theta >= 0) {
-    /* Approximate cost of entropy-coding theta */
-    rate += .9*OD_LOG2(ts);
-    if (qg == icgr) rate -= .5;
-  }
-  return rate;
-}
-
-#define MAX_PVQ_ITEMS (20)
-/* This stores the information about a PVQ search candidate, so we can sort
-   based on K. */
-typedef struct {
-  int gain;
-  int k;
-  od_val32 qtheta;
-  int theta;
-  int ts;
-  od_val32 qcg;
-} pvq_search_item;
-
-int items_compare(pvq_search_item *a, pvq_search_item *b) {
-  /* Break ties in K with gain to ensure a stable sort.
-     Otherwise, the order depends on qsort implementation. */
-  return a->k == b->k ? a->gain - b->gain : a->k - b->k;
-}
-
-/** Perform PVQ quantization with prediction, trying several
- * possible gains and angles. See draft-valin-videocodec-pvq and
- * http://jmvalin.ca/slides/pvq.pdf for more details.
- *
- * @param [out]    out         coefficients after quantization
- * @param [in]     x0          coefficients before quantization
- * @param [in]     r0          reference, aka predicted coefficients
- * @param [in]     n           number of dimensions
- * @param [in]     q0          quantization step size
- * @param [out]    y           pulse vector (i.e. selected PVQ codevector)
- * @param [out]    itheta      angle between input and reference (-1 if noref)
- * @param [out]    vk          total number of pulses
- * @param [in]     beta        per-band activity masking beta param
- * @param [out]    skip_diff   distortion cost of skipping this block
- *                             (accumulated)
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     pli         plane index
- * @param [in]     adapt       probability adaptation context
- * @param [in]     qm          QM with magnitude compensation
- * @param [in]     qm_inv      Inverse of QM with magnitude compensation
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in]     speed       Make search faster by making approximations
- * @return         gain        index of the quatized gain
-*/
-static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
-    int n, int q0, od_coeff *y, int *itheta, int *vk,
-    od_val16 beta, double *skip_diff, int is_keyframe, int pli,
-    const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
-    double pvq_norm_lambda, int speed) {
-  od_val32 g;
-  od_val32 gr;
-  od_coeff y_tmp[MAXN + 3];
-  int i;
-  /* Number of pulses. */
-  int k;
-  /* Companded gain of x and reference, normalized to q. */
-  od_val32 cg;
-  od_val32 cgr;
-  int icgr;
-  int qg;
-  /* Best RDO cost (D + lamdba*R) so far. */
-  double best_cost;
-  double dist0;
-  /* Distortion (D) that corresponds to the best RDO cost. */
-  double best_dist;
-  double dist;
-  /* Sign of Householder reflection. */
-  int s;
-  /* Dimension on which Householder reflects. */
-  int m;
-  od_val32 theta;
-  double corr;
-  int best_k;
-  od_val32 best_qtheta;
-  od_val32 gain_offset;
-  int noref;
-  double skip_dist;
-  int cfl_enabled;
-  int skip;
-  double gain_weight;
-  od_val16 x16[MAXN];
-  od_val16 r16[MAXN];
-  int xshift;
-  int rshift;
-  /* Give more weight to gain error when calculating the total distortion. */
-  gain_weight = 1.0;
-  OD_ASSERT(n > 1);
-  corr = 0;
-#if !defined(OD_FLOAT_PVQ)
-  /* Shift needed to make x fit in 16 bits even after rotation.
-     This shift value is not normative (it can be changed without breaking
-     the bitstream) */
-  xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
-  /* Shift needed to make the reference fit in 15 bits, so that the Householder
-     vector can fit in 16 bits.
-     This shift value *is* normative, and has to match the decoder. */
-  rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
-#else
-  xshift = 0;
-  rshift = 0;
-#endif
-  for (i = 0; i < n; i++) {
-#if defined(OD_FLOAT_PVQ)
-    /*This is slightly different from the original float PVQ code,
-       where the qm was applied in the accumulation in od_pvq_compute_gain and
-       the vectors were od_coeffs, not od_val16 (i.e. double).*/
-    x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
-    r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
-#else
-    x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
-    r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
-#endif
-    corr += OD_MULT16_16(x16[i], r16[i]);
-  }
-  cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
-  cg  = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
-  cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
-  if (cfl_enabled) cgr = OD_CGAIN_SCALE;
-  /* gain_offset is meant to make sure one of the quantized gains has
-     exactly the same gain as the reference. */
-#if defined(OD_FLOAT_PVQ)
-  icgr = (int)floor(.5 + cgr);
-#else
-  icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
-#endif
-  gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
-  /* Start search with null case: gain=0, no pulse. */
-  qg = 0;
-  dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
-  best_dist = dist;
-  best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
-    n, speed);
-  noref = 1;
-  best_k = 0;
-  *itheta = -1;
-  OD_CLEAR(y, n);
-  best_qtheta = 0;
-  m = 0;
-  s = 1;
-  corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
-  corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
-  if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
-  else {
-    skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
-     + cgr*(double)cg*(2 - 2*corr);
-    skip_dist *= OD_CGAIN_SCALE_2;
-  }
-  if (!is_keyframe) {
-    /* noref, gain=0 isn't allowed, but skip is allowed. */
-    od_val32 scgr;
-    scgr = OD_MAXF(0,gain_offset);
-    if (icgr == 0) {
-      best_dist = gain_weight*(cg - scgr)*(cg - scgr)
-       + scgr*(double)cg*(2 - 2*corr);
-      best_dist *= OD_CGAIN_SCALE_2;
-    }
-    best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
-     NULL, 0, n, speed);
-    best_qtheta = 0;
-    *itheta = 0;
-    noref = 0;
-  }
-  dist0 = best_dist;
-  if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
-    od_val16 xr[MAXN];
-    int gain_bound;
-    int prev_k;
-    pvq_search_item items[MAX_PVQ_ITEMS];
-    int idx;
-    int nitems;
-    double cos_dist;
-    idx = 0;
-    gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
-    /* Perform theta search only if prediction is useful. */
-    theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
-    m = od_compute_householder(r16, n, gr, &s, rshift);
-    od_apply_householder(xr, x16, r16, n);
-    prev_k = 0;
-    for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
-    /* Compute all candidate PVQ searches within a reasonable range of gain
-       and theta. */
-    for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
-      int j;
-      od_val32 qcg;
-      int ts;
-      int theta_lower;
-      int theta_upper;
-      /* Quantized companded gain */
-      qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
-      /* Set angular resolution (in ra) to match the encoded gain */
-      ts = od_pvq_compute_max_theta(qcg, beta);
-      theta_lower = OD_MAXI(0, (int)floor(.5 +
-       theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
-      theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
-      /* Include the angles within a reasonable range. */
-      for (j = theta_lower; j <= theta_upper; j++) {
-        od_val32 qtheta;
-        qtheta = od_pvq_compute_theta(j, ts);
-        k = od_pvq_compute_k(qcg, j, 0, n, beta);
-        items[idx].gain = i;
-        items[idx].theta = j;
-        items[idx].k = k;
-        items[idx].qcg = qcg;
-        items[idx].qtheta = qtheta;
-        items[idx].ts = ts;
-        idx++;
-        OD_ASSERT(idx < MAX_PVQ_ITEMS);
-      }
-    }
-    nitems = idx;
-    cos_dist = 0;
-    /* Sort PVQ search candidates in ascending order of pulses K so that
-       we can reuse all the previously searched pulses across searches. */
-    qsort(items, nitems, sizeof(items[0]),
-     (int (*)(const void *, const void *))items_compare);
-    /* Search for the best gain/theta in order. */
-    for (idx = 0; idx < nitems; idx++) {
-      int j;
-      od_val32 qcg;
-      int ts;
-      double cost;
-      double dist_theta;
-      double sin_prod;
-      od_val32 qtheta;
-      /* Quantized companded gain */
-      qcg = items[idx].qcg;
-      i = items[idx].gain;
-      j = items[idx].theta;
-      /* Set angular resolution (in ra) to match the encoded gain */
-      ts = items[idx].ts;
-      /* Search for the best angle within a reasonable range. */
-      qtheta = items[idx].qtheta;
-      k = items[idx].k;
-      /* Compute the minimal possible distortion by not taking the PVQ
-         cos_dist into account. */
-      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
-      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
-      dist *= OD_CGAIN_SCALE_2;
-      /* If we have no hope of beating skip (including a 1-bit worst-case
-         penalty), stop now. */
-      if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
-      sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
-       OD_TRIG_SCALE_1;
-      /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
-         that's the factor by which cos_dist is multiplied to get the
-         distortion metric. */
-      if (k == 0) {
-        cos_dist = 0;
-        OD_CLEAR(y_tmp, n-1);
-      }
-      else if (k != prev_k) {
-        cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
-         qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
-      }
-      prev_k = k;
-      /* See Jmspeex' Journal of Dubious Theoretical Results. */
-      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
-       + sin_prod*(2 - 2*cos_dist);
-      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
-      dist *= OD_CGAIN_SCALE_2;
-      /* Do approximate RDO. */
-      cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
-       k, n, speed);
-      if (cost < best_cost) {
-        best_cost = cost;
-        best_dist = dist;
-        qg = i;
-        best_k = k;
-        best_qtheta = qtheta;
-        *itheta = j;
-        noref = 0;
-        OD_COPY(y, y_tmp, n - 1);
-      }
-    }
-  }
-  /* Don't bother with no-reference version if there's a reasonable
-     correlation. */
-  if (n <= OD_MAX_PVQ_SIZE && (corr < .5
-        || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
-    int gain_bound;
-    int prev_k;
-    gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
-    prev_k = 0;
-    /* Search for the best gain (haven't determined reasonable range yet). */
-    for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
-      double cos_dist;
-      double cost;
-      od_val32 qcg;
-      qcg = OD_SHL(i, OD_CGAIN_SHIFT);
-      k = od_pvq_compute_k(qcg, -1, 1, n, beta);
-      /* Compute the minimal possible distortion by not taking the PVQ
-         cos_dist into account. */
-      dist = gain_weight*(qcg - cg)*(qcg - cg);
-      dist *= OD_CGAIN_SCALE_2;
-      if (dist > dist0 && k != 0) continue;
-      cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
-       qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
-      prev_k = k;
-      /* See Jmspeex' Journal of Dubious Theoretical Results. */
-      dist = gain_weight*(qcg - cg)*(qcg - cg)
-       + qcg*(double)cg*(2 - 2*cos_dist);
-      dist *= OD_CGAIN_SCALE_2;
-      /* Do approximate RDO. */
-      cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
-       n, speed);
-      if (cost <= best_cost) {
-        best_cost = cost;
-        best_dist = dist;
-        qg = i;
-        noref = 1;
-        best_k = k;
-        *itheta = -1;
-        OD_COPY(y, y_tmp, n);
-      }
-    }
-  }
-  k = best_k;
-  theta = best_qtheta;
-  skip = 0;
-  if (noref) {
-    if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
-  }
-  else {
-    if (!is_keyframe && qg == 0) {
-      skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
-    }
-    if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
-  }
-  /* Synthesize like the decoder would. */
-  if (skip) {
-    if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
-    else OD_CLEAR(out, n);
-  }
-  else {
-    if (noref) gain_offset = 0;
-    g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
-    od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
-     qm_inv);
-  }
-  *vk = k;
-  *skip_diff += skip_dist - best_dist;
-  /* Encode gain differently depending on whether we use prediction or not.
-     Special encoding on inter frames where qg=0 is allowed for noref=0
-     but not noref=1.*/
-  if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
-  else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
-}
-
-/** Encodes a single vector of integers (eg, a partition within a
- *  coefficient block) using PVQ
- *
- * @param [in,out] w          multi-symbol entropy encoder
- * @param [in]     qg         quantized gain
- * @param [in]     theta      quantized post-prediction theta
- * @param [in]     in         coefficient vector to code
- * @param [in]     n          number of coefficients in partition
- * @param [in]     k          number of pulses in partition
- * @param [in,out] model      entropy encoder state
- * @param [in,out] adapt      adaptation context
- * @param [in,out] exg        ExQ16 expectation of gain value
- * @param [in,out] ext        ExQ16 expectation of theta value
- * @param [in]     cdf_ctx    selects which cdf context to use
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     code_skip  whether the "skip rest" flag is allowed
- * @param [in]     skip_rest  when set, we skip all higher bands
- * @param [in]     encode_flip whether we need to encode the CfL flip flag now
- * @param [in]     flip       value of the CfL flip flag
- */
-void pvq_encode_partition(aom_writer *w,
-                                 int qg,
-                                 int theta,
-                                 const od_coeff *in,
-                                 int n,
-                                 int k,
-                                 generic_encoder model[3],
-                                 od_adapt_ctx *adapt,
-                                 int *exg,
-                                 int *ext,
-                                 int cdf_ctx,
-                                 int is_keyframe,
-                                 int code_skip,
-                                 int skip_rest,
-                                 int encode_flip,
-                                 int flip) {
-  int noref;
-  int id;
-  noref = (theta == -1);
-  id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
-  if (is_keyframe) {
-    OD_ASSERT(id != 8);
-    if (id >= 8) id--;
-  }
-  else {
-    OD_ASSERT(id != 10);
-    if (id >= 10) id--;
-  }
-  /* Jointly code gain, theta and noref for small values. Then we handle
-     larger gain and theta values. For noref, theta = -1. */
-  aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
-   8 + 7*code_skip);
-  if (encode_flip) {
-    /* We could eventually do some smarter entropy coding here, but it would
-       have to be good enough to overcome the overhead of the entropy coder.
-       An early attempt using a "toogle" flag with simple adaptation wasn't
-       worth the trouble. */
-    aom_write_bit(w, flip);
-  }
-  if (qg > 0) {
-    int tmp;
-    tmp = *exg;
-    generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
-    OD_IIR_DIADIC(*exg, qg << 16, 2);
-  }
-  if (theta > 1) {
-    int tmp;
-    tmp = *ext;
-    generic_encode(w, &model[2], theta - 2, &tmp, 2);
-    OD_IIR_DIADIC(*ext, theta << 16, 2);
-  }
-  aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
-   n - (theta != -1), k);
-}
-
-/** Quantizes a scalar with rate-distortion optimization (RDO)
- * @param [in] x      unquantized value
- * @param [in] q      quantization step size
- * @param [in] delta0 rate increase for encoding a 1 instead of a 0
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @retval quantized value
- */
-int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
-  int n;
-  /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
-     Jmspeex' Journal of Dubious Theoretical Results for details. */
-  n = OD_DIV_R0(abs(x), q);
-  if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
-    return 0;
-  }
-  else {
-    return OD_DIV_R0(x, q);
-  }
-}
-
-/** Encode a coefficient block (excepting DC) using PVQ
- *
- * @param [in,out] enc     daala encoder context
- * @param [in]     ref     'reference' (prediction) vector
- * @param [in]     in      coefficient block to quantize and encode
- * @param [out]    out     quantized coefficient block
- * @param [in]     q0      scale/quantizer
- * @param [in]     pli     plane index
- * @param [in]     bs      log of the block size minus two
- * @param [in]     beta    per-band activity masking beta param
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     qm      QM with magnitude compensation
- * @param [in]     qm_inv  Inverse of QM with magnitude compensation
- * @param [in]     speed   Make search faster by making approximations
- * @param [in]     pvq_info If null, conisdered as RDO search mode
- * @return         Returns block skip info indicating whether DC/AC are coded.
- *                 bit0: DC is coded, bit1: AC is coded (1 means coded)
- *
- */
-PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
-                   od_coeff *ref,
-                   const od_coeff *in,
-                   od_coeff *out,
-                   int q_dc,
-                   int q_ac,
-                   int pli,
-                   int bs,
-                   const od_val16 *beta,
-                   int is_keyframe,
-                   const int16_t *qm,
-                   const int16_t *qm_inv,
-                   int speed,
-                   PVQ_INFO *pvq_info){
-  int theta[PVQ_MAX_PARTITIONS];
-  int qg[PVQ_MAX_PARTITIONS];
-  int k[PVQ_MAX_PARTITIONS];
-  od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
-  int *exg;
-  int *ext;
-  int nb_bands;
-  int i;
-  const int *off;
-  int size[PVQ_MAX_PARTITIONS];
-  generic_encoder *model;
-  double skip_diff;
-  int tell;
-  uint16_t *skip_cdf;
-  od_rollback_buffer buf;
-  int dc_quant;
-  int flip;
-  int cfl_encoded;
-  int skip_rest;
-  int skip_dir;
-  int skip_theta_value;
-  const unsigned char *pvq_qm;
-  double dc_rate;
-  int use_masking;
-  PVQ_SKIP_TYPE ac_dc_coded;
-
-  aom_clear_system_state();
-
-  use_masking = enc->use_activity_masking;
-
-  if (use_masking)
-    pvq_qm = &enc->state.pvq_qm_q4[pli][0];
-  else
-    pvq_qm = 0;
-
-  exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
-  ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
-  skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
-  model = enc->state.adapt->pvq.pvq_param_model;
-  nb_bands = OD_BAND_OFFSETS[bs][0];
-  off = &OD_BAND_OFFSETS[bs][1];
-
-  if (use_masking)
-    dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
-  else
-    dc_quant = OD_MAXI(1, q_dc);
-
-  tell = 0;
-  for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
-  skip_diff = 0;
-  flip = 0;
-  /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
-  if (pli != 0 && is_keyframe) {
-    od_val32 xy;
-    xy = 0;
-    /*Compute the dot-product of the first band of chroma with the luma ref.*/
-    for (i = off[0]; i < off[1]; i++) {
-#if defined(OD_FLOAT_PVQ)
-      xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
-       (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
-#else
-      od_val32 rq;
-      od_val32 inq;
-      rq = ref[i]*qm[i];
-      inq = in[i]*qm[i];
-      xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
-       1));
-#endif
-    }
-    /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
-    if (xy < 0) {
-      flip = 1;
-      for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
-    }
-  }
-  for (i = 0; i < nb_bands; i++) {
-    int q;
-
-    if (use_masking)
-      q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
-    else
-      q = OD_MAXI(1, q_ac);
-
-    qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
-     q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
-     pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
-     enc->pvq_norm_lambda, speed);
-  }
-  od_encode_checkpoint(enc, &buf);
-  if (is_keyframe) out[0] = 0;
-  else {
-    int n;
-    n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
-    if (n == 0) {
-      out[0] = 0;
-    } else {
-      int tell2;
-      od_rollback_buffer dc_buf;
-
-      dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/
-       (double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1])));
-      dc_rate += 1;
-
-#if !CONFIG_ANS
-      tell2 = od_ec_enc_tell_frac(&enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-      od_encode_checkpoint(enc, &dc_buf);
-      generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
-       n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
-#if !CONFIG_ANS
-      tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-      dc_rate += tell2/8.0;
-      od_encode_rollback(enc, &dc_buf);
-
-      out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
-       enc->pvq_norm_lambda);
-    }
-  }
-#if !CONFIG_ANS
-  tell = od_ec_enc_tell_frac(&enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  /* Code as if we're not skipping. */
-  aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
-  ac_dc_coded = AC_CODED + (out[0] != 0);
-  cfl_encoded = 0;
-  skip_rest = 1;
-  skip_theta_value = is_keyframe ? -1 : 0;
-  for (i = 1; i < nb_bands; i++) {
-    if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
-  }
-  skip_dir = 0;
-  if (nb_bands > 1) {
-    for (i = 0; i < 3; i++) {
-      int j;
-      int tmp;
-      tmp = 1;
-      // ToDo(yaowu): figure out better stop condition without gcc warning.
-      for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
-        if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
-      }
-      skip_dir |= tmp << i;
-    }
-  }
-  if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
-
-  /* NOTE: There was no other better place to put this function. */
-  if (pvq_info)
-    av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
-      skip_rest, skip_dir, bs);
-
-  for (i = 0; i < nb_bands; i++) {
-    int encode_flip;
-    /* Encode CFL flip bit just after the first time it's used. */
-    encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
-    if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
-      pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
-       size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
-       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
-       is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
-    }
-    if (i == 0 && !skip_rest && bs > 0) {
-      aom_write_symbol(&enc->w, skip_dir,
-       &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
-    }
-    if (encode_flip) cfl_encoded = 1;
-  }
-#if !CONFIG_ANS
-  tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  /* Account for the rate of skipping the AC, based on the same DC decision
-     we made when trying to not skip AC. */
-  {
-    double skip_rate;
-    if (out[0] != 0) {
-      skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
-     (double)OD_ICDF(skip_cdf[3]));
-    }
-    else {
-      skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/
-     (double)OD_ICDF(skip_cdf[3]));
-    }
-    tell -= (int)floor(.5+8*skip_rate);
-  }
-  if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
-    if (is_keyframe) out[0] = 0;
-    else {
-      int n;
-      n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
-      if (n == 0) {
-        out[0] = 0;
-      } else {
-        int tell2;
-        od_rollback_buffer dc_buf;
-
-        dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
-         (double)OD_ICDF(skip_cdf[0]));
-        dc_rate += 1;
-
-#if !CONFIG_ANS
-        tell2 = od_ec_enc_tell_frac(&enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-        od_encode_checkpoint(enc, &dc_buf);
-        generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
-         n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
-#if !CONFIG_ANS
-        tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-        dc_rate += tell2/8.0;
-        od_encode_rollback(enc, &dc_buf);
-
-        out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
-         enc->pvq_norm_lambda);
-      }
-    }
-    /* We decide to skip, roll back everything as it was before. */
-    od_encode_rollback(enc, &buf);
-    aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
-    ac_dc_coded = (out[0] != 0);
-    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
-    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
-  }
-  if (pvq_info)
-    pvq_info->ac_dc_coded = ac_dc_coded;
-  return ac_dc_coded;
-}
diff --git a/third_party/aom/av1/encoder/pvq_encoder.h b/third_party/aom/av1/encoder/pvq_encoder.h
deleted file mode 100644
index b84c8961b..000000000
--- a/third_party/aom/av1/encoder/pvq_encoder.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_pvq_encoder_H)
-# define _pvq_encoder_H (1)
-# include "aom_dsp/bitwriter.h"
-# include "aom_dsp/entenc.h"
-# include "av1/common/blockd.h"
-# include "av1/common/pvq.h"
-# include "av1/encoder/encint.h"
-
-void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
-    int nsymbs);
-
-void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
- const int *y, int n, int k, int level);
-
-void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay);
-
-void pvq_encode_partition(aom_writer *w,
-                                 int qg,
-                                 int theta,
-                                 const od_coeff *in,
-                                 int n,
-                                 int k,
-                                 generic_encoder model[3],
-                                 od_adapt_ctx *adapt,
-                                 int *exg,
-                                 int *ext,
-                                 int cdf_ctx,
-                                 int is_keyframe,
-                                 int code_skip,
-                                 int skip_rest,
-                                 int encode_flip,
-                                 int flip);
-
-PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref,
-    const od_coeff *in, od_coeff *out, int q_dc, int q_ac, int pli, int bs,
-    const od_val16 *beta, int is_keyframe,
-    const int16_t *qm, const int16_t *qm_inv, int speed,
-    PVQ_INFO *pvq_info);
-
-#endif
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
index 6d2eb4183..781f528eb 100644
--- a/third_party/aom/av1/encoder/ransac.c
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -80,60 +80,6 @@ static void project_points_double_affine(double *mat, double *points,
   }
 }
 
-static void project_points_double_hortrapezoid(double *mat, double *points,
-                                               double *proj, const int n,
-                                               const int stride_points,
-                                               const int stride_proj) {
-  int i;
-  double x, y, Z, Z_inv;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    Z_inv = mat[7] * y + 1;
-    assert(fabs(Z_inv) > 0.000001);
-    Z = 1. / Z_inv;
-    *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
-    *(proj++) = (mat[5] * y + mat[1]) * Z;
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-static void project_points_double_vertrapezoid(double *mat, double *points,
-                                               double *proj, const int n,
-                                               const int stride_points,
-                                               const int stride_proj) {
-  int i;
-  double x, y, Z, Z_inv;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    Z_inv = mat[6] * x + 1;
-    assert(fabs(Z_inv) > 0.000001);
-    Z = 1. / Z_inv;
-    *(proj++) = (mat[2] * x + mat[0]) * Z;
-    *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-static void project_points_double_homography(double *mat, double *points,
-                                             double *proj, const int n,
-                                             const int stride_points,
-                                             const int stride_proj) {
-  int i;
-  double x, y, Z, Z_inv;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    Z_inv = mat[6] * x + mat[7] * y + 1;
-    assert(fabs(Z_inv) > 0.000001);
-    Z = 1. / Z_inv;
-    *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
-    *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
 static void normalize_homography(double *pts, int n, double *T) {
   double *p = pts;
   double mean[2] = { 0, 0 };
@@ -193,22 +139,6 @@ static void denormalize_homography(double *params, double *T1, double *T2) {
   multiply_mat(iT2, params2, params, 3, 3, 3);
 }
 
-static void denormalize_homography_reorder(double *params, double *T1,
-                                           double *T2) {
-  double params_denorm[MAX_PARAMDIM];
-  memcpy(params_denorm, params, sizeof(*params) * 8);
-  params_denorm[8] = 1.0;
-  denormalize_homography(params_denorm, T1, T2);
-  params[0] = params_denorm[2];
-  params[1] = params_denorm[5];
-  params[2] = params_denorm[0];
-  params[3] = params_denorm[1];
-  params[4] = params_denorm[3];
-  params[5] = params_denorm[4];
-  params[6] = params_denorm[6];
-  params[7] = params_denorm[7];
-}
-
 static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
   double params_denorm[MAX_PARAMDIM];
   params_denorm[0] = params[0];
@@ -377,217 +307,6 @@ static int find_affine(int np, double *pts1, double *pts2, double *mat) {
   return 0;
 }
 
-static int find_vertrapezoid(int np, double *pts1, double *pts2, double *mat) {
-  const int np3 = np * 3;
-  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
-  double *U = a + np3 * 7;
-  double S[7], V[7 * 7], H[9];
-  int i, mini;
-  double sx, sy, dx, dy;
-  double T1[9], T2[9];
-
-  normalize_homography(pts1, np, T1);
-  normalize_homography(pts2, np, T2);
-
-  for (i = 0; i < np; ++i) {
-    dx = *(pts2++);
-    dy = *(pts2++);
-    sx = *(pts1++);
-    sy = *(pts1++);
-
-    a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = 0;
-    a[i * 3 * 7 + 2] = -sx;
-    a[i * 3 * 7 + 3] = -sy;
-    a[i * 3 * 7 + 4] = -1;
-    a[i * 3 * 7 + 5] = dy * sx;
-    a[i * 3 * 7 + 6] = dy;
-
-    a[(i * 3 + 1) * 7 + 0] = sx;
-    a[(i * 3 + 1) * 7 + 1] = 1;
-    a[(i * 3 + 1) * 7 + 2] = a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] =
-        0;
-    a[(i * 3 + 1) * 7 + 5] = -dx * sx;
-    a[(i * 3 + 1) * 7 + 6] = -dx;
-
-    a[(i * 3 + 2) * 7 + 0] = -dy * sx;
-    a[(i * 3 + 2) * 7 + 1] = -dy;
-    a[(i * 3 + 2) * 7 + 2] = dx * sx;
-    a[(i * 3 + 2) * 7 + 3] = dx * sy;
-    a[(i * 3 + 2) * 7 + 4] = dx;
-    a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
-  }
-  if (SVD(U, S, V, a, np3, 7)) {
-    aom_free(a);
-    return 1;
-  } else {
-    double minS = 1e12;
-    mini = -1;
-    for (i = 0; i < 7; ++i) {
-      if (S[i] < minS) {
-        minS = S[i];
-        mini = i;
-      }
-    }
-  }
-  H[1] = H[7] = 0;
-  for (i = 0; i < 1; i++) H[i] = V[i * 7 + mini];
-  for (; i < 6; i++) H[i + 1] = V[i * 7 + mini];
-  for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
-
-  denormalize_homography_reorder(H, T1, T2);
-  aom_free(a);
-  if (H[8] == 0.0) {
-    return 1;
-  } else {
-    // normalize
-    double f = 1.0 / H[8];
-    for (i = 0; i < 8; i++) mat[i] = f * H[i];
-  }
-  return 0;
-}
-
-static int find_hortrapezoid(int np, double *pts1, double *pts2, double *mat) {
-  const int np3 = np * 3;
-  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
-  double *U = a + np3 * 7;
-  double S[7], V[7 * 7], H[9];
-  int i, mini;
-  double sx, sy, dx, dy;
-  double T1[9], T2[9];
-
-  normalize_homography(pts1, np, T1);
-  normalize_homography(pts2, np, T2);
-
-  for (i = 0; i < np; ++i) {
-    dx = *(pts2++);
-    dy = *(pts2++);
-    sx = *(pts1++);
-    sy = *(pts1++);
-
-    a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = a[i * 3 * 7 + 2] = 0;
-    a[i * 3 * 7 + 3] = -sy;
-    a[i * 3 * 7 + 4] = -1;
-    a[i * 3 * 7 + 5] = dy * sy;
-    a[i * 3 * 7 + 6] = dy;
-
-    a[(i * 3 + 1) * 7 + 0] = sx;
-    a[(i * 3 + 1) * 7 + 1] = sy;
-    a[(i * 3 + 1) * 7 + 2] = 1;
-    a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = 0;
-    a[(i * 3 + 1) * 7 + 5] = -dx * sy;
-    a[(i * 3 + 1) * 7 + 6] = -dx;
-
-    a[(i * 3 + 2) * 7 + 0] = -dy * sx;
-    a[(i * 3 + 2) * 7 + 1] = -dy * sy;
-    a[(i * 3 + 2) * 7 + 2] = -dy;
-    a[(i * 3 + 2) * 7 + 3] = dx * sy;
-    a[(i * 3 + 2) * 7 + 4] = dx;
-    a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
-  }
-
-  if (SVD(U, S, V, a, np3, 7)) {
-    aom_free(a);
-    return 1;
-  } else {
-    double minS = 1e12;
-    mini = -1;
-    for (i = 0; i < 7; ++i) {
-      if (S[i] < minS) {
-        minS = S[i];
-        mini = i;
-      }
-    }
-  }
-  H[3] = H[6] = 0;
-  for (i = 0; i < 3; i++) H[i] = V[i * 7 + mini];
-  for (; i < 5; i++) H[i + 1] = V[i * 7 + mini];
-  for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
-
-  denormalize_homography_reorder(H, T1, T2);
-  aom_free(a);
-  if (H[8] == 0.0) {
-    return 1;
-  } else {
-    // normalize
-    double f = 1.0 / H[8];
-    for (i = 0; i < 8; i++) mat[i] = f * H[i];
-  }
-  return 0;
-}
-
-static int find_homography(int np, double *pts1, double *pts2, double *mat) {
-  // Implemented from Peter Kovesi's normalized implementation
-  const int np3 = np * 3;
-  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18);
-  double *U = a + np3 * 9;
-  double S[9], V[9 * 9], H[9];
-  int i, mini;
-  double sx, sy, dx, dy;
-  double T1[9], T2[9];
-
-  normalize_homography(pts1, np, T1);
-  normalize_homography(pts2, np, T2);
-
-  for (i = 0; i < np; ++i) {
-    dx = *(pts2++);
-    dy = *(pts2++);
-    sx = *(pts1++);
-    sy = *(pts1++);
-
-    a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0;
-    a[i * 3 * 9 + 3] = -sx;
-    a[i * 3 * 9 + 4] = -sy;
-    a[i * 3 * 9 + 5] = -1;
-    a[i * 3 * 9 + 6] = dy * sx;
-    a[i * 3 * 9 + 7] = dy * sy;
-    a[i * 3 * 9 + 8] = dy;
-
-    a[(i * 3 + 1) * 9 + 0] = sx;
-    a[(i * 3 + 1) * 9 + 1] = sy;
-    a[(i * 3 + 1) * 9 + 2] = 1;
-    a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] =
-        0;
-    a[(i * 3 + 1) * 9 + 6] = -dx * sx;
-    a[(i * 3 + 1) * 9 + 7] = -dx * sy;
-    a[(i * 3 + 1) * 9 + 8] = -dx;
-
-    a[(i * 3 + 2) * 9 + 0] = -dy * sx;
-    a[(i * 3 + 2) * 9 + 1] = -dy * sy;
-    a[(i * 3 + 2) * 9 + 2] = -dy;
-    a[(i * 3 + 2) * 9 + 3] = dx * sx;
-    a[(i * 3 + 2) * 9 + 4] = dx * sy;
-    a[(i * 3 + 2) * 9 + 5] = dx;
-    a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] =
-        0;
-  }
-
-  if (SVD(U, S, V, a, np3, 9)) {
-    aom_free(a);
-    return 1;
-  } else {
-    double minS = 1e12;
-    mini = -1;
-    for (i = 0; i < 9; ++i) {
-      if (S[i] < minS) {
-        minS = S[i];
-        mini = i;
-      }
-    }
-  }
-
-  for (i = 0; i < 9; i++) H[i] = V[i * 9 + mini];
-  denormalize_homography_reorder(H, T1, T2);
-  aom_free(a);
-  if (H[8] == 0.0) {
-    return 1;
-  } else {
-    // normalize
-    double f = 1.0 / H[8];
-    for (i = 0; i < 8; i++) mat[i] = f * H[i];
-  }
-  return 0;
-}
-
 static int get_rand_indices(int npoints, int minpts, int *indices,
                             unsigned int *seed) {
   int i, j;
@@ -860,11 +579,6 @@ static int is_degenerate_affine(double *p) {
   return is_collinear3(p, p + 2, p + 4);
 }
 
-static int is_degenerate_homography(double *p) {
-  return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) ||
-         is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6);
-}
-
 int ransac_translation(int *matched_points, int npoints,
                        int *num_inliers_by_motion, double *params_by_motion,
                        int num_desired_motions) {
@@ -887,30 +601,3 @@ int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
                 params_by_motion, num_desired_motions, 3, is_degenerate_affine,
                 find_affine, project_points_double_affine);
 }
-
-int ransac_homography(int *matched_points, int npoints,
-                      int *num_inliers_by_motion, double *params_by_motion,
-                      int num_desired_motions) {
-  return ransac(matched_points, npoints, num_inliers_by_motion,
-                params_by_motion, num_desired_motions, 4,
-                is_degenerate_homography, find_homography,
-                project_points_double_homography);
-}
-
-int ransac_hortrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_desired_motions) {
-  return ransac(matched_points, npoints, num_inliers_by_motion,
-                params_by_motion, num_desired_motions, 4,
-                is_degenerate_homography, find_hortrapezoid,
-                project_points_double_hortrapezoid);
-}
-
-int ransac_vertrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_desired_motions) {
-  return ransac(matched_points, npoints, num_inliers_by_motion,
-                params_by_motion, num_desired_motions, 4,
-                is_degenerate_homography, find_vertrapezoid,
-                project_points_double_vertrapezoid);
-}
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
index f611add36..1019055ed 100644
--- a/third_party/aom/av1/encoder/ransac.h
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -25,17 +25,8 @@ typedef int (*RansacFunc)(int *matched_points, int npoints,
 
 /* Each of these functions fits a motion model from a set of
    corresponding points in 2 frames using RANSAC. */
-int ransac_homography(int *matched_points, int npoints,
-                      int *num_inliers_by_motion, double *params_by_motion,
-                      int num_motions);
 int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
                   double *params_by_motion, int num_motions);
-int ransac_hortrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_motions);
-int ransac_vertrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_motions);
 int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
                    double *params_by_motion, int num_motions);
 int ransac_translation(int *matched_points, int npoints,
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index a90cb880e..ac9392fa1 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -44,7 +44,6 @@
 #define MAX_BPB_FACTOR 50
 
 #define FRAME_OVERHEAD_BITS 200
-#if CONFIG_HIGHBITDEPTH
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
     switch (bit_depth) {                                     \
@@ -58,13 +57,6 @@
         name = NULL;                                         \
     }                                                        \
   } while (0)
-#else
-#define ASSIGN_MINQ_TABLE(bit_depth, name) \
-  do {                                     \
-    (void)bit_depth;                       \
-    name = name##_8;                       \
-  } while (0)
-#endif
 
 // Tables relating active max Q to active min Q
 static int kf_low_motion_minq_8[QINDEX_RANGE];
@@ -74,7 +66,6 @@ static int arfgf_high_motion_minq_8[QINDEX_RANGE];
 static int inter_minq_8[QINDEX_RANGE];
 static int rtc_minq_8[QINDEX_RANGE];
 
-#if CONFIG_HIGHBITDEPTH
 static int kf_low_motion_minq_10[QINDEX_RANGE];
 static int kf_high_motion_minq_10[QINDEX_RANGE];
 static int arfgf_low_motion_minq_10[QINDEX_RANGE];
@@ -87,7 +78,6 @@ static int arfgf_low_motion_minq_12[QINDEX_RANGE];
 static int arfgf_high_motion_minq_12[QINDEX_RANGE];
 static int inter_minq_12[QINDEX_RANGE];
 static int rtc_minq_12[QINDEX_RANGE];
-#endif
 
 static int gf_high = 2000;
 static int gf_low = 400;
@@ -97,7 +87,6 @@ static int kf_low = 400;
 // How many times less pixels there are to encode given the current scaling.
 // Temporary replacement for rcf_mult and rate_thresh_mult.
 static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
-  (void)cpi;
   return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
 }
 
@@ -140,33 +129,27 @@ void av1_rc_init_minq_luts(void) {
   init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
                  arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
                  inter_minq_8, rtc_minq_8, AOM_BITS_8);
-#if CONFIG_HIGHBITDEPTH
   init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
                  arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
                  inter_minq_10, rtc_minq_10, AOM_BITS_10);
   init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
                  arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
                  inter_minq_12, rtc_minq_12, AOM_BITS_12);
-#endif
 }
 
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
 // tables if and when things settle down in the experimental bitstream
 double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
-// Convert the index to a real Q value (scaled down to match old Q values)
-#if CONFIG_HIGHBITDEPTH
+  // Convert the index to a real Q value (scaled down to match old Q values)
   switch (bit_depth) {
-    case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
-    case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0;
-    case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0;
+    case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1.0;
   }
-#else
-  return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
-#endif
 }
 
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
@@ -196,12 +179,8 @@ int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target =
       AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
-// Clip the frame target to the minimum setup value.
-#if CONFIG_EXT_REFS
+  // Clip the frame target to the minimum setup value.
   if (cpi->rc.is_src_frame_alt_ref) {
-#else
-  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
-#endif  // CONFIG_EXT_REFS
     // If there is an active ARF at this location use the minimum
     // bits on this frame even if it is a constructed arf.
     // The active maximum quantizer insures that an appropriate
@@ -239,14 +218,10 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
 
-// Non-viewable frames are a special case and are treated as pure overhead.
-#if CONFIG_EXT_REFS
+  // Non-viewable frames are a special case and are treated as pure overhead.
   // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
   //               differently, since it is a no-show frame.
   if (!cm->show_frame && !rc->is_bwd_ref_frame)
-#else
-  if (!cm->show_frame)
-#endif  // CONFIG_EXT_REFS
     rc->bits_off_target -= encoded_frame_size;
   else
     rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
@@ -590,11 +565,9 @@ static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
   } else {
-    if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-                                      cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-                                      cpi->refresh_alt_ref_frame)) {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+         cpi->refresh_alt_ref_frame)) {
       active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
                                              : rc->last_q[INTER_FRAME];
     } else {
@@ -931,26 +904,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
 }
 
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
-  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-    1.00,  // INTER_NORMAL
-#if CONFIG_EXT_REFS
-    0.80,  // INTER_LOW
-    1.50,  // INTER_HIGH
-    1.25,  // GF_ARF_LOW
-#else
-    1.00,  // INTER_HIGH
-    1.50,  // GF_ARF_LOW
-#endif     // CONFIG_EXT_REFS
-    2.00,  // GF_ARF_STD
-    2.00,  // KF_STD
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
+    INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
   };
-  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
-#if CONFIG_EXT_REFS
-      { INTER_FRAME, INTER_FRAME, INTER_FRAME,
-        INTER_FRAME, INTER_FRAME, KEY_FRAME };
-#else
-      { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME };
-#endif  // CONFIG_EXT_REFS
   const AV1_COMMON *const cm = &cpi->common;
   int qdelta =
       av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
@@ -1020,11 +976,9 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-                                           cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-                                           cpi->refresh_alt_ref_frame)) {
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+              cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1044,11 +998,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == AOM_Q) {
-#if CONFIG_EXT_REFS
       if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
-#else
-      if (!cpi->refresh_alt_ref_frame) {
-#endif  // CONFIG_EXT_REFS
         active_best_quality = cq_level;
       } else {
         active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
@@ -1080,11 +1030,9 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   if ((cpi->oxcf.rc_mode != AOM_Q) &&
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
     if (frame_is_intra_only(cm) ||
-        (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-                                       cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-                                       cpi->refresh_alt_ref_frame))) {
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+          cpi->refresh_alt_ref_frame))) {
       active_best_quality -=
           (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
       active_worst_quality += (cpi->twopass.extend_maxq / 2);
@@ -1106,7 +1054,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   }
 
   // Modify active_best_quality for downscaled normal frames.
-  if (!av1_frame_unscaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+  if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
         rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
     active_best_quality =
@@ -1193,7 +1141,7 @@ static void rc_set_frame_target(AV1_COMP *cpi, int target, int width,
   rc->this_frame_target = target;
 
   // Modify frame size target when down-scaled.
-  if (!av1_frame_unscaled(cm))
+  if (av1_frame_scaled(cm))
     rc->this_frame_target =
         (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
 
@@ -1217,21 +1165,13 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
-#if CONFIG_EXT_REFS
   // Update the Golden frame usage counts.
   // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
   //                   only the virtual indices for the reference frame will be
   //                   updated and cpi->refresh_golden_frame will still be zero.
   if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
-#else   // !CONFIG_EXT_REFS
-  // Update the Golden frame usage counts.
-  if (cpi->refresh_golden_frame) {
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
     // We will not use internal overlay frames to replace the golden frame
     if (!rc->is_src_frame_ext_arf)
-#endif  // CONFIG_EXT_REFS
       // this frame refreshes means next frames don't unless specified by user
       rc->frames_since_golden = 0;
 
@@ -1248,11 +1188,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
-#if CONFIG_EXT_REFS
   } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
-#else
-  } else if (!cpi->refresh_alt_ref_frame) {
-#endif  // CONFIG_EXT_REFS
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
@@ -1282,10 +1218,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
     if (!rc->is_src_frame_alt_ref &&
-        !(cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-          cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
+        !(cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
           cpi->refresh_alt_ref_frame)) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
@@ -1307,10 +1240,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
       (!rc->constrained_gf_group &&
-       (cpi->refresh_alt_ref_frame ||
-#if CONFIG_EXT_REFS
-        cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
+       (cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
@@ -1320,7 +1250,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
-  if (!av1_frame_unscaled(cm))
+  if (av1_frame_scaled(cm))
     rc->this_frame_target =
         (int)(rc->this_frame_target /
               resize_rate_factor(cpi, cm->width, cm->height));
@@ -1337,14 +1267,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
-#if CONFIG_EXT_REFS
   // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
   //               differently here for rc->avg_frame_bandwidth.
   rc->total_target_bits +=
       (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
-#else
-  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
-#endif  // CONFIG_EXT_REFS
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
@@ -1358,13 +1284,9 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
 
-#if CONFIG_EXT_REFS
   // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
   //               differently here for rc->avg_frame_bandwidth.
   if (cm->show_frame || rc->is_bwd_ref_frame) {
-#else
-  if (cm->show_frame) {
-#endif  // CONFIG_EXT_REFS
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
@@ -1417,6 +1339,10 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
+  int altref_enabled = is_altref_enabled(cpi);
+  int sframe_dist = cpi->oxcf.sframe_dist;
+  int sframe_mode = cpi->oxcf.sframe_mode;
+  int sframe_enabled = cpi->oxcf.sframe_enabled;
   // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
   if (!cpi->refresh_alt_ref_frame &&
       (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
@@ -1429,6 +1355,37 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
     rc->source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
+    if (sframe_enabled) {
+      if (altref_enabled) {
+        if (sframe_mode == 1) {
+          // sframe_mode == 1: insert sframe if it matches altref frame.
+
+          if (cm->current_video_frame % sframe_dist == 0 &&
+              cm->frame_type != KEY_FRAME && cm->current_video_frame != 0 &&
+              cpi->refresh_alt_ref_frame) {
+            cm->frame_type = S_FRAME;
+          }
+        } else {
+          // sframe_mode != 1: if sframe will be inserted at the next available
+          // altref frame
+
+          if (cm->current_video_frame % sframe_dist == 0 &&
+              cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+            rc->sframe_due = 1;
+          }
+
+          if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
+            cm->frame_type = S_FRAME;
+            rc->sframe_due = 0;
+          }
+        }
+      } else {
+        if (cm->current_video_frame % sframe_dist == 0 &&
+            cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+          cm->frame_type = S_FRAME;
+        }
+      }
+    }
   }
   if (rc->frames_till_gf_update_due == 0) {
     rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
@@ -1444,6 +1401,10 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
     rc->gfu_boost = DEFAULT_GF_BOOST;
   }
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_update_parameters(cpi);
+
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
   else
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 8b410e778..81157ce72 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -28,7 +28,6 @@ extern "C" {
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
 
-#if CONFIG_EXT_REFS
 typedef enum {
   INTER_NORMAL = 0,
   INTER_LOW = 1,
@@ -38,23 +37,20 @@ typedef enum {
   KF_STD = 5,
   RATE_FACTOR_LEVELS = 6
 } RATE_FACTOR_LEVEL;
-#else
-typedef enum {
-  INTER_NORMAL = 0,
-  INTER_HIGH = 1,
-  GF_ARF_LOW = 2,
-  GF_ARF_STD = 3,
-  KF_STD = 4,
-  RATE_FACTOR_LEVELS = 5
-} RATE_FACTOR_LEVEL;
-#endif  // CONFIG_EXT_REFS
+
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+  1.00,  // INTER_NORMAL
+  0.80,  // INTER_LOW
+  1.50,  // INTER_HIGH
+  1.25,  // GF_ARF_LOW
+  2.00,  // GF_ARF_STD
+  2.00,  // KF_STD
+};
 
 typedef struct {
   int resize_width;
   int resize_height;
-#if CONFIG_FRAME_SUPERRES
   uint8_t superres_denom;
-#endif  // CONFIG_FRAME_SUPERRES
 } size_params_type;
 
 typedef struct {
@@ -88,8 +84,8 @@ typedef struct {
   int source_alt_ref_pending;
   int source_alt_ref_active;
   int is_src_frame_alt_ref;
+  int sframe_due;
 
-#if CONFIG_EXT_REFS
   // Length of the bi-predictive frame group interval
   int bipred_group_interval;
 
@@ -99,7 +95,6 @@ typedef struct {
   int is_last_bipred_frame;
   int is_bipred_frame;
   int is_src_frame_ext_arf;
-#endif  // CONFIG_EXT_REFS
 
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
index b9f827528..e69de29bb 100644
--- a/third_party/aom/av1/encoder/ratectrl_xiph.c
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.c
@@ -1,1244 +0,0 @@
-/*
- * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "av1/common/odintrin.h"
-#include "av1/encoder/ratectrl_xiph.h"
-
-#define OD_Q57(v) ((int64_t)((uint64_t)(v) << 57))
-#define OD_F_Q45(v) ((int64_t)(((v) * ((int64_t)1 << 45))))
-#define OD_F_Q12(v) ((int32_t)(((v) * ((int32_t)1 << 12))))
-
-/*A rough lookup table for tan(x), 0 <= x < pi/2.
-  The values are Q12 fixed-point and spaced at 5 degree intervals.
-  These decisions are somewhat arbitrary, but sufficient for the 2nd order
-   Bessel follower below.
-  Values of x larger than 85 degrees are extrapolated from the last interval,
-   which is way off, but "good enough".*/
-static uint16_t OD_ROUGH_TAN_LOOKUP[18] = { 0,     358,   722,  1098, 1491,
-                                            1910,  2365,  2868, 3437, 4096,
-                                            4881,  5850,  7094, 8784, 11254,
-                                            15286, 23230, 46817 };
-
-/*alpha is Q24 in the range [0,0.5).
-  The return values is 5.12.*/
-static int od_warp_alpha(int alpha) {
-  int i;
-  int d;
-  int t0;
-  int t1;
-  i = alpha * 36 >> 24;
-  if (i >= 17) i = 16;
-  t0 = OD_ROUGH_TAN_LOOKUP[i];
-  t1 = OD_ROUGH_TAN_LOOKUP[i + 1];
-  d = alpha * 36 - (i << 24);
-  return (int)((((int64_t)t0 << 32) + ((t1 - t0) << 8) * (int64_t)d) >> 32);
-}
-
-static const int64_t OD_ATANH_LOG2[32] = {
-  0x32B803473F7AD0F4LL, 0x2F2A71BD4E25E916LL, 0x2E68B244BB93BA06LL,
-  0x2E39FB9198CE62E4LL, 0x2E2E683F68565C8FLL, 0x2E2B850BE2077FC1LL,
-  0x2E2ACC58FE7B78DBLL, 0x2E2A9E2DE52FD5F2LL, 0x2E2A92A338D53EECLL,
-  0x2E2A8FC08F5E19B6LL, 0x2E2A8F07E51A485ELL, 0x2E2A8ED9BA8AF388LL,
-  0x2E2A8ECE2FE7384ALL, 0x2E2A8ECB4D3E4B1ALL, 0x2E2A8ECA94940FE8LL,
-  0x2E2A8ECA6669811DLL, 0x2E2A8ECA5ADEDD6ALL, 0x2E2A8ECA57FC347ELL,
-  0x2E2A8ECA57438A43LL, 0x2E2A8ECA57155FB4LL, 0x2E2A8ECA5709D510LL,
-  0x2E2A8ECA5706F267LL, 0x2E2A8ECA570639BDLL, 0x2E2A8ECA57060B92LL,
-  0x2E2A8ECA57060008LL, 0x2E2A8ECA5705FD25LL, 0x2E2A8ECA5705FC6CLL,
-  0x2E2A8ECA5705FC3ELL, 0x2E2A8ECA5705FC33LL, 0x2E2A8ECA5705FC30LL,
-  0x2E2A8ECA5705FC2FLL, 0x2E2A8ECA5705FC2FLL
-};
-
-static int od_ilog64(int64_t v) {
-  static const unsigned char OD_DEBRUIJN_IDX64[64] = {
-    0,  1,  2,  7,  3,  13, 8,  19, 4,  25, 14, 28, 9,  34, 20, 40,
-    5,  17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57,
-    63, 6,  12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56,
-    62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
-  };
-  int ret;
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  v |= v >> 32;
-  ret = (int)v & 1;
-  v = (v >> 1) + 1;
-  ret += OD_DEBRUIJN_IDX64[v * UINT64_C(0x218A392CD3D5DBF) >> 58 & 0x3F];
-  return ret;
-}
-
-/*Computes the binary exponential of logq57.
-  input: a log base 2 in Q57 format
-  output: a 64 bit integer in Q0 (no fraction) */
-static int64_t od_bexp64(int64_t logq57) {
-  int64_t w;
-  int64_t z;
-  int ipart;
-  ipart = (int)(logq57 >> 57);
-  if (ipart < 0) return 0;
-  if (ipart >= 63) return 0x7FFFFFFFFFFFFFFFLL;
-  z = logq57 - OD_Q57(ipart);
-  if (z) {
-    int64_t mask;
-    int64_t wlo;
-    int i;
-    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
-      This is not particularly fast, but it's not being used in time-critical
-       code; it is very accurate.*/
-    /*z is the fractional part of the log in Q62 format.
-      We need 1 bit of headroom since the magnitude can get larger than 1
-       during the iteration, and a sign bit.*/
-    z <<= 5;
-    /*w is the exponential in Q61 format (since it also needs headroom and can
-       get as large as 2.0); we could get another bit if we dropped the sign,
-       but we'll recover that bit later anyway.
-      Ideally this should start out as
-        \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
-       but in order to guarantee convergence we have to repeat iterations 4,
-        13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
-    w = 0x26A3D0E401DD846DLL;
-    for (i = 0;; i++) {
-      mask = -(z < 0);
-      w += ((w >> (i + 1)) + mask) ^ mask;
-      z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
-      /*Repeat iteration 4.*/
-      if (i >= 3) break;
-      z *= 2;
-    }
-    for (;; i++) {
-      mask = -(z < 0);
-      w += ((w >> (i + 1)) + mask) ^ mask;
-      z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
-      /*Repeat iteration 13.*/
-      if (i >= 12) break;
-      z *= 2;
-    }
-    for (; i < 32; i++) {
-      mask = -(z < 0);
-      w += ((w >> (i + 1)) + mask) ^ mask;
-      z = (z - ((OD_ATANH_LOG2[i] + mask) ^ mask)) * 2;
-    }
-    wlo = 0;
-    /*Skip the remaining iterations unless we really require that much
-       precision.
-      We could have bailed out earlier for smaller iparts, but that would
-       require initializing w from a table, as the limit doesn't converge to
-       61-bit precision until n=30.*/
-    if (ipart > 30) {
-      /*For these iterations, we just update the low bits, as the high bits
-         can't possibly be affected.
-        OD_ATANH_LOG2 has also converged (it actually did so one iteration
-         earlier, but that's no reason for an extra special case).*/
-      for (;; i++) {
-        mask = -(z < 0);
-        wlo += ((w >> i) + mask) ^ mask;
-        z -= (OD_ATANH_LOG2[31] + mask) ^ mask;
-        /*Repeat iteration 40.*/
-        if (i >= 39) break;
-        z <<= 1;
-      }
-      for (; i < 61; i++) {
-        mask = -(z < 0);
-        wlo += ((w >> i) + mask) ^ mask;
-        z = (z - ((OD_ATANH_LOG2[31] + mask) ^ mask)) << 1;
-      }
-    }
-    w = (w << 1) + wlo;
-  } else {
-    w = (int64_t)1 << 62;
-  }
-  if (ipart < 62) {
-    w = ((w >> (61 - ipart)) + 1) >> 1;
-  }
-  return w;
-}
-
-/*Computes the binary log of w
-  input: a 64-bit integer in Q0 (no fraction)
-  output: a 64-bit log in Q57 */
-static int64_t od_blog64(int64_t w) {
-  int64_t z;
-  int ipart;
-  if (w <= 0) return -1;
-  ipart = od_ilog64(w) - 1;
-  if (ipart > 61) {
-    w >>= ipart - 61;
-  } else {
-    w <<= 61 - ipart;
-  }
-  z = 0;
-  if (w & (w - 1)) {
-    int64_t x;
-    int64_t y;
-    int64_t u;
-    int64_t mask;
-    int i;
-    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
-      This is not particularly fast, but it's not being used in time-critical
-       code; it is very accurate.*/
-    /*z is the fractional part of the log in Q61 format.*/
-    /*x and y are the cosh() and sinh(), respectively, in Q61 format.
-      We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)).*/
-    x = w + ((int64_t)1 << 61);
-    y = w - ((int64_t)1 << 61);
-    for (i = 0; i < 4; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*Repeat iteration 4.*/
-    for (i--; i < 13; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*Repeat iteration 13.*/
-    for (i--; i < 32; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*OD_ATANH_LOG2 has converged.*/
-    for (; i < 40; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*Repeat iteration 40.*/
-    for (i--; i < 62; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    z = (z + 8) >> 4;
-  }
-  return OD_Q57(ipart) + z;
-}
-
-/*Convenience function converts Q57 value to a clamped 32-bit Q24 value
-  in: input in Q57 format.
-  Return: same number in Q24 */
-static int32_t od_q57_to_q24(int64_t in) {
-  int64_t ret;
-  ret = (in + ((int64_t)1 << 32)) >> 33;
-  /*0x80000000 is automatically converted to unsigned on 32-bit systems.
-    -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to
-    unsigned.*/
-  return (int32_t)OD_CLAMPI(-0x7FFFFFFF - 1, ret, 0x7FFFFFFF);
-}
-
-/*Binary exponential of log_scale with 24-bit fractional precision and
-   saturation.
-  log_scale: A binary logarithm in Q57 format.
-  Return: The binary exponential in Q24 format, saturated to 2**31-1 if
-   log_scale was too large.*/
-static int32_t od_bexp64_q24(int64_t log_scale) {
-  if (log_scale < OD_Q57(8)) {
-    int64_t ret;
-    ret = od_bexp64(log_scale + OD_Q57(24));
-    return ret < 0x7FFFFFFF ? (int32_t)ret : 0x7FFFFFFF;
-  }
-  return 0x7FFFFFFF;
-}
-
-/*Re-initialize Bessel filter coefficients with the specified delay.
-  This does not alter the x/y state, but changes the reaction time of the
-   filter.
-  Altering the time constant of a reactive filter without alterning internal
-   state is something that has to be done carefuly, but our design operates at
-   high enough delays and with small enough time constant changes to make it
-   safe.*/
-static void od_iir_bessel2_reinit(od_iir_bessel2 *f, int delay) {
-  int alpha;
-  int64_t one48;
-  int64_t warp;
-  int64_t k1;
-  int64_t k2;
-  int64_t d;
-  int64_t a;
-  int64_t ik2;
-  int64_t b1;
-  int64_t b2;
-  /*This borrows some code from an unreleased version of Postfish.
-    See the recipe at http://unicorn.us.com/alex/2polefilters.html for details
-     on deriving the filter coefficients.*/
-  /*alpha is Q24*/
-  alpha = (1 << 24) / delay;
-  one48 = (int64_t)1 << 48;
-  /*warp is 7.12*/
-  warp = OD_MAXI(od_warp_alpha(alpha), 1);
-  /*k1 is 9.12*/
-  k1 = 3 * warp;
-  /*k2 is 16.24.*/
-  k2 = k1 * warp;
-  /*d is 16.15.*/
-  d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9;
-  /*a is 0.32, since d is larger than both 1.0 and k2.*/
-  a = (k2 << 23) / d;
-  /*ik2 is 25.24.*/
-  ik2 = one48 / k2;
-  /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/
-  b1 = 2 * a * (ik2 - (1 << 24));
-  /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/
-  b2 = (one48 << 8) - ((4 * a) << 24) - b1;
-  /*All of the filter parameters are Q24.*/
-  f->c[0] = (int32_t)((b1 + ((int64_t)1 << 31)) >> 32);
-  f->c[1] = (int32_t)((b2 + ((int64_t)1 << 31)) >> 32);
-  f->g = (int32_t)((a + 128) >> 8);
-}
-
-/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay
-   and initial value.
-  value is Q24.*/
-static void od_iir_bessel2_init(od_iir_bessel2 *f, int delay, int32_t value) {
-  od_iir_bessel2_reinit(f, delay);
-  f->y[1] = f->y[0] = f->x[1] = f->x[0] = value;
-}
-
-static int64_t od_iir_bessel2_update(od_iir_bessel2 *f, int32_t x) {
-  int64_t c0;
-  int64_t c1;
-  int64_t g;
-  int64_t x0;
-  int64_t x1;
-  int64_t y0;
-  int64_t y1;
-  int64_t ya;
-  c0 = f->c[0];
-  c1 = f->c[1];
-  g = f->g;
-  x0 = f->x[0];
-  x1 = f->x[1];
-  y0 = f->y[0];
-  y1 = f->y[1];
-  ya = ((x + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1 << 23)) >> 24;
-  f->x[1] = (int32_t)x0;
-  f->x[0] = x;
-  f->y[1] = (int32_t)y0;
-  f->y[0] = (int32_t)ya;
-  return ya;
-}
-
-static void od_enc_rc_reset(od_rc_state *rc) {
-  int64_t npixels;
-  int64_t ibpp;
-  rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
-  /*Insane framerates or frame sizes mean insane bitrates.
-    Let's not get carried away.*/
-  if (rc->bits_per_frame > 0x400000000000LL) {
-    rc->bits_per_frame = (int64_t)0x400000000000LL;
-  } else {
-    if (rc->bits_per_frame < 32) {
-      rc->bits_per_frame = 32;
-    }
-  }
-  rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
-  rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
-  /*Start with a buffer fullness and fullness target of 50% */
-  rc->reservoir_target = (rc->reservoir_max + 1) >> 1;
-  rc->reservoir_fullness = rc->reservoir_target;
-  /*Pick exponents and initial scales for quantizer selection.*/
-  npixels = rc->frame_width * (int64_t)rc->frame_height;
-  rc->log_npixels = od_blog64(npixels);
-  ibpp = npixels / rc->bits_per_frame;
-  /*All of these initial scale/exp values are from Theora, and have not yet
-     been adapted to Daala, so they're certainly wrong.
-    The B-frame values especially are simply copies of the P-frame values.*/
-  if (ibpp < 1) {
-    rc->exp[OD_I_FRAME] = 59;
-    rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT);
-  } else if (ibpp < 2) {
-    rc->exp[OD_I_FRAME] = 55;
-    rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT);
-  } else {
-    rc->exp[OD_I_FRAME] = 48;
-    rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT);
-  }
-  if (ibpp < 4) {
-    rc->exp[OD_P_FRAME] = 100;
-    rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT);
-  } else if (ibpp < 8) {
-    rc->exp[OD_P_FRAME] = 95;
-    rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT);
-  } else {
-    rc->exp[OD_P_FRAME] = 73;
-    rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT);
-  }
-  /*Golden P-frames both use the same log_scale and exp modeling
-     values as regular P-frames and the same scale follower.
-    For convenience in the rate calculation code, we maintain a copy of
-    the scale and exp values in OD_GOLDEN_P_FRAME.*/
-  rc->exp[OD_GOLDEN_P_FRAME] = rc->exp[OD_P_FRAME];
-  rc->log_scale[OD_GOLDEN_P_FRAME] = rc->log_scale[OD_P_FRAME];
-  rc->exp[OD_ALTREF_P_FRAME] = rc->exp[OD_P_FRAME];
-  rc->log_scale[OD_ALTREF_P_FRAME] = rc->log_scale[OD_P_FRAME];
-  /*We clamp the actual I and B frame delays to a minimum of 10 to work within
-     the range of values where later incrementing the delay works as designed.
-    10 is not an exact choice, but rather a good working trade-off.*/
-  rc->inter_p_delay = 10;
-  rc->inter_delay_target = rc->reservoir_frame_delay >> 1;
-  memset(rc->frame_count, 0, sizeof(rc->frame_count));
-  /*Drop-frame tracking is concerned with more than just the basic three frame
-     types.
-    It needs to track boosted and cut subtypes (of which there is only one
-     right now, OD_GOLDEN_P_FRAME). */
-  rc->prev_drop_count[OD_I_FRAME] = 0;
-  rc->log_drop_scale[OD_I_FRAME] = OD_Q57(0);
-  rc->prev_drop_count[OD_P_FRAME] = 0;
-  rc->log_drop_scale[OD_P_FRAME] = OD_Q57(0);
-  rc->prev_drop_count[OD_GOLDEN_P_FRAME] = 0;
-  rc->log_drop_scale[OD_GOLDEN_P_FRAME] = OD_Q57(0);
-  rc->prev_drop_count[OD_ALTREF_P_FRAME] = 0;
-  rc->log_drop_scale[OD_ALTREF_P_FRAME] = OD_Q57(0);
-  /*Set up second order followers, initialized according to corresponding
-     time constants.*/
-  od_iir_bessel2_init(&rc->scalefilter[OD_I_FRAME], 4,
-                      od_q57_to_q24(rc->log_scale[OD_I_FRAME]));
-  od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], rc->inter_p_delay,
-                      od_q57_to_q24(rc->log_scale[OD_P_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_I_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_I_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_P_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_P_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_GOLDEN_P_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_GOLDEN_P_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_ALTREF_P_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_ALTREF_P_FRAME]));
-}
-
-int od_enc_rc_resize(od_rc_state *rc) {
-  /*If encoding has not yet begun, reset the buffer state.*/
-  if (rc->cur_frame == 0) {
-    od_enc_rc_reset(rc);
-  } else {
-    int idt;
-    /*Otherwise, update the bounds on the buffer, but not the current
-       fullness.*/
-    rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
-    /*Insane framerates or frame sizes mean insane bitrates.
-      Let's not get carried away.*/
-    if (rc->bits_per_frame > 0x400000000000LL) {
-      rc->bits_per_frame = (int64_t)0x400000000000LL;
-    } else {
-      if (rc->bits_per_frame < 32) {
-        rc->bits_per_frame = 32;
-      }
-    }
-    rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
-    rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
-    rc->reservoir_target =
-        ((rc->reservoir_max + 1) >> 1) +
-        ((rc->bits_per_frame + 2) >> 2) *
-            OD_MINI(rc->keyframe_rate, rc->reservoir_frame_delay);
-    /*Update the INTER-frame scale filter delay.
-      We jump to it immediately if we've already seen enough frames; otherwise
-       it is simply set as the new target.*/
-    rc->inter_delay_target = idt = OD_MAXI(rc->reservoir_frame_delay >> 1, 10);
-    if (idt < OD_MINI(rc->inter_p_delay, rc->frame_count[OD_P_FRAME])) {
-      od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], idt,
-                          rc->scalefilter[OD_P_FRAME].y[0]);
-      rc->inter_p_delay = idt;
-    }
-  }
-  return 0;
-}
-
-int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms) {
-  if (rc->framerate <= 0) return 1;
-  if (rc->target_bitrate > 0) {
-    /*State has already been initialized; rather than reinitialize,
-      adjust the buffering for the new target rate. */
-    rc->target_bitrate = bitrate;
-    return od_enc_rc_resize(rc);
-  }
-  rc->target_quantizer = 0;
-  rc->target_bitrate = bitrate;
-  rc->rate_bias = 0;
-  if (bitrate > 0) {
-    /* The buffer size is clamped between [12, 256], this interval is short
-       enough to
-       allow reaction, but long enough to allow looking into the next GOP
-       (avoiding
-       the case where the last frames before an I-frame get starved).
-       The 12 frame minimum gives us some chance to distribute bit estimation
-       errors in the worst case. The 256 frame maximum means we'll require 8-10
-       seconds
-       of pre-buffering at 24-30 fps, which is not unreasonable.*/
-    rc->reservoir_frame_delay =
-        (int)OD_MINI((delay_ms / 1000) * rc->framerate, 256);
-    rc->drop_frames = 1;
-    rc->cap_overflow = 1;
-    rc->cap_underflow = 0;
-    rc->twopass_state = 0;
-    od_enc_rc_reset(rc);
-  }
-  return 0;
-}
-
-/*Scale the number of frames by the number of expected drops/duplicates.*/
-static int od_rc_scale_drop(od_rc_state *rc, int frame_type, int nframes) {
-  if (rc->prev_drop_count[frame_type] > 0 ||
-      rc->log_drop_scale[frame_type] > OD_Q57(0)) {
-    int64_t dup_scale;
-    dup_scale = od_bexp64(((rc->log_drop_scale[frame_type] +
-                            od_blog64(rc->prev_drop_count[frame_type] + 1)) >>
-                           1) +
-                          OD_Q57(8));
-    if (dup_scale < nframes << 8) {
-      int dup_scalei;
-      dup_scalei = (int)dup_scale;
-      if (dup_scalei > 0) {
-        nframes = ((nframes << 8) + dup_scalei - 1) / dup_scalei;
-      }
-    } else {
-      nframes = !!nframes;
-    }
-  }
-  return nframes;
-}
-
-/*Closed form version of frame determination code.
-  Used by rate control to predict frame types and subtypes into the future.
-  No side effects, may be called any number of times.
-  Note that it ignores end-of-file conditions; one-pass planning *should*
-   ignore end-of-file. */
-int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
-                  int *is_altref, int64_t *ip_count) {
-  int frame_type;
-  if (coding_frame_count == 0) {
-    *is_golden = 1;
-    *is_altref = 1;
-    *ip_count = 0;
-    frame_type = OD_I_FRAME;
-  } else {
-    int keyrate = rc->keyframe_rate;
-    if (rc->closed_gop) {
-      int ip_per_gop;
-      int gop_n;
-      int gop_i;
-      ip_per_gop = (keyrate - 1) / 2;
-      gop_n = coding_frame_count / keyrate;
-      gop_i = coding_frame_count - gop_n * keyrate;
-      *ip_count = gop_n * ip_per_gop + (gop_i > 0) + (gop_i - 1);
-      frame_type = gop_i == 0 ? OD_I_FRAME : OD_P_FRAME;
-    } else {
-      int ip_per_gop;
-      int gop_n;
-      int gop_i;
-      ip_per_gop = (keyrate);
-      gop_n = (coding_frame_count - 1) / keyrate;
-      gop_i = coding_frame_count - gop_n * keyrate - 1;
-      *ip_count = (coding_frame_count > 0) + gop_n * ip_per_gop + (gop_i);
-      frame_type = gop_i / 1 < ip_per_gop - 1 ? OD_P_FRAME : OD_I_FRAME;
-    }
-  }
-  *is_golden =
-      (*ip_count % rc->goldenframe_rate) == 0 || frame_type == OD_I_FRAME;
-  *is_altref = (*ip_count % rc->altref_rate) == 0 || frame_type == OD_I_FRAME;
-  return frame_type;
-}
-
-/*Count frames types forward from the current frame up to but not including
-   the last I-frame in reservoir_frame_delay.
-  If reservoir_frame_delay contains no I-frames (or the current frame is the
-   only I-frame), count all reservoir_frame_delay frames.
-  Returns the number of frames counted.
-  Right now, this implementation is simple, brute-force, and expensive.
-  It is also easy to understand and debug.
-  TODO: replace with a virtual FIFO that keeps running totals as
-   repeating the counting over-and-over will have a performance impact on
-   whole-file 2pass usage.*/
-static int frame_type_count(od_rc_state *rc, int nframes[OD_FRAME_NSUBTYPES]) {
-  int i;
-  int j;
-  int acc[OD_FRAME_NSUBTYPES];
-  int count;
-  int reservoir_frames;
-  int reservoir_frame_delay;
-  memset(nframes, 0, OD_FRAME_NSUBTYPES * sizeof(*nframes));
-  memset(acc, 0, sizeof(acc));
-  count = 0;
-  reservoir_frames = 0;
-#if 1
-  /*Go ahead and count past end-of-stream.
-    We won't nail the exact bitrate on short files that end with a partial
-     GOP, but we also won't [potentially] destroy the quality of the last few
-     frames in that same case when we suddenly find out the stream is ending
-     before the original planning horizon.*/
-  reservoir_frame_delay = rc->reservoir_frame_delay;
-#else
-  /*Don't count past the end of the stream (once we know where end-of-stream
-     is).*/
-  reservoir_frame_delay =
-      rc->end_of_input ? rc->input_size + 1 : rc->reservoir_frame_delay;
-#endif
-  for (i = 0; i < reservoir_frame_delay; i++) {
-    int frame_type;
-    int is_golden;
-    int is_altref;
-    int64_t dummy;
-    frame_type =
-        od_frame_type(rc, rc->cur_frame + i, &is_golden, &is_altref, &dummy);
-    switch (frame_type) {
-      case OD_I_FRAME: {
-        for (j = 0; j < OD_FRAME_NSUBTYPES; j++) nframes[j] += acc[j];
-        reservoir_frames += count;
-        memset(acc, 0, sizeof(acc));
-        acc[OD_I_FRAME] = 1;
-        count = 1;
-        break;
-      }
-      case OD_P_FRAME: {
-        if (is_golden) {
-          ++acc[OD_GOLDEN_P_FRAME];
-          ++count;
-        } else if (is_altref) {
-          ++acc[OD_ALTREF_P_FRAME];
-          ++count;
-        } else {
-          ++acc[OD_P_FRAME];
-          ++count;
-        }
-        break;
-      }
-    }
-  }
-  /*If there were no I-frames at all, or only the first frame was an I-frame,
-     the accumulators never flushed and still contain the counts for the
-     entire buffer.
-    In both these cases, we return these counts.
-    Otherwise, we discard what remains in the accumulators as they contain
-     the counts from and past the last I-frame.*/
-  if (reservoir_frames == 0) {
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) nframes[i] = acc[i];
-    reservoir_frames += count;
-  }
-  return reservoir_frames;
-}
-
-static int convert_to_ac_quant(int q, int bit_depth) {
-  return lrint(av1_convert_qindex_to_q(q, bit_depth));
-}
-
-int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
-                                            int is_golden_frame,
-                                            int is_altref_frame, int frame_type,
-                                            int *bottom_idx, int *top_idx) {
-  int frame_subtype;
-  int64_t log_cur_scale;
-  int lossy_quantizer_min;
-  int lossy_quantizer_max;
-  double mqp_i = OD_MQP_I;
-  double mqp_p = OD_MQP_P;
-  double mqp_gp = OD_MQP_GP;
-  double mqp_ap = OD_MQP_AP;
-  int reservoir_frames;
-  int nframes[OD_FRAME_NSUBTYPES];
-  int32_t mqp_Q12[OD_FRAME_NSUBTYPES];
-  int64_t dqp_Q45[OD_FRAME_NSUBTYPES];
-  /*Verify the closed-form frame type determination code matches what the
-     input queue set.*/
-  /*One pseudo-non-closed-form caveat:
-    Once we've seen end-of-input, the batched frame determination code
-     suppresses the last open-GOP's I-frame (since it would only be
-     useful for the next GOP, which doesn't exist).
-     Thus, don't check one the input queue is drained.*/
-  if (!rc->end_of_input) {
-    int closed_form_type;
-    int closed_form_golden;
-    int closed_form_altref;
-    int64_t closed_form_cur_frame;
-    closed_form_type =
-        od_frame_type(rc, rc->cur_frame, &closed_form_golden,
-                      &closed_form_altref, &closed_form_cur_frame);
-    OD_UNUSED(closed_form_type);
-    OD_UNUSED(is_altref_frame);
-    assert(closed_form_type == frame_type);
-    assert(closed_form_cur_frame == rc->cur_frame);
-    assert(closed_form_altref == is_altref_frame);
-    assert(closed_form_golden == is_golden_frame);
-  }
-
-  log_cur_scale = (int64_t)rc->scalefilter[frame_type].y[0] << 33;
-
-  /*Count the various types and classes of frames.*/
-  reservoir_frames = frame_type_count(rc, nframes);
-  nframes[OD_I_FRAME] = od_rc_scale_drop(rc, OD_I_FRAME, nframes[OD_I_FRAME]);
-  nframes[OD_P_FRAME] = od_rc_scale_drop(rc, OD_P_FRAME, nframes[OD_P_FRAME]);
-  nframes[OD_GOLDEN_P_FRAME] =
-      od_rc_scale_drop(rc, OD_GOLDEN_P_FRAME, nframes[OD_GOLDEN_P_FRAME]);
-  nframes[OD_ALTREF_P_FRAME] =
-      od_rc_scale_drop(rc, OD_ALTREF_P_FRAME, nframes[OD_ALTREF_P_FRAME]);
-
-  switch (rc->twopass_state) {
-    default: break;
-    case 1: {
-      /*Pass 1 mode: use a fixed qi value.*/
-      return rc->firstpass_quant;
-    } break;
-    case 2: {
-      int i;
-      int64_t scale_sum[OD_FRAME_NSUBTYPES];
-      int qti;
-      /*Pass 2 mode: we know exactly how much of each frame type there is in
-         the current buffer window, and have estimates for the scales.*/
-      for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-        nframes[i] = rc->nframes[i];
-        nframes[i] = rc->nframes[i];
-        scale_sum[i] = rc->scale_sum[i];
-      }
-      /*If we're not using the same frame type as in pass 1 (because someone
-         changed the keyframe interval), remove that scale estimate.
-        We'll add in a replacement for the correct frame type below.*/
-      qti = rc->cur_metrics.frame_type;
-      if (qti != frame_type) {
-        nframes[qti]--;
-        scale_sum[qti] -= od_bexp64_q24(rc->cur_metrics.log_scale);
-      }
-      /*Compute log_scale estimates for each frame type from the pass-1 scales
-         we measured in the current window.*/
-      for (qti = 0; qti < OD_FRAME_NSUBTYPES; qti++) {
-        rc->log_scale[qti] = nframes[qti] > 0
-                                 ? od_blog64(scale_sum[qti]) -
-                                       od_blog64(nframes[qti]) - OD_Q57(24)
-                                 : -rc->log_npixels;
-      }
-      /*If we're not using the same frame type as in pass 1, add a scale
-         estimate for the corresponding frame using the current low-pass
-         filter value.
-        This is mostly to ensure we have a valid estimate even when pass 1 had
-         no frames of this type in the buffer window.
-        TODO: We could also plan ahead and figure out how many keyframes we'll
-         be forced to add in the current buffer window.*/
-      qti = rc->cur_metrics.frame_type;
-      if (qti != frame_type) {
-        int64_t scale;
-        scale = rc->log_scale[frame_type] < OD_Q57(23)
-                    ? od_bexp64(rc->log_scale[frame_type] + OD_Q57(24))
-                    : 0x7FFFFFFFFFFFLL;
-        scale *= nframes[frame_type];
-        nframes[frame_type]++;
-        scale += od_bexp64_q24(log_cur_scale >> 33);
-        rc->log_scale[frame_type] =
-            od_blog64(scale) - od_blog64(nframes[qti]) - OD_Q57(24);
-      } else {
-        log_cur_scale = (int64_t)rc->cur_metrics.log_scale << 33;
-      }
-    } break;
-  }
-
-  /*Quantizer selection sticks to the codable, lossy portion of the quantizer
-    range.*/
-  lossy_quantizer_min = convert_to_ac_quant(rc->minq, rc->bit_depth);
-  lossy_quantizer_max = convert_to_ac_quant(rc->maxq, rc->bit_depth);
-  frame_subtype = frame_type;
-  /*Stash quantizer modulation by frame type.*/
-  mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
-  mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
-  mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
-  mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
-  dqp_Q45[OD_I_FRAME] = OD_F_Q45(OD_DQP_I);
-  dqp_Q45[OD_P_FRAME] = OD_F_Q45(OD_DQP_P);
-  dqp_Q45[OD_GOLDEN_P_FRAME] = OD_F_Q45(OD_DQP_GP);
-  dqp_Q45[OD_ALTREF_P_FRAME] = OD_F_Q45(OD_DQP_AP);
-  /*Is rate control active?*/
-  if (rc->target_bitrate <= 0) {
-    /*Rate control is not active; derive quantizer directly from
-      quality parameter and frame type. */
-    /*Can't use the OD_LOSSLESS macro, as it uses state.quantizer to intuit,
-      and we've not set it yet.*/
-    if (rc->quality == 0) {
-      /*Lossless coding requested.*/
-      rc->base_quantizer = 0;
-      rc->target_quantizer = 0;
-    } else {
-      int64_t log_quantizer;
-
-      /* Adjust the modulation constants using the last frame's quantizer. */
-      double mqp_delta = (255 - rc->target_quantizer) / 2000.0f;
-      mqp_i -= mqp_delta;
-      mqp_p += mqp_delta;
-      mqp_gp -= mqp_delta;
-      mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
-      mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
-      mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
-      mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
-
-      if (rc->quality == -1) {
-        /*A quality of -1 means quality was unset; use a default.*/
-        rc->base_quantizer = convert_to_ac_quant(10, rc->bit_depth);
-      } else {
-        rc->base_quantizer = convert_to_ac_quant(rc->quality, rc->bit_depth);
-      }
-
-      if (rc->periodic_boosts && !is_golden_frame) {
-        int pattern_rate = (rc->goldenframe_rate >> 1);
-        int dist_to_golden = rc->cur_frame % pattern_rate;
-        int dist_away_golden = pattern_rate - dist_to_golden;
-        int boost = dist_to_golden;
-        if (dist_away_golden > dist_to_golden) boost = dist_away_golden;
-        boost -= pattern_rate;
-        boost *= (rc->base_quantizer) / OD_PERIODIC_BOOST_DIV;
-        rc->base_quantizer = rc->base_quantizer + boost;
-      }
-
-      /*As originally written, qp modulation is applied to the coded quantizer.
-        Because we now have and use a more precise target quantizer for various
-        calculation, that needs to be modulated as well.
-        Calculate what is, effectively, a fractional coded quantizer. */
-      /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-      log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT);
-      /*log_quantizer to Q21.*/
-      log_quantizer >>= 36;
-      /*scale log quantizer, result is Q33.*/
-      log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
-      /*Add Q33 offset to Q33 log_quantizer.*/
-      log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
-      /*Modulate quantizer according to frame type; result is Q45.*/
-      log_quantizer *= mqp_Q12[frame_subtype];
-      /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
-      log_quantizer += dqp_Q45[frame_subtype];
-      /*Back to log2 quantizer in Q57.*/
-      log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
-                          OD_LOG_QUANTIZER_EXP_Q12 +
-                      OD_Q57(OD_COEFF_SHIFT);
-      /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/
-      rc->target_quantizer = od_bexp64(log_quantizer);
-    }
-  } else {
-    int clamp;
-    int64_t rate_bias;
-    int64_t rate_total;
-    int base_quantizer;
-    int64_t log_quantizer;
-    int qlo;
-    int qhi;
-    int i;
-    /*We clamp the allowed amount of qi change (after initialization).*/
-    clamp = rc->cur_frame > 0;
-    /*Figure out how to re-distribute bits so that we hit our fullness target
-       before the last keyframe in our current buffer window (after the current
-       frame), or the end of the buffer window, whichever comes first.*/
-    /*Single pass only right now.*/
-    /*If we've been missing our target, add a penalty term.*/
-    rate_bias = (rc->rate_bias / (rc->cur_frame + 1000)) * reservoir_frames;
-    /*rate_total is the total bits available over the next
-       reservoir_frames frames.*/
-    rate_total = rc->reservoir_fullness - rc->reservoir_target + rate_bias +
-                 reservoir_frames * rc->bits_per_frame;
-    /*Find a target quantizer that meets our rate target for the specific mix
-       of frame types we'll have over the next frame_delay frames.
-      We model the rate<->quantizer relationship as:
-       rate = scale*(quantizer**-exp)
-      In this case, we have our desired rate, an exponent selected in setup,
-       and a scale that's been measured over our frame history, so we're
-       solving for the quantizer.
-      Exponentiation with arbitrary exponents is expensive, so we work in
-       the binary log domain (binary exp and log aren't too bad):
-       rate = e2(log2_scale - log2_quantizer * exp)
-      There's no easy closed form solution, so we bisection search for it.*/
-    /*We do not currently allow rate control to select lossless encoding.*/
-    qlo = 1;
-    /*If there's a quality specified, it's used to select the
-       coarsest base quantizer we can select.
-      Otherwise we can use up to and including the coarsest codable
-       quantizer.*/
-    if (rc->quality > 0)
-      qhi = convert_to_ac_quant(rc->quality, rc->bit_depth);
-    else
-      qhi = lossy_quantizer_max;
-    base_quantizer = (qlo + qhi) >> 1;
-    while (qlo < qhi) {
-      volatile int64_t log_base_quantizer;
-      int64_t diff;
-      int64_t bits;
-      /*Count bits contributed by each frame type using the model.*/
-      bits = 0;
-      log_base_quantizer = od_blog64(base_quantizer);
-      for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-        /*Modulate base quantizer by frame type.*/
-        /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-        log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT);
-        /*log_quantizer to Q21.*/
-        log_quantizer >>= 36;
-        /*scale log quantizer, result is Q33.*/
-        log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
-        /*Add Q33 offset to Q33 log_quantizer.*/
-        log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
-        /*Modulate quantizer according to frame type; result is Q45.*/
-        log_quantizer *= mqp_Q12[i];
-        /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
-        log_quantizer += dqp_Q45[i];
-        /*Back to log2 quantizer in Q57.*/
-        log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
-                            OD_LOG_QUANTIZER_EXP_Q12 +
-                        OD_Q57(OD_COEFF_SHIFT);
-        /*Clamp modulated quantizer values.*/
-        log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
-                                  od_blog64(lossy_quantizer_max));
-        /* All the fields here are Q57 except for the exponent which is Q6.*/
-        bits += nframes[i] * od_bexp64(rc->log_scale[i] + rc->log_npixels -
-                                       (log_quantizer >> 6) * rc->exp[i]);
-      }
-      diff = bits - rate_total;
-      if (diff > 0) {
-        qlo = base_quantizer + 1;
-      } else if (diff < 0) {
-        qhi = base_quantizer - 1;
-      } else {
-        break;
-      }
-      base_quantizer = (qlo + qhi) >> 1;
-    }
-    /*If this was not one of the initial frames, limit the change in base
-       quantizer to within [0.8*Q,1.2*Q], where Q is the previous frame's
-       base quantizer.*/
-    if (clamp) {
-      base_quantizer = OD_CLAMPI((rc->base_quantizer * 0x0CCCD + 0x8000) >> 16,
-                                 base_quantizer,
-                                 (rc->base_quantizer * 0x13333 + 0x8000) >> 16);
-    }
-    /*Modulate chosen base quantizer to produce target quantizer.*/
-    log_quantizer = od_blog64(base_quantizer);
-    /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-    log_quantizer -= OD_Q57(OD_COEFF_SHIFT);
-    /*log_quantizer to Q21.*/
-    log_quantizer >>= 36;
-    /*scale log quantizer, result is Q33.*/
-    log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
-    /*Add Q33 offset to Q33 log_quantizer.*/
-    log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
-    /*Modulate quantizer according to frame type; result is Q45.*/
-    log_quantizer *= mqp_Q12[frame_subtype];
-    /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
-    log_quantizer += dqp_Q45[frame_subtype];
-    /*Back to log2 quantizer in Q57.*/
-    log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
-                        OD_LOG_QUANTIZER_EXP_Q12 +
-                    OD_Q57(OD_COEFF_SHIFT);
-    /*Clamp modulated quantizer values.*/
-    log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
-                              od_blog64(lossy_quantizer_max));
-    /*The above allocation looks only at the total rate we'll accumulate in
-       the next reservoir_frame_delay frames.
-      However we could overflow the bit reservoir on the very next frame, so
-       check for that here if we're not using a soft target.*/
-    if (rc->cap_overflow) {
-      int64_t margin;
-      int64_t soft_limit;
-      int64_t log_soft_limit;
-      int64_t log_scale_pixels;
-      int64_t exp;
-      int64_t log_qexp;
-      /*Allow 3% of the buffer for prediction error.
-        This should be plenty, and we don't mind if we go a bit over; we only
-         want to keep these bits from being completely wasted.*/
-      margin = (rc->reservoir_max + 31) >> 5;
-      /*We want to use at least this many bits next frame.*/
-      soft_limit = rc->reservoir_fullness + rc->bits_per_frame -
-                   (rc->reservoir_max - margin);
-      log_soft_limit = od_blog64(soft_limit);
-      /*If we're predicting we won't use that many bits...*/
-      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
-      exp = rc->exp[frame_subtype];
-      log_qexp = (log_quantizer >> 6) * exp;
-      if (log_scale_pixels - log_qexp < log_soft_limit) {
-        /*Scale the adjustment based on how far into the margin we are.*/
-        log_qexp += ((log_scale_pixels - log_soft_limit - log_qexp) >> 32) *
-                    (OD_MINI(margin, soft_limit) << 32) / margin;
-        log_quantizer = (((log_qexp + (exp >> 1)) / exp) << 6);
-      }
-    }
-    /*We just checked we don't overflow the reservoir next frame, now check
-       we don't underflow and bust the budget (when not using a soft target).
-      Disabled when a quality bound is set; if we saturate quantizer to the
-       maximum possible size when we have a limiting max quality, the
-       resulting lambda can cause strange behavior.*/
-    if (rc->quality == -1) {
-      int64_t exp;
-      int64_t log_qexp;
-      int64_t log_scale_pixels;
-      int64_t log_hard_limit;
-      /*Compute the maximum number of bits we can use in the next frame.
-        Allow 50% of the rate for a single frame for prediction error.
-        This may not be enough for keyframes or sudden changes in
-         complexity.*/
-      log_hard_limit =
-          od_blog64(rc->reservoir_fullness + (rc->bits_per_frame >> 1));
-      /*If we're predicting we'll use more than this...*/
-      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
-      exp = rc->exp[frame_subtype];
-      log_qexp = (log_quantizer >> 6) * exp;
-      if (log_scale_pixels - log_qexp > log_hard_limit) {
-        /*Force the target to hit our limit exactly.*/
-        log_qexp = log_scale_pixels - log_hard_limit;
-        log_quantizer = (log_qexp + (exp >> 1)) / exp << 6;
-        /*If that target is unreasonable, oh well; we'll have to drop.*/
-        log_quantizer = OD_MAXI(log_quantizer, od_blog64(lossy_quantizer_max));
-      }
-    }
-    /*Compute a final estimate of the number of bits we plan to use, update
-       the running rate bias measurement.*/
-    {
-      int64_t log_qexp;
-      int64_t log_scale_pixels;
-      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
-      log_qexp = (log_quantizer >> 6) * rc->exp[frame_subtype];
-      rc->rate_bias += od_bexp64(log_scale_pixels - log_qexp);
-    }
-    rc->target_quantizer = od_bexp64(log_quantizer);
-    /*The various cappings and adjustments may have altered the log_quantizer
-       target significantly.
-      We can either update the base quantizer to be consistent with the
-       target or let it track separately.
-      Theora behavior effectively keeps them consistent, as it regenerates
-       the effective base quantizer from the target each frame rather than
-       saving both.
-      For Daala, it's easier to allow them to track separately.
-      For now, allow them to track separately and see how it behaves.*/
-    rc->base_quantizer = base_quantizer;
-  }
-  *bottom_idx = lossy_quantizer_min;
-  *top_idx = lossy_quantizer_max;
-  rc->target_quantizer = av1_qindex_from_ac(
-      OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max),
-      rc->bit_depth);
-  return rc->target_quantizer;
-}
-
-int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
-                           int is_altref_frame, int frame_type, int droppable) {
-  int dropped;
-  dropped = 0;
-  /*Update rate control only if rate control is active.*/
-  if (rc->target_bitrate > 0) {
-    int64_t log_scale;
-    int frame_subtype;
-    frame_subtype = frame_type;
-    /*Track non-golden and golden P frame drops separately.*/
-    if (is_golden_frame && frame_type == OD_P_FRAME)
-      frame_subtype = OD_GOLDEN_P_FRAME;
-    else if (is_altref_frame && frame_type == OD_P_FRAME)
-      frame_subtype = OD_ALTREF_P_FRAME;
-    if (bits <= 0) {
-      /*We didn't code any blocks in this frame.*/
-      log_scale = OD_Q57(-64);
-      bits = 0;
-      ++rc->prev_drop_count[frame_subtype];
-    } else {
-      int64_t log_bits;
-      int64_t log_qexp;
-      /*Compute the estimated scale factor for this frame type.*/
-      log_bits = od_blog64(bits);
-      log_qexp = od_blog64(rc->target_quantizer);
-      log_qexp = (log_qexp >> 6) * (rc->exp[frame_type]);
-      log_scale = OD_MINI(log_bits - rc->log_npixels + log_qexp, OD_Q57(16));
-    }
-
-    switch (rc->twopass_state) {
-      case 1: {
-        int golden, altref;
-        int64_t ipc;
-        rc->cur_metrics.frame_type =
-            od_frame_type(rc, rc->cur_frame, &golden, &altref, &ipc);
-        /*Pass 1 mode: save the metrics for this frame.*/
-        rc->cur_metrics.log_scale = od_q57_to_q24(log_scale);
-      } break;
-      case 2: {
-        /*Pass 2 mode:*/
-        int m_frame_type = rc->cur_metrics.frame_type;
-        rc->nframes[m_frame_type]--;
-        rc->scale_sum[m_frame_type] -= od_bexp64_q24(rc->cur_metrics.log_scale);
-      } break;
-    }
-
-    if (bits > 0) {
-      od_iir_bessel2 *f;
-      /*If this is the first example of the given frame type we've
-         seen, we immediately replace the default scale factor guess
-         with the estimate we just computed using the first frame.*/
-      if (rc->frame_count[frame_type] == 0) {
-        f = rc->scalefilter + frame_type;
-        f->y[1] = f->y[0] = f->x[1] = f->x[0] = od_q57_to_q24(log_scale);
-        rc->log_scale[frame_type] = log_scale;
-      } else {
-        /*Lengthen the time constant for the inter filters as we collect more
-           frame statistics, until we reach our target.*/
-        if (frame_type != OD_I_FRAME &&
-            rc->inter_p_delay < rc->inter_delay_target &&
-            rc->frame_count[frame_type] >= rc->inter_p_delay) {
-          od_iir_bessel2_reinit(&rc->scalefilter[frame_type],
-                                ++rc->inter_p_delay);
-        }
-        /*Update the low-pass scale filter for this frame type
-           regardless of whether or not we drop this frame.*/
-        rc->log_scale[frame_type] =
-            od_iir_bessel2_update(rc->scalefilter + frame_type,
-                                  od_q57_to_q24(log_scale))
-            << 33;
-      }
-      /*If this frame busts our budget, it must be dropped.*/
-      if (droppable && rc->reservoir_fullness + rc->bits_per_frame < bits) {
-        ++rc->prev_drop_count[frame_subtype];
-        bits = 0;
-        dropped = 1;
-      } else {
-        uint32_t drop_count;
-        /*Update a low-pass filter to estimate the "real" frame rate taking
-           drops into account.
-          This is only done if the frame is coded, as it needs the final
-           count of dropped frames.*/
-        drop_count = rc->prev_drop_count[frame_subtype] + 1;
-        if (drop_count > 0x7F) {
-          drop_count = 0x7FFFFFFF;
-        } else {
-          drop_count <<= 24;
-        }
-        rc->log_drop_scale[frame_subtype] =
-            od_blog64(od_iir_bessel2_update(rc->vfrfilter + frame_subtype,
-                                            drop_count)) -
-            OD_Q57(24);
-        /*Zero the drop count for this frame.
-          It will be increased if we drop frames.*/
-        rc->prev_drop_count[frame_subtype] = 0;
-      }
-      /*Increment the frame count for filter adaptation purposes.*/
-      if (!rc->twopass_state) rc->frame_count[frame_type]++;
-    }
-    rc->reservoir_fullness += rc->bits_per_frame - bits;
-    /*If we're too quick filling the buffer and overflow is capped,
-      that rate is lost forever.*/
-    if (rc->cap_overflow && rc->reservoir_fullness > rc->reservoir_max) {
-      rc->reservoir_fullness = rc->reservoir_max;
-    }
-    /*If we're too quick draining the buffer and underflow is capped,
-      don't try to make up that rate later.*/
-    if (rc->cap_underflow && rc->reservoir_fullness < 0) {
-      rc->reservoir_fullness = 0;
-    }
-    /*Adjust the bias for the real bits we've used.*/
-    rc->rate_bias -= bits;
-  }
-  return dropped;
-}
-
-static INLINE void od_rc_buffer_val(od_rc_state *rc, int64_t val, int bytes) {
-  while (bytes-- > 0) {
-    rc->twopass_buffer[rc->twopass_buffer_bytes++] = (uint8_t)(val & 0xFF);
-    val >>= 8;
-  }
-}
-
-static INLINE int64_t od_rc_unbuffer_val(od_rc_state *rc, int bytes) {
-  int64_t ret = 0;
-  int shift = 0;
-  while (bytes-- > 0) {
-    ret |= ((int64_t)rc->twopass_buffer[rc->twopass_buffer_bytes++]) << shift;
-    shift += 8;
-  }
-  return ret;
-}
-
-int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
-                        int summary) {
-  int i;
-  struct aom_codec_cx_pkt pkt;
-  rc->twopass_buffer = rc->firstpass_buffer;
-  rc->twopass_buffer_bytes = 0;
-  if (!rc->twopass_state) {
-    rc->twopass_state = 1;
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-      rc->frame_count[i] = 0;
-      rc->exp[i] = 0;
-      rc->scale_sum[i] = 0;
-    }
-  }
-  if (summary) {
-    od_rc_buffer_val(rc, OD_RC_2PASS_MAGIC, 4);
-    od_rc_buffer_val(rc, OD_RC_2PASS_VERSION, 1);
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-      od_rc_buffer_val(rc, rc->frame_count[i], 4);
-      od_rc_buffer_val(rc, rc->exp[i], 4);
-      od_rc_buffer_val(rc, rc->scale_sum[i], 8);
-    }
-  } else {
-    int frame_type = rc->cur_metrics.frame_type;
-    rc->scale_sum[frame_type] += od_bexp64_q24(rc->cur_metrics.log_scale);
-    rc->frame_count[frame_type]++;
-    od_rc_buffer_val(rc, rc->cur_metrics.frame_type, 1);
-    od_rc_buffer_val(rc, rc->cur_metrics.log_scale, 4);
-  }
-  pkt.data.twopass_stats.buf = rc->firstpass_buffer;
-  pkt.data.twopass_stats.sz = rc->twopass_buffer_bytes;
-  pkt.kind = AOM_CODEC_STATS_PKT;
-  aom_codec_pkt_list_add(pkt_list, &pkt);
-  return 0;
-}
-
-int od_enc_rc_2pass_in(od_rc_state *rc) {
-  /* Enable pass 2 mode if this is the first call. */
-  if (rc->twopass_state == 0) {
-    uint32_t i, total_frames = 0;
-
-    if (!rc->twopass_allframes_buf ||
-        rc->twopass_allframes_buf_size < OD_RC_2PASS_MIN)
-      return -1;
-
-    /* Find summary packet at the end */
-    rc->twopass_buffer = rc->twopass_allframes_buf;
-    rc->twopass_buffer +=
-        rc->twopass_allframes_buf_size - OD_RC_2PASS_SUMMARY_SZ;
-    rc->twopass_buffer_bytes = 0;
-
-    if (od_rc_unbuffer_val(rc, 4) != OD_RC_2PASS_MAGIC) return -1;
-    if (od_rc_unbuffer_val(rc, 1) != OD_RC_2PASS_VERSION) return -1;
-
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-      rc->frame_count[i] = od_rc_unbuffer_val(rc, 4);
-      rc->exp[i] = od_rc_unbuffer_val(rc, 4);
-      rc->scale_sum[i] = od_rc_unbuffer_val(rc, 8);
-      rc->nframes[i] = rc->frame_count[i];
-      total_frames += rc->frame_count[i];
-    }
-
-    if (total_frames < 1) return -1;
-
-    if (total_frames * OD_RC_2PASS_PACKET_SZ > rc->twopass_allframes_buf_size)
-      return -1;
-
-    od_enc_rc_reset(rc);
-
-    /* Everything looks ok */
-    rc->twopass_buffer = rc->twopass_allframes_buf;
-    rc->twopass_state = 2;
-    rc->twopass_buffer_bytes = 0;
-  }
-
-  rc->cur_metrics.frame_type = od_rc_unbuffer_val(rc, 1);
-  rc->cur_metrics.log_scale = od_rc_unbuffer_val(rc, 4);
-
-  return 0;
-}
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
index a4a9052fa..e69de29bb 100644
--- a/third_party/aom/av1/encoder/ratectrl_xiph.h
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.h
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if !defined(_ratectrl_xiph_H)
-#define _ratectrl_xiph_H (1)
-
-#include "av1/encoder/ratectrl.h"
-#include "aom/internal/aom_codec_internal.h"
-
-/*Frame types.*/
-#define OD_I_FRAME (0)
-#define OD_P_FRAME (1)
-#define OD_GOLDEN_P_FRAME (2)
-#define OD_ALTREF_P_FRAME (3)
-
-#define OD_FRAME_NSUBTYPES (OD_ALTREF_P_FRAME + 1)
-
-/* Periodic boost (in between golden frames) strength - lower is more */
-#define OD_PERIODIC_BOOST_DIV (10)
-
-/* Constants for frame QP modulation <- tweak these
- * Adjusts how the rate control system decides the quantizers per frame
- * (sub)type */
-#define OD_MQP_I (0.98)
-#define OD_MQP_P (1.06)
-#define OD_MQP_GP (0.99)
-#define OD_MQP_AP (0.92)
-#define OD_DQP_I (-2)
-#define OD_DQP_P (0)
-#define OD_DQP_GP (-2)
-#define OD_DQP_AP (-2)
-
-/*Fractional_coded_quantizer ~=
-   log2(quantizer / (1 << OD_COEFF_SHIFT))*6.307 + 6.235*/
-/*Base/scale factor for linear quantizer to fractional coded quantizer
-   conversion (6.307 * 2^12) */
-#define OD_LOG_QUANTIZER_BASE_Q12 (0x0064EB)
-/*Inverse of above scale factor.*/
-#define OD_LOG_QUANTIZER_EXP_Q12 (0x000289)
-/*Offset for linear quantizer to fractional coded quantizer
-   conversion (6.235 * 2^45) */
-#define OD_LOG_QUANTIZER_OFFSET_Q45 (0x0000C7851EB851ECLL)
-
-#define OD_RC_2PASS_MAGIC (0x53015641) /* [A, V, 1, S] in little endian */
-#define OD_RC_2PASS_SUMMARY_SZ (4 + 1 + (4 + 4 + 8) * OD_FRAME_NSUBTYPES)
-#define OD_RC_2PASS_PACKET_SZ (1 + 4)
-#define OD_RC_2PASS_MIN (OD_RC_2PASS_PACKET_SZ + OD_RC_2PASS_SUMMARY_SZ)
-#define OD_RC_2PASS_VERSION (1)
-
-/*A 2nd order low-pass Bessel follower.
-  We use this for rate control because it has fast reaction time, but is
-   critically damped.*/
-typedef struct od_iir_bessel2 {
-  int32_t c[2];
-  int64_t g;
-  int32_t x[2];
-  int32_t y[2];
-} od_iir_bessel2;
-
-/* The 2-pass metrics associated with a single frame. */
-typedef struct od_frame_metrics {
-  /*The log base 2 of the scale factor for this frame in Q24 format.*/
-  int64_t log_scale;
-  /*The frame type from pass 1.*/
-  unsigned frame_type : 1;
-} od_frame_metrics;
-
-/*Rate control setup and working state information.*/
-typedef struct od_rc_state {
-  /* Image format */
-  int frame_width;
-  int frame_height;
-  int bit_depth;
-
-  /* Framerate */
-  double framerate;
-  /* Keyframe rate */
-  int keyframe_rate;
-  /* Golden frame period */
-  int goldenframe_rate;
-  /* Altref frame period */
-  int altref_rate;
-  /*The target bit-rate in bits per second.*/
-  int64_t target_bitrate;
-  /* Quality level for non-bitrate-targeting */
-  int quality;
-  /* Copied from oxcf->frame_periodic_boost */
-  int periodic_boosts;
-  /* Max Q */
-  int maxq;
-  /* Min Q */
-  int minq;
-  /* Quantizer to use for the first pass */
-  int firstpass_quant;
-
-  /* 2-pass metrics */
-  od_frame_metrics cur_metrics;
-
-  /* 2-pass state */
-  int64_t scale_sum[OD_FRAME_NSUBTYPES];
-  int nframes[OD_FRAME_NSUBTYPES];
-
-  /* 2-pass bytestream reader/writer context */
-  uint8_t *twopass_buffer;
-  int twopass_buffer_bytes;
-
-  /* Pass 1 stats packet storage */
-  uint8_t firstpass_buffer[OD_RC_2PASS_SUMMARY_SZ];
-
-  /* Every state packet from the first pass in a single buffer */
-  uint8_t *twopass_allframes_buf;
-  size_t twopass_allframes_buf_size;
-
-  /* Actual returned quantizer */
-  int target_quantizer;
-  /*The full-precision, unmodulated quantizer upon which
-    our modulated quantizers are based.*/
-  int base_quantizer;
-
-  /* Increments by 1 for each frame. */
-  int64_t cur_frame;
-
-  /* End of input flag */
-  int end_of_input;
-  /* Closed GOP flag */
-  int closed_gop;
-  /*The number of frames over which to distribute the reservoir usage.*/
-  int reservoir_frame_delay;
-  /*Will we drop frames to meet bitrate target?*/
-  unsigned char drop_frames;
-  /*Do we respect the maximum reservoir fullness?*/
-  unsigned char cap_overflow;
-  /*Can the reservoir go negative?*/
-  unsigned char cap_underflow;
-  /*Two-pass mode state.
-    0 => 1-pass encoding.
-    1 => 1st pass of 2-pass encoding.
-    2 => 2nd pass of 2-pass encoding.*/
-  int twopass_state;
-  /*The log of the number of pixels in a frame in Q57 format.*/
-  int64_t log_npixels;
-  /*The target average bits per frame.*/
-  int64_t bits_per_frame;
-  /*The current bit reservoir fullness (bits available to be used).*/
-  int64_t reservoir_fullness;
-  /*The target buffer fullness.
-    This is where we'd like to be by the last keyframe the appears in the next
-     buf_delay frames.*/
-  int64_t reservoir_target;
-  /*The maximum buffer fullness (total size of the buffer).*/
-  int64_t reservoir_max;
-  /*The log of estimated scale factor for the rate model in Q57 format.*/
-  int64_t log_scale[OD_FRAME_NSUBTYPES];
-  /*The exponent used in the rate model in Q8 format.*/
-  unsigned exp[OD_FRAME_NSUBTYPES];
-  /*The log of an estimated scale factor used to obtain the real framerate, for
-     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
-  int64_t log_drop_scale[OD_FRAME_NSUBTYPES];
-  /*The total drop count from the previous frame.*/
-  uint32_t prev_drop_count[OD_FRAME_NSUBTYPES];
-  /*Second-order lowpass filters to track scale and VFR/drops.*/
-  od_iir_bessel2 scalefilter[OD_FRAME_NSUBTYPES];
-  od_iir_bessel2 vfrfilter[OD_FRAME_NSUBTYPES];
-  int frame_count[OD_FRAME_NSUBTYPES];
-  int inter_p_delay;
-  int inter_delay_target;
-  /*The total accumulated estimation bias.*/
-  int64_t rate_bias;
-} od_rc_state;
-
-int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms);
-
-int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
-                                            int is_golden_frame,
-                                            int is_altref_frame, int frame_type,
-                                            int *bottom_idx, int *top_idx);
-
-/* Returns 1 if the frame should be dropped */
-int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
-                           int is_altref_frame, int frame_type, int droppable);
-
-int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
-                  int *is_altref, int64_t *ip_count);
-
-int od_enc_rc_resize(od_rc_state *rc);
-
-int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
-                        int summary);
-
-int od_enc_rc_2pass_in(od_rc_state *rc);
-
-#endif
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index 5dd485334..17f23e5ec 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -13,7 +13,7 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -36,9 +36,7 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
@@ -54,114 +52,96 @@
 // This table is used to correct for block size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  2,  2,
-#endif
-  2,  3,  3,  4, 6,  6,  8, 12, 12, 16, 24, 24, 32,
-#if CONFIG_EXT_PARTITION
-  48, 48, 64,
-#endif  // CONFIG_EXT_PARTITION
-  4,  4,  8,  8, 16, 16,
-#if CONFIG_EXT_PARTITION
-  32, 32
-#endif  // CONFIG_EXT_PARTITION
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
 };
 
-#if CONFIG_EXT_TX
 static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
     {
-#if CONFIG_CHROMA_2X2
-      { 1, 1, 1, 1, 1 },  // unused
-      { 0, 1, 1, 0, 0 },
-      { 0, 0, 0, 1, 0 },
-#if CONFIG_MRC_TX
-      { 0, 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#else   // CONFIG_CHROMA_2X2
       { 1, 1, 1, 1 },  // unused
       { 1, 1, 0, 0 },
       { 0, 0, 1, 0 },
-#if CONFIG_MRC_TX
-      { 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#endif  // CONFIG_CHROMA_2X2
     };
 
 static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
     {
-#if CONFIG_CHROMA_2X2
-      { 1, 1, 1, 1, 1 },  // unused
-      { 0, 1, 1, 0, 0 }, { 0, 0, 0, 1, 0 }, { 0, 0, 0, 0, 1 },
-#if CONFIG_MRC_TX
-      { 0, 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#else   // CONFIG_CHROMA_2X2
       { 1, 1, 1, 1 },  // unused
-      { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 },
-#if CONFIG_MRC_TX
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
       { 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#endif  // CONFIG_CHROMA_2X2
     };
-#endif  // CONFIG_EXT_TX
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+                                                      EXT_TX_SETS_INTER)] = {
+  {
+      // Intra
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_DTT4_IDTX_1DDCT,
+      EXT_TX_SET_DTT4_IDTX,
+  },
+  {
+      // Inter
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_ALL16,
+      EXT_TX_SET_DTT9_IDTX_1DDCT,
+      EXT_TX_SET_DCT_IDTX,
+  },
+};
 
 void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
                          FRAME_CONTEXT *fc) {
   int i, j;
 
-  if (cm->frame_type == KEY_FRAME) {
-    for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-      av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i],
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+
+  if (cm->skip_mode_flag) {
+    for (i = 0; i < SKIP_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
                                NULL);
-#if CONFIG_UNPOISON_PARTITION_CTX
-    for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
-      aom_prob p = fc->partition_prob[i][PARTITION_VERT];
-      assert(p > 0);
-      x->partition_cost[i][PARTITION_NONE] = INT_MAX;
-      x->partition_cost[i][PARTITION_HORZ] = INT_MAX;
-      x->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0);
-      x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
-    }
-    for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
-      aom_prob p = fc->partition_prob[i][PARTITION_HORZ];
-      assert(p > 0);
-      x->partition_cost[i][PARTITION_NONE] = INT_MAX;
-      x->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0);
-      x->partition_cost[i][PARTITION_VERT] = INT_MAX;
-      x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
     }
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX;
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX;
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX;
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0;
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
   }
 
-#if CONFIG_KF_CTX
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+  }
+
   for (i = 0; i < KF_MODE_CONTEXTS; ++i)
     for (j = 0; j < KF_MODE_CONTEXTS; ++j)
       av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
-#else
-  for (i = 0; i < INTRA_MODES; ++i)
-    for (j = 0; j < INTRA_MODES; ++j)
-      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
-#endif
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
     av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
-  for (i = 0; i < INTRA_MODES; ++i)
-    av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i], fc->uv_mode_cdf[i],
-                             NULL);
+  for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+                               fc->uv_mode_cdf[i][j], NULL);
+
+  av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
+                           NULL);
+  for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    if (av1_filter_intra_allowed_bsize(cm, i))
+      av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+                               fc->filter_intra_cdfs[i], NULL);
+  }
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
                              fc->switchable_interp_cdf[i], NULL);
 
-  for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
+  for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
     av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
                              fc->palette_y_size_cdf[i], NULL);
     av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
                              fc->palette_uv_size_cdf[i], NULL);
+    for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+                               fc->palette_y_mode_cdf[i][j], NULL);
+    }
+  }
+
+  for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+                             fc->palette_uv_mode_cdf[i], NULL);
   }
 
   for (i = 0; i < PALETTE_SIZES; ++i) {
@@ -172,60 +152,38 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
                                fc->palette_uv_color_index_cdf[i][j], NULL);
     }
   }
-#if CONFIG_MRC_TX
-  for (i = 0; i < PALETTE_SIZES; ++i) {
-    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(x->mrc_mask_inter_cost[i][j],
-                               fc->mrc_mask_inter_cdf[i][j], NULL);
-      av1_cost_tokens_from_cdf(x->mrc_mask_intra_cost[i][j],
-                               fc->mrc_mask_intra_cdf[i][j], NULL);
-    }
-  }
-#endif  // CONFIG_MRC_TX
 
-#if CONFIG_CFL
   int sign_cost[CFL_JOINT_SIGNS];
   av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
   for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-    const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
-    const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
     int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
     int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
-    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO)
+    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
       memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
-    else
+    } else {
+      const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
       av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
-    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO)
+    }
+    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
       memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
-    else
+    } else {
+      const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
       av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+    }
     for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
       cost_u[u] += sign_cost[joint_sign];
   }
-#endif  // CONFIG_CFL
 
-  for (i = 0; i < MAX_TX_DEPTH; ++i)
+  for (i = 0; i < MAX_TX_CATS; ++i)
     for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
       av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
                                NULL);
 
-#if CONFIG_EXT_TX
-#if CONFIG_LGT_FROM_PRED
-  if (LGT_FROM_PRED_INTRA) {
-    for (i = 0; i < LGT_SIZES; ++i) {
-      for (j = 0; j < INTRA_MODES; ++j) {
-        x->intra_lgt_cost[i][j][0] = av1_cost_bit(fc->intra_lgt_prob[i][j], 0);
-        x->intra_lgt_cost[i][j][1] = av1_cost_bit(fc->intra_lgt_prob[i][j], 1);
-      }
-    }
-  }
-  if (LGT_FROM_PRED_INTER) {
-    for (i = 0; i < LGT_SIZES; ++i) {
-      x->inter_lgt_cost[i][0] = av1_cost_bit(fc->inter_lgt_prob[i], 0);
-      x->inter_lgt_cost[i][1] = av1_cost_bit(fc->inter_lgt_prob[i], 1);
-    }
+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+                             fc->txfm_partition_cdf[i], NULL);
   }
-#endif  // CONFIG_LGT_FROM_PRED
+
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     int s;
     for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
@@ -245,125 +203,124 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
       }
     }
   }
-#else
-  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    for (j = 0; j < TX_TYPES; ++j)
-      av1_cost_tokens_from_cdf(x->intra_tx_type_costs[i][j],
-                               fc->intra_ext_tx_cdf[i][j], av1_ext_tx_inv);
-  }
-  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    av1_cost_tokens_from_cdf(x->inter_tx_type_costs[i], fc->inter_ext_tx_cdf[i],
-                             av1_ext_tx_inv);
-  }
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  for (i = 0; i < INTRA_FILTERS + 1; ++i)
-    av1_cost_tokens_from_cdf(x->intra_filter_cost[i], fc->intra_filter_cdf[i],
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+    av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
                              NULL);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_LOOP_RESTORATION
-  av1_cost_tokens(x->switchable_restore_cost, fc->switchable_restore_prob,
-                  av1_switchable_restore_tree);
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_INTRABC
+  }
+  av1_cost_tokens_from_cdf(x->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
+                           NULL);
+  av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
+                           NULL);
   av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
-#endif  // CONFIG_INTRABC
 
   if (!frame_is_intra_only(cm)) {
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < SINGLE_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+                                 fc->single_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+                               fc->comp_ref_type_cdf[i], NULL);
+    }
+
+    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+      for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+                                 fc->uni_comp_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < FWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
+                                 NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < BWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+                                 fc->comp_bwdref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
+                               NULL);
+    }
+
     for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
       av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
-#else
-      x->newmv_mode_cost[i][0] = av1_cost_bit(fc->newmv_prob[i], 0);
-      x->newmv_mode_cost[i][1] = av1_cost_bit(fc->newmv_prob[i], 1);
-#endif
     }
 
-    for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
+    for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
       av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
-#else
-      x->zeromv_mode_cost[i][0] = av1_cost_bit(fc->zeromv_prob[i], 0);
-      x->zeromv_mode_cost[i][1] = av1_cost_bit(fc->zeromv_prob[i], 1);
-#endif
     }
 
     for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
       av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
-#else
-      x->refmv_mode_cost[i][0] = av1_cost_bit(fc->refmv_prob[i], 0);
-      x->refmv_mode_cost[i][1] = av1_cost_bit(fc->refmv_prob[i], 1);
-#endif
     }
 
     for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
       av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
-#else
-      x->drl_mode_cost0[i][0] = av1_cost_bit(fc->drl_prob[i], 0);
-      x->drl_mode_cost0[i][1] = av1_cost_bit(fc->drl_prob[i], 1);
-#endif
     }
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
                                fc->inter_compound_mode_cdf[i], NULL);
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
     for (i = 0; i < BLOCK_SIZES_ALL; ++i)
       av1_cost_tokens_from_cdf(x->compound_type_cost[i],
                                fc->compound_type_cdf[i], NULL);
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      av1_cost_tokens_from_cdf(x->inter_singleref_comp_mode_cost[i],
-                               fc->inter_singleref_comp_mode_cdf[i], NULL);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      if (get_interinter_wedge_bits(i)) {
+        av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
+                                 NULL);
+      }
+    }
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+      av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
+                               NULL);
       av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
                                fc->interintra_mode_cdf[i], NULL);
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    }
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+                               fc->wedge_interintra_cdf[i], NULL);
+    }
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
       av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
                                NULL);
     }
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-      av1_cost_tokens_from_cdf(x->motion_mode_cost2[i], fc->ncobmc_cdf[i],
-                               NULL);
-#endif
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
       av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
-#else
-      x->motion_mode_cost1[i][0] = av1_cost_bit(fc->obmc_prob[i], 0);
-      x->motion_mode_cost1[i][1] = av1_cost_bit(fc->obmc_prob[i], 1);
-#endif
     }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-    for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
-      av1_cost_tokens_from_cdf(x->ncobmc_mode_cost[i], fc->ncobmc_mode_cdf[i],
+    for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
                                NULL);
     }
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+                               fc->comp_group_idx_cdf[i], NULL);
+    }
   }
 }
 
 // Values are now correlated to quantizer.
 static int sad_per_bit16lut_8[QINDEX_RANGE];
 static int sad_per_bit4lut_8[QINDEX_RANGE];
-
-#if CONFIG_HIGHBITDEPTH
 static int sad_per_bit16lut_10[QINDEX_RANGE];
 static int sad_per_bit4lut_10[QINDEX_RANGE];
 static int sad_per_bit16lut_12[QINDEX_RANGE];
 static int sad_per_bit4lut_12[QINDEX_RANGE];
-#endif
 
 static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
                             aom_bit_depth_t bit_depth) {
@@ -381,31 +338,26 @@ static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
 void av1_init_me_luts(void) {
   init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
                   AOM_BITS_8);
-#if CONFIG_HIGHBITDEPTH
   init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
                   AOM_BITS_10);
   init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
                   AOM_BITS_12);
-#endif
 }
 
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
   128, 144, 128, 128, 144,
-#if CONFIG_EXT_REFS
   // TODO(zoeliu): To adjust further following factor values.
   128, 128, 128,
   // TODO(weitinglin): We should investigate if the values should be the same
   //                   as the value used by OVERLAY frame
   144,  // INTNL_OVERLAY_UPDATE
   128   // INTNL_ARF_UPDATE
-#endif  // CONFIG_EXT_REFS
 };
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth);
-#if CONFIG_HIGHBITDEPTH
+  const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth);
   int64_t rdmult = 0;
   switch (cpi->common.bit_depth) {
     case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
@@ -415,9 +367,6 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  int64_t rdmult = 88 * q * q / 24;
-#endif  // CONFIG_HIGHBITDEPTH
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -432,25 +381,19 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
 
 static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
   double q;
-#if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
-    case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break;
-    case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break;
-    case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break;
+    case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break;
+    case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0;
-#endif  // CONFIG_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
   return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
 void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
-#if CONFIG_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case AOM_BITS_8:
       x->sadperbit16 = sad_per_bit16lut_8[qindex];
@@ -467,11 +410,6 @@ void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
   }
-#else
-  (void)cpi;
-  x->sadperbit16 = sad_per_bit16lut_8[qindex];
-  x->sadperbit4 = sad_per_bit4lut_8[qindex];
-#endif  // CONFIG_HIGHBITDEPTH
 }
 
 static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
@@ -490,195 +428,89 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
       const int t = q * rd_thresh_block_size_factor[bsize];
       const int thresh_max = INT_MAX / t;
 
-#if CONFIG_CB4X4
       for (i = 0; i < MAX_MODES; ++i)
         rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
                                                  ? rd->thresh_mult[i] * t / 4
                                                  : INT_MAX;
-#else
-      if (bsize >= BLOCK_8X8) {
-        for (i = 0; i < MAX_MODES; ++i)
-          rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
-                                                   ? rd->thresh_mult[i] * t / 4
-                                                   : INT_MAX;
-      } else {
-        for (i = 0; i < MAX_REFS; ++i)
-          rd->threshes[segment_id][bsize][i] =
-              rd->thresh_mult_sub8x8[i] < thresh_max
-                  ? rd->thresh_mult_sub8x8[i] * t / 4
-                  : INT_MAX;
-      }
-#endif
     }
   }
 }
 
-void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
-                    int ref_mv_idx) {
-  MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
-  int8_t rf_type = av1_ref_frame_type(x->e_mbd.mi[0]->mbmi.ref_frame);
-  int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                            mbmi_ext->ref_mv_stack[rf_type], ref, ref_mv_idx);
-  (void)ref_frame;
-  x->mvcost = x->mv_cost_stack[nmv_ctx];
-  x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx) {
+  (void)ref;
+  (void)ref_mv_idx;
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
 }
 
-#if CONFIG_LV_MAP
-#if !LV_MAP_PROB
-static void get_rate_cost(aom_prob p, int cost[2]) {
-  cost[0] = av1_cost_bit(p, 0);
-  cost[1] = av1_cost_bit(p, 1);
-}
-#endif  // !LV_MAP_PROB
-
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc) {
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes) {
+  const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+  for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+
+      for (int ctx = 0; ctx < 2; ++ctx) {
+        aom_cdf_prob *pcdf;
+        switch (eob_multi_size) {
+          case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+          case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+          case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+          case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+          case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+          case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+          case 6:
+          default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+        }
+        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+      }
+    }
+  }
   for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (int plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (int plane = 0; plane < nplanes; ++plane) {
       LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
 
-#if LV_MAP_PROB
       for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
                                  fc->txb_skip_cdf[tx_size][ctx], NULL);
 
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+                                 fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+                                 NULL);
       for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
-        av1_cost_tokens_from_cdf(pcost->nz_map_cost[ctx],
-                                 fc->nz_map_cdf[tx_size][plane][ctx], NULL);
+        av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+                                 fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
 
       for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
-        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx],
-                                 fc->eob_flag_cdf[tx_size][plane][ctx], NULL);
+        av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+                                 fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
 
       for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
                                  fc->dc_sign_cdf[plane][ctx], NULL);
 
-      for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer)
-        for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(
-              pcost->base_cost[layer][ctx],
-              fc->coeff_base_cdf[tx_size][plane][layer][ctx], NULL);
-
-#if BR_NODE
-      for (int br = 0; br < BASE_RANGE_SETS; ++br)
-        for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(pcost->br_cost[br][ctx],
-                                   fc->coeff_br_cdf[tx_size][plane][br][ctx],
-                                   NULL);
-
       for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-        int lps_rate[2];
-        av1_cost_tokens_from_cdf(lps_rate,
-                                 fc->coeff_lps_cdf[tx_size][plane][ctx], NULL);
-
-        for (int base_range = 0; base_range < COEFF_BASE_RANGE + 1;
-             ++base_range) {
-          int br_set_idx = base_range < COEFF_BASE_RANGE
-                               ? coeff_to_br_index[base_range]
-                               : BASE_RANGE_SETS;
-
-          pcost->lps_cost[ctx][base_range] = 0;
-
-          for (int idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-            if (idx == br_set_idx) {
-              pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][1];
-
-              int br_base = br_index_to_coeff[br_set_idx];
-              int br_offset = base_range - br_base;
-              int extra_bits = (1 << br_extra_bits[idx]) - 1;
-              for (int tok = 0; tok < extra_bits; ++tok) {
-                if (tok == br_offset) {
-                  pcost->lps_cost[ctx][base_range] += lps_rate[1];
-                  break;
-                } else {
-                  pcost->lps_cost[ctx][base_range] += lps_rate[0];
-                }
-              }
-              break;
-            } else {
-              pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][0];
-            }
-          }
-          // load the base range cost
-        }
-      }
-#else   // BR_NODE
-      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-        av1_cost_tokens_from_cdf(pcost->lps_cost[ctx],
-                                 fc->coeff_lps_cdf[tx_size][plane][ctx], NULL);
-#endif  // BR_NODE
-#if CONFIG_CTX1D
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        av1_cost_tokens_from_cdf(pcost->eob_mode_cost[tx_class],
-                                 fc->eob_mode_cdf[tx_size][plane][tx_class],
+        int br_rate[BR_CDF_SIZE];
+        int prev_cost = 0;
+        int i, j;
+        av1_cost_tokens_from_cdf(br_rate, fc->coeff_br_cdf[tx_size][plane][ctx],
                                  NULL);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(
-              pcost->empty_line_cost[tx_class][ctx],
-              fc->empty_line_cdf[tx_size][plane][tx_class][ctx], NULL);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(
-              pcost->hv_eob_cost[tx_class][ctx],
-              fc->hv_eob_cdf[tx_size][plane][tx_class][ctx], NULL);
-#endif  // CONFIG_CTX1D
-#else   // LV_MAP_PROB
-      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
-        get_rate_cost(fc->txb_skip[tx_size][ctx], pcost->txb_skip_cost[ctx]);
-
-      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
-        get_rate_cost(fc->nz_map[tx_size][plane][ctx], pcost->nz_map_cost[ctx]);
-
-      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
-        get_rate_cost(fc->eob_flag[tx_size][plane][ctx], pcost->eob_cost[ctx]);
-
-      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-        get_rate_cost(fc->dc_sign[plane][ctx], pcost->dc_sign_cost[ctx]);
-
-      for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer)
-        for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-          get_rate_cost(fc->coeff_base[tx_size][plane][layer][ctx],
-                        pcost->base_cost[layer][ctx]);
-
-      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-        get_rate_cost(fc->coeff_lps[tx_size][plane][ctx], pcost->lps_cost[ctx]);
-
-#if CONFIG_CTX1D
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        get_rate_cost(fc->eob_mode[tx_size][plane][tx_class],
-                      pcost->eob_mode_cost[tx_class]);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
-          get_rate_cost(fc->empty_line[tx_size][plane][tx_class][ctx],
-                        pcost->empty_line_cost[tx_class][ctx]);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
-          get_rate_cost(fc->hv_eob[tx_size][plane][tx_class][ctx],
-                        pcost->hv_eob_cost[tx_class][ctx]);
-#endif  // CONFIG_CTX1D
-#endif  // LV_MAP_PROB
-    }
-  }
-}
-#endif  // CONFIG_LV_MAP
-
-void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost,
-                                   coeff_cdf_model (*cdf)[PLANE_TYPES]) {
-  for (int tx = 0; tx < TX_SIZES; ++tx) {
-    for (int pt = 0; pt < PLANE_TYPES; ++pt) {
-      for (int rt = 0; rt < REF_TYPES; ++rt) {
-        for (int band = 0; band < COEF_BANDS; ++band) {
-          for (int ctx = 0; ctx < BAND_COEFF_CONTEXTS(band); ++ctx) {
-            av1_cost_tokens_from_cdf(cost[tx][pt][rt][band][ctx],
-                                     cdf[tx][pt][rt][band][ctx], NULL);
+        // printf("br_rate: ");
+        // for(j = 0; j < BR_CDF_SIZE; j++)
+        //  printf("%4d ", br_rate[j]);
+        // printf("\n");
+        for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+          for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+            pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
           }
+          prev_cost += br_rate[j];
         }
+        pcost->lps_cost[ctx][i] = prev_cost;
+        // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+        // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+        //  printf("%5d ", pcost->lps_cost[ctx][i]);
+        // printf("\n");
       }
     }
   }
@@ -688,7 +520,6 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   RD_OPT *const rd = &cpi->rd;
-  int nmv_ctx;
 
   aom_clear_system_state();
 
@@ -698,56 +529,35 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
 
   set_block_thresholds(cm, rd);
 
-  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
-#if CONFIG_AMVR
-    if (cm->cur_frame_mv_precision_level) {
-      av1_build_nmv_cost_table(x->nmv_vec_cost[nmv_ctx], x->nmvcost[nmv_ctx],
-                               &cm->fc->nmvc[nmv_ctx], MV_SUBPEL_NONE);
-    } else {
-      av1_build_nmv_cost_table(
-          x->nmv_vec_cost[nmv_ctx],
-          cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
-                                      : x->nmvcost[nmv_ctx],
-          &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
-    }
-
-#else
+  if (cm->cur_frame_force_integer_mv) {
+    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc,
+                             MV_SUBPEL_NONE);
+  } else {
     av1_build_nmv_cost_table(
-        x->nmv_vec_cost[nmv_ctx],
-        cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
-                                    : x->nmvcost[nmv_ctx],
-        &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
-#endif
+        x->nmv_vec_cost,
+        cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
+        cm->allow_high_precision_mv);
   }
-  x->mvcost = x->mv_cost_stack[0];
-  x->nmvjointcost = x->nmv_vec_cost[0];
 
-#if CONFIG_INTRABC
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
+
   if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
       cpi->oxcf.pass != 1) {
-    av1_build_nmv_cost_table(
-        x->nmv_vec_cost[0],
-        cm->allow_high_precision_mv ? x->nmvcost_hp[0] : x->nmvcost[0],
-        &cm->fc->ndvc, MV_SUBPEL_NONE);
+    int *dvcost[2] = { &cpi->dv_cost[0][MV_MAX], &cpi->dv_cost[1][MV_MAX] };
+    av1_build_nmv_cost_table(cpi->dv_joint_cost, dvcost, &cm->fc->ndvc,
+                             MV_SUBPEL_NONE);
   }
-#endif
 
-#if CONFIG_GLOBAL_MOTION
   if (cpi->oxcf.pass != 1) {
     for (int i = 0; i < TRANS_TYPES; ++i)
-#if GLOBAL_TRANS_TYPES > 4
-      cpi->gmtype_cost[i] = (1 + (i > 0 ? GLOBAL_TYPE_BITS : 0))
-                            << AV1_PROB_COST_SHIFT;
-#else
       // IDENTITY: 1 bit
       // TRANSLATION: 3 bits
       // ROTZOOM: 2 bits
       // AFFINE: 3 bits
       cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
                             << AV1_PROB_COST_SHIFT;
-#endif  // GLOBAL_TRANS_TYPES > 4
   }
-#endif  // CONFIG_GLOBAL_MOTION
 }
 
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
@@ -840,288 +650,32 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
   }
 }
 
-static void get_entropy_contexts_plane(
-    BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
-    ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-    ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+                                       const struct macroblockd_plane *pd,
+                                       ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                                       ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
   const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
   const ENTROPY_CONTEXT *const above = pd->above_context;
   const ENTROPY_CONTEXT *const left = pd->left_context;
 
-#if CONFIG_LV_MAP
   memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
   memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-  return;
-#endif  // CONFIG_LV_MAP
-
-  int i;
-
-#if CONFIG_CHROMA_2X2
-  switch (tx_size) {
-    case TX_2X2:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_4X4:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X8:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X16:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_32X32:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-#if CONFIG_TX64X64
-    case TX_32X64:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 32)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] |
-               *(const uint64_t *)&left[i + 16] |
-               *(const uint64_t *)&left[i + 24]);
-      break;
-    case TX_64X32:
-      for (i = 0; i < num_4x4_w; i += 32)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] |
-               *(const uint64_t *)&above[i + 16] |
-               *(const uint64_t *)&above[i + 24]);
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_64X64:
-      for (i = 0; i < num_4x4_w; i += 32)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] |
-               *(const uint64_t *)&above[i + 16] |
-               *(const uint64_t *)&above[i + 24]);
-      for (i = 0; i < num_4x4_h; i += 32)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] |
-               *(const uint64_t *)&left[i + 16] |
-               *(const uint64_t *)&left[i + 24]);
-      break;
-#endif  // CONFIG_TX64X64
-    case TX_4X8:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_8X4:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X16:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_16X8:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X32:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_32X16:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_16X4:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X32:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_32X8:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-#endif
-
-    default: assert(0 && "Invalid transform size."); break;
-  }
-  return;
-#endif  // CONFIG_CHROMA_2X2
-
-  switch (tx_size) {
-    case TX_4X4:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X8:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_16X16:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_32X32:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-#if CONFIG_TX64X64
-    case TX_32X64:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_64X32:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_64X64:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-#endif  // CONFIG_TX64X64
-    case TX_4X8:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X4:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X16:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X8:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_16X32:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_32X16:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X4:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X32:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_32X8:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-#endif
-    default: assert(0 && "Invalid transform size."); break;
-  }
 }
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
-#if CONFIG_CHROMA_SUB8X8
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
   const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
-  get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
 }
 
 void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
   int i;
   int zero_seen = 0;
-  int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
@@ -1129,11 +683,15 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
   uint8_t *ref_y_ptr;
   MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
   int num_mv_refs = 0;
-
-  pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
-  if (x->mbmi_ext->ref_mvs[ref_frame][0].as_int !=
-      x->mbmi_ext->ref_mvs[ref_frame][1].as_int) {
-    pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+  const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  const int_mv ref_mv =
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+  const int_mv ref_mv1 =
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+
+  pred_mv[num_mv_refs++] = ref_mv.as_mv;
+  if (ref_mv.as_int != ref_mv1.as_int) {
+    pred_mv[num_mv_refs++] = ref_mv1.as_mv;
   }
   if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size)
     pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
@@ -1158,12 +716,10 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
-      best_index = i;
     }
   }
 
   // Note the index of the mv that worked best in the reference list.
-  x->mv_best_ref_index[ref_frame] = best_index;
   x->max_mv_context[ref_frame] = max_mv;
   x->pred_mv_sad[ref_frame] = best_sad;
 }
@@ -1172,7 +728,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *scale,
-                          const struct scale_factors *scale_uv) {
+                          const struct scale_factors *scale_uv,
+                          const int num_planes) {
   int i;
 
   dst[0].buf = src->y_buffer;
@@ -1181,8 +738,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
   dst[2].buf = src->v_buffer;
   dst[1].stride = dst[2].stride = src->uv_stride;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    setup_pred_plane(dst + i, xd->mi[0]->mbmi.sb_type, dst[i].buf,
+  for (i = 0; i < num_planes; ++i) {
+    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
                      i ? src->uv_crop_width : src->y_crop_width,
                      i ? src->uv_crop_height : src->y_crop_height,
                      dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
@@ -1192,7 +749,7 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
 
 int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
                             int stride) {
-  const int bw = b_width_log2_lookup[plane_bsize];
+  const int bw = mi_size_wide_log2[plane_bsize];
   const int y = 4 * (raster_block >> bw);
   const int x = 4 * (raster_block & ((1 << bw) - 1));
   return y * stride + x;
@@ -1214,43 +771,24 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
              : NULL;
 }
 
-#if CONFIG_DUAL_FILTER
 int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
                             const MACROBLOCKD *xd) {
   if (cm->interp_filter == SWITCHABLE) {
-    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
     int inter_filter_cost = 0;
     int dir;
 
     for (dir = 0; dir < 2; ++dir) {
-      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-          (mbmi->ref_frame[1] > INTRA_FRAME &&
-           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        const InterpFilter filter =
-            av1_extract_interp_filter(mbmi->interp_filters, dir);
-        inter_filter_cost += x->switchable_interp_costs[ctx][filter];
-      }
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      const InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      inter_filter_cost += x->switchable_interp_costs[ctx][filter];
     }
     return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
   } else {
     return 0;
   }
 }
-#else
-int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
-                            const MACROBLOCKD *xd) {
-  if (cm->interp_filter == SWITCHABLE) {
-    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-    const int ctx = av1_get_pred_context_switchable_interp(xd);
-    const InterpFilter filter =
-        av1_extract_interp_filter(mbmi->interp_filters, 0);
-    return SWITCHABLE_INTERP_RATE_FACTOR *
-           x->switchable_interp_costs[ctx][filter];
-  }
-  return 0;
-}
-#endif
 
 void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   int i;
@@ -1262,22 +800,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
 
   if (sf->adaptive_rd_thresh) {
     rd->thresh_mult[THR_NEARESTMV] = 300;
-#if CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTL2] = 300;
     rd->thresh_mult[THR_NEARESTL3] = 300;
     rd->thresh_mult[THR_NEARESTB] = 300;
     rd->thresh_mult[THR_NEARESTA2] = 300;
-#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 300;
     rd->thresh_mult[THR_NEARESTG] = 300;
   } else {
     rd->thresh_mult[THR_NEARESTMV] = 0;
-#if CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTL2] = 0;
     rd->thresh_mult[THR_NEARESTL3] = 0;
     rd->thresh_mult[THR_NEARESTB] = 0;
     rd->thresh_mult[THR_NEARESTA2] = 0;
-#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 0;
     rd->thresh_mult[THR_NEARESTG] = 0;
   }
@@ -1285,92 +819,35 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_DC] += 1000;
 
   rd->thresh_mult[THR_NEWMV] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWL2] += 1000;
   rd->thresh_mult[THR_NEWL3] += 1000;
   rd->thresh_mult[THR_NEWB] += 1000;
   rd->thresh_mult[THR_NEWA2] = 1000;
-#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
   rd->thresh_mult[THR_NEARMV] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARL2] += 1000;
   rd->thresh_mult[THR_NEARL3] += 1000;
   rd->thresh_mult[THR_NEARB] += 1000;
   rd->thresh_mult[THR_NEARA2] = 1000;
-#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_NEARG] += 1000;
 
-  rd->thresh_mult[THR_ZEROMV] += 2000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_ZEROL2] += 2000;
-  rd->thresh_mult[THR_ZEROL3] += 2000;
-  rd->thresh_mult[THR_ZEROB] += 2000;
-  rd->thresh_mult[THR_ZEROA2] = 2000;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_ZEROG] += 2000;
-  rd->thresh_mult[THR_ZEROA] += 2000;
-
-  rd->thresh_mult[THR_TM] += 1000;
-
-#if CONFIG_COMPOUND_SINGLEREF
-  rd->thresh_mult[THR_SR_NEAREST_NEARMV] += 1200;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEARL2] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEARL3] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEARB] += 1200;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEARA] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEARG] += 1200;
-
-  /*
-  rd->thresh_mult[THR_SR_NEAREST_NEWMV] += 1200;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEWL2] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEWL3] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEWB] += 1200;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEWA] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEWG] += 1200;*/
-
-  rd->thresh_mult[THR_SR_NEAR_NEWMV] += 1500;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAR_NEWL2] += 1500;
-  rd->thresh_mult[THR_SR_NEAR_NEWL3] += 1500;
-  rd->thresh_mult[THR_SR_NEAR_NEWB] += 1500;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAR_NEWA] += 1500;
-  rd->thresh_mult[THR_SR_NEAR_NEWG] += 1500;
-
-  rd->thresh_mult[THR_SR_ZERO_NEWMV] += 2000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_ZERO_NEWL2] += 2000;
-  rd->thresh_mult[THR_SR_ZERO_NEWL3] += 2000;
-  rd->thresh_mult[THR_SR_ZERO_NEWB] += 2000;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_ZERO_NEWA] += 2000;
-  rd->thresh_mult[THR_SR_ZERO_NEWG] += 2000;
-
-  rd->thresh_mult[THR_SR_NEW_NEWMV] += 1700;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEW_NEWL2] += 1700;
-  rd->thresh_mult[THR_SR_NEW_NEWL3] += 1700;
-  rd->thresh_mult[THR_SR_NEW_NEWB] += 1700;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEW_NEWA] += 1700;
-  rd->thresh_mult[THR_SR_NEW_NEWG] += 1700;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  rd->thresh_mult[THR_GLOBALMV] += 2000;
+  rd->thresh_mult[THR_GLOBALL2] += 2000;
+  rd->thresh_mult[THR_GLOBALL3] += 2000;
+  rd->thresh_mult[THR_GLOBALB] += 2000;
+  rd->thresh_mult[THR_GLOBALA2] = 2000;
+  rd->thresh_mult[THR_GLOBALG] += 2000;
+  rd->thresh_mult[THR_GLOBALA] += 2000;
+
+  rd->thresh_mult[THR_PAETH] += 1000;
 
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
-#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
@@ -1380,13 +857,10 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000;
 
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 1000;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 2000;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
@@ -1394,16 +868,15 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
 
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
@@ -1411,8 +884,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
-#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
@@ -1420,16 +892,15 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
 
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
   rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
@@ -1437,7 +908,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
@@ -1445,7 +916,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
@@ -1453,7 +924,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
@@ -1461,7 +932,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLA2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500;
@@ -1469,7 +940,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
@@ -1477,7 +948,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500;
@@ -1485,124 +956,55 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROGA2] += 2500;
-
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLL2] += 2500;
-
-  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLL3] += 2500;
-
-  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLG] += 2500;
-
-  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROBA] += 2500;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
 
   rd->thresh_mult[THR_H_PRED] += 2000;
   rd->thresh_mult[THR_V_PRED] += 2000;
   rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D207_PRED] += 2500;
-  rd->thresh_mult[THR_D153_PRED] += 2500;
-  rd->thresh_mult[THR_D63_PRED] += 2500;
-  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D203_PRED] += 2500;
+  rd->thresh_mult[THR_D157_PRED] += 2500;
+  rd->thresh_mult[THR_D67_PRED] += 2500;
+  rd->thresh_mult[THR_D113_PRED] += 2500;
   rd->thresh_mult[THR_D45_PRED] += 2500;
 
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000;
-
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000;
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000;
-#endif  // CONFIG_EXT_REFS
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000;
-
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000;
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA2] += 2000;
-#endif  // CONFIG_EXT_REFS
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
 }
 
 void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
-  static const int thresh_mult[MAX_REFS] = {
-#if CONFIG_EXT_REFS
-    2500,
-    2500,
-    2500,
-    2500,
-    2500,
-    2500,
-    2500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    2500
-#else  // !CONFIG_EXT_REFS
-    2500,
-    2500,
-    2500,
-    4500,
-    4500,
-    2500
-#endif  // CONFIG_EXT_REFS
-  };
+  static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500,
+                                             2500, 2500, 4500, 4500, 4500,
+                                             4500, 4500, 4500, 4500, 4500,
+                                             4500, 4500, 4500, 4500, 2500 };
   RD_OPT *const rd = &cpi->rd;
   memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
 }
@@ -1611,15 +1013,12 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*factor_buf)[MAX_MODES], int rd_thresh,
                                int bsize, int best_mode_index) {
   if (rd_thresh > 0) {
-#if CONFIG_CB4X4
     const int top_mode = MAX_MODES;
-#else
-    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
-#endif
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
       const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size);
+      const BLOCK_SIZE max_size =
+          AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];
@@ -1635,8 +1034,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth) {
-  const int q = av1_dc_quant(qindex, qdelta, bit_depth);
-#if CONFIG_HIGHBITDEPTH
+  const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth);
   switch (bit_depth) {
     case AOM_BITS_8: return 20 * q;
     case AOM_BITS_10: return 5 * q;
@@ -1645,7 +1043,4 @@ int av1_get_intra_cost_penalty(int qindex, int qdelta,
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  return 20 * q;
-#endif  // CONFIG_HIGHBITDEPTH
 }
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 35ada8e6c..281b676b0 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -14,9 +14,6 @@
 
 #include <limits.h>
 
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif  // CONFIG_ANS
 #include "av1/common/blockd.h"
 
 #include "av1/encoder/block.h"
@@ -30,9 +27,9 @@ extern "C" {
 #define RDDIV_BITS 7
 #define RD_EPB_SHIFT 6
 
-#define RDCOST(RM, R, D)                                          \
-  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + \
-   (D << RDDIV_BITS))
+#define RDCOST(RM, R, D)                                            \
+  (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+   ((D) * (1 << RDDIV_BITS)))
 
 #define RDCOST_DBL(RM, R, D)                                       \
   (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
@@ -50,102 +47,43 @@ extern "C" {
 // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
   THR_NEARESTMV,
-#if CONFIG_EXT_REFS
   THR_NEARESTL2,
   THR_NEARESTL3,
   THR_NEARESTB,
   THR_NEARESTA2,
-#endif  // CONFIG_EXT_REFS
   THR_NEARESTA,
   THR_NEARESTG,
 
   THR_DC,
 
   THR_NEWMV,
-#if CONFIG_EXT_REFS
   THR_NEWL2,
   THR_NEWL3,
   THR_NEWB,
   THR_NEWA2,
-#endif  // CONFIG_EXT_REFS
   THR_NEWA,
   THR_NEWG,
 
   THR_NEARMV,
-#if CONFIG_EXT_REFS
   THR_NEARL2,
   THR_NEARL3,
   THR_NEARB,
   THR_NEARA2,
-#endif  // CONFIG_EXT_REFS
   THR_NEARA,
   THR_NEARG,
 
-  THR_ZEROMV,
-#if CONFIG_EXT_REFS
-  THR_ZEROL2,
-  THR_ZEROL3,
-  THR_ZEROB,
-  THR_ZEROA2,
-#endif  // CONFIG_EXT_REFS
-  THR_ZEROA,
-  THR_ZEROG,
-
-#if CONFIG_COMPOUND_SINGLEREF
-  THR_SR_NEAREST_NEARMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEARL2,
-  THR_SR_NEAREST_NEARL3,
-  THR_SR_NEAREST_NEARB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEARG,
-  THR_SR_NEAREST_NEARA,
-
-  /*
-  THR_SR_NEAREST_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEWL2,
-  THR_SR_NEAREST_NEWL3,
-  THR_SR_NEAREST_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEWG,
-  THR_SR_NEAREST_NEWA,*/
-
-  THR_SR_NEAR_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEAR_NEWL2,
-  THR_SR_NEAR_NEWL3,
-  THR_SR_NEAR_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEAR_NEWG,
-  THR_SR_NEAR_NEWA,
-
-  THR_SR_ZERO_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_ZERO_NEWL2,
-  THR_SR_ZERO_NEWL3,
-  THR_SR_ZERO_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_ZERO_NEWG,
-  THR_SR_ZERO_NEWA,
-
-  THR_SR_NEW_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEW_NEWL2,
-  THR_SR_NEW_NEWL3,
-  THR_SR_NEW_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEW_NEWG,
-  THR_SR_NEW_NEWA,
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
 
   THR_COMP_NEAREST_NEARESTLA,
-#if CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTL2A,
   THR_COMP_NEAREST_NEARESTL3A,
-#endif  // CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTGA,
-#if CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTLB,
   THR_COMP_NEAREST_NEARESTL2B,
   THR_COMP_NEAREST_NEARESTL3B,
@@ -154,21 +92,16 @@ typedef enum {
   THR_COMP_NEAREST_NEARESTL2A2,
   THR_COMP_NEAREST_NEARESTL3A2,
   THR_COMP_NEAREST_NEARESTGA2,
-#if CONFIG_EXT_COMP_REFS
   THR_COMP_NEAREST_NEARESTLL2,
   THR_COMP_NEAREST_NEARESTLL3,
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
 
-  THR_TM,
+  THR_PAETH,
 
   THR_SMOOTH,
-#if CONFIG_SMOOTH_HV
   THR_SMOOTH_V,
   THR_SMOOTH_H,
-#endif  // CONFIG_SMOOTH_HV
 
   THR_COMP_NEAR_NEARLA,
   THR_COMP_NEW_NEARESTLA,
@@ -176,16 +109,15 @@ typedef enum {
   THR_COMP_NEW_NEARLA,
   THR_COMP_NEAR_NEWLA,
   THR_COMP_NEW_NEWLA,
-  THR_COMP_ZERO_ZEROLA,
+  THR_COMP_GLOBAL_GLOBALLA,
 
-#if CONFIG_EXT_REFS
   THR_COMP_NEAR_NEARL2A,
   THR_COMP_NEW_NEARESTL2A,
   THR_COMP_NEAREST_NEWL2A,
   THR_COMP_NEW_NEARL2A,
   THR_COMP_NEAR_NEWL2A,
   THR_COMP_NEW_NEWL2A,
-  THR_COMP_ZERO_ZEROL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
 
   THR_COMP_NEAR_NEARL3A,
   THR_COMP_NEW_NEARESTL3A,
@@ -193,8 +125,7 @@ typedef enum {
   THR_COMP_NEW_NEARL3A,
   THR_COMP_NEAR_NEWL3A,
   THR_COMP_NEW_NEWL3A,
-  THR_COMP_ZERO_ZEROL3A,
-#endif  // CONFIG_EXT_REFS
+  THR_COMP_GLOBAL_GLOBALL3A,
 
   THR_COMP_NEAR_NEARGA,
   THR_COMP_NEW_NEARESTGA,
@@ -202,16 +133,15 @@ typedef enum {
   THR_COMP_NEW_NEARGA,
   THR_COMP_NEAR_NEWGA,
   THR_COMP_NEW_NEWGA,
-  THR_COMP_ZERO_ZEROGA,
+  THR_COMP_GLOBAL_GLOBALGA,
 
-#if CONFIG_EXT_REFS
   THR_COMP_NEAR_NEARLB,
   THR_COMP_NEW_NEARESTLB,
   THR_COMP_NEAREST_NEWLB,
   THR_COMP_NEW_NEARLB,
   THR_COMP_NEAR_NEWLB,
   THR_COMP_NEW_NEWLB,
-  THR_COMP_ZERO_ZEROLB,
+  THR_COMP_GLOBAL_GLOBALLB,
 
   THR_COMP_NEAR_NEARL2B,
   THR_COMP_NEW_NEARESTL2B,
@@ -219,7 +149,7 @@ typedef enum {
   THR_COMP_NEW_NEARL2B,
   THR_COMP_NEAR_NEWL2B,
   THR_COMP_NEW_NEWL2B,
-  THR_COMP_ZERO_ZEROL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
 
   THR_COMP_NEAR_NEARL3B,
   THR_COMP_NEW_NEARESTL3B,
@@ -227,7 +157,7 @@ typedef enum {
   THR_COMP_NEW_NEARL3B,
   THR_COMP_NEAR_NEWL3B,
   THR_COMP_NEW_NEWL3B,
-  THR_COMP_ZERO_ZEROL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
 
   THR_COMP_NEAR_NEARGB,
   THR_COMP_NEW_NEARESTGB,
@@ -235,7 +165,7 @@ typedef enum {
   THR_COMP_NEW_NEARGB,
   THR_COMP_NEAR_NEWGB,
   THR_COMP_NEW_NEWGB,
-  THR_COMP_ZERO_ZEROGB,
+  THR_COMP_GLOBAL_GLOBALGB,
 
   THR_COMP_NEAR_NEARLA2,
   THR_COMP_NEW_NEARESTLA2,
@@ -243,7 +173,7 @@ typedef enum {
   THR_COMP_NEW_NEARLA2,
   THR_COMP_NEAR_NEWLA2,
   THR_COMP_NEW_NEWLA2,
-  THR_COMP_ZERO_ZEROLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
 
   THR_COMP_NEAR_NEARL2A2,
   THR_COMP_NEW_NEARESTL2A2,
@@ -251,7 +181,7 @@ typedef enum {
   THR_COMP_NEW_NEARL2A2,
   THR_COMP_NEAR_NEWL2A2,
   THR_COMP_NEW_NEWL2A2,
-  THR_COMP_ZERO_ZEROL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
 
   THR_COMP_NEAR_NEARL3A2,
   THR_COMP_NEW_NEARESTL3A2,
@@ -259,7 +189,7 @@ typedef enum {
   THR_COMP_NEW_NEARL3A2,
   THR_COMP_NEAR_NEWL3A2,
   THR_COMP_NEW_NEWL3A2,
-  THR_COMP_ZERO_ZEROL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
 
   THR_COMP_NEAR_NEARGA2,
   THR_COMP_NEW_NEARESTGA2,
@@ -267,16 +197,24 @@ typedef enum {
   THR_COMP_NEW_NEARGA2,
   THR_COMP_NEAR_NEWGA2,
   THR_COMP_NEW_NEWGA2,
-  THR_COMP_ZERO_ZEROGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
 
-#if CONFIG_EXT_COMP_REFS
   THR_COMP_NEAR_NEARLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
   THR_COMP_NEW_NEARLL2,
   THR_COMP_NEAR_NEWLL2,
   THR_COMP_NEW_NEWLL2,
-  THR_COMP_ZERO_ZEROLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
 
   THR_COMP_NEAR_NEARLL3,
   THR_COMP_NEW_NEARESTLL3,
@@ -284,7 +222,7 @@ typedef enum {
   THR_COMP_NEW_NEARLL3,
   THR_COMP_NEAR_NEWLL3,
   THR_COMP_NEW_NEWLL3,
-  THR_COMP_ZERO_ZEROLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
 
   THR_COMP_NEAR_NEARLG,
   THR_COMP_NEW_NEARESTLG,
@@ -292,7 +230,7 @@ typedef enum {
   THR_COMP_NEW_NEARLG,
   THR_COMP_NEAR_NEWLG,
   THR_COMP_NEW_NEWLG,
-  THR_COMP_ZERO_ZEROLG,
+  THR_COMP_GLOBAL_GLOBALLG,
 
   THR_COMP_NEAR_NEARBA,
   THR_COMP_NEW_NEARESTBA,
@@ -300,79 +238,25 @@ typedef enum {
   THR_COMP_NEW_NEARBA,
   THR_COMP_NEAR_NEWBA,
   THR_COMP_NEW_NEWBA,
-  THR_COMP_ZERO_ZEROBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  THR_COMP_GLOBAL_GLOBALBA,
 
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D207_PRED,
-  THR_D153_PRED,
-  THR_D63_PRED,
-  THR_D117_PRED,
-  THR_D45_PRED,
-
-  THR_COMP_INTERINTRA_ZEROL,
-  THR_COMP_INTERINTRA_NEARESTL,
-  THR_COMP_INTERINTRA_NEARL,
-  THR_COMP_INTERINTRA_NEWL,
-
-#if CONFIG_EXT_REFS
-  THR_COMP_INTERINTRA_ZEROL2,
-  THR_COMP_INTERINTRA_NEARESTL2,
-  THR_COMP_INTERINTRA_NEARL2,
-  THR_COMP_INTERINTRA_NEWL2,
-
-  THR_COMP_INTERINTRA_ZEROL3,
-  THR_COMP_INTERINTRA_NEARESTL3,
-  THR_COMP_INTERINTRA_NEARL3,
-  THR_COMP_INTERINTRA_NEWL3,
-#endif  // CONFIG_EXT_REFS
-
-  THR_COMP_INTERINTRA_ZEROG,
-  THR_COMP_INTERINTRA_NEARESTG,
-  THR_COMP_INTERINTRA_NEARG,
-  THR_COMP_INTERINTRA_NEWG,
-
-#if CONFIG_EXT_REFS
-  THR_COMP_INTERINTRA_ZEROB,
-  THR_COMP_INTERINTRA_NEARESTB,
-  THR_COMP_INTERINTRA_NEARB,
-  THR_COMP_INTERINTRA_NEWB,
-
-  THR_COMP_INTERINTRA_ZEROA2,
-  THR_COMP_INTERINTRA_NEARESTA2,
-  THR_COMP_INTERINTRA_NEARA2,
-  THR_COMP_INTERINTRA_NEWA2,
-#endif  // CONFIG_EXT_REFS
-
-  THR_COMP_INTERINTRA_ZEROA,
-  THR_COMP_INTERINTRA_NEARESTA,
-  THR_COMP_INTERINTRA_NEARA,
-  THR_COMP_INTERINTRA_NEWA,
   MAX_MODES
 } THR_MODES;
 
 typedef enum {
   THR_LAST,
-#if CONFIG_EXT_REFS
   THR_LAST2,
   THR_LAST3,
   THR_BWDR,
   THR_ALTR2,
-#endif  // CONFIG_EXT_REFS
   THR_GOLD,
   THR_ALTR,
 
   THR_COMP_LA,
-#if CONFIG_EXT_REFS
   THR_COMP_L2A,
   THR_COMP_L3A,
-#endif  // CONFIG_EXT_REFS
   THR_COMP_GA,
 
-#if CONFIG_EXT_REFS
   THR_COMP_LB,
   THR_COMP_L2B,
   THR_COMP_L3B,
@@ -382,7 +266,6 @@ typedef enum {
   THR_COMP_L2A2,
   THR_COMP_L3A2,
   THR_COMP_GA2,
-#endif  // CONFIG_EXT_REFS
 
   THR_INTRA,
 
@@ -399,7 +282,7 @@ typedef struct RD_OPT {
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
 
-  int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES];
+  int64_t prediction_type_threshes[REF_FRAMES][REFERENCE_MODES];
 
   int RDMULT;
 } RD_OPT;
@@ -417,16 +300,16 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   rd_stats->invalid_rate = 0;
   rd_stats->ref_rdcost = INT64_MAX;
 #if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
-#if CONFIG_VAR_TX
     {
       int r, c;
       for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
         for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
           rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
     }
-#endif
   }
 #endif
 }
@@ -444,16 +327,16 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   rd_stats->invalid_rate = 1;
   rd_stats->ref_rdcost = INT64_MAX;
 #if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
-#if CONFIG_VAR_TX
     {
       int r, c;
       for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
         for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
           rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
     }
-#endif
   }
 #endif
 }
@@ -464,14 +347,17 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   int plane;
 #endif
   rd_stats_dst->rate += rd_stats_src->rate;
+  if (!rd_stats_dst->zero_rate)
+    rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
   rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
 #if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
-#if CONFIG_VAR_TX
     {
       // TODO(angiebird): optimize this part
       int r, c;
@@ -484,21 +370,10 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
         }
       assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
     }
-#endif
   }
 #endif
 }
 
-static INLINE int av1_get_coeff_token_cost(int token, int eob_val, int is_first,
-                                           const int *head_cost_table,
-                                           const int *tail_cost_table) {
-  if (eob_val == LAST_EOB) return av1_cost_zero(128);
-  const int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + is_first;
-  int cost = head_cost_table[comb_symb];
-  if (token > ONE_TOKEN) cost += tail_cost_table[token - TWO_TOKEN];
-  return cost;
-}
-
 struct TileInfo;
 struct TileDataEnc;
 struct AV1_COMP;
@@ -528,13 +403,12 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
 
 void av1_init_me_luts(void);
 
-void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
-                    int ref_mv_idx);
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
 
 void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
 
@@ -562,7 +436,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *scale,
-                          const struct scale_factors *scale_uv);
+                          const struct scale_factors *scale_uv,
+                          const int num_planes);
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth);
@@ -570,12 +445,8 @@ int av1_get_intra_cost_penalty(int qindex, int qdelta,
 void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
                          FRAME_CONTEXT *fc);
 
-#if CONFIG_LV_MAP
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc);
-#endif
-
-void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost,
-                                   coeff_cdf_model (*cdf)[PLANE_TYPES]);
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 607db9b86..6f4fced87 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -12,18 +12,17 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
-#if CONFIG_CFL
 #include "av1/common/cfl.h"
-#endif
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
@@ -37,12 +36,8 @@
 #include "av1/common/reconintra.h"
 #include "av1/common/scan.h"
 #include "av1/common/seg_common.h"
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
-#if CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION
 
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
@@ -50,105 +45,37 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
 #include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_PVQ
-#include "av1/encoder/pvq_encoder.h"
-#include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ
-#if CONFIG_DUAL_FILTER
+#include "av1/encoder/tx_prune_model_weights.h"
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
-#if USE_EXTRA_FILTER
-static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
-  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
-  { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
-  { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
+static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+  0x00000000, 0x00010000, 0x00020000,  // y = 0
+  0x00000001, 0x00010001, 0x00020001,  // y = 1
+  0x00000002, 0x00010002, 0x00020002,  // y = 2
 };
-#else   // USE_EXTRA_FILTER
-static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
-  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 },
-  { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 },
-};
-#endif  // USE_EXTRA_FILTER
-#endif  // CONFIG_DUAL_FILTER
-
-#if CONFIG_EXT_REFS
-
-#define LAST_FRAME_MODE_MASK                                          \
-  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |     \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define LAST2_FRAME_MODE_MASK                                         \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) |      \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define LAST3_FRAME_MODE_MASK                                         \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |      \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define GOLDEN_FRAME_MODE_MASK                                       \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
-   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define BWDREF_FRAME_MODE_MASK                                       \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define ALTREF2_FRAME_MODE_MASK                                     \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define ALTREF_FRAME_MODE_MASK                                      \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
-   (1 << ALTREF2_FRAME))
-
-#else  // !CONFIG_EXT_REFS
-
-#define LAST_FRAME_MODE_MASK \
-  ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
-#define GOLDEN_FRAME_MODE_MASK \
-  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
-#define ALTREF_FRAME_MODE_MASK \
-  ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
-
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-#if CONFIG_EXT_COMP_REFS
+
 #define SECOND_REF_FRAME_MASK                                         \
   ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
    (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
-#else  // !CONFIG_EXT_COMP_REFS
-#define SECOND_REF_FRAME_MASK \
-  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | 0x01)
-#endif  // CONFIG_EXT_COMP_REFS
-#else   // !CONFIG_EXT_REFS
-#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
-#endif  // CONFIG_EXT_REFS
-
-#define MIN_EARLY_TERM_INDEX 3
-#define NEW_MV_DISCOUNT_FACTOR 8
 
-#if CONFIG_EXT_INTRA
 #define ANGLE_SKIP_THRESH 10
-#define FILTER_FAST_SEARCH 1
-#endif  // CONFIG_EXT_INTRA
-
-// Setting this to 1 will disable trellis optimization within the
-// transform search. Trellis optimization will still be applied
-// in the final encode.
-#ifndef DISABLE_TRELLISQ_SEARCH
-#define DISABLE_TRELLISQ_SEARCH 0
-#endif
 
 static const double ADST_FLIP_SVM[8] = {
   /* vertical */
@@ -162,122 +89,72 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
 
-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+typedef enum {
+  FTXS_NONE = 0,
+  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} FAST_TX_SEARCH_MODE;
 
 struct rdcost_block_args {
   const AV1_COMP *cpi;
   MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
-  ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
   RD_STATS rd_stats;
   int64_t this_rd;
   int64_t best_rd;
   int exit_early;
   int use_fast_coef_costing;
+  FAST_TX_SEARCH_MODE ftxs_mode;
 };
 
 #define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
   { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
 
   { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
 
   { NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
   { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
   { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
 
   { NEARMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
   { NEARMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
   { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
 
-  { ZEROMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
-  { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
-  { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
-  { ZEROMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
-
-// TODO(zoeliu): May need to reconsider the order on the modes to check
-
-#if CONFIG_COMPOUND_SINGLEREF
-  // Single ref comp mode
-  { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEAREST_NEARMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEARMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEARMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEAREST_NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEARMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  /*
-  { SR_NEAREST_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEAREST_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEAREST_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEWMV, { ALTREF_FRAME, NONE_FRAME } },*/
-
-  { SR_NEAR_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEAR_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEAR_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEAR_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEAR_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEAR_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  { SR_ZERO_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_ZERO_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_ZERO_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_ZERO_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_ZERO_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_ZERO_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  { SR_NEW_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEW_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEW_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEW_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEW_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEW_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
 
   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
@@ -287,21 +164,16 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
-#if CONFIG_EXT_COMP_REFS
   { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
 
-  { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
 
   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
-#if CONFIG_SMOOTH_HV
   { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
   { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
-#endif  // CONFIG_SMOOTH_HV
 
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -309,16 +181,15 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
 
-#if CONFIG_EXT_REFS
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
@@ -326,8 +197,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
@@ -335,16 +205,15 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-#if CONFIG_EXT_REFS
   { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
@@ -352,7 +221,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
@@ -360,7 +229,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
@@ -368,7 +237,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
@@ -376,7 +245,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
@@ -384,7 +253,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
@@ -392,7 +261,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
@@ -400,16 +269,24 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 
-#if CONFIG_EXT_COMP_REFS
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, LAST2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
@@ -417,7 +294,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, LAST3_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
@@ -425,7 +302,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
 
   { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
@@ -433,89 +310,400 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+};
 
-  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D207_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D153_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D63_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
+  7,    // DC_PRED,
+  134,  // V_PRED,
+  133,  // H_PRED,
+  140,  // D45_PRED,
+  135,  // D135_PRED,
+  139,  // D113_PRED,
+  137,  // D157_PRED,
+  136,  // D203_PRED,
+  138,  // D67_PRED,
+  46,   // SMOOTH_PRED,
+  47,   // SMOOTH_V_PRED,
+  48,   // SMOOTH_H_PRED,
+  45,   // PAETH_PRED,
+};
+
+/* clang-format off */
+static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+                                             [REF_FRAMES] = {
+  // NEARESTMV,
+  { -1, 0, 1, 2, 6, 3, 4, 5, },
+  // NEARMV,
+  { -1, 15, 16, 17, 21, 18, 19, 20, },
+  // GLOBALMV,
+  { -1, 22, 23, 24, 27, 25, 26, 28, },
+  // NEWMV,
+  { -1, 8, 9, 10, 14, 11, 12, 13, },
+};
+/* clang-format on */
 
-  { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
-  { NEARMV, { LAST_FRAME, INTRA_FRAME } },
-  { NEWMV, { LAST_FRAME, INTRA_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
-  { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
-  { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
-
-  { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
-  { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
-  { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
-#endif  // CONFIG_EXT_REFS
-
-  { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
-  { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
-  { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
-  { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
-
-  { ZEROMV, { ALTREF2_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { ALTREF2_FRAME, INTRA_FRAME } },
-  { NEARMV, { ALTREF2_FRAME, INTRA_FRAME } },
-  { NEWMV, { ALTREF2_FRAME, INTRA_FRAME } },
-#endif  // CONFIG_EXT_REFS
-
-  { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
-  { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
-  { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
+/* clang-format off */
+static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+                                     [REF_FRAMES] = {
+  // NEAREST_NEARESTMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 41, 42, 43, 33, 37, 29, },
+      { -1, -1, -1, -1, -1, 34, 38, 30, },
+      { -1, -1, -1, -1, -1, 35, 39, 31, },
+      { -1, -1, -1, -1, -1, 36, 40, 32, },
+      { -1, -1, -1, -1, -1, -1, -1, 44, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAR_NEARMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 141, 148, 155, 77, 105, 49, },
+      { -1, -1, -1, -1, -1, 84, 112, 56, },
+      { -1, -1, -1, -1, -1, 91, 119, 63, },
+      { -1, -1, -1, -1, -1, 98, 126, 70, },
+      { -1, -1, -1, -1, -1, -1, -1, 162, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAREST_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 143, 150, 157, 79, 107, 51, },
+      { -1, -1, -1, -1, -1, 86, 114, 58, },
+      { -1, -1, -1, -1, -1, 93, 121, 65, },
+      { -1, -1, -1, -1, -1, 100, 128, 72, },
+      { -1, -1, -1, -1, -1, -1, -1, 164, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEARESTMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 142, 149, 156, 78, 106, 50, },
+      { -1, -1, -1, -1, -1, 85, 113, 57, },
+      { -1, -1, -1, -1, -1, 92, 120, 64, },
+      { -1, -1, -1, -1, -1, 99, 127, 71, },
+      { -1, -1, -1, -1, -1, -1, -1, 163, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAR_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 145, 152, 159, 81, 109, 53, },
+      { -1, -1, -1, -1, -1, 88, 116, 60, },
+      { -1, -1, -1, -1, -1, 95, 123, 67, },
+      { -1, -1, -1, -1, -1, 102, 130, 74, },
+      { -1, -1, -1, -1, -1, -1, -1, 166, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEARMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 144, 151, 158, 80, 108, 52, },
+      { -1, -1, -1, -1, -1, 87, 115, 59, },
+      { -1, -1, -1, -1, -1, 94, 122, 66, },
+      { -1, -1, -1, -1, -1, 101, 129, 73, },
+      { -1, -1, -1, -1, -1, -1, -1, 165, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // GLOBAL_GLOBALMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 147, 154, 161, 83, 111, 55, },
+      { -1, -1, -1, -1, -1, 90, 118, 62, },
+      { -1, -1, -1, -1, -1, 97, 125, 69, },
+      { -1, -1, -1, -1, -1, 104, 132, 76, },
+      { -1, -1, -1, -1, -1, -1, -1, 168, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 146, 153, 160, 82, 110, 54, },
+      { -1, -1, -1, -1, -1, 89, 117, 61, },
+      { -1, -1, -1, -1, -1, 96, 124, 68, },
+      { -1, -1, -1, -1, -1, 103, 131, 75, },
+      { -1, -1, -1, -1, -1, -1, -1, 167, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
 };
+/* clang-format on */
+
+static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   MV_REFERENCE_FRAME second_ref_frame) {
+  if (this_mode < INTRA_MODE_END) {
+    assert(ref_frame == INTRA_FRAME);
+    assert(second_ref_frame == NONE_FRAME);
+    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  }
+  if (this_mode >= SINGLE_INTER_MODE_START &&
+      this_mode < SINGLE_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert(second_ref_frame == NONE_FRAME);
+    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+                                   [ref_frame];
+  }
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert((second_ref_frame > INTRA_FRAME) &&
+           (second_ref_frame <= ALTREF_FRAME));
+    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+                                 [second_ref_frame];
+  }
+  assert(0);
+  return -1;
+}
 
 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
-  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, TM_PRED,
-#if CONFIG_SMOOTH_HV
-  SMOOTH_V_PRED, SMOOTH_H_PRED,
-#endif  // CONFIG_SMOOTH_HV
-  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED,    D117_PRED, D45_PRED,
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
 };
 
-#if CONFIG_CFL
 static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
-  UV_DC_PRED,       UV_CFL_PRED,      UV_H_PRED,
-  UV_V_PRED,        UV_SMOOTH_PRED,   UV_TM_PRED,
-#if CONFIG_SMOOTH_HV
-  UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
-#endif  // CONFIG_SMOOTH_HV
-  UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
-  UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
 };
-#else
-#define uv_rd_search_mode_order intra_rd_search_mode_order
-#endif  // CONFIG_CFL
+
+typedef struct InterModeSearchState {
+  int64_t best_rd;
+  MB_MODE_INFO best_mbmode;
+  int best_rate_y;
+  int best_rate_uv;
+  int best_mode_skippable;
+  int best_skip2;
+  int best_mode_index;
+  int skip_intra_modes;
+  int num_available_refs;
+  int64_t dist_refs[REF_FRAMES];
+  int dist_order_refs[REF_FRAMES];
+  int64_t mode_threshold[MAX_MODES];
+  PREDICTION_MODE best_intra_mode;
+  int64_t best_intra_rd;
+  int angle_stats_ready;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  unsigned int best_pred_sse;
+  int rate_uv_intra[TX_SIZES_ALL];
+  int rate_uv_tokenonly[TX_SIZES_ALL];
+  int64_t dist_uvs[TX_SIZES_ALL];
+  int skip_uvs[TX_SIZES_ALL];
+  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
+  int8_t uv_angle_delta[TX_SIZES_ALL];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  // Save a set of single_newmv for each checked ref_mv.
+  int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
+  int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
+  int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES];
+} InterModeSearchState;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+
+typedef struct InterModeRdModel {
+  int ready;
+  double a;
+  double b;
+  double dist_mean;
+  int skip_count;
+  int non_skip_count;
+  int fp_skip_count;
+  int bracket_idx;
+} InterModeRdModel;
+
+InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+static int inter_mode_data_idx[4];
+static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+
+int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_8X8) return 1;
+  if (bsize == BLOCK_16X16) return 2;
+  if (bsize == BLOCK_32X32) return 3;
+  return -1;
+}
+
+void av1_inter_mode_data_init() {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    const int block_idx = inter_mode_data_block_idx(i);
+    if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
+    InterModeRdModel *md = &inter_mode_rd_models[i];
+    md->ready = 0;
+    md->skip_count = 0;
+    md->non_skip_count = 0;
+    md->fp_skip_count = 0;
+    md->bracket_idx = 0;
+  }
+}
+
+void av1_inter_mode_data_show(const AV1_COMMON *cm) {
+  printf("frame_offset %d\n", cm->frame_offset);
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    const int block_idx = inter_mode_data_block_idx(i);
+    if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
+    InterModeRdModel *md = &inter_mode_rd_models[i];
+    if (md->ready) {
+      printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i,
+             md->non_skip_count, md->skip_count, md->fp_skip_count);
+    }
+  }
+}
+
+static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse,
+                          int curr_cost) {
+  aom_clear_system_state();
+  InterModeRdModel *md = &inter_mode_rd_models[bsize];
+  if (md->ready) {
+    const double est_ld = md->a * sse + md->b;
+    const double est_residue_cost = (sse - md->dist_mean) / est_ld;
+    const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost;
+    const int64_t int64_dist_mean = (int64_t)round(md->dist_mean);
+    const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean);
+    return est_rd;
+  }
+  return 0;
+}
+
+#define DATA_BRACKETS 7
+static const int data_num_threshold[DATA_BRACKETS] = {
+  200, 400, 800, 1600, 3200, 6400, INT32_MAX
+};
+
+void av1_inter_mode_data_fit(int rdmult) {
+  aom_clear_system_state();
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const int block_idx = inter_mode_data_block_idx(bsize);
+    InterModeRdModel *md = &inter_mode_rd_models[bsize];
+    if (block_idx == -1) continue;
+    int data_num = inter_mode_data_idx[block_idx];
+    if (data_num < data_num_threshold[md->bracket_idx]) {
+      continue;
+    }
+    double my = 0;
+    double mx = 0;
+    double dx = 0;
+    double dxy = 0;
+    double dist_mean = 0;
+    const int train_num = data_num;
+    for (int i = 0; i < train_num; ++i) {
+      const double sse = (double)inter_mode_data_sse[block_idx][i];
+      const double dist = (double)inter_mode_data_dist[block_idx][i];
+      const double residue_cost = inter_mode_data_residue_cost[block_idx][i];
+      const double ld = (sse - dist) / residue_cost;
+      dist_mean += dist;
+      my += ld;
+      mx += sse;
+      dx += sse * sse;
+      dxy += sse * ld;
+    }
+    dist_mean = dist_mean / data_num;
+    my = my / train_num;
+    mx = mx / train_num;
+    dx = sqrt(dx / train_num);
+    dxy = dxy / train_num;
+
+    md->dist_mean = dist_mean;
+    md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+    md->b = my - md->a * mx;
+    ++md->bracket_idx;
+    md->ready = 1;
+    assert(md->bracket_idx < DATA_BRACKETS);
+
+    (void)rdmult;
+#if 0
+    int skip_count = 0;
+    int fp_skip_count = 0;
+    double avg_error = 0;
+    const int test_num = data_num;
+    for (int i = 0; i < data_num; ++i) {
+      const int64_t sse = inter_mode_data_sse[block_idx][i];
+      const int64_t dist = inter_mode_data_dist[block_idx][i];
+      const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i];
+      const int64_t all_cost = inter_mode_data_all_cost[block_idx][i];
+      const int64_t est_rd =
+          get_est_rd(bsize, rdmult, sse, all_cost - residue_cost);
+      const int64_t real_rd = RDCOST(rdmult, all_cost, dist);
+      const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i];
+      if (est_rd > ref_best_rd) {
+        ++skip_count;
+        if (real_rd < ref_best_rd) {
+          ++fp_skip_count;
+        }
+      }
+      avg_error += abs(est_rd - real_rd) * 100. / real_rd;
+    }
+    avg_error /= test_num;
+    printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n",
+           test_num, bsize, avg_error, skip_count, fp_skip_count);
+#endif
+  }
+}
+
+static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist,
+                                 int residue_cost, int all_cost,
+                                 int64_t ref_best_rd) {
+  if (residue_cost == 0 || sse == dist) return;
+  const int block_idx = inter_mode_data_block_idx(bsize);
+  if (block_idx == -1) return;
+  if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+    const int data_idx = inter_mode_data_idx[block_idx];
+    inter_mode_data_sse[block_idx][data_idx] = sse;
+    inter_mode_data_dist[block_idx][data_idx] = dist;
+    inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost;
+    inter_mode_data_all_cost[block_idx][data_idx] = all_cost;
+    inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd;
+    ++inter_mode_data_idx[block_idx];
+  }
+}
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
 
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   if (l == 0) return 0;
   if (v < m)
-    return (l - 1) * av1_cost_bit(128, 0);
+    return av1_cost_literal(l - 1);
   else
-    return l * av1_cost_bit(128, 0);
+    return av1_cost_literal(l);
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+                                                      const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED;
+
+  if (!xd->cfl.is_chroma_reference) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // For chroma reference blocks, we should store data in the encoder iff we're
+  // allowed to try out CfL.
+  return is_cfl_allowed(xd);
 }
 
 // constants for prune 1 and prune 2 decision boundaries
@@ -524,6 +712,10 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_CORR_MARGIN 0.5
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode);
+
 static unsigned pixel_dist_visible_only(
     const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
     const int src_stride, const uint8_t *dst, const int dst_stride,
@@ -531,15 +723,10 @@ static unsigned pixel_dist_visible_only(
     int visible_cols) {
   unsigned sse;
 
-  if (txb_rows == visible_rows && txb_cols == visible_cols
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      && tx_bsize < BLOCK_SIZES
-#endif
-      ) {
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
     cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
     return sse;
   }
-#if CONFIG_HIGHBITDEPTH
   const MACROBLOCKD *xd = &x->e_mbd;
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -547,9 +734,6 @@ static unsigned pixel_dist_visible_only(
                                              visible_cols, visible_rows);
     return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
   }
-#else
-  (void)x;
-#endif  // CONFIG_HIGHBITDEPTH
   sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
                          visible_rows);
   return sse;
@@ -588,10 +772,9 @@ static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
   const uint64_t c1 = (400 * a << 2 * coeff_shift);
   const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
 
-  dist =
-      (uint64_t)floor(.5 +
-                      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * (svar + dvar + c1) /
-                          (sqrt(svar * (double)dvar + c2)));
+  dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+                                  (svar + dvar + c1) /
+                                  (sqrt(svar * (double)dvar + c2)));
 
   // Calibrate dist to have similar rate for the same QP with MSE only
   // distortion (as in master branch)
@@ -729,11 +912,9 @@ static double od_compute_dist_common(int activity_masking, uint16_t *x,
 static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
                               int bsize_h, int qindex) {
   assert(bsize_w >= 8 && bsize_h >= 8);
-#if CONFIG_PVQ
-  int activity_masking = 1;
-#else
+
   int activity_masking = 0;
-#endif
+
   int i, j;
   DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
@@ -760,11 +941,9 @@ static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
 static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
                                    int bsize_h, int qindex) {
   assert(bsize_w >= 8 && bsize_h >= 8);
-#if CONFIG_PVQ
-  int activity_masking = 1;
-#else
+
   int activity_masking = 0;
-#endif
+
   DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
@@ -806,7 +985,6 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
 
   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
       x->tune_metric == AOM_TUNE_DAALA_DIST) {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
@@ -834,7 +1012,6 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
         }
       }
     } else {
-#endif
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 
@@ -858,9 +1035,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
               rec[j * bsw + i] = src[j * src_stride + i];
         }
       }
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
   }
 
   if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
@@ -874,10 +1049,8 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
                                  bsw, coeff_shift);
       }
     }
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       d = ((uint64_t)d) >> 2 * coeff_shift;
-#endif
   } else {
     // Otherwise, MSE by default
     d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
@@ -887,10 +1060,10 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
   return d;
 }
 
-static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
-                                 int src_stride, const int16_t *diff,
-                                 int diff_stride, int bsw, int bsh,
-                                 int visible_w, int visible_h, int qindex) {
+static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
+                             int src_stride, const int16_t *diff,
+                             int diff_stride, int bsw, int bsh, int visible_w,
+                             int visible_h, int qindex) {
   int64_t d = 0;
   int i, j;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -905,18 +1078,14 @@ static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
 
   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
       x->tune_metric == AOM_TUNE_DAALA_DIST) {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
           orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
     } else {
-#endif
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
 
     if ((bsw == visible_w) && (bsh == visible_h)) {
       for (j = 0; j < bsh; j++)
@@ -971,7 +1140,8 @@ static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                                          const uint8_t *src, int src_stride,
                                          const uint8_t *dst, int dst_stride,
-                                         double *hordist, double *verdist) {
+                                         int need_4th, double *hordist,
+                                         double *verdist) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
@@ -980,7 +1150,6 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   if (f_index < 0) {
     const int w_shift = bw == 8 ? 1 : 2;
     const int h_shift = bh == 8 ? 1 : 2;
-#if CONFIG_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) {
       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -992,17 +1161,13 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
               (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
         }
     } else {
-#endif  // CONFIG_HIGHBITDEPTH
-
       for (int i = 0; i < bh; ++i)
         for (int j = 0; j < bw; ++j) {
           const int index = (j >> w_shift) + ((i >> h_shift) << 2);
           esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
                         (src[j + i * src_stride] - dst[j + i * dst_stride]);
         }
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
   } else {
     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
@@ -1051,13 +1216,22 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
     hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
     hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
     hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    if (need_4th) {
+      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+    }
     verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
     verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
     verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+    if (need_4th) {
+      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+    }
   } else {
     hordist[0] = verdist[0] = 0.25;
     hordist[1] = verdist[1] = 0.25;
     hordist[2] = verdist[2] = 0.25;
+    if (need_4th) {
+      hordist[3] = verdist[3] = 0.25;
+    }
   }
 }
 
@@ -1067,7 +1241,7 @@ static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   int prune_bitmask = 0;
   double svm_proj_h = 0, svm_proj_v = 0;
   double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
-  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
+  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0,
                                hdist, vdist);
 
   svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
@@ -1087,7 +1261,6 @@ static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   return prune_bitmask;
 }
 
-#if CONFIG_EXT_TX
 static void get_horver_correlation(const int16_t *diff, int stride, int w,
                                    int h, double *hcorr, double *vcorr) {
   // Returns hor/ver correlation coefficient
@@ -1132,7 +1305,7 @@ static void get_horver_correlation(const int16_t *diff, int stride, int w,
   }
 }
 
-int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
+static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
   double hcorr, vcorr;
   int prune_bitmask = 0;
   get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
@@ -1164,14 +1337,13 @@ static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   if (dct_idtx) {
     av1_subtract_plane(x, bsize, 0);
     const struct macroblock_plane *const p = &x->plane[0];
-    const int bw = 4 << (b_width_log2_lookup[bsize]);
-    const int bh = 4 << (b_height_log2_lookup[bsize]);
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
     prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
   }
 
   return prune;
 }
-#endif  // CONFIG_EXT_TX
 
 // Performance drop: 0.3%, Speed improvement: 5%
 static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
@@ -1182,61 +1354,342 @@ static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                           pd->dst.stride);
 }
 
-#if CONFIG_EXT_TX
 // 1D Transforms used in inter set, this needs to be changed if
 // ext_tx_used_inter is changed
 static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
-  { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
-#if CONFIG_MRC_TX
+  { 1, 0, 0, 0 },
+  { 1, 1, 1, 1 },
+  { 1, 1, 1, 1 },
   { 1, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
 };
-#endif  // CONFIG_EXT_TX
 
-static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                          const MACROBLOCKD *const xd, int tx_set) {
-#if CONFIG_EXT_TX
-  const int *tx_set_1D = tx_set >= 0 ? ext_tx_used_inter_1D[tx_set] : NULL;
-#else
-  const int tx_set_1D[TX_TYPES_1D] = { 0 };
-#endif  // CONFIG_EXT_TX
+static void get_energy_distribution_finer(const int16_t *diff, int stride,
+                                          int bw, int bh, float *hordist,
+                                          float *verdist) {
+  // First compute downscaled block energy values (esq); downscale factors
+  // are defined by w_shift and h_shift.
+  unsigned int esq[256];
+  const int w_shift = bw <= 8 ? 0 : 1;
+  const int h_shift = bh <= 8 ? 0 : 1;
+  const int esq_w = bw <= 8 ? bw : bw / 2;
+  const int esq_h = bh <= 8 ? bh : bh / 2;
+  const int esq_sz = esq_w * esq_h;
+  int i, j;
+  memset(esq, 0, esq_sz * sizeof(esq[0]));
+  for (i = 0; i < bh; i++) {
+    unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+    const int16_t *cur_diff_row = diff + i * stride;
+    for (j = 0; j < bw; j++) {
+      cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j];
+    }
+  }
+
+  uint64_t total = 0;
+  for (i = 0; i < esq_sz; i++) total += esq[i];
+
+  // Output hordist and verdist arrays are normalized 1D projections of esq
+  if (total == 0) {
+    float hor_val = 1.0f / esq_w;
+    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+    float ver_val = 1.0f / esq_h;
+    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+    return;
+  }
+
+  const float e_recip = 1.0f / (float)total;
+  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+  const unsigned int *cur_esq_row;
+  for (i = 0; i < esq_h - 1; i++) {
+    cur_esq_row = esq + i * esq_w;
+    for (j = 0; j < esq_w - 1; j++) {
+      hordist[j] += (float)cur_esq_row[j];
+      verdist[i] += (float)cur_esq_row[j];
+    }
+    verdist[i] += (float)cur_esq_row[j];
+  }
+  cur_esq_row = esq + i * esq_w;
+  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+static void get_horver_correlation_full(const int16_t *diff, int stride, int w,
+                                        int h, float *hcorr, float *vcorr) {
+  const float num_hor = (float)(h * (w - 1));
+  const float num_ver = (float)((h - 1) * w);
+  int i, j;
+
+  // The following notation is used:
+  // x - current pixel
+  // y - left neighbor pixel
+  // z - top neighbor pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t xhor_sum = 0, xver_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2hor_sum = 0, x2ver_sum = 0, y2_sum = 0, z2_sum = 0;
+
+  int16_t x, y, z;
+  for (j = 1; j < w; ++j) {
+    x = diff[j];
+    y = diff[j - 1];
+    xy_sum += x * y;
+    xhor_sum += x;
+    y_sum += y;
+    x2hor_sum += x * x;
+    y2_sum += y * y;
+  }
+  for (i = 1; i < h; ++i) {
+    x = diff[i * stride];
+    z = diff[(i - 1) * stride];
+    xz_sum += x * z;
+    xver_sum += x;
+    z_sum += z;
+    x2ver_sum += x * x;
+    z2_sum += z * z;
+    for (j = 1; j < w; ++j) {
+      x = diff[i * stride + j];
+      y = diff[i * stride + j - 1];
+      z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      xhor_sum += x;
+      xver_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2hor_sum += x * x;
+      x2ver_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  *hcorr = *vcorr = 1;
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  }
+}
+
+// Transforms raw scores into a probability distribution across 16 TX types
+static void score_2D_transform_pow8(float *scores_2D, float shift) {
+  float sum = 0.0f;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    float v, v2, v4;
+    v = AOMMAX(scores_2D[i] + shift, 0.0f);
+    v2 = v * v;
+    v4 = v2 * v2;
+    scores_2D[i] = v4 * v4;
+    sum += scores_2D[i];
+  }
+  for (i = 0; i < 16; i++) scores_2D[i] /= sum;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+  // TX_4X4
+  (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f,
+             0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f,
+             0.08606f, 0.09827f },
+  // TX_8X8
+  (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f,
+             0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f,
+             0.09363f, 0.11682f },
+  // TX_16X16
+  (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+             0.06897f, 0.07629f, 0.08875f, 0.11169f },
+  // TX_32X32
+  NULL,
+  // TX_64X64
+  NULL,
+  // TX_4X8
+  (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f,
+             0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
+             0.09119f, 0.10828f },
+  // TX_8X4
+  (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f,
+             0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
+             0.09167f, 0.10974f },
+  // TX_8X16
+  (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f,
+             0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f,
+             0.09509f, 0.12097f },
+  // TX_16X8
+  (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f,
+             0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f,
+             0.09485f, 0.12048f },
+  // TX_16X32
+  (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f,
+             0.06506f, 0.07385f, 0.08606f, 0.10925f },
+  // TX_32X16
+  (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f,
+             0.06531f, 0.07336f, 0.08582f, 0.11072f },
+  // TX_32X64
+  NULL,
+  // TX_64X32
+  NULL,
+  // TX_4X16
+  NULL,
+  // TX_16X4
+  NULL,
+  // TX_8X32
+  NULL,
+  // TX_32X8
+  NULL,
+  // TX_16X64
+  NULL,
+  // TX_64X16
+  NULL,
+};
+
+static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                       int blk_row, int blk_col, TxSetType tx_set_type,
+                       TX_TYPE_PRUNE_MODE prune_mode) {
+  static const int tx_type_table_2D[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+  if (tx_set_type != EXT_TX_SET_ALL16 &&
+      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+    return 0;
+  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+  if (!nn_config_hor || !nn_config_ver) return 0;  // Model not established yet.
+
+  aom_clear_system_state();
+  float hfeatures[16], vfeatures[16];
+  float hscores[4], vscores[4];
+  float scores_2D[16];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+  assert(hfeatures_num <= 16);
+  assert(vfeatures_num <= 16);
+
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+                                vfeatures);
+  get_horver_correlation_full(diff, diff_stride, bw, bh,
+                              &hfeatures[hfeatures_num - 1],
+                              &vfeatures[vfeatures_num - 1]);
+  av1_nn_predict(hfeatures, nn_config_hor, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, vscores);
+
+  float score_2D_average = 0.0f;
+  for (int i = 0; i < 4; i++) {
+    float *cur_scores_2D = scores_2D + i * 4;
+    cur_scores_2D[0] = vscores[i] * hscores[0];
+    cur_scores_2D[1] = vscores[i] * hscores[1];
+    cur_scores_2D[2] = vscores[i] * hscores[2];
+    cur_scores_2D[3] = vscores[i] * hscores[3];
+    score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] +
+                        cur_scores_2D[3];
+  }
+  score_2D_average /= 16;
+  score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+
+  // Always keep the TX type with the highest score, prune all others with
+  // score below score_thresh.
+  int max_score_i = 0;
+  float max_score = 0.0f;
+  for (int i = 0; i < 16; i++) {
+    if (scores_2D[i] > max_score &&
+        av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) {
+      max_score = scores_2D[i];
+      max_score_i = i;
+    }
+  }
+
+  int pruning_aggressiveness = 0;
+  if (prune_mode == PRUNE_2D_ACCURATE) {
+    if (tx_set_type == EXT_TX_SET_ALL16)
+      pruning_aggressiveness = 6;
+    else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+      pruning_aggressiveness = 4;
+  } else if (prune_mode == PRUNE_2D_FAST) {
+    if (tx_set_type == EXT_TX_SET_ALL16)
+      pruning_aggressiveness = 10;
+    else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+      pruning_aggressiveness = 7;
+  }
+  const float score_thresh =
+      prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
+
+  int prune_bitmask = 0;
+  for (int i = 0; i < 16; i++) {
+    if (scores_2D[i] < score_thresh && i != max_score_i)
+      prune_bitmask |= (1 << tx_type_table_2D[i]);
+  }
+  return prune_bitmask;
+}
 
+static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+                     const MACROBLOCKD *const xd, int tx_set_type) {
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
+      x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
+      x->cb_partition_scan)
+    return;
+  int tx_set = ext_tx_set_index[1][tx_set_type];
+  assert(tx_set >= 0);
+  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
   switch (cpi->sf.tx_type_search.prune_mode) {
-    case NO_PRUNE: return 0; break;
+    case NO_PRUNE: return;
     case PRUNE_ONE:
-      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
-        return 0;
-      return prune_one_for_sby(cpi, bsize, x, xd);
+      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
+      x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd);
       break;
-#if CONFIG_EXT_TX
     case PRUNE_TWO:
-      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
-        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
-        return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
+        x->tx_search_prune[tx_set_type] =
+            prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      }
+      if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
+        x->tx_search_prune[tx_set_type] =
+            prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
       }
-      if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
-        return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
-      return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      x->tx_search_prune[tx_set_type] =
+          prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
       break;
-#endif  // CONFIG_EXT_TX
+    case PRUNE_2D_ACCURATE:
+    case PRUNE_2D_FAST: break;
+    default: assert(0);
   }
-  assert(0);
-  return 0;
 }
 
-static int do_tx_type_search(TX_TYPE tx_type, int prune) {
-// TODO(sarahparker) implement for non ext tx
-#if CONFIG_EXT_TX
-  return !(((prune >> vtx_tab[tx_type]) & 1) |
-           ((prune >> (htx_tab[tx_type] + 8)) & 1));
-#else
-  // temporary to avoid compiler warnings
-  (void)vtx_tab;
-  (void)htx_tab;
-  (void)tx_type;
-  (void)prune;
-  return 1;
-#endif  // CONFIG_EXT_TX
+static int do_tx_type_search(TX_TYPE tx_type, int prune,
+                             TX_TYPE_PRUNE_MODE mode) {
+  // TODO(sarahparker) implement for non ext tx
+  if (mode >= PRUNE_2D_ACCURATE) {
+    return !((prune >> tx_type) & 1);
+  } else {
+    return !(((prune >> vtx_tab[tx_type]) & 1) |
+             ((prune >> (htx_tab[tx_type] + 8)) & 1));
+  }
 }
 
 static void model_rd_from_sse(const AV1_COMP *const cpi,
@@ -1245,16 +1698,12 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
                               int64_t *dist) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dequant_shift =
-#if CONFIG_HIGHBITDEPTH
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
-#endif  // CONFIG_HIGHBITDEPTH
-                                                    3;
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
 
   // Fast approximate the modelling function.
   if (cpi->sf.simple_model_rd_from_var) {
     const int64_t square_error = sse;
-    int quantizer = (pd->dequant[1] >> dequant_shift);
-
+    int quantizer = (pd->dequant_Q3[1] >> dequant_shift);
     if (quantizer < 120)
       *rate = (int)((square_error * (280 - quantizer)) >>
                     (16 - AV1_PROB_COST_SHIFT));
@@ -1263,22 +1712,48 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
     *dist = (square_error * quantizer) >> 8;
   } else {
     av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
-                                 pd->dequant[1] >> dequant_shift, rate, dist);
+                                 pd->dequant_Q3[1] >> dequant_shift, rate,
+                                 dist);
   }
-
   *dist <<= 4;
 }
 
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                               pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    total_sse += sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+#endif
+
 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
                             int plane_to, int *out_rate_sum,
                             int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb) {
+                            int64_t *skip_sse_sb, int *plane_rate,
+                            int64_t *plane_sse, int64_t *plane_dist) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
   int plane;
-  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+  const int ref = xd->mi[0]->ref_frame[0];
 
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
@@ -1289,19 +1764,13 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#endif  // CONFIG_CHROMA_SUB8X8
-
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
     int rate;
     int64_t dist;
 
-#if CONFIG_CB4X4
     if (x->skip_chroma_rd && plane) continue;
-#endif  // CONFIG_CB4X4
 
     // TODO(geza): Write direct sse functions that do not compute
     // variance as well.
@@ -1316,14 +1785,54 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 
     rate_sum += rate;
     dist_sum += dist;
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
   }
 
-  *skip_txfm_sb = total_sse == 0;
-  *skip_sse_sb = total_sse << 4;
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum;
 }
 
+static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                             int plane_to, int *skip_txfm_sb) {
+  *skip_txfm_sb = 1;
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    // Since fast HBD variance functions scale down sse by 4 bit, we first use
+    // fast vf implementation to rule out blocks with non-zero scaled sse. Then,
+    // only if the source is HBD and the scaled sse is 0, accurate sse
+    // computation is applied to determine if the sse is really 0. This step is
+    // necessary for HBD lossless coding.
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    if (sse) {
+      *skip_txfm_sb = 0;
+      return;
+    } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint64_t sse64 = aom_highbd_sse_odd_size(
+          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+          block_size_wide[bs], block_size_high[bs]);
+
+      if (sse64) {
+        *skip_txfm_sb = 0;
+        return;
+      }
+    }
+  }
+  return;
+}
+
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -1339,20 +1848,6 @@ int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
-int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
-                             int block_size) {
-  int i;
-  int64_t error = 0;
-
-  for (i = 0; i < block_size; i++) {
-    const int diff = coeff[i] - dqcoeff[i];
-    error += diff * diff;
-  }
-
-  return error;
-}
-
-#if CONFIG_HIGHBITDEPTH
 int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff, intptr_t block_size,
                                  int64_t *ssz, int bd) {
@@ -1373,236 +1868,13 @@ int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
   *ssz = sqcoeff;
   return error;
 }
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_PVQ
-// Without PVQ, av1_block_error_c() return two kind of errors,
-// 1) reconstruction (i.e. decoded) error and
-// 2) Squared sum of transformed residue (i.e. 'coeff')
-// However, if PVQ is enabled, coeff does not keep the transformed residue
-// but instead a transformed original is kept.
-// Hence, new parameter ref vector (i.e. transformed predicted signal)
-// is required to derive the residue signal,
-// i.e. coeff - ref = residue (all transformed).
-
-#if CONFIG_HIGHBITDEPTH
-static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff,
-                                         const tran_low_t *dqcoeff,
-                                         const tran_low_t *ref,
-                                         intptr_t block_size, int64_t *ssz,
-                                         int bd) {
-  int64_t error;
-  int64_t sqcoeff;
-  int shift = 2 * (bd - 8);
-  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
-  // Use the existing sse codes for calculating distortion of decoded signal:
-  // i.e. (orig - decoded)^2
-  // For high bit depth, throw away ssz until a 32-bit version of
-  // av1_block_error_fp is written.
-  int64_t ssz_trash;
-  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
-  // prediction residue^2 = (orig - ref)^2
-  sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash);
-  error = (error + rounding) >> shift;
-  sqcoeff = (sqcoeff + rounding) >> shift;
-  *ssz = sqcoeff;
-  return error;
-}
-#else
-// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
-// a separate function that does not do the extra computations for ssz.
-static int64_t av1_block_error2_c(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff,
-                                  const tran_low_t *ref, intptr_t block_size,
-                                  int64_t *ssz) {
-  int64_t error;
-  int64_t ssz_trash;
-  // Use the existing sse codes for calculating distortion of decoded signal:
-  // i.e. (orig - decoded)^2
-  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
-  // prediction residue^2 = (orig - ref)^2
-  *ssz = av1_block_error(coeff, ref, block_size, &ssz_trash);
-  return error;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PVQ
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-#if !CONFIG_LV_MAP
-static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                       int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
-                       const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                       int use_fast_coef_costing) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const struct macroblock_plane *p = &x->plane[plane];
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const PLANE_TYPE type = pd->plane_type;
-  const uint16_t *band_count = &band_count_table[tx_size][1];
-  const int eob = p->eobs[block];
-  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
-  uint8_t token_cache[MAX_TX_SQUARE];
-  int pt = combine_entropy_contexts(*a, *l);
-  int c, cost;
-  const int16_t *scan = scan_order->scan;
-  const int16_t *nb = scan_order->neighbors;
-  const int ref = is_inter_block(mbmi);
-  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      x->token_head_costs[tx_size_ctx][type][ref];
-  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      x->token_tail_costs[tx_size_ctx][type][ref];
-  const int seg_eob = av1_get_tx_eob(&cm->seg, mbmi->segment_id, tx_size);
-  int eob_val;
-
-#if CONFIG_HIGHBITDEPTH
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-#else
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
-  // Check for consistency of tx_size with mode info
-  assert(tx_size == av1_get_tx_size(plane, xd));
-#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
-  (void)cm;
-
-  if (eob == 0) {
-    // block zero
-    cost = (*head_token_costs)[pt][0];
-  } else {
-    if (use_fast_coef_costing) {
-      int band_left = *band_count++;
-
-      // dc token
-      int v = qcoeff[0];
-      int16_t prev_t;
-      cost = av1_get_token_cost(v, &prev_t, cat6_bits);
-      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
-      cost += av1_get_coeff_token_cost(
-          prev_t, eob_val, 1, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
-
-      token_cache[0] = av1_pt_energy_class[prev_t];
-      ++head_token_costs;
-      ++tail_token_costs;
-
-      // ac tokens
-      for (c = 1; c < eob; c++) {
-        const int rc = scan[c];
-        int16_t t;
-
-        v = qcoeff[rc];
-        cost += av1_get_token_cost(v, &t, cat6_bits);
-        eob_val =
-            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-        cost += av1_get_coeff_token_cost(t, eob_val, 0,
-                                         (*head_token_costs)[!prev_t],
-                                         (*tail_token_costs)[!prev_t]);
-        prev_t = t;
-        if (!--band_left) {
-          band_left = *band_count++;
-          ++head_token_costs;
-          ++tail_token_costs;
-        }
-      }
-    } else {  // !use_fast_coef_costing
-      int band_left = *band_count++;
-
-      // dc token
-      int v = qcoeff[0];
-      int16_t tok;
-      cost = av1_get_token_cost(v, &tok, cat6_bits);
-      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
-      cost += av1_get_coeff_token_cost(tok, eob_val, 1, (*head_token_costs)[pt],
-                                       (*tail_token_costs)[pt]);
-
-      token_cache[0] = av1_pt_energy_class[tok];
-      ++head_token_costs;
-      ++tail_token_costs;
-
-      // ac tokens
-      for (c = 1; c < eob; c++) {
-        const int rc = scan[c];
-
-        v = qcoeff[rc];
-        cost += av1_get_token_cost(v, &tok, cat6_bits);
-        pt = get_coef_context(nb, token_cache, c);
-        eob_val =
-            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-        cost += av1_get_coeff_token_cost(
-            tok, eob_val, 0, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
-        token_cache[rc] = av1_pt_energy_class[tok];
-        if (!--band_left) {
-          band_left = *band_count++;
-          ++head_token_costs;
-          ++tail_token_costs;
-        }
-      }
-    }
-  }
-
-  return cost;
-}
-#endif  // !CONFIG_LV_MAP
-
-int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
-                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
-  const AV1_COMMON *const cm = &cpi->common;
-#if !CONFIG_LV_MAP
-  (void)blk_row;
-  (void)blk_col;
-#if CONFIG_MRC_TX
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const TX_TYPE tx_type = av1_get_tx_type(xd->plane[plane].plane_type, xd,
-                                          blk_row, blk_col, block, tx_size);
-  const int is_inter = is_inter_block(mbmi);
-  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
-                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
-    const int mrc_mask_cost =
-        av1_cost_color_map(x, plane, block, mbmi->sb_type, tx_size, MRC_MAP);
-    return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
-                       use_fast_coef_costing) +
-           mrc_mask_cost;
-  }
-#endif
-  return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
-                     use_fast_coef_costing);
-#else  // !CONFIG_LV_MAP
-  (void)scan_order;
-  (void)use_fast_coef_costing;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else   // CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif  // CONFIG_CB4X4
-
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, tx_size,
-                             &txb_ctx);
-#endif  // !CONFIG_LV_MAP
-}
-#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
 
 // Get transform block visible dimensions cropped to the MI units.
 static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
                                BLOCK_SIZE tx_bsize, int *width, int *height,
                                int *visible_width, int *visible_height) {
-#if !(CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX))
   assert(tx_bsize <= plane_bsize);
-#endif
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
   const int block_height = block_size_high[plane_bsize];
@@ -1659,1332 +1931,1468 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
 
 // Compute the pixel domain distortion from diff on all visible 4x4s in the
 // transform block.
-static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
-                               const int16_t *diff, const int diff_stride,
-                               int blk_row, int blk_col,
-                               const BLOCK_SIZE plane_bsize,
-                               const BLOCK_SIZE tx_bsize) {
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                                      int blk_row, int blk_col,
+                                      const BLOCK_SIZE plane_bsize,
+                                      const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
 #if CONFIG_DIST_8X8
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
-  const int src_stride = x->plane[plane].src.stride;
-  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
-  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-#endif
-
-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
-                     NULL, &visible_cols, &visible_rows);
-
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
-    return av1_dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
-                             txb_height, visible_cols, visible_rows, x->qindex);
-  else
+  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) {
+    const int src_stride = x->plane[plane].src.stride;
+    const int src_idx = (blk_row * src_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+    return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
+                         txb_height, visible_cols, visible_rows, x->qindex);
+  }
 #endif
-    return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols,
-                                  visible_rows);
+  diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
+  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
 }
 
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
-  int val_count[256];
-  memset(val_count, 0, sizeof(val_count));
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count) {
+  const int max_pix_val = 1 << 8;
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
-      ++val_count[src[r * stride + c]];
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      ++val_count[this_val];
     }
   }
   int n = 0;
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < max_pix_val; ++i) {
     if (val_count[i]) ++n;
   }
   return n;
 }
 
-#if CONFIG_HIGHBITDEPTH
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth) {
+                            int bit_depth, int *val_count) {
   assert(bit_depth <= 12);
+  const int max_pix_val = 1 << bit_depth;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  int val_count[1 << 12];
-  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
-      ++val_count[src[r * stride + c]];
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      if (this_val >= max_pix_val) return 0;
+      ++val_count[this_val];
     }
   }
   int n = 0;
-  for (int i = 0; i < (1 << bit_depth); ++i) {
+  for (int i = 0; i < max_pix_val; ++i) {
     if (val_count[i]) ++n;
   }
   return n;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                    BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
-                    TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
-                    OUTPUT_STATUS output_status) {
+static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane,
+                                           int block, int blk_row, int blk_col,
+                                           int eob, int reduced_tx_set) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, reduced_tx_set);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              dst_stride, eob, reduced_tx_set);
+}
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash);
+
+static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
+                                   int blk_col, BLOCK_SIZE plane_bsize,
+                                   TX_SIZE tx_size) {
+  int16_t tmp_data[64 * 64];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int txb_w = tx_size_wide[tx_size];
+  const int txb_h = tx_size_high[tx_size];
+  uint8_t *hash_data = (uint8_t *)cur_diff_row;
+  if (txb_w != diff_stride) {
+    int16_t *cur_hash_row = tmp_data;
+    for (int i = 0; i < txb_h; i++) {
+      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
+      cur_hash_row += txb_w;
+      cur_diff_row += diff_stride;
+    }
+    hash_data = (uint8_t *)tmp_data;
+  }
+  CRC32C *crc = &x->mb_rd_record.crc_calculator;
+  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
+  return (hash << 5) + tx_size;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        int64_t *out_sse) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-#if CONFIG_DIST_8X8
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#else   // CONFIG_DIST_8X8
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif  // CONFIG_DIST_8X8
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
-  if (cpi->sf.use_transform_domain_distortion
-#if CONFIG_DIST_8X8
-      && !x->using_dist_8x8
-#endif
-      ) {
-    // Transform domain distortion computation is more efficient as it does
-    // not involve an inverse transform, but it is less accurate.
-    const int buffer_length = tx_size_2d[tx_size];
-    int64_t this_sse;
-    int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-    tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_PVQ
-    tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
-
-#if CONFIG_HIGHBITDEPTH
-    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-    *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
-                                          buffer_length, &this_sse, bd);
-#else
-    *out_dist =
-        av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, &this_sse);
-#endif  // CONFIG_HIGHBITDEPTH
-#else   // !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
-                                         &this_sse, xd->bd);
-    else
-#endif
-      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-#endif  // CONFIG_PVQ
-    *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
-    *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
-  } else {
-    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-#if !CONFIG_PVQ || CONFIG_DIST_8X8
-    const int bsw = block_size_wide[tx_bsize];
-    const int bsh = block_size_high[tx_bsize];
-#endif
-    const int src_stride = x->plane[plane].src.stride;
-    const int dst_stride = xd->plane[plane].dst.stride;
-    // Scale the transform block index to pixel unit.
-    const int src_idx = (blk_row * src_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const int dst_idx = (blk_row * dst_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-    const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
-    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    const uint16_t eob = p->eobs[block];
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  else
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
 
-    assert(cpi != NULL);
-    assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
 
-    {
-      const int diff_stride = block_size_wide[plane_bsize];
-      const int diff_idx = (blk_row * diff_stride + blk_col)
-                           << tx_size_wide_log2[0];
-      const int16_t *diff = &p->src_diff[diff_idx];
-      *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                                 plane_bsize, tx_bsize);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-    }
-    *out_sse *= 16;
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+  const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0];
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
-    if (eob) {
-      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-        *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride,
-                               blk_row, blk_col, plane_bsize, tx_bsize);
-      } else {
-#if CONFIG_HIGHBITDEPTH
-        uint8_t *recon;
-        DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
-
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          recon = CONVERT_TO_BYTEPTR(recon16);
-        else
-          recon = (uint8_t *)recon16;
-#else
-        DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
-                                   NULL, 0, bsw, bsh, xd->bd);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL,
-                            0, bsw, bsh);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-#else
-        (void)dst;
-#endif  // !CONFIG_PVQ
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        const PLANE_TYPE plane_type = get_plane_type(plane);
-        TX_TYPE tx_type =
-            av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-        av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                    xd->mi[0]->mbmi.mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                    mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                    tx_type, tx_size, recon, MAX_TX_SIZE, eob);
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+  } else {
+    recon = (uint8_t *)recon16;
+    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                            NULL, 0, 0, NULL);
+  }
 
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+                                    cpi->common.reduced_tx_set_used);
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.reduced_tx_set_used);
 #if CONFIG_DIST_8X8
-        if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
-          // Save decoded pixels for inter block in pd->pred to avoid
-          // block_8x8_rd_txfm_daala_dist() need to produce them
-          // by calling av1_inverse_transform_block() again.
-          const int pred_stride = block_size_wide[plane_bsize];
-          const int pred_idx = (blk_row * pred_stride + blk_col)
-                               << tx_size_wide_log2[0];
-          int16_t *pred = &pd->pred[pred_idx];
-          int i, j;
-
-#if CONFIG_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            for (j = 0; j < bsh; j++)
-              for (i = 0; i < bsw; i++)
-                pred[j * pred_stride + i] =
-                    CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
-          } else {
-#endif
-            for (j = 0; j < bsh; j++)
-              for (i = 0; i < bsw; i++)
-                pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
-#if CONFIG_HIGHBITDEPTH
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_DIST_8X8
-        *out_dist =
-            pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
-                       blk_row, blk_col, plane_bsize, tx_bsize);
-      }
-      *out_dist *= 16;
+  if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
+    // Save decoded pixels for inter block in pd->pred to avoid
+    // block_8x8_rd_txfm_daala_dist() need to produce them
+    // by calling av1_inverse_transform_block() again.
+    const int pred_stride = block_size_wide[plane_bsize];
+    const int pred_idx = (blk_row * pred_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    int16_t *pred = &x->pred_luma[pred_idx];
+    int i, j;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          pred[j * pred_stride + i] =
+              CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
     } else {
-      *out_dist = *out_sse;
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
     }
   }
-}
-
-static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
-                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct rdcost_block_args *args = arg;
-  MACROBLOCK *const x = args->x;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const AV1_COMP *cpi = args->cpi;
-  ENTROPY_CONTEXT *a = args->t_above + blk_col;
-  ENTROPY_CONTEXT *l = args->t_left + blk_row;
-  const AV1_COMMON *cm = &cpi->common;
-  int64_t rd1, rd2, rd;
-  RD_STATS this_rd_stats;
-
-#if CONFIG_DIST_8X8
-  // If sub8x8 tx, 8x8 or larger partition, and luma channel,
-  // dist-8x8 disables early skip, because the distortion metrics for
-  // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
-  // (new distortion metric) are different.
-  // Exception is: dist-8x8 is enabled but still MSE is used,
-  // i.e. "--tune=" encoder option is not used.
-  int disable_early_skip =
-      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
-      x->tune_metric != AOM_TUNE_PSNR;
 #endif  // CONFIG_DIST_8X8
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+}
 
-#if !CONFIG_SUPERTX && !CONFIG_VAR_TX
-  assert(tx_size == av1_get_tx_size(plane, xd));
-#endif  // !CONFIG_SUPERTX
-
-  av1_init_rd_stats(&this_rd_stats);
-
-  if (args->exit_early) return;
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
 
-  if (!is_inter_block(mbmi)) {
-    av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
-                                   tx_size);
-    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int err = diff[j * stride + i];
+      sum += err * err;
+    }
   }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
 
-#if !CONFIG_TXK_SEL
-  // full forward transform and quantization
-  const int coeff_ctx = combine_entropy_contexts(*a, *l);
-#if DISABLE_TRELLISQ_SEARCH
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_B);
-#else
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += abs(diff[j * stride + i]);
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
 
-  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
-  const int buffer_length = tx_size_2d[tx_size];
-  int64_t tmp_dist;
-  int64_t tmp;
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
-  else
-#endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
-  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
+static void get_2x2_normalized_sses_and_sads(
+    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+    int src_stride, const uint8_t *const dst, int dst_stride,
+    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+    double *const sad_norm_arr) {
+  const BLOCK_SIZE tx_bsize_half =
+      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
+    const int half_width = block_size_wide[tx_bsize] / 2;
+    const int half_height = block_size_high[tx_bsize] / 2;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const int16_t *const this_src_diff =
+            src_diff + row * half_height * diff_stride + col * half_width;
+        sse_norm_arr[row * 2 + col] =
+            get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        sad_norm_arr[row * 2 + col] =
+            get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+      }
+    }
+  } else {  // use function pointers to calculate stats
+    const int half_width = block_size_wide[tx_bsize_half];
+    const int half_height = block_size_high[tx_bsize_half];
+    const int num_samples_half = half_width * half_height;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const uint8_t *const this_src =
+            src + row * half_height * src_stride + col * half_width;
+        const uint8_t *const this_dst =
+            dst + row * half_height * dst_stride + col * half_width;
 
-  if (
-#if CONFIG_DIST_8X8
-      disable_early_skip ||
-#endif
-      RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-  } else {
-    args->exit_early = 1;
-    return;
-  }
-#endif  // DISABLE_TRELLISQ_SEARCH
+        unsigned int this_sse;
+        cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                      dst_stride, &this_sse);
+        sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
 
-#if CONFIG_MRC_TX
-  if (mbmi->tx_type == MRC_DCT && !mbmi->valid_mrc_mask) {
-    args->exit_early = 1;
-    return;
+        const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+            this_src, src_stride, this_dst, dst_stride);
+        sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+      }
+    }
   }
-#endif  // CONFIG_MRC_TX
+}
 
-  if (!is_inter_block(mbmi)) {
-    struct macroblock_plane *const p = &x->plane[plane];
-    av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
-                                       p->eobs[block]);
-    av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                   tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_DECODED_PIXELS);
-  } else {
-    av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                   tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_PREDICTED_PIXELS);
-  }
-#if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
-#if CONFIG_CHROMA_SUB8X8
-    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
-#else
-    assert(!is_inter_block(mbmi));
-#endif  // CONFIG_CHROMA_SUB8X8
-    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
-  }
-#endif  // CONFIG_CFL
-  rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
-  if (args->this_rd + rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
-  }
-#if !CONFIG_PVQ
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  this_rd_stats.rate =
-      av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
-                      scan_order, a, l, args->use_fast_coef_costing);
-#else   // !CONFIG_PVQ
-  this_rd_stats.rate = x->rate;
-#endif  // !CONFIG_PVQ
-#else   // !CONFIG_TXK_SEL
-  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, a, l, args->use_fast_coef_costing,
-                      &this_rd_stats);
-#endif  // !CONFIG_TXK_SEL
-
-#if !CONFIG_PVQ
-#if CONFIG_RD_DEBUG
-  av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
-                            this_rd_stats.rate);
-#endif  // CONFIG_RD_DEBUG
-  av1_set_txb_context(x, plane, block, tx_size, a, l);
-#endif  // !CONFIG_PVQ
+#if CONFIG_COLLECT_RD_STATS
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const RD_STATS *const rd_stats, int blk_row,
+                                    int blk_col, BLOCK_SIZE plane_bsize,
+                                    TX_SIZE tx_size, TX_TYPE tx_type) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
 
-  rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 21743;
+  if (lcg_rand16(&seed) % 100 > 0) return;
 
-  // TODO(jingning): temporarily enabled only for luma component
-  rd = AOMMIN(rd1, rd2);
+  const char output_file[] = "tu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
 
-#if !CONFIG_PVQ
-  this_rd_stats.skip &= !x->plane[plane].eobs[block];
-#else
-  this_rd_stats.skip &= x->pvq_skip[plane];
-#endif  // !CONFIG_PVQ
-  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+  const double num_samples = txw * txh;
 
-  args->this_rd += rd;
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
 
-#if CONFIG_DIST_8X8
-  if (!disable_early_skip)
-#endif
-    if (args->this_rd > args->best_rd) {
-      args->exit_early = 1;
-      return;
-    }
-}
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
 
-#if CONFIG_DIST_8X8
-static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize,
-                                    struct rdcost_block_args *args) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[0];
-  const struct macroblock_plane *const p = &x->plane[0];
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int src_stride = p->src.stride;
+  const uint8_t *const src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
   const int dst_stride = pd->dst.stride;
-  const uint8_t *src = &p->src.buf[0];
-  const uint8_t *dst = &pd->dst.buf[0];
-  const int16_t *pred = &pd->pred[0];
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
-  int visible_w = bw;
-  int visible_h = bh;
-
-  int i, j;
-  int64_t rd, rd1, rd2;
-  unsigned int tmp1, tmp2;
-  int qindex = x->qindex;
-
-  assert((bw & 0x07) == 0);
-  assert((bh & 0x07) == 0);
+  const uint8_t *const dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  unsigned int sse;
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
 
-  get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
-                     &visible_h);
+  const unsigned int sad =
+      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
 
-#if CONFIG_HIGHBITDEPTH
-  uint8_t *pred8;
-  DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    pred8 = CONVERT_TO_BYTEPTR(pred16);
-  else
-    pred8 = (uint8_t *)pred16;
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < bh; j++)
-      for (i = 0; i < bw; i++)
-        CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
-  } else {
-#endif
-    for (j = 0; j < bh; j++)
-      for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
-#if CONFIG_HIGHBITDEPTH
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
-  tmp1 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw,
-                                bh, visible_w, visible_h, qindex);
-  tmp2 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize,
-                                bw, bh, visible_w, visible_h, qindex);
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
 
-  if (!is_inter_block(mbmi)) {
-    if (x->tune_metric == AOM_TUNE_PSNR) {
-      assert(args->rd_stats.sse == tmp1 * 16);
-      assert(args->rd_stats.dist == tmp2 * 16);
-    }
-    args->rd_stats.sse = (int64_t)tmp1 * 16;
-    args->rd_stats.dist = (int64_t)tmp2 * 16;
-  } else {
-    // For inter mode, the decoded pixels are provided in pd->pred,
-    // while the predicted pixels are in dst.
-    if (x->tune_metric == AOM_TUNE_PSNR) {
-      assert(args->rd_stats.sse == tmp2 * 16);
-      assert(args->rd_stats.dist == tmp1 * 16);
-    }
-    args->rd_stats.sse = (int64_t)tmp2 * 16;
-    args->rd_stats.dist = (int64_t)tmp1 * 16;
-  }
+  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
 
-  rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
-  rd = AOMMIN(rd1, rd2);
+  int model_rate;
+  int64_t model_dist;
+  model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
 
-  args->rd_stats.rdcost = rd;
-  args->this_rd = rd;
+  const double mean = get_mean(src_diff, diff_stride, txw, txh);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, txw, txh, &hor_corr,
+                         &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
 
-  if (args->this_rd > args->best_rd) args->exit_early = 1;
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+                               1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, "\n");
+  fclose(fout);
 }
-#endif  // CONFIG_DIST_8X8
 
-static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
-                             RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             int use_fast_coef_casting) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  struct rdcost_block_args args;
-  av1_zero(args);
-  args.x = x;
-  args.cpi = cpi;
-  args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
-  av1_init_rd_stats(&args.rd_stats);
+#if CONFIG_COLLECT_RD_STATS == 2
+static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     const RD_STATS *const rd_stats,
+                                     BLOCK_SIZE plane_bsize) {
+  if (rd_stats->invalid_rate) return;
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
 
-  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 95014;
+  if (lcg_rand16(&seed) % 100 > 0) return;
 
-  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+  const char output_file[] = "pu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
 
-  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
-                                         &args);
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && !args.exit_early && plane == 0 &&
-      bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
-#endif
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+  const double num_samples = bw * bh;
 
-  if (args.exit_early) {
-    av1_invalid_rd_stats(rd_stats);
-  } else {
-    *rd_stats = args.rd_stats;
-  }
-}
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
 
-#if CONFIG_SUPERTX
-void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
-                                  int64_t *distortion, int *skippable,
-                                  int64_t *sse, int64_t ref_best_rd, int plane,
-                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                  int use_fast_coef_casting) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  struct rdcost_block_args args;
-  av1_zero(args);
-  args.cpi = cpi;
-  args.x = x;
-  args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
 
-#if CONFIG_EXT_TX
-  assert(tx_size < TX_SIZES);
-#endif  // CONFIG_EXT_TX
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  unsigned int sse;
+  cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
 
-  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+  const unsigned int sad =
+      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
 
-  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
 
-  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
-                &args);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff = p->src_diff;
 
-  if (args.exit_early) {
-    *rate = INT_MAX;
-    *distortion = INT64_MAX;
-    *sse = INT64_MAX;
-    *skippable = 0;
-  } else {
-    *distortion = args.rd_stats.dist;
-    *rate = args.rd_stats.rate;
-    *sse = args.rd_stats.sse;
-    *skippable = !x->plane[plane].eobs[0];
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
   }
-}
-#endif  // CONFIG_SUPERTX
 
-static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
-                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  fprintf(fout, " %d %d %d", q_step, bw, bh);
 
-  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
-    const int is_inter = is_inter_block(mbmi);
-    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                         : intra_tx_size_cat_lookup[bsize];
-    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-    const int depth = tx_size_to_depth(coded_tx_size);
-    const int tx_size_ctx = get_tx_size_context(xd);
-    int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
-      r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
-                                tx_size == quarter_txsize_lookup[bsize]);
-#endif
-    return r_tx_size;
-  } else {
-    return 0;
-  }
-}
+  int model_rate;
+  int64_t model_dist;
+  model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
 
-#if CONFIG_LGT_FROM_PRED
-int av1_lgt_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                 const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-                 TX_SIZE tx_size, int use_lgt) {
-  if (plane > 0) return 0;
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
+  const double mean = get_mean(src_diff, diff_stride, bw, bh);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
 
-  assert(is_lgt_allowed(mbmi->mode, tx_size));
-  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    const int ext_tx_set =
-        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-    if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
-        ALLOW_INTRA_EXT_TX)
-      return x->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode][use_lgt];
-    if (LGT_FROM_PRED_INTRA && is_inter && ext_tx_set > 0)
-      return x->inter_lgt_cost[txsize_sqr_map[tx_size]][use_lgt];
-  }
-  return 0;
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, "\n");
+  fclose(fout);
 }
-#endif  // CONFIG_LGT_FROM_PRED
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+#endif  // CONFIG_COLLECT_RD_STATS
 
-// TODO(angiebird): use this function whenever it's possible
-int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-                     TX_SIZE tx_size, TX_TYPE tx_type) {
-  if (plane > 0) return 0;
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              int plane, unsigned int *rsse, int *rate,
+                              int64_t *dist) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  const int log_numpels = num_pels_log2_lookup[plane_bsize];
+  const int num_samples = (1 << log_numpels);
 
-#if CONFIG_LGT_FROM_PRED
-  assert(!xd->mi[0]->mbmi.use_lgt);
-#endif
-#if CONFIG_VAR_TX
-  tx_size = get_min_tx_size(tx_size);
-#endif
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
 
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    const int ext_tx_set =
-        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-    if (is_inter) {
-      if (ext_tx_set > 0)
-        return x
-            ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
-    } else {
-      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        return x->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
-                                     [mbmi->mode][tx_type];
-    }
-  }
-#else
-  (void)bsize;
-  (void)cm;
-  if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-      !FIXED_TX_TYPE) {
-    if (is_inter) {
-      return x->inter_tx_type_costs[tx_size][tx_type];
-    } else {
-      return x->intra_tx_type_costs[tx_size]
-                                   [intra_mode_to_tx_type_context[mbmi->mode]]
-                                   [tx_type];
-    }
-  }
-#endif  // CONFIG_EXT_TX
-  return 0;
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  unsigned int sse;
+  cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff = p->src_diff;
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  const double mean = get_mean(src_diff, diff_stride, bw, bh);
+  const double variance = sse_norm - mean * mean;
+  const double q_sqr = (double)(q_step * q_step);
+  const double q_sqr_by_variance = q_sqr / variance;
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+
+  float features[20];
+  features[0] = (float)hdist[0];
+  features[1] = (float)hdist[1];
+  features[2] = (float)hdist[2];
+  features[3] = (float)hdist[3];
+  features[4] = (float)hor_corr;
+  features[5] = (float)log_numpels;
+  features[6] = (float)mean;
+  features[7] = (float)q_sqr;
+  features[8] = (float)q_sqr_by_variance;
+  features[9] = (float)sse_norm_arr[0];
+  features[10] = (float)sse_norm_arr[1];
+  features[11] = (float)sse_norm_arr[2];
+  features[12] = (float)sse_norm_arr[3];
+  features[13] = (float)sse_norm_arr[3];
+  features[14] = (float)variance;
+  features[15] = (float)vdist[0];
+  features[16] = (float)vdist[1];
+  features[17] = (float)vdist[2];
+  features[18] = (float)vdist[3];
+  features[19] = (float)vert_corr;
+
+  float rate_f, dist_f;
+  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_f);
+  av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+  const int rate_i = (int)(AOMMAX(0.0, rate_f * (1 << log_numpels)) + 0.5);
+  const int64_t dist_i =
+      (int64_t)(AOMMAX(0.0, dist_f * (1 << log_numpels)) + 0.5);
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+  if (rsse) *rsse = sse;
+  return;
 }
-static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                        RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
-                        TX_TYPE tx_type, TX_SIZE tx_size) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int64_t rd = INT64_MAX;
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
-  int s0, s1;
-  const int is_inter = is_inter_block(mbmi);
-  const int tx_select =
-      cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
 
-  const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
+void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                              int plane_to, int *out_rate_sum,
+                              int64_t *out_dist_sum, int *skip_txfm_sb,
+                              int64_t *skip_sse_sb, int *plane_rate,
+                              int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
 
-#if CONFIG_PVQ
-  assert(tx_size >= TX_4X4);
-#endif  // CONFIG_PVQ
-  assert(skip_prob > 0);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
 
-  s0 = av1_cost_bit(skip_prob, 0);
-  s1 = av1_cost_bit(skip_prob, 1);
+  x->pred_sse[ref] = 0;
 
-  mbmi->tx_type = tx_type;
-  mbmi->tx_size = tx_size;
-  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size,
-                   cpi->sf.use_fast_coef_costing);
-  if (rd_stats->rate == INT_MAX) return INT64_MAX;
-#if !CONFIG_TXK_SEL
-  int plane = 0;
-#if CONFIG_LGT_FROM_PRED
-  if (is_lgt_allowed(mbmi->mode, tx_size))
-    rd_stats->rate +=
-        av1_lgt_cost(cm, x, xd, bs, plane, tx_size, mbmi->use_lgt);
-  if (!mbmi->use_lgt)
-    rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
-#else
-  rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
-#endif  // CONFIG_LGT_FROM_PRED
-#endif
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    unsigned int sse;
+    int rate;
+    int64_t dist;
 
-  if (rd_stats->skip) {
-    if (is_inter) {
-      rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-    } else {
-      rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
-    }
-  } else {
-    rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
-                rd_stats->dist);
-  }
+    if (x->skip_chroma_rd && plane) continue;
 
-  if (tx_select) rd_stats->rate += r_tx_size;
+    model_rd_with_dnn(cpi, x, bsize, plane, &sse, &rate, &dist);
 
-  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-      !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
+    if (plane == 0) x->pred_sse[ref] = sse;
 
-  return rd;
-}
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
 
-static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
-                            TX_TYPE tx_type, TX_SIZE tx_size) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  const int is_inter = is_inter_block(mbmi);
-  int prune = 0;
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-    // passing -1 in for tx_type indicates that all 1D
-    // transforms should be considered for pruning
-    prune = prune_tx_types(cpi, bs, x, xd, -1);
-
-#if CONFIG_MRC_TX
-  // MRC_DCT only implemented for TX_32X32 so only include this tx in
-  // the search for TX_32X32
-  if (tx_type == MRC_DCT &&
-      ((is_inter && !USE_MRC_INTER) || (!is_inter && !USE_MRC_INTRA) ||
-       tx_size != TX_32X32))
-    return 1;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  if (mbmi->use_lgt && mbmi->ref_mv_idx > 0) return 1;
-#endif  // CONFIG_LGT_FROM_PRED
-  if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
-  if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
-    return 1;
-  if (!is_inter && x->use_default_intra_tx_type &&
-      tx_type != get_default_tx_type(0, xd, 0, tx_size))
-    return 1;
-  if (is_inter && x->use_default_inter_tx_type &&
-      tx_type != get_default_tx_type(0, xd, 0, tx_size))
-    return 1;
-  if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
-#if CONFIG_EXT_TX
-  const AV1_COMMON *const cm = &cpi->common;
-  const TxSetType tx_set_type =
-      get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
-  if (!av1_ext_tx_used[tx_set_type][tx_type]) return 1;
-  if (is_inter) {
-    if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
-      if (!do_tx_type_search(tx_type, prune)) return 1;
-    }
-  } else {
-    if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
-      if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
-    }
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
   }
-#else   // CONFIG_EXT_TX
-  if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
-      !do_tx_type_search(tx_type, prune))
-    return 1;
-#endif  // CONFIG_EXT_TX
-  return 0;
-}
 
-#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
-static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
-                                   MACROBLOCK *x, int *r, int64_t *d, int *s,
-                                   int64_t *sse, int64_t ref_best_rd) {
-  RD_STATS rd_stats;
-  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT,
-                        max_txsize_lookup[bs]);
-  *r = rd_stats.rate;
-  *d = rd_stats.dist;
-  *s = rd_stats.skip;
-  *sse = rd_stats.sse;
-  return rd;
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
 }
-#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
-static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                   RD_STATS *rd_stats, int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
-  int64_t this_rd, best_rd = INT64_MAX;
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
-  int s0 = av1_cost_bit(skip_prob, 0);
-  int s1 = av1_cost_bit(skip_prob, 1);
+static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx,
+                               FAST_TX_SEARCH_MODE ftxs_mode,
+                               int use_fast_coef_costing, int64_t ref_best_rd,
+                               RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-  int prune = 0;
-  const int plane = 0;
-#if CONFIG_LGT_FROM_PRED
-  int is_lgt_best = 0;
-  int search_lgt = is_inter
-                       ? LGT_FROM_PRED_INTER && !x->use_default_inter_tx_type &&
-                             !cpi->sf.tx_type_search.prune_mode > NO_PRUNE
-                       : LGT_FROM_PRED_INTRA && !x->use_default_intra_tx_type &&
-                             ALLOW_INTRA_EXT_TX;
-#endif  // CONFIG_LGT_FROM_PRED
-  av1_invalid_rd_stats(rd_stats);
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  TX_TYPE last_tx_type = TX_TYPES;
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+  // of the best tx_type
+  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  tran_low_t *best_dqcoeff = this_dqcoeff;
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  av1_invalid_rd_stats(best_rd_stats);
+
+  TXB_RD_INFO *intra_txb_rd_info = NULL;
+  uint16_t cur_joint_ctx = 0;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+  if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+      !is_inter && plane == 0 &&
+      tx_size_wide[tx_size] == tx_size_high[tx_size]) {
+    const uint32_t intra_hash =
+        get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+    const int intra_hash_idx =
+        find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+    intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+
+    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+    if (intra_hash_idx > 0 &&
+        intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+        x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+      mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
+      const TX_TYPE ref_tx_type =
+          av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                          tx_size, cpi->common.reduced_tx_set_used);
+      if (ref_tx_type == intra_txb_rd_info->tx_type) {
+        best_rd_stats->rate = intra_txb_rd_info->rate;
+        best_rd_stats->dist = intra_txb_rd_info->dist;
+        best_rd_stats->sse = intra_txb_rd_info->sse;
+        best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+        x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+        x->plane[plane].txb_entropy_ctx[block] =
+            intra_txb_rd_info->txb_entropy_ctx;
+        best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
+        best_eob = intra_txb_rd_info->eob;
+        best_tx_type = intra_txb_rd_info->tx_type;
+        update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                         best_tx_type);
+        goto RECON_INTRA;
+      }
+    }
+  }
+
+  int rate_cost = 0;
+  TX_TYPE txk_start = DCT_DCT;
+  TX_TYPE txk_end = TX_TYPES - 1;
+  if (!(!is_inter && x->use_default_intra_tx_type) &&
+      !(is_inter && x->use_default_inter_tx_type))
+    if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan)
+      if (plane == 0) txk_end = DCT_DCT;
 
-  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-#if CONFIG_EXT_TX
-  int ext_tx_set =
-      get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  uint8_t best_txb_ctx = 0;
   const TxSetType tx_set_type =
-      get_ext_tx_set_type(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-#if CONFIG_EXT_TX
-    prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
-#else
-    prune = prune_tx_types(cpi, bs, x, xd, 0);
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) >
-          1 &&
-      !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_PVQ
-    od_rollback_buffer pre_buf, post_buf;
-
-    od_encode_checkpoint(&x->daala_enc, &pre_buf);
-    od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-      if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-      RD_STATS this_rd_stats;
-      if (is_inter) {
-        if (x->use_default_inter_tx_type &&
-            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-          continue;
-        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
-          if (!do_tx_type_search(tx_type, prune)) continue;
-        }
-      } else {
-        if (x->use_default_intra_tx_type &&
-            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-          continue;
-        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
-          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
-        }
-      }
-
-      mbmi->tx_type = tx_type;
+      av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+  int prune = 0;
+  const int do_prune = plane == 0 && !fast_tx_search && txk_end != DCT_DCT &&
+                       !(!is_inter && x->use_default_intra_tx_type) &&
+                       !(is_inter && x->use_default_inter_tx_type) &&
+                       cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
+  if (do_prune && is_inter) {
+    if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
+      prune = prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col,
+                          tx_set_type, cpi->sf.tx_type_search.prune_mode);
+    } else {
+      prune = x->tx_search_prune[tx_set_type];
+    }
+  }
 
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
-                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-      if (this_rd_stats.rate == INT_MAX) continue;
-      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_start = txk_end =
+        av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
+                        cm->reduced_tx_set_used);
+  }
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+    txk_start = txk_end = DCT_DCT;
+  }
 
-      if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
-      else
-        this_rd =
-            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
-      if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
-          !this_rd_stats.skip)
-        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
-
-      if (this_rd < best_rd) {
-        best_rd = this_rd;
-        best_tx_type = mbmi->tx_type;
-        *rd_stats = this_rd_stats;
-#if CONFIG_PVQ
-        od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-      }
-    }
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-#if CONFIG_LGT_FROM_PRED
-    // search LGT
-    if (search_lgt && is_lgt_allowed(mbmi->mode, mbmi->tx_size) &&
-        !cm->reduced_tx_set_used) {
-      RD_STATS this_rd_stats;
-      mbmi->use_lgt = 1;
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
-                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-      if (this_rd_stats.rate != INT_MAX) {
-        av1_lgt_cost(cm, x, xd, bs, plane, mbmi->tx_size, 1);
-        if (this_rd_stats.skip)
-          this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
-        else
-          this_rd =
-              RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
-        if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
-            !this_rd_stats.skip)
-          this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
-        if (this_rd < best_rd) {
-          best_rd = this_rd;
-          is_lgt_best = 1;
-          *rd_stats = this_rd_stats;
-        }
-      }
-      mbmi->use_lgt = 0;
-    }
-#endif  // CONFIG_LGT_FROM_PRED
+  int8_t allowed_tx_mask[TX_TYPES] = { 0 };  // 1: allow; 0: skip.
+  int allowed_tx_num = 0;
+  if (fast_tx_search) {
+    allowed_tx_mask[DCT_DCT] = 1;
+    allowed_tx_mask[H_DCT] = 1;
+    allowed_tx_mask[V_DCT] = 1;
   } else {
-    mbmi->tx_type = DCT_DCT;
-    txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-  }
-#else   // CONFIG_EXT_TX
-  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
-    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-      RD_STATS this_rd_stats;
-      if (!is_inter && x->use_default_intra_tx_type &&
-          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-        continue;
-      if (is_inter && x->use_default_inter_tx_type &&
-          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-        continue;
-      mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
-                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-      if (this_rd_stats.rate == INT_MAX) continue;
-
-      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
-      if (is_inter) {
-        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
-            !do_tx_type_search(tx_type, prune))
-          continue;
-      }
-      if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
-      else
-        this_rd =
-            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
-      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
-        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
+    memset(allowed_tx_mask + txk_start, 1, txk_end - txk_start + 1);
+  }
+  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+    if (do_prune) {
+      if (!do_tx_type_search(tx_type, prune, cpi->sf.tx_type_search.prune_mode))
+        allowed_tx_mask[tx_type] = 0;
+    }
+    if (plane == 0 && allowed_tx_mask[tx_type]) {
+      if (!av1_ext_tx_used[tx_set_type][tx_type])
+        allowed_tx_mask[tx_type] = 0;
+      else if (!is_inter && x->use_default_intra_tx_type &&
+               tx_type != get_default_tx_type(0, xd, tx_size))
+        allowed_tx_mask[tx_type] = 0;
+      else if (is_inter && x->use_default_inter_tx_type &&
+               tx_type != get_default_tx_type(0, xd, tx_size))
+        allowed_tx_mask[tx_type] = 0;
+    }
+    allowed_tx_num += allowed_tx_mask[tx_type];
+  }
+  // Need to have at least one transform type allowed.
+  if (allowed_tx_num == 0) {
+    allowed_tx_mask[plane ? uv_tx_type : DCT_DCT] = 1;
+  }
+
+  int use_transform_domain_distortion =
+      (cpi->sf.use_transform_domain_distortion > 0) &&
+      // Any 64-pt transforms only preserves half the coefficients.
+      // Therefore transform domain distortion is not valid for these
+      // transform sizes.
+      txsize_sqr_up_map[tx_size] != TX_64X64;
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) use_transform_domain_distortion = 0;
+#endif
 
-      if (this_rd < best_rd) {
-        best_rd = this_rd;
-        best_tx_type = mbmi->tx_type;
-        *rd_stats = this_rd_stats;
-      }
-    }
-  } else {
-    mbmi->tx_type = DCT_DCT;
-    txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-  }
-#endif  // CONFIG_EXT_TX
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = is_lgt_best;
-#endif  // CONFIG_LGT_FROM_PRED
-}
+  int calc_pixel_domain_distortion_final =
+      cpi->sf.use_transform_domain_distortion == 1 &&
+      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
+      !x->cb_partition_scan;
+  if (calc_pixel_domain_distortion_final && allowed_tx_num <= 1)
+    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
 
-static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    RD_STATS *rd_stats, int64_t ref_best_rd,
-                                    BLOCK_SIZE bs) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
 
-  mbmi->tx_size = TX_4X4;
-  mbmi->tx_type = DCT_DCT;
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = get_min_tx_size(TX_4X4);
-#endif  // CONFIG_VAR_TX
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  int64_t block_sse =
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+  block_sse *= 16;
 
-  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
-                   cpi->sf.use_fast_coef_costing);
-}
+  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+    if (!allowed_tx_mask[tx_type]) continue;
+    if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+
+    if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+      rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
+                                  tx_size, txb_ctx, use_fast_coef_costing);
+    } else {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX &&
+          eobs_ptr[block] >= 4) {
+        // Calculate distortion quickly in transform domain.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+        rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
+                                    tx_size, txb_ctx, use_fast_coef_costing);
+        const int64_t rd_estimate =
+            AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
+                   RDCOST(x->rdmult, 0, this_rd_stats.sse));
+        if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd))
+          continue;
+      }
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
+                     &rate_cost);
+    }
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (use_transform_domain_distortion) {
+      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+    } else {
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+      this_rd_stats.sse = block_sse;
+    }
 
-#if CONFIG_TXK_SEL || CONFIG_VAR_TX
-static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
-  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
-  return num_blk;
-}
-#endif  // CONFIG_TXK_SEL || CONFIG_VAR_TX
+    this_rd_stats.rate = rate_cost;
 
-static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
-                                        MACROBLOCK *x, RD_STATS *rd_stats,
-                                        int64_t ref_best_rd, BLOCK_SIZE bs) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int64_t rd = INT64_MAX;
-  int n;
-  int start_tx, end_tx;
-  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  TX_SIZE best_tx_size = max_tx_size;
-  TX_TYPE best_tx_type = DCT_DCT;
-#if CONFIG_LGT_FROM_PRED
-  int breakout = 0;
-  int is_lgt_best = 0;
-  mbmi->use_lgt = 0;
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_TXK_SEL
-  TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-#endif  // CONFIG_TXK_SEL
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  const int is_inter = is_inter_block(mbmi);
-#if CONFIG_PVQ
-  od_rollback_buffer buf;
-  od_encode_checkpoint(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
+    const int64_t rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
 
-  av1_invalid_rd_stats(rd_stats);
+    if (rd < best_rd) {
+      best_rd = rd;
+      *best_rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+      best_eob = x->plane[plane].eobs[block];
+      last_tx_type = best_tx_type;
 
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  int evaluate_rect_tx = 0;
-  if (tx_select) {
-    evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
-  } else {
-    const TX_SIZE chosen_tx_size =
-        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-    evaluate_rect_tx = is_rect_tx(chosen_tx_size);
-    assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
-  }
-  if (evaluate_rect_tx) {
-    TX_TYPE tx_start = DCT_DCT;
-    TX_TYPE tx_end = TX_TYPES;
-#if CONFIG_TXK_SEL
-    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
-    // performed in av1_search_txk_type()
-    tx_end = DCT_DCT + 1;
-#endif
-    TX_TYPE tx_type;
-    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-      if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
-      const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
-      RD_STATS this_rd_stats;
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if (av1_ext_tx_used[tx_set_type][tx_type]) {
-        rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
-                      rect_tx_size);
-        ref_best_rd = AOMMIN(rd, ref_best_rd);
-        if (rd < best_rd) {
-#if CONFIG_TXK_SEL
-          memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
-#endif
-          best_tx_type = tx_type;
-          best_tx_size = rect_tx_size;
-          best_rd = rd;
-          *rd_stats = this_rd_stats;
-        }
-      }
-#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-      const int is_inter = is_inter_block(mbmi);
-      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
-#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-    }
-#if CONFIG_LGT_FROM_PRED
-    const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
-    if (is_lgt_allowed(mbmi->mode, rect_tx_size) && !cm->reduced_tx_set_used) {
-      RD_STATS this_rd_stats;
-      mbmi->use_lgt = 1;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, rect_tx_size);
-      if (rd < best_rd) {
-        is_lgt_best = 1;
-        best_tx_size = rect_tx_size;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-      mbmi->use_lgt = 0;
-    }
-#endif  // CONFIG_LGT_FROM_PRED
-  }
-
-#if CONFIG_RECT_TX_EXT
-  // test 1:4/4:1 tx
-  int evaluate_quarter_tx = 0;
-  if (is_quarter_tx_allowed(xd, mbmi, is_inter)) {
-    if (tx_select) {
-      evaluate_quarter_tx = 1;
-    } else {
-      const TX_SIZE chosen_tx_size =
-          tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-      evaluate_quarter_tx = chosen_tx_size == quarter_txsize_lookup[bs];
-    }
-  }
-  if (evaluate_quarter_tx) {
-    TX_TYPE tx_start = DCT_DCT;
-    TX_TYPE tx_end = TX_TYPES;
-#if CONFIG_TXK_SEL
-    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
-    // performed in av1_search_txk_type()
-    tx_end = DCT_DCT + 1;
-#endif
-    TX_TYPE tx_type;
-    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-      if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
-      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
-      RD_STATS this_rd_stats;
-      const TxSetType tx_set_type =
-          get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if (av1_ext_tx_used[tx_set_type][tx_type]) {
-        rd =
-            txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
-        if (rd < best_rd) {
-#if CONFIG_TXK_SEL
-          memcpy(best_txk_type, mbmi->txk_type,
-                 sizeof(best_txk_type[0]) * num_blk);
-#endif
-          best_tx_type = tx_type;
-#if CONFIG_LGT_FROM_PRED
-          is_lgt_best = 0;
-#endif
-          best_tx_size = tx_size;
-          best_rd = rd;
-          *rd_stats = this_rd_stats;
-        }
-      }
-#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-      const int is_inter = is_inter_block(mbmi);
-      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
-#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+      // Swap qcoeff and dqcoeff buffers
+      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+      best_dqcoeff = pd->dqcoeff;
+      pd->dqcoeff = tmp_dqcoeff;
     }
-#if CONFIG_LGT_FROM_PRED
-    if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
-      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
-      RD_STATS this_rd_stats;
-      mbmi->use_lgt = 1;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, tx_size);
-      if (rd < best_rd) {
-        is_lgt_best = 1;
-        best_tx_size = tx_size;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-      mbmi->use_lgt = 0;
+
+#if CONFIG_COLLECT_RD_STATS == 1
+    if (plane == 0) {
+      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+                              plane_bsize, tx_size, tx_type);
     }
-#endif  // CONFIG_LGT_FROM_PRED
-  }
-#endif  // CONFIG_RECT_TX_EXT
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#endif  // CONFIG_COLLECT_RD_STATS == 1
 
-  if (tx_select) {
-    start_tx = max_tx_size;
-    end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4;
-  } else {
-    const TX_SIZE chosen_tx_size =
-        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-    start_tx = chosen_tx_size;
-    end_tx = chosen_tx_size;
-  }
-
-  last_rd = INT64_MAX;
-  for (n = start_tx; n >= end_tx; --n) {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-    if (is_rect_tx(n)) break;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-    TX_TYPE tx_start = DCT_DCT;
-    TX_TYPE tx_end = TX_TYPES;
-#if CONFIG_TXK_SEL
-    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
-    // performed in av1_search_txk_type()
-    tx_end = DCT_DCT + 1;
-#endif
-    TX_TYPE tx_type;
-    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-      RD_STATS this_rd_stats;
-      if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n);
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
-      // Early termination in transform size search.
-      if (cpi->sf.tx_size_search_breakout &&
-          (rd == INT64_MAX ||
-           (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int)max_tx_size && rd > last_rd))) {
-#if CONFIG_LGT_FROM_PRED
-        breakout = 1;
-#endif
+    if (cpi->sf.adaptive_txb_search_level) {
+      if ((best_rd - (best_rd >> cpi->sf.adaptive_txb_search_level)) >
+          ref_best_rd) {
         break;
       }
+    }
 
-      last_rd = rd;
-      ref_best_rd = AOMMIN(rd, ref_best_rd);
-      if (rd < best_rd) {
-#if CONFIG_TXK_SEL
-        memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
-#endif
-        best_tx_type = tx_type;
-#if CONFIG_LGT_FROM_PRED
-        is_lgt_best = 0;
-#endif
-        best_tx_size = n;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-      const int is_inter = is_inter_block(mbmi);
-      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
-#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-    }
-#if CONFIG_LGT_FROM_PRED
-    mbmi->use_lgt = 1;
-    if (is_lgt_allowed(mbmi->mode, n) && !skip_txfm_search(cpi, x, bs, 0, n) &&
-        !breakout) {
-      RD_STATS this_rd_stats;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, n);
-      if (rd < best_rd) {
-        is_lgt_best = 1;
-        best_tx_size = n;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-    }
-    mbmi->use_lgt = 0;
-#endif  // CONFIG_LGT_FROM_PRED
+    // Skip transform type search when we found the block has been quantized to
+    // all zero and at the same time, it has better rdcost than doing transform.
+    if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break;
   }
-  mbmi->tx_size = best_tx_size;
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = is_lgt_best;
-  assert(!is_lgt_best || is_lgt_allowed(mbmi->mode, mbmi->tx_size));
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_TXK_SEL
-  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
-#endif
 
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-
-#if !CONFIG_EXT_TX
-  if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
-#endif  // !CONFIG_EXT_TX
-#if CONFIG_PVQ
-  if (best_rd != INT64_MAX) {
-    txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size);
-  }
-#endif  // CONFIG_PVQ
-}
+  assert(best_rd != INT64_MAX);
 
-static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bs,
-                            int64_t ref_best_rd) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  av1_init_rd_stats(rd_stats);
+  best_rd_stats->skip = best_eob == 0;
+  if (best_eob == 0) best_tx_type = DCT_DCT;
+  if (plane == 0) {
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     best_tx_type);
+  }
+  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+  x->plane[plane].eobs[block] = best_eob;
+
+  pd->dqcoeff = best_dqcoeff;
+
+  if (calc_pixel_domain_distortion_final && best_eob) {
+    best_rd_stats->dist = dist_block_px_domain(
+        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->sse = block_sse;
+  }
+
+  if (intra_txb_rd_info != NULL) {
+    intra_txb_rd_info->valid = 1;
+    intra_txb_rd_info->entropy_context = cur_joint_ctx;
+    intra_txb_rd_info->rate = best_rd_stats->rate;
+    intra_txb_rd_info->dist = best_rd_stats->dist;
+    intra_txb_rd_info->sse = best_rd_stats->sse;
+    intra_txb_rd_info->eob = best_eob;
+    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
+    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
+  }
+
+RECON_INTRA:
+  if (!is_inter && best_eob &&
+      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+    // intra mode needs decoded result such that the next transform block
+    // can use it for prediction.
+    // if the last search tx_type is the best tx_type, we don't need to
+    // do this again
+    if (best_tx_type != last_tx_type) {
+      if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+        av1_xform_quant(
+            cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+            best_tx_type,
+            USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+      } else {
+        av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                        tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
+                       &rate_cost);
+      }
+    }
 
-  assert(bs == xd->mi[0]->mbmi.sb_type);
+    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                   x->plane[plane].eobs[block],
+                                   cm->reduced_tx_set_used);
 
-  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
-  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
-    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
-  } else {
-    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+    // This may happen because of hash collision. The eob stored in the hash
+    // table is non-zero, but the real eob is zero. We need to make sure tx_type
+    // is DCT_DCT in this case.
+    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+        best_tx_type != DCT_DCT) {
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
+    }
   }
-}
+  pd->dqcoeff = orig_dqcoeff;
 
-static int conditional_skipintra(PREDICTION_MODE mode,
-                                 PREDICTION_MODE best_intra_mode) {
-  if (mode == D117_PRED && best_intra_mode != V_PRED &&
-      best_intra_mode != D135_PRED)
-    return 1;
-  if (mode == D63_PRED && best_intra_mode != V_PRED &&
-      best_intra_mode != D45_PRED)
-    return 1;
-  if (mode == D207_PRED && best_intra_mode != H_PRED &&
-      best_intra_mode != D45_PRED)
-    return 1;
-  if (mode == D153_PRED && best_intra_mode != H_PRED &&
-      best_intra_mode != D135_PRED)
-    return 1;
-  return 0;
+  return best_rd;
 }
 
-// Model based RD estimation for luma intra blocks.
-static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               BLOCK_SIZE bsize, int mode_cost) {
-  const AV1_COMMON *cm = &cpi->common;
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  assert(!is_inter_block(mbmi));
-  RD_STATS this_rd_stats;
-  int row, col;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const AV1_COMP *cpi = args->cpi;
+  ENTROPY_CONTEXT *a = args->t_above + blk_col;
+  ENTROPY_CONTEXT *l = args->t_left + blk_row;
+  const AV1_COMMON *cm = &cpi->common;
+  int64_t rd1, rd2, rd;
+  RD_STATS this_rd_stats;
+
+#if CONFIG_DIST_8X8
+  // If sub8x8 tx, 8x8 or larger partition, and luma channel,
+  // dist-8x8 disables early skip, because the distortion metrics for
+  // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
+  // (new distortion metric) are different.
+  // Exception is: dist-8x8 is enabled but still MSE is used,
+  // i.e. "--tune=" encoder option is not used.
+  int bw = block_size_wide[plane_bsize];
+  int bh = block_size_high[plane_bsize];
+  int disable_early_skip =
+      x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
+      x->tune_metric != AOM_TUNE_PSNR;
+#endif  // CONFIG_DIST_8X8
+
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (args->exit_early) return;
+
+  if (!is_inter_block(mbmi)) {
+    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  }
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
+                  args->best_rd - args->this_rd, &this_rd_stats);
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+
+#if CONFIG_RD_DEBUG
+  av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+                            this_rd_stats.rate);
+#endif  // CONFIG_RD_DEBUG
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  if (plane == 0) {
+    x->blk_skip[blk_row *
+                    (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+                blk_col] = (x->plane[plane].eobs[block] == 0);
+  }
+
+  rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = AOMMIN(rd1, rd2);
+
+  this_rd_stats.skip &= !x->plane[plane].eobs[block];
+
+  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+  args->this_rd += rd;
+
+#if CONFIG_DIST_8X8
+  if (!disable_early_skip)
+#endif
+    if (args->this_rd > args->best_rd) {
+      args->exit_early = 1;
+      return;
+    }
+}
+
+#if CONFIG_DIST_8X8
+static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize,
+                                    struct rdcost_block_args *args) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const struct macroblock_plane *const p = &x->plane[0];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *src = &p->src.buf[0];
+  const uint8_t *dst = &pd->dst.buf[0];
+  const int16_t *pred = &x->pred_luma[0];
+  int bw = block_size_wide[bsize];
+  int bh = block_size_high[bsize];
+  int visible_w = bw;
+  int visible_h = bh;
+
+  int i, j;
+  int64_t rd, rd1, rd2;
+  int64_t sse = INT64_MAX, dist = INT64_MAX;
+  int qindex = x->qindex;
+
+  assert((bw & 0x07) == 0);
+  assert((bh & 0x07) == 0);
+
+  get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
+                     &visible_h);
+
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff;
+  sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w,
+                      visible_h, qindex);
+  sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+  sse *= 16;
+
+  if (!is_inter_block(mbmi)) {
+    dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh,
+                        visible_w, visible_h, qindex);
+    dist *= 16;
+  } else {
+    // For inter mode, the decoded pixels are provided in x->pred_luma,
+    // while the predicted pixels are in dst.
+    uint8_t *pred8;
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      pred8 = CONVERT_TO_BYTEPTR(pred16);
+    else
+      pred8 = (uint8_t *)pred16;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bh; j++)
+        for (i = 0; i < bw; i++)
+          CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
+    } else {
+      for (j = 0; j < bh; j++)
+        for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
+    }
+
+    dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh,
+                        visible_w, visible_h, qindex);
+    dist *= 16;
+  }
+
+#ifdef DEBUG_DIST_8X8
+  if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) {
+    assert(args->rd_stats.sse == sse);
+    assert(args->rd_stats.dist == dist);
+  }
+#endif  // DEBUG_DIST_8X8
+
+  args->rd_stats.sse = sse;
+  args->rd_stats.dist = dist;
+
+  rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
+  rd = AOMMIN(rd1, rd2);
+
+  args->rd_stats.rdcost = rd;
+  args->this_rd = rd;
+
+  if (args->this_rd > args->best_rd) args->exit_early = 1;
+}
+#endif  // CONFIG_DIST_8X8
+
+static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                             RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting,
+                             FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.ftxs_mode = ftxs_mode;
+  av1_init_rd_stats(&args.rd_stats);
+
+  if (plane == 0) xd->mi[0]->tx_size = tx_size;
+
+  av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
+
+  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+                                         &args);
+#if CONFIG_DIST_8X8
+  int bw = block_size_wide[bsize];
+  int bh = block_size_high[bsize];
+
+  if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 &&
+      bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
+    dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
+#endif
+
+  if (args.exit_early) {
+    av1_invalid_rd_stats(rd_stats);
+  } else {
+    *rd_stats = args.rd_stats;
+  }
+}
+
+static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
+                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int tx_size_ctx = get_tx_size_context(xd);
+    int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+    return r_tx_size;
+  } else {
+    return 0;
+  }
+}
+
+static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                        RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
+                        TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t rd = INT64_MAX;
+  const int skip_ctx = av1_get_skip_context(xd);
+  int s0, s1;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select =
+      cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type);
+  int ctx = txfm_partition_context(
+      xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+  const int r_tx_size = is_inter ? x->txfm_partition_cost[ctx][0]
+                                 : tx_size_cost(cm, x, bs, tx_size);
+
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+
+  s0 = x->skip_cost[skip_ctx][0];
+  s1 = x->skip_cost[skip_ctx][1];
+
+  mbmi->tx_size = tx_size;
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing, ftxs_mode);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  if (rd_stats->skip) {
+    if (is_inter) {
+      rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+    } else {
+      rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
+    }
+  } else {
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
+                rd_stats->dist);
+  }
+
+  if (tx_select) rd_stats->rate += r_tx_size;
+
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
+
+  return rd;
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int *r, int64_t *d, int *s,
+                                   int64_t *sse, int64_t ref_best_rd) {
+  RD_STATS rd_stats;
+  x->rd_model = LOW_TXFM_RD;
+  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
+                        max_txsize_rect_lookup[bs], FTXS_NONE);
+  x->rd_model = FULL_TXFM_RD;
+  *r = rd_stats.rate;
+  *d = rd_stats.dist;
+  *s = rd_stats.skip;
+  *sse = rd_stats.sse;
+  return rd;
+}
+
+static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   RD_STATS *rd_stats, int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(mbmi->tx_size, is_inter, cm->reduced_tx_set_used);
+  prune_tx(cpi, bs, x, xd, tx_set_type);
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+}
+
+static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    RD_STATS *rd_stats, int64_t ref_best_rd,
+                                    BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  mbmi->tx_size = TX_4X4;
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+                   cpi->sf.use_fast_coef_costing, FTXS_NONE);
+}
+
+static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
+  return num_blk;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+                                 const SPEED_FEATURES *sf) {
+  if (sf->tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+  if (sf->tx_size_search_lgr_block) {
+    if (mi_width > mi_size_wide[BLOCK_64X64] ||
+        mi_height > mi_size_high[BLOCK_64X64])
+      return MAX_VARTX_DEPTH;
+  }
+
+  if (is_inter) {
+    return (mi_height != mi_width) ? sf->inter_tx_size_search_init_depth_rect
+                                   : sf->inter_tx_size_search_init_depth_sqr;
+  } else {
+    return (mi_height != mi_width) ? sf->intra_tx_size_search_init_depth_rect
+                                   : sf->intra_tx_size_search_init_depth_sqr;
+  }
+}
+
+static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                        MACROBLOCK *x, RD_STATS *rd_stats,
+                                        int64_t ref_best_rd, BLOCK_SIZE bs) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t rd = INT64_MAX;
+  int n;
+  int start_tx;
+  int depth;
+  int64_t best_rd = INT64_MAX;
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int n4 = bsize_to_num_blk(bs);
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+
+  av1_invalid_rd_stats(rd_stats);
+
+  if (tx_select) {
+    start_tx = max_rect_tx_size;
+    depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                  is_inter_block(mbmi), &cpi->sf);
+  } else {
+    const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+    start_tx = chosen_tx_size;
+    depth = MAX_TX_DEPTH;
+  }
+
+  prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
+
+  for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+    RD_STATS this_rd_stats;
+    if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
+    rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
+    x->rd_model = FULL_TXFM_RD;
+
+    if (rd < best_rd) {
+      memcpy(best_txk_type, mbmi->txk_type,
+             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+      memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+      best_tx_size = n;
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+    }
+    if (n == TX_4X4) break;
+  }
+  mbmi->tx_size = best_tx_size;
+  memcpy(mbmi->txk_type, best_txk_type,
+         sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+  memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+}
+
+static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bs,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  av1_init_rd_stats(rd_stats);
+
+  assert(bs == xd->mi[0]->sb_type);
+
+  if (xd->lossless[xd->mi[0]->segment_id]) {
+    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else {
+    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+  }
+}
+
+// Return the rate cost for luma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                  int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                  .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
+                                       [MAX_ANGLE_DELTA +
+                                        mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += x->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+// Return the rate cost for chroma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                   int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(pmi, color_cache, n_cache,
+                                                     cpi->common.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
+                                             MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D113_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D67_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D203_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D157_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  RD_STATS this_rd_stats;
+  int row, col;
   int64_t temp_sse, this_rd;
-  const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0);
+  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
   const int stepr = tx_size_high_unit[tx_size];
   const int stepc = tx_size_wide_unit[tx_size];
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   const int max_blocks_high = max_block_high(xd, bsize, 0);
   mbmi->tx_size = tx_size;
   // Prediction.
-  const int step = stepr * stepc;
-  int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(cm, xd, 0, block, col, row, tx_size);
-      block += step;
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
     }
   }
   // RD estimation.
   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
-                  &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
-#if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->mode, bsize) &&
-      av1_use_angle_delta(bsize)) {
-    mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                    MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-  }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  if (mbmi->mode == DC_PRED) {
-    const aom_prob prob = cpi->common.fc->filter_intra_probs[0];
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
-      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0];
-      mode_cost += (av1_cost_bit(prob, 1) +
-                    write_uniform_cost(FILTER_INTRA_MODES, mode));
+                  &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
+                  NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost +=
+        x->angle_delta_cost[mbmi->mode - V_PRED]
+                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
+                   x->filter_intra_mode_cost[mode];
     } else {
-      mode_cost += av1_cost_bit(prob, 0);
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
     }
   }
-#endif  // CONFIG_FILTER_INTRA
   this_rd =
       RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
   return this_rd;
@@ -3014,42 +3422,99 @@ static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
   }
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Bias toward using colors in the cache.
 // TODO(huisu): Try other schemes to improve compression.
 static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
-                                    int n_colors, int stride,
-                                    float *centroids) {
+                                    int n_colors, int stride, int *centroids) {
   if (n_cache <= 0) return;
   for (int i = 0; i < n_colors * stride; i += stride) {
-    float min_diff = fabsf(centroids[i] - color_cache[0]);
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
     int idx = 0;
     for (int j = 1; j < n_cache; ++j) {
-      float this_diff = fabsf(centroids[i] - color_cache[j]);
+      const int this_diff = abs(centroids[i] - color_cache[j]);
       if (this_diff < min_diff) {
         min_diff = this_diff;
         idx = j;
       }
     }
-    if (min_diff < 1.5) centroids[i] = color_cache[idx];
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
   }
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
-static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                     BLOCK_SIZE bsize, int palette_ctx,
-                                     int dc_mode_cost, MB_MODE_INFO *best_mbmi,
-                                     uint8_t *best_palette_color_map,
-                                     int64_t *best_rd, int64_t *best_model_rd,
-                                     int *rate, int *rate_tokenonly,
-                                     int64_t *distortion, int *skippable) {
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion,
+    int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+  int k = av1_remove_duplicates(centroids, n);
+  if (k < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.use_highbitdepth)
+    for (int i = 0; i < k; ++i)
+      pmi->palette_colors[i] =
+          clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
+  else
+    for (int i = 0; i < k; ++i)
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+  pmi->palette_size[0] = k;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+    return;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  RD_STATS tokenonly_rd_stats;
+  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    tokenonly_rd_stats.rate -=
+        tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+  }
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    *rate_overhead = this_rate - tokenonly_rd_stats.rate;
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip;
+  }
+}
+
+static int rd_pick_palette_intra_sby(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip) {
   int rate_overhead = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
-  assert(bsize >= BLOCK_8X8);
-  int this_rate, colors, n;
+  assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
+  int colors, n;
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const src = x->plane[0].src.buf;
   uint8_t *const color_map = xd->plane[0].color_index_map;
@@ -3057,821 +3522,134 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
                            &cols);
 
-  assert(cpi->common.allow_screen_content_tools);
-
-#if CONFIG_HIGHBITDEPTH
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
   if (cpi->common.use_highbitdepth)
     colors = av1_count_colors_highbd(src, src_stride, rows, cols,
-                                     cpi->common.bit_depth);
+                                     cpi->common.bit_depth, count_buf);
   else
-#endif  // CONFIG_HIGHBITDEPTH
-    colors = av1_count_colors(src, src_stride, rows, cols);
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
+    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
   if (colors > 1 && colors <= 64) {
-    int r, c, i, k, palette_mode_cost;
+    int r, c, i;
     const int max_itr = 50;
-    float *const data = x->palette_buffer->kmeans_data_buf;
-    float centroids[PALETTE_MAX_SIZE];
-    float lb, ub, val;
-    RD_STATS tokenonly_rd_stats;
-    int64_t this_rd, this_model_rd;
-    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#if CONFIG_HIGHBITDEPTH
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub, val;
     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     if (cpi->common.use_highbitdepth)
       lb = ub = src16[0];
     else
-#endif  // CONFIG_HIGHBITDEPTH
       lb = ub = src[0];
 
-#if CONFIG_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) {
       for (r = 0; r < rows; ++r) {
         for (c = 0; c < cols; ++c) {
-          val = src16[r * src_stride + c];
-          data[r * cols + c] = val;
-          if (val < lb)
-            lb = val;
-          else if (val > ub)
-            ub = val;
-        }
-      }
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (r = 0; r < rows; ++r) {
-        for (c = 0; c < cols; ++c) {
-          val = src[r * src_stride + c];
-          data[r * cols + c] = val;
-          if (val < lb)
-            lb = val;
-          else if (val > ub)
-            ub = val;
-        }
-      }
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-
-    mbmi->mode = DC_PRED;
-#if CONFIG_FILTER_INTRA
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
-
-    if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
-
-#if CONFIG_PALETTE_DELTA_ENCODING
-    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
-      if (colors == PALETTE_MIN_SIZE) {
-        // Special case: These colors automatically become the centroids.
-        assert(colors == n);
-        assert(colors == 2);
-        centroids[0] = lb;
-        centroids[1] = ub;
-        k = 2;
-      } else {
-        for (i = 0; i < n; ++i) {
-          centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-        }
-        av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
-#if CONFIG_PALETTE_DELTA_ENCODING
-        optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-        k = av1_remove_duplicates(centroids, n);
-        if (k < PALETTE_MIN_SIZE) {
-          // Too few unique colors to create a palette. And DC_PRED will work
-          // well for that case anyway. So skip.
-          continue;
-        }
-      }
-
-#if CONFIG_HIGHBITDEPTH
-      if (cpi->common.use_highbitdepth)
-        for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] =
-              clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
-      pmi->palette_size[0] = k;
-
-      av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
-      extend_palette_color_map(color_map, cols, rows, block_width,
-                               block_height);
-      palette_mode_cost =
-          dc_mode_cost +
-          x->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
-          write_uniform_cost(k, color_map[0]) +
-          av1_cost_bit(
-              av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
-              1);
-      palette_mode_cost += av1_palette_color_cost_y(pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
-                                                    color_cache, n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-                                                    cpi->common.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 0, 0, bsize, mbmi->tx_size, PALETTE_MAP);
-      this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
-      if (*best_model_rd != INT64_MAX &&
-          this_model_rd > *best_model_rd + (*best_model_rd >> 1))
-        continue;
-      if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
-      super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (!xd->lossless[mbmi->segment_id] &&
-          block_signals_txsize(mbmi->sb_type)) {
-        tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-      }
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        memcpy(best_palette_color_map, color_map,
-               block_width * block_height * sizeof(color_map[0]));
-        *best_mbmi = *mbmi;
-        rate_overhead = this_rate - tokenonly_rd_stats.rate;
-        if (rate) *rate = this_rate;
-        if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
-        if (distortion) *distortion = tokenonly_rd_stats.dist;
-        if (skippable) *skippable = tokenonly_rd_stats.skip;
-      }
-    }
-  }
-
-  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           block_width * block_height * sizeof(best_palette_color_map[0]));
-  }
-  *mbmi = *best_mbmi;
-  return rate_overhead;
-}
-
-static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
-    PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
-    ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
-    BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) {
-  const AV1_COMMON *const cm = &cpi->common;
-  PREDICTION_MODE mode;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(!is_inter_block(&xd->mi[0]->mbmi));
-  int64_t best_rd = rd_thresh;
-  struct macroblock_plane *p = &x->plane[0];
-  struct macroblockd_plane *pd = &xd->plane[0];
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
-  uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
-#if CONFIG_CHROMA_2X2
-  // TODO(jingning): This is a temporal change. The whole function should be
-  // out when cb4x4 is enabled.
-  ENTROPY_CONTEXT ta[4], tempa[4];
-  ENTROPY_CONTEXT tl[4], templ[4];
-#else
-  ENTROPY_CONTEXT ta[2], tempa[2];
-  ENTROPY_CONTEXT tl[2], templ[2];
-#endif  // CONFIG_CHROMA_2X2
-
-  const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
-  const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
-  const int tx_width_unit = tx_size_wide_unit[tx_size];
-  const int tx_height_unit = tx_size_high_unit[tx_size];
-  const int pred_block_width = block_size_wide[bsize];
-  const int pred_block_height = block_size_high[bsize];
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
-  const int pred_width_in_transform_blocks = pred_block_width / tx_width;
-  const int pred_height_in_transform_blocks = pred_block_height / tx_height;
-  int idx, idy;
-  int best_can_skip = 0;
-  uint8_t best_dst[8 * 8];
-#if CONFIG_HIGHBITDEPTH
-  uint16_t best_dst16[8 * 8];
-#endif  // CONFIG_HIGHBITDEPTH
-  const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  const int sub_bsize = bsize;
-#else
-  const int sub_bsize = BLOCK_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf, post_buf;
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-  od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-  assert(bsize < BLOCK_8X8);
-  assert(tx_width < 8 || tx_height < 8);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  if (is_lossless)
-    assert(tx_width == 4 && tx_height == 4);
-  else
-    assert(tx_width == pred_block_width && tx_height == pred_block_height);
-#else
-  assert(tx_width == 4 && tx_height == 4);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-  memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0]));
-  memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0]));
-
-  xd->mi[0]->mbmi.tx_size = tx_size;
-
-  xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-#if CONFIG_PVQ
-    od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif
-    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-      int64_t this_rd;
-      int ratey = 0;
-      int64_t distortion = 0;
-      int rate = bmode_costs[mode];
-      int can_skip = 1;
-
-      if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
-            (1 << mode)))
-        continue;
-
-      // Only do the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(mode, *best_mode)) continue;
-      }
-
-      memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
-      memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
-
-      for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) {
-        for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) {
-          const int block_raster_idx = (row + idy) * 2 + (col + idx);
-          const int block =
-              av1_raster_order_to_block_index(tx_size, block_raster_idx);
-          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
-          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-#if !CONFIG_PVQ
-          int16_t *const src_diff = av1_raster_block_offset_int16(
-              BLOCK_8X8, block_raster_idx, p->src_diff);
-#endif
-          int skip;
-          assert(block < 4);
-          assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                         idx == 0 && idy == 0));
-          assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                         block == 0 || block == 2));
-          xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
-          av1_predict_intra_block(
-              cm, xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode,
-              dst, dst_stride, dst, dst_stride, col + idx, row + idy, 0);
-#if !CONFIG_PVQ
-          aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
-                                    src_stride, dst, dst_stride, xd->bd);
-#endif
-          if (is_lossless) {
-            TX_TYPE tx_type =
-                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
-            const SCAN_ORDER *scan_order =
-                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-            const int coeff_ctx =
-                combine_entropy_contexts(tempa[idx], templ[idy]);
-#if !CONFIG_PVQ
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
-                                     scan_order, tempa + idx, templ + idy,
-                                     cpi->sf.use_fast_coef_costing);
-            skip = (p->eobs[block] == 0);
-            can_skip &= skip;
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-#if CONFIG_EXT_TX
-            if (tx_size == TX_8X4) {
-              tempa[idx + 1] = tempa[idx];
-            } else if (tx_size == TX_4X8) {
-              templ[idy + 1] = templ[idy];
-            }
-#endif  // CONFIG_EXT_TX
-#else
-            (void)scan_order;
-
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
-
-            ratey += x->rate;
-            skip = x->pvq_skip[0];
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-            can_skip &= skip;
-#endif
-            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
-              goto next_highbd;
-#if CONFIG_PVQ
-            if (!skip)
-#endif
-              av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                          mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          DCT_DCT, tx_size, dst, dst_stride,
-                                          p->eobs[block]);
-          } else {
-            int64_t dist;
-            unsigned int tmp;
-            TX_TYPE tx_type =
-                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
-            const SCAN_ORDER *scan_order =
-                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-            const int coeff_ctx =
-                combine_entropy_contexts(tempa[idx], templ[idy]);
-#if !CONFIG_PVQ
-#if DISABLE_TRELLISQ_SEARCH
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
-#else
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
-                           tempa + idx, templ + idy, 1);
-#endif  // DISABLE_TRELLISQ_SEARCH
-            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
-                                     scan_order, tempa + idx, templ + idy,
-                                     cpi->sf.use_fast_coef_costing);
-            skip = (p->eobs[block] == 0);
-            can_skip &= skip;
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-#if CONFIG_EXT_TX
-            if (tx_size == TX_8X4) {
-              tempa[idx + 1] = tempa[idx];
-            } else if (tx_size == TX_4X8) {
-              templ[idy + 1] = templ[idy];
-            }
-#endif  // CONFIG_EXT_TX
-#else
-            (void)scan_order;
-
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            ratey += x->rate;
-            skip = x->pvq_skip[0];
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-            can_skip &= skip;
-#endif
-#if CONFIG_PVQ
-            if (!skip)
-#endif
-              av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                          mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          tx_type, tx_size, dst, dst_stride,
-                                          p->eobs[block]);
-            cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
-            dist = (int64_t)tmp << 4;
-            distortion += dist;
-            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
-              goto next_highbd;
-          }
-        }
-      }
-
-      rate += ratey;
-      this_rd = RDCOST(x->rdmult, rate, distortion);
-
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = ratey;
-        *bestdistortion = distortion;
-        best_rd = this_rd;
-        best_can_skip = can_skip;
-        *best_mode = mode;
-        memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
-        memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
-#if CONFIG_PVQ
-        od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif
-        for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
-          memcpy(best_dst16 + idy * 8,
-                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
-                 pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
-        }
-      }
-    next_highbd : {}
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif
-    }
-
-    if (best_rd >= rd_thresh) return best_rd;
-
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &post_buf);
-#endif
-
-    if (y_skip) *y_skip &= best_can_skip;
-
-    for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
-      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
-             best_dst16 + idy * 8,
-             pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
-    }
-
-    return best_rd;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_PVQ
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-    int64_t this_rd;
-    int ratey = 0;
-    int64_t distortion = 0;
-    int rate = bmode_costs[mode];
-    int can_skip = 1;
-
-    if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
-          (1 << mode))) {
-      continue;
-    }
-
-    // Only do the oblique modes if the best so far is
-    // one of the neighboring directional modes
-    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-      if (conditional_skipintra(mode, *best_mode)) continue;
-    }
-
-    memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
-    memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
-
-    for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) {
-      for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) {
-        const int block_raster_idx = (row + idy) * 2 + (col + idx);
-        int block = av1_raster_order_to_block_index(tx_size, block_raster_idx);
-        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
-        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-#if !CONFIG_PVQ
-        int16_t *const src_diff = av1_raster_block_offset_int16(
-            BLOCK_8X8, block_raster_idx, p->src_diff);
-#endif  // !CONFIG_PVQ
-        int skip;
-        assert(block < 4);
-        assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                       idx == 0 && idy == 0));
-        assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                       block == 0 || block == 2));
-        xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
-        av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                                txsize_to_bsize[tx_size], mode, dst, dst_stride,
-                                dst, dst_stride,
-#if CONFIG_CB4X4
-                                2 * (col + idx), 2 * (row + idy),
-#else
-                                col + idx, row + idy,
-#endif  // CONFIG_CB4X4
-                                0);
-#if !CONFIG_PVQ
-        aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
-                           dst, dst_stride);
-#endif  // !CONFIG_PVQ
-        TX_TYPE tx_type =
-            av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
-        const SCAN_ORDER *scan_order =
-            get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-        const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
-#if CONFIG_CB4X4
-        block = 4 * block;
-#endif  // CONFIG_CB4X4
-#if !CONFIG_PVQ
-#if DISABLE_TRELLISQ_SEARCH
-        av1_xform_quant(cm, x, 0, block,
-#if CONFIG_CB4X4
-                        2 * (row + idy), 2 * (col + idx),
-#else
-                        row + idy, col + idx,
-#endif  // CONFIG_CB4X4
-                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
-#else
-        const AV1_XFORM_QUANT xform_quant =
-            is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
-        av1_xform_quant(cm, x, 0, block,
-#if CONFIG_CB4X4
-                        2 * (row + idy), 2 * (col + idx),
-#else
-                        row + idy, col + idx,
-#endif  // CONFIG_CB4X4
-                        BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
-
-        av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
-                       templ + idy, 1);
-#endif  // DISABLE_TRELLISQ_SEARCH
-        ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
-                                 tempa + idx, templ + idy,
-                                 cpi->sf.use_fast_coef_costing);
-        skip = (p->eobs[block] == 0);
-        can_skip &= skip;
-        tempa[idx] = !skip;
-        templ[idy] = !skip;
-#if CONFIG_EXT_TX
-        if (tx_size == TX_8X4) {
-          tempa[idx + 1] = tempa[idx];
-        } else if (tx_size == TX_4X8) {
-          templ[idy + 1] = templ[idy];
-        }
-#endif  // CONFIG_EXT_TX
-#else
-        (void)scan_order;
-
-        av1_xform_quant(cm, x, 0, block,
-#if CONFIG_CB4X4
-                        2 * (row + idy), 2 * (col + idx),
-#else
-                        row + idy, col + idx,
-#endif  // CONFIG_CB4X4
-                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-
-        ratey += x->rate;
-        skip = x->pvq_skip[0];
-        tempa[idx] = !skip;
-        templ[idy] = !skip;
-        can_skip &= skip;
-#endif  // !CONFIG_PVQ
-
-        if (!is_lossless) {  // To use the pixel domain distortion, we need to
-                             // calculate inverse txfm *before* calculating RD
-                             // cost. Compared to calculating the distortion in
-                             // the frequency domain, the overhead of encoding
-                             // effort is low.
-#if CONFIG_PVQ
-          if (!skip)
-#endif  // CONFIG_PVQ
-            av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                        mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        tx_type, tx_size, dst, dst_stride,
-                                        p->eobs[block]);
-          unsigned int tmp;
-          cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
-          const int64_t dist = (int64_t)tmp << 4;
-          distortion += dist;
+          val = src16[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
         }
-
-        if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) goto next;
-
-        if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
-#if CONFIG_PVQ
-          if (!skip)
-#endif  // CONFIG_PVQ
-            av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                        mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        DCT_DCT, tx_size, dst, dst_stride,
-                                        p->eobs[block]);
+      }
+    } else {
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
         }
       }
     }
 
-    rate += ratey;
-    this_rd = RDCOST(x->rdmult, rate, distortion);
-
-    if (this_rd < best_rd) {
-      *bestrate = rate;
-      *bestratey = ratey;
-      *bestdistortion = distortion;
-      best_rd = this_rd;
-      best_can_skip = can_skip;
-      *best_mode = mode;
-      memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
-      memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
-#if CONFIG_PVQ
-      od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-      for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
-        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
-               pred_width_in_transform_blocks * 4);
-    }
-  next : {}
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-  }     // mode decision loop
-
-  if (best_rd >= rd_thresh) return best_rd;
-
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-  if (y_skip) *y_skip &= best_can_skip;
-
-  for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
-    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
-           pred_width_in_transform_blocks * 4);
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
-                                            MACROBLOCK *mb, int *rate,
-                                            int *rate_y, int64_t *distortion,
-                                            int *y_skip, int64_t best_rd) {
-  const MACROBLOCKD *const xd = &mb->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
-  MB_MODE_INFO *const mbmi = &mic->mbmi;
-  assert(!is_inter_block(mbmi));
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
-  const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
-  int idx, idy;
-  int cost = 0;
-  int64_t total_distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  const int *bmode_costs = mb->mbmode_cost[0];
-  const int is_lossless = xd->lossless[mbmi->segment_id];
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
-#else
-  const TX_SIZE tx_size = TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
-
-  // TODO(any): Add search of the tx_type to improve rd performance at the
-  // expense of speed.
-  mbmi->tx_type = DCT_DCT;
-  mbmi->tx_size = tx_size;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
-  if (y_skip) *y_skip = 1;
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
 
-  // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this
-  // 8x8 coding block.
-  for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) {
-    for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) {
-      PREDICTION_MODE best_mode = DC_PRED;
-      int r = INT_MAX, ry = INT_MAX;
-      int64_t d = INT64_MAX, this_rd = INT64_MAX;
-      int j;
-      const int pred_block_idx = idy * 2 + idx;
-      if (cpi->common.frame_type == KEY_FRAME) {
-        const PREDICTION_MODE A =
-            av1_above_block_mode(mic, above_mi, pred_block_idx);
-        const PREDICTION_MODE L =
-            av1_left_block_mode(mic, left_mi, pred_block_idx);
-
-#if CONFIG_KF_CTX
-        const int above_ctx = intra_mode_context[A];
-        const int left_ctx = intra_mode_context[L];
-        bmode_costs = mb->y_mode_costs[above_ctx][left_ctx];
-#else
-        bmode_costs = mb->y_mode_costs[A][L];
-#endif
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << cpi->common.bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
       }
-      this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
-          cpi, mb, idy, idx, &best_mode, bmode_costs,
-          xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
-          &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if CONFIG_DIST_8X8
-      if (!cpi->oxcf.using_dist_8x8)
-#endif
-        if (this_rd >= best_rd - total_rd) return INT64_MAX;
-
-      total_rd += this_rd;
-      cost += r;
-      total_distortion += d;
-      tot_rate_y += ry;
-
-      mic->bmi[pred_block_idx].as_mode = best_mode;
-      for (j = 1; j < pred_height_in_4x4_blocks; ++j)
-        mic->bmi[pred_block_idx + j * 2].as_mode = best_mode;
-      for (j = 1; j < pred_width_in_4x4_blocks; ++j)
-        mic->bmi[pred_block_idx + j].as_mode = best_mode;
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
 
-      if (total_rd >= best_rd) return INT64_MAX;
+    // Try the dominant colors directly.
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+      for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
+      palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                   color_cache, n_cache, best_mbmi, best_palette_color_map,
+                   best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
+                   distortion, skippable, ctx, best_blk_skip);
     }
-  }
-  mbmi->mode = mic->bmi[3].as_mode;
 
-#if CONFIG_DIST_8X8
-  if (cpi->oxcf.using_dist_8x8) {
-    const struct macroblock_plane *p = &mb->plane[0];
-    const struct macroblockd_plane *pd = &xd->plane[0];
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-    uint8_t *src = p->src.buf;
-    uint8_t *dst = pd->dst.buf;
-
-    // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_dist_8x8(cpi, mb, src, src_stride, dst, dst_stride,
-                                    BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
-                       << 4;
-  }
-#endif  // CONFIG_DIST_8X8
-  // Add in the cost of the transform type
-  if (!is_lossless) {
-    int rate_tx_type = 0;
-#if CONFIG_EXT_TX
-    if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) >
-        1) {
-      const int eset =
-          get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
-#if CONFIG_LGT_FROM_PRED
-      if (LGT_FROM_PRED_INTRA && is_lgt_allowed(mbmi->mode, tx_size))
-        rate_tx_type += mb->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode]
-                                          [mbmi->use_lgt];
-      if (!LGT_FROM_PRED_INTRA || !mbmi->use_lgt)
-#endif  // CONFIG_LGT_FROM_PRED
-        rate_tx_type += mb->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
-                                               [mbmi->mode][mbmi->tx_type];
+    // K-means clustering.
+    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == n);
+        assert(colors == 2);
+        centroids[0] = lb;
+        centroids[1] = ub;
+      } else {
+        for (i = 0; i < n; ++i) {
+          centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+        }
+        av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
+      }
+      palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                   color_cache, n_cache, best_mbmi, best_palette_color_map,
+                   best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
+                   distortion, skippable, ctx, best_blk_skip);
     }
-#else
-    rate_tx_type =
-        mb->intra_tx_type_costs[txsize_sqr_map[tx_size]]
-                               [intra_mode_to_tx_type_context[mbmi->mode]]
-                               [mbmi->tx_type];
-#endif  // CONFIG_EXT_TX
-    assert(mbmi->tx_size == tx_size);
-    cost += rate_tx_type;
-    tot_rate_y += rate_tx_type;
   }
 
-  *rate = cost;
-  *rate_y = tot_rate_y;
-  *distortion = total_distortion;
-
-  return RDCOST(mb->rdmult, cost, total_distortion);
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+  return rate_overhead;
 }
 
-#if CONFIG_FILTER_INTRA
 // Return 1 if an filter intra mode is selected; return 0 otherwise.
 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, int mode_cost,
                                     int64_t *best_rd, int64_t *best_model_rd,
-                                    uint16_t skip_mask) {
+                                    PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mic->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   int filter_intra_selected_flag = 0;
   FILTER_INTRA_MODE mode;
-  TX_SIZE best_tx_size = TX_4X4;
+  TX_SIZE best_tx_size = TX_8X8;
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  TX_TYPE best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  int use_lgt_when_selected;
-#endif
-
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  (void)ctx;
   av1_zero(filter_intra_mode_info);
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
   mbmi->mode = DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-    int this_rate;
     int64_t this_rd, this_model_rd;
     RD_STATS tokenonly_rd_stats;
-    if (skip_mask & (1 << mode)) continue;
-    mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
+    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
     this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
     if (*best_model_rd != INT64_MAX &&
         this_model_rd > *best_model_rd + (*best_model_rd >> 1))
@@ -3879,19 +3657,19 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
     super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
     if (tokenonly_rd_stats.rate == INT_MAX) continue;
-    this_rate = tokenonly_rd_stats.rate +
-                av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
-                write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    const int this_rate =
+        tokenonly_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
-      best_tx_size = mic->mbmi.tx_size;
+      best_tx_size = mbmi->tx_size;
       filter_intra_mode_info = mbmi->filter_intra_mode_info;
-      best_tx_type = mic->mbmi.tx_type;
-#if CONFIG_LGT_FROM_PRED
-      use_lgt_when_selected = mic->mbmi.use_lgt;
-#endif
+      memcpy(best_txk_type, mbmi->txk_type,
+             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
@@ -3903,43 +3681,31 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (filter_intra_selected_flag) {
     mbmi->mode = DC_PRED;
     mbmi->tx_size = best_tx_size;
-#if CONFIG_LGT_FROM_PRED
-    mbmi->use_lgt = use_lgt_when_selected;
-#endif
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
-        filter_intra_mode_info.use_filter_intra_mode[0];
-    mbmi->filter_intra_mode_info.filter_intra_mode[0] =
-        filter_intra_mode_info.filter_intra_mode[0];
-    mbmi->tx_type = best_tx_type;
+    mbmi->filter_intra_mode_info = filter_intra_mode_info;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
     return 1;
   } else {
     return 0;
   }
 }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
 // Run RD calculation with given luma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t calc_rd_given_intra_angle(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
     RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    TX_TYPE *best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-    int *use_lgt_when_selected,
-#endif
-#if CONFIG_INTRA_INTERP
-    INTRA_FILTER *best_filter,
-#endif  // CONFIG_INTRA_INTERP
-    int64_t *best_rd, int64_t *best_model_rd) {
+    int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type,
+    uint8_t *best_blk_skip) {
   int this_rate;
   RD_STATS tokenonly_rd_stats;
   int64_t this_rd, this_model_rd;
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
   assert(!is_inter_block(mbmi));
 
-  mbmi->angle_delta[0] = angle_delta;
+  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
   this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
   if (*best_model_rd != INT64_MAX &&
       this_model_rd > *best_model_rd + (*best_model_rd >> 1))
@@ -3948,22 +3714,19 @@ static int64_t calc_rd_given_intra_angle(
   super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
   if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
 
-  this_rate = tokenonly_rd_stats.rate + mode_cost +
-              write_uniform_cost(2 * max_angle_delta + 1,
-                                 mbmi->angle_delta[0] + max_angle_delta);
+  this_rate =
+      tokenonly_rd_stats.rate + mode_cost +
+      x->angle_delta_cost[mbmi->mode - V_PRED]
+                         [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]];
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
   if (this_rd < *best_rd) {
+    memcpy(best_txk_type, mbmi->txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
     *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[0];
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
     *best_tx_size = mbmi->tx_size;
-#if CONFIG_INTRA_INTERP
-    *best_filter = mbmi->intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-    *best_tx_type = mbmi->tx_type;
-#if CONFIG_LGT_FROM_PRED
-    *use_lgt_when_selected = mbmi->use_lgt;
-#endif
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
@@ -3980,131 +3743,60 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int64_t best_rd,
                                        int64_t *best_model_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mic->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   int i, angle_delta, best_angle_delta = 0;
   int first_try = 1;
-#if CONFIG_INTRA_INTERP
-  int p_angle;
-  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-  INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mic->mbmi.tx_size;
-  TX_TYPE best_tx_type = mbmi->tx_type;
-#if CONFIG_LGT_FROM_PRED
-  int use_lgt_when_selected = mbmi->use_lgt;
-#endif
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  const int n4 = bsize_to_num_blk(bsize);
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
   for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-#if CONFIG_INTRA_INTERP
-    for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
-      if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
-      mic->mbmi.intra_filter = filter;
-#endif  // CONFIG_INTRA_INTERP
-      for (i = 0; i < 2; ++i) {
-        best_rd_in = (best_rd == INT64_MAX)
-                         ? INT64_MAX
-                         : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-        this_rd = calc_rd_given_intra_angle(
-            cpi, x, bsize,
-#if CONFIG_INTRA_INTERP
-            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
-#else
-          mode_cost,
-#endif  // CONFIG_INTRA_INTERP
-            best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
-            rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-            &use_lgt_when_selected,
-#endif
-#if CONFIG_INTRA_INTERP
-            &best_filter,
-#endif  // CONFIG_INTRA_INTERP
-            &best_rd, best_model_rd);
-        rd_cost[2 * angle_delta + i] = this_rd;
-        if (first_try && this_rd == INT64_MAX) return best_rd;
-        first_try = 0;
-        if (angle_delta == 0) {
-          rd_cost[1] = this_rd;
-          break;
-        }
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+      this_rd = calc_rd_given_intra_angle(
+          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
+          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+          &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (first_try && this_rd == INT64_MAX) return best_rd;
+      first_try = 0;
+      if (angle_delta == 0) {
+        rd_cost[1] = this_rd;
+        break;
       }
-#if CONFIG_INTRA_INTERP
     }
-#endif  // CONFIG_INTRA_INTERP
   }
 
   assert(best_rd != INT64_MAX);
   for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
     int64_t rd_thresh;
-#if CONFIG_INTRA_INTERP
-    for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
-      if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
-      mic->mbmi.intra_filter = filter;
-#endif  // CONFIG_INTRA_INTERP
-      for (i = 0; i < 2; ++i) {
-        int skip_search = 0;
-        rd_thresh = best_rd + (best_rd >> 5);
-        if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-            rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-          skip_search = 1;
-        if (!skip_search) {
-          calc_rd_given_intra_angle(
-              cpi, x, bsize,
-#if CONFIG_INTRA_INTERP
-              mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
-#else
-            mode_cost,
-#endif  // CONFIG_INTRA_INTERP
-              best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
-              rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-              &use_lgt_when_selected,
-#endif
-#if CONFIG_INTRA_INTERP
-              &best_filter,
-#endif  // CONFIG_INTRA_INTERP
-              &best_rd, best_model_rd);
-        }
-      }
-#if CONFIG_INTRA_INTERP
-    }
-#endif  // CONFIG_INTRA_INTERP
-  }
-
-#if CONFIG_INTRA_INTERP
-  if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) {
-    p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP;
-    if (av1_is_intra_filter_switchable(p_angle)) {
-      for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
-        mic->mbmi.intra_filter = filter;
-        this_rd = calc_rd_given_intra_angle(
-            cpi, x, bsize,
-            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], best_rd,
-            best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
-            &best_angle_delta, &best_tx_size, &best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-            &use_lgt_when_selected,
-#endif
-            &best_filter, &best_rd, best_model_rd);
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        calc_rd_given_intra_angle(
+            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
+            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+            &best_rd, best_model_rd, best_txk_type, best_blk_skip);
       }
     }
   }
-#endif  // CONFIG_INTRA_INTERP
 
   mbmi->tx_size = best_tx_size;
-  mbmi->angle_delta[0] = best_angle_delta;
-#if CONFIG_INTRA_INTERP
-  mic->mbmi.intra_filter = best_filter;
-#endif  // CONFIG_INTRA_INTERP
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = use_lgt_when_selected;
-#endif
+  mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+  memcpy(mbmi->txk_type, best_txk_type,
+         sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+  memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
   return best_rd;
 }
 
@@ -4173,7 +3865,7 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
   uint64_t hist_sum = 0;
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
-    if (av1_is_directional_mode(i, bsize)) {
+    if (av1_is_directional_mode(i)) {
       const uint8_t angle_bin = mode_to_angle_bin[i];
       uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
@@ -4191,7 +3883,6 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
                                     int rows, int cols, BLOCK_SIZE bsize,
                                     uint8_t *directional_mode_skip_mask) {
@@ -4229,7 +3920,7 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
   uint64_t hist_sum = 0;
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
-    if (av1_is_directional_mode(i, bsize)) {
+    if (av1_is_directional_mode(i)) {
       const uint8_t angle_bin = mode_to_angle_bin[i];
       uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
@@ -4246,119 +3937,102 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_EXT_INTRA
+
+// Given selected prediction mode, search for the best tx type and size.
+static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, const int *bmode_costs,
+                            int64_t *best_rd, int *rate, int *rate_tokenonly,
+                            int64_t *distortion, int *skippable,
+                            MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS rd_stats;
+  super_block_yrd(cpi, x, &rd_stats, bsize, *best_rd);
+  if (rd_stats.rate == INT_MAX) return;
+  int this_rate_tokenonly = rd_stats.rate;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    this_rate_tokenonly -= tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+  }
+  const int this_rate =
+      rd_stats.rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_mbmi = *mbmi;
+    *best_rd = this_rd;
+    *rate = this_rate;
+    *rate_tokenonly = this_rate_tokenonly;
+    *distortion = rd_stats.dist;
+    *skippable = rd_stats.skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  }
+}
 
 // This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
-                                      BLOCK_SIZE bsize, int64_t best_rd) {
+                                      BLOCK_SIZE bsize, int64_t best_rd,
+                                      PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
-  MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_model_rd = INT64_MAX;
-#if CONFIG_EXT_INTRA
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
-#if CONFIG_INTRA_INTERP
-  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-#endif  // CONFIG_INTRA_INTERP
   int is_directional_mode;
   uint8_t directional_mode_skip_mask[INTRA_MODES];
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *src = x->plane[0].src.buf;
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
   int beat_best_rd = 0;
-  uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
-#endif  // CONFIG_FILTER_INTRA
   const int *bmode_costs;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  int palette_y_mode_ctx = 0;
   const int try_palette =
       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
   uint8_t *best_palette_color_map =
       try_palette ? x->palette_buffer->best_palette_color_map : NULL;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
-  const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
-  const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
-  const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf, post_buf;
-
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-  od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-#if CONFIG_KF_CTX
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
   const int above_ctx = intra_mode_context[A];
   const int left_ctx = intra_mode_context[L];
   bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
-#else
-  bmode_costs = x->y_mode_costs[A][L];
-#endif
 
-#if CONFIG_EXT_INTRA
-  mbmi->angle_delta[0] = 0;
-#if CONFIG_HIGHBITDEPTH
+  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     highbd_angle_estimation(src, src_stride, rows, cols, bsize,
                             directional_mode_skip_mask);
   else
-#endif  // CONFIG_HIGHBITDEPTH
     angle_estimation(src, src_stride, rows, cols, bsize,
                      directional_mode_skip_mask);
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
   pmi->palette_size[0] = 0;
-  if (try_palette) {
-    if (above_mi) {
-      palette_y_mode_ctx +=
-          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-    if (left_mi) {
-      palette_y_mode_ctx +=
-          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-  }
 
   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
     x->use_default_intra_tx_type = 1;
   else
     x->use_default_intra_tx_type = 0;
 
+  MB_MODE_INFO best_mbmi = *mbmi;
   /* Y Search for intra prediction mode */
-  for (int mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+  for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) {
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd, this_model_rd;
-    if (mode_idx == FINAL_MODE_SEARCH) {
-      if (x->use_default_intra_tx_type == 0) break;
-      mbmi->mode = best_mbmi.mode;
-      x->use_default_intra_tx_type = 0;
-    } else {
-      assert(mode_idx < INTRA_MODES);
-      mbmi->mode = intra_rd_search_mode_order[mode_idx];
-    }
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-#if CONFIG_EXT_INTRA
-    mbmi->angle_delta[0] = 0;
-#endif  // CONFIG_EXT_INTRA
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
     this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
     if (best_model_rd != INT64_MAX &&
         this_model_rd > best_model_rd + (best_model_rd >> 1))
       continue;
     if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
-#if CONFIG_EXT_INTRA
-    is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
+    is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
     if (is_directional_mode && av1_use_angle_delta(bsize)) {
       this_rd_stats.rate = INT_MAX;
@@ -4367,97 +4041,61 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     } else {
       super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     }
-#else
-    super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-#endif  // CONFIG_EXT_INTRA
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
     s = this_rd_stats.skip;
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
-    this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
-
     if (!xd->lossless[mbmi->segment_id] &&
         block_signals_txsize(mbmi->sb_type)) {
       // super_block_yrd above includes the cost of the tx_size in the
       // tokenonly rate, but for intra blocks, tx_size is always coded
       // (prediction granularity), so we account for it in the full rate,
       // not the tokenonly rate.
-      this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-    }
-    if (try_palette && mbmi->mode == DC_PRED) {
-      this_rate +=
-          av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                      [palette_y_mode_ctx],
-                       0);
-    }
-#if CONFIG_FILTER_INTRA
-    if (mbmi->mode == DC_PRED)
-      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
-    if (is_directional_mode) {
-#if CONFIG_INTRA_INTERP
-      const int p_angle =
-          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-      if (av1_is_intra_filter_switchable(p_angle))
-        this_rate += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
-#endif  // CONFIG_INTRA_INTERP
-      if (av1_use_angle_delta(bsize)) {
-        this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                        MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-      }
-    }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_INTRABC
-    if (bsize >= BLOCK_8X8 && cpi->common.allow_screen_content_tools)
-      this_rate += x->intrabc_cost[0];
-#endif  // CONFIG_INTRABC
-    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
-#if CONFIG_FILTER_INTRA
-    if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
-      filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
+      this_rate_tokenonly -=
+          tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
     }
-#endif  // CONFIG_FILTER_INTRA
-
+    this_rate =
+        this_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
     if (this_rd < best_rd) {
       best_mbmi = *mbmi;
       best_rd = this_rd;
-#if CONFIG_FILTER_INTRA
       beat_best_rd = 1;
-#endif  // CONFIG_FILTER_INTRA
       *rate = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion = this_distortion;
       *skippable = s;
-#if CONFIG_PVQ
-      od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
     }
   }
 
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
   if (try_palette) {
-    rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
-                              bmode_costs[DC_PRED], &best_mbmi,
+    rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi,
                               best_palette_color_map, &best_rd, &best_model_rd,
-                              rate, rate_tokenonly, distortion, skippable);
+                              rate, rate_tokenonly, distortion, skippable, ctx,
+                              ctx->blk_skip);
   }
 
-#if CONFIG_FILTER_INTRA
-  if (beat_best_rd) {
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                                  skippable, bsize, bmode_costs[DC_PRED],
-                                 &best_rd, &best_model_rd,
-                                 filter_intra_mode_skip_mask)) {
+                                 &best_rd, &best_model_rd, ctx)) {
       best_mbmi = *mbmi;
     }
   }
-#endif  // CONFIG_FILTER_INTRA
+
+  // If previous searches use only the default tx type, do an extra search for
+  // the best tx type.
+  if (x->use_default_intra_tx_type) {
+    *mbmi = best_mbmi;
+    x->use_default_intra_tx_type = 0;
+    intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly,
+                    distortion, skippable, &best_mbmi, ctx);
+  }
 
   *mbmi = best_mbmi;
   return best_rd;
@@ -4469,33 +4107,29 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
   int plane;
   int is_cost_valid = 1;
   av1_init_rd_stats(rd_stats);
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) return is_cost_valid;
 
-  bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x,
-                             xd->plane[1].subsampling_y);
-#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
 
-#if !CONFIG_PVQ
   if (is_inter_block(mbmi) && is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       av1_subtract_plane(x, bsize, plane);
   }
-#endif  // !CONFIG_PVQ
 
   if (is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
       RD_STATS pn_rd_stats;
       txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
-                       uv_tx_size, cpi->sf.use_fast_coef_costing);
+                       uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
       if (pn_rd_stats.rate == INT_MAX) {
         is_cost_valid = 0;
         break;
@@ -4517,283 +4151,222 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   return is_cost_valid;
 }
 
-#if CONFIG_VAR_TX
-void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
-                       int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, const ENTROPY_CONTEXT *a,
-                       const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
+static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                          int blk_row, int blk_col, int plane, int block,
+                          int plane_bsize, const ENTROPY_CONTEXT *a,
+                          const ENTROPY_CONTEXT *l, RD_STATS *rd_stats,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
+                          TXB_RD_INFO *rd_info_array) {
   const struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-
-#if CONFIG_TXK_SEL
-  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, a, l, 0, rd_stats);
-  return;
-#endif
-
-  int64_t tmp;
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
-  int bh = block_size_high[txm_bsize];
-  int bw = block_size_wide[txm_bsize];
-  int src_stride = p->src.stride;
-  uint8_t *src =
-      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-  uint8_t *dst =
-      &pd->dst
-           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
-  uint8_t *rec_buffer;
-#else
-  DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-  int txb_coeff_cost;
-
-  assert(tx_size < TX_SIZES_ALL);
-
-  int coeff_ctx = get_entropy_context(tx_size, a, l);
-
-  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                        plane_bsize, txm_bsize);
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-  rd_stats->sse += tmp << 4;
-
-  if (rd_stats->invalid_rate) {
-    rd_stats->dist += tmp << 4;
-    rd_stats->rate += rd_stats->zero_rate;
-    rd_stats->skip = 1;
-    return;
-  }
-
-// TODO(any): Use av1_dist_block to compute distortion
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
-    aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
-                             0, NULL, 0, bw, bh, xd->bd);
-  } else {
-    rec_buffer = (uint8_t *)rec_buffer16;
-    aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
-                      NULL, 0, bw, bh);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  const uint16_t cur_joint_ctx =
+      (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
+
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  // Look up RD and terminate early in case when we've already processed exactly
+  // the same residual with exactly the same entropy context.
+  if (rd_info_array != NULL && rd_info_array->valid &&
+      rd_info_array->entropy_context == cur_joint_ctx) {
+    if (plane == 0)
+      x->e_mbd.mi[0]->txk_type[txk_type_idx] = rd_info_array->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                        tx_size, cpi->common.reduced_tx_set_used);
+    if (ref_tx_type == rd_info_array->tx_type) {
+      rd_stats->rate += rd_info_array->rate;
+      rd_stats->dist += rd_info_array->dist;
+      rd_stats->sse += rd_info_array->sse;
+      rd_stats->skip &= rd_info_array->eob == 0;
+      p->eobs[block] = rd_info_array->eob;
+      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+      return;
+    }
   }
-#else
-  aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
-                    0, bw, bh);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if DISABLE_TRELLISQ_SEARCH
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_B);
-
-#else
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-
-  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  const int buffer_length = tx_size_2d[tx_size];
-  int64_t tmp_dist, tmp_sse;
-#if CONFIG_DIST_8X8
-  int disable_early_skip =
-      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
-      x->tune_metric != AOM_TUNE_PSNR;
-#endif  // CONFIG_DIST_8X8
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
-  else
-#endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
-
-  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
-#if CONFIG_MRC_TX
-  if (tx_type == MRC_DCT && !xd->mi[0]->mbmi.valid_mrc_mask) {
-    av1_invalid_rd_stats(rd_stats);
-    return;
-  }
-#endif  // CONFIG_MRC_TX
-  if (
-#if CONFIG_DIST_8X8
-      disable_early_skip ||
-#endif
-      RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-  } else {
-    rd_stats->rate += rd_stats->zero_rate;
-    rd_stats->dist += tmp << 4;
-    rd_stats->skip = 1;
-    rd_stats->invalid_rate = 1;
-    return;
+  RD_STATS this_rd_stats;
+  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+
+  av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+  // Save RD results for possible reuse in future.
+  if (rd_info_array != NULL) {
+    rd_info_array->valid = 1;
+    rd_info_array->entropy_context = cur_joint_ctx;
+    rd_info_array->rate = this_rd_stats.rate;
+    rd_info_array->dist = this_rd_stats.dist;
+    rd_info_array->sse = this_rd_stats.sse;
+    rd_info_array->eob = p->eobs[block];
+    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+    if (plane == 0) {
+      rd_info_array->tx_type = x->e_mbd.mi[0]->txk_type[txk_type_idx];
+    }
+  }
+}
+
+static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh,
+                             float *mean, float *dev) {
+  int x_sum = 0;
+  uint64_t x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const int val = data[j];
+      x_sum += val;
+      x2_sum += val * val;
+    }
+    data += stride;
+  }
+
+  const int num = bw * bh;
+  const float e_x = (float)x_sum / num;
+  const float e_x2 = (float)((double)x2_sum / num);
+  const float diff = e_x2 - e_x * e_x;
+  *dev = (diff > 0) ? sqrtf(diff) : 0;
+  *mean = e_x;
+}
+
+static void get_mean_and_dev_float(const float *data, int stride, int bw,
+                                   int bh, float *mean, float *dev) {
+  float x_sum = 0;
+  float x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const float val = data[j];
+      x_sum += val;
+      x2_sum += val * val;
+    }
+    data += stride;
+  }
+
+  const int num = bw * bh;
+  const float e_x = x_sum / num;
+  const float e_x2 = x2_sum / num;
+  const float diff = e_x2 - e_x * e_x;
+  *dev = (diff > 0) ? sqrtf(diff) : 0;
+  *mean = e_x;
+}
+
+// Feature used by the model to predict tx split: the mean and standard
+// deviation values of the block and sub-blocks.
+static void get_mean_dev_features(const int16_t *data, int stride, int bw,
+                                  int bh, int levels, float *feature) {
+  int feature_idx = 0;
+  int width = bw;
+  int height = bh;
+  const int16_t *const data_ptr = &data[0];
+  for (int lv = 0; lv < levels; ++lv) {
+    if (width < 2 || height < 2) break;
+    float mean_buf[16];
+    float dev_buf[16];
+    int blk_idx = 0;
+    for (int row = 0; row < bh; row += height) {
+      for (int col = 0; col < bw; col += width) {
+        float mean, dev;
+        get_mean_and_dev(data_ptr + row * stride + col, stride, width, height,
+                         &mean, &dev);
+        feature[feature_idx++] = mean;
+        feature[feature_idx++] = dev;
+        mean_buf[blk_idx] = mean;
+        dev_buf[blk_idx++] = dev;
+      }
+    }
+    if (blk_idx > 1) {
+      float mean, dev;
+      // Deviation of means.
+      get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev);
+      feature[feature_idx++] = dev;
+      // Mean of deviations.
+      get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev);
+      feature[feature_idx++] = mean;
+    }
+    // Reduce the block size when proceeding to the next level.
+    if (height == width) {
+      height = height >> 1;
+      width = width >> 1;
+    } else if (height > width) {
+      height = height >> 1;
+    } else {
+      width = width >> 1;
+    }
   }
-#endif  // DISABLE_TRELLISQ_SEARCH
+}
 
-  const int eob = p->eobs[block];
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+                               int blk_col, TX_SIZE tx_size) {
+  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+  if (!nn_config) return -1;
 
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              xd->mi[0]->mbmi.mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, rec_buffer, MAX_TX_SIZE, eob);
-  if (eob > 0) {
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) {
-      // Save sub8x8 luma decoded pixels
-      // since 8x8 luma decoded pixels are not available for daala-dist
-      // after recursive split of BLOCK_8x8 is done.
-      const int pred_stride = block_size_wide[plane_bsize];
-      const int pred_idx = (blk_row * pred_stride + blk_col)
-                           << tx_size_wide_log2[0];
-      int16_t *decoded = &pd->pred[pred_idx];
-      int i, j;
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  aom_clear_system_state();
 
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        for (j = 0; j < bh; j++)
-          for (i = 0; i < bw; i++)
-            decoded[j * pred_stride + i] =
-                CONVERT_TO_SHORTPTR(rec_buffer)[j * MAX_TX_SIZE + i];
-      } else {
-#endif
-        for (j = 0; j < bh; j++)
-          for (i = 0; i < bw; i++)
-            decoded[j * pred_stride + i] = rec_buffer[j * MAX_TX_SIZE + i];
-#if CONFIG_HIGHBITDEPTH
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_DIST_8X8
-    tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE,
-                     blk_row, blk_col, plane_bsize, txm_bsize);
-  }
-  rd_stats->dist += tmp * 16;
-  txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
-                                   tx_size, scan_order, a, l, 0);
-  rd_stats->rate += txb_coeff_cost;
-  rd_stats->skip &= (eob == 0);
+  float features[64] = { 0.0f };
+  get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
 
-#if CONFIG_RD_DEBUG
-  av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
-                            txb_coeff_cost);
-#endif  // CONFIG_RD_DEBUG
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, &score);
+  if (score > 8.0f) return 100;
+  if (score < -8.0f) return 0;
+  score = 1.0f / (1.0f + (float)exp(-score));
+  return (int)(score * 100);
 }
 
+// Search for the best tx partition/type for a given luma block.
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                            int blk_col, int plane, int block, TX_SIZE tx_size,
-                            int depth, BLOCK_SIZE plane_bsize,
-                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
-                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
-                            RD_STATS *rd_stats, int64_t ref_best_rd,
-                            int *is_cost_valid) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE(*const inter_tx_size)
-  [MAX_MIB_SIZE] =
-      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  int64_t this_rd = INT64_MAX;
-  ENTROPY_CONTEXT *pta = ta + blk_col;
-  ENTROPY_CONTEXT *ptl = tl + blk_row;
-  int i;
-  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                   mbmi->sb_type, tx_size);
-  int64_t sum_rd = INT64_MAX;
-  int tmp_eob = 0;
-  int zero_blk_rate;
-  RD_STATS sum_rd_stats;
-#if CONFIG_TXK_SEL
-  TX_TYPE best_tx_type = TX_TYPES;
-  int txk_idx = (blk_row << 4) + blk_col;
-#endif
-#if CONFIG_RECT_TX_EXT
-  TX_SIZE quarter_txsize = quarter_txsize_lookup[mbmi->sb_type];
-  int check_qttx = is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
-                   tx_size == max_txsize_rect_lookup[mbmi->sb_type] &&
-                   quarter_txsize != tx_size;
-  int is_qttx_picked = 0;
-  int eobs_qttx[2] = { 0, 0 };
-  int skip_qttx[2] = { 0, 0 };
-  int block_offset_qttx = check_qttx
-                              ? tx_size_wide_unit[quarter_txsize] *
-                                    tx_size_high_unit[quarter_txsize]
-                              : 0;
-  int blk_row_offset, blk_col_offset;
-  int is_wide_qttx =
-      tx_size_wide_unit[quarter_txsize] > tx_size_high_unit[quarter_txsize];
-  blk_row_offset = is_wide_qttx ? tx_size_high_unit[quarter_txsize] : 0;
-  blk_col_offset = is_wide_qttx ? 0 : tx_size_wide_unit[quarter_txsize];
-#endif
-
-  av1_init_rd_stats(&sum_rd_stats);
-
+                            int blk_col, int block, TX_SIZE tx_size, int depth,
+                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+                            int64_t ref_best_rd, int *is_cost_valid,
+                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            TXB_RD_INFO_NODE *rd_info_node) {
   assert(tx_size < TX_SIZES_ALL);
-
+  av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) {
     *is_cost_valid = 0;
     return;
   }
 
-  av1_init_rd_stats(rd_stats);
-
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-#if CONFIG_LV_MAP
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
-
-#if LV_MAP_PROB
-  zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(plane)]
-                      .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-#else
-  zero_blk_rate =
-      av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
-#endif  // LV_MAP_PROB
-#else
-  TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
-  int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
-  zero_blk_rate =
-      x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
-#endif
-
-  rd_stats->ref_rdcost = ref_best_rd;
-  rd_stats->zero_rate = zero_blk_rate;
-  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
-    inter_tx_size[0][0] = tx_size;
-    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, pta, ptl, rd_stats);
-    if (rd_stats->rate == INT_MAX) return;
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  ENTROPY_CONTEXT *pta = ta + blk_col;
+  ENTROPY_CONTEXT *ptl = tl + blk_row;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+  struct macroblock_plane *const p = &x->plane[0];
+
+  const int try_no_split = 1;
+  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+
+  int64_t no_split_rd = INT64_MAX;
+  int no_split_txb_entropy_ctx = 0;
+  TX_TYPE no_split_tx_type = TX_TYPES;
+  // TX no split
+  if (try_no_split) {
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+
+    rd_stats->ref_rdcost = ref_best_rd;
+    rd_stats->zero_rate = zero_blk_rate;
+    const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+    mbmi->inter_tx_size[index] = tx_size;
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
+                  ptl, rd_stats, ftxs_mode, ref_best_rd,
+                  rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+    assert(rd_stats->rate < INT_MAX);
 
     if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
              RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4806,187 +4379,111 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       rd_stats->rate = zero_blk_rate;
       rd_stats->dist = rd_stats->sse;
       rd_stats->skip = 1;
-      x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+      x->blk_skip[blk_row * bw + blk_col] = 1;
       p->eobs[block] = 0;
-#if CONFIG_TXK_SEL
-      mbmi->txk_type[txk_idx] = DCT_DCT;
-#endif
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
     } else {
-      x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+      x->blk_skip[blk_row * bw + blk_col] = 0;
       rd_stats->skip = 0;
     }
 
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate +=
-          av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-#if CONFIG_RECT_TX_EXT
-    if (check_qttx) {
-      assert(blk_row == 0 && blk_col == 0);
-      rd_stats->rate += av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 0);
-    }
-#endif
-    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-#if CONFIG_LV_MAP
-    tmp_eob = p->txb_entropy_ctx[block];
-#else
-    tmp_eob = p->eobs[block];
-#endif
-
-#if CONFIG_TXK_SEL
-    best_tx_type = mbmi->txk_type[txk_idx];
-#endif
-
-#if CONFIG_RECT_TX_EXT
-    if (check_qttx) {
-      assert(blk_row == 0 && blk_col == 0 && block == 0 && plane == 0);
-
-      RD_STATS rd_stats_tmp, rd_stats_qttx;
-      int64_t rd_qttx;
-
-      av1_init_rd_stats(&rd_stats_qttx);
-      av1_init_rd_stats(&rd_stats_tmp);
-
-      av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
-                        pta, ptl, &rd_stats_qttx);
-      if (rd_stats->rate == INT_MAX) return;
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    no_split_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (cpi->sf.adaptive_txb_search_level &&
+        (no_split_rd -
+         (no_split_rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
+            ref_best_rd) {
+      *is_cost_valid = 0;
+      return;
+    }
 
-      tx_size_ctx = txsize_sqr_map[quarter_txsize];
-      coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
-      zero_blk_rate =
-          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
-      if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
-               RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
-           rd_stats_qttx.skip == 1) &&
-          !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
-        av1_update_txb_coeff_cost(&rd_stats_qttx, plane, quarter_txsize, 0, 0,
-                                  zero_blk_rate - rd_stats_qttx.rate);
-#endif  // CONFIG_RD_DEBUG
-        rd_stats_qttx.rate = zero_blk_rate;
-        rd_stats_qttx.dist = rd_stats_qttx.sse;
-        rd_stats_qttx.skip = 1;
-        x->blk_skip[plane][blk_row * bw + blk_col] = 1;
-        skip_qttx[0] = 1;
-        p->eobs[block] = 0;
-      } else {
-        x->blk_skip[plane][blk_row * bw + blk_col] = 0;
-        skip_qttx[0] = 0;
-        rd_stats->skip = 0;
-      }
-
-      // Second tx block
-      av1_tx_block_rd_b(cpi, x, quarter_txsize, blk_row_offset, blk_col_offset,
-                        plane, block_offset_qttx, plane_bsize, pta, ptl,
-                        &rd_stats_tmp);
-
-      if (rd_stats->rate == INT_MAX) return;
-
-#if !CONFIG_PVQ
-      av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
-#endif  // !CONFIG_PVQ
-      coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
-                                      ptl + blk_row_offset);
-      zero_blk_rate =
-          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
-      if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
-               RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
-           rd_stats_tmp.skip == 1) &&
-          !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
-        av1_update_txb_coeff_cost(&rd_stats_tmp, plane, quarter_txsize, 0, 0,
-                                  zero_blk_rate - rd_stats_tmp.rate);
-#endif  // CONFIG_RD_DEBUG
-        rd_stats_tmp.rate = zero_blk_rate;
-        rd_stats_tmp.dist = rd_stats_tmp.sse;
-        rd_stats_tmp.skip = 1;
-        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 1;
-        skip_qttx[1] = 1;
-        p->eobs[block_offset_qttx] = 0;
-      } else {
-        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 0;
-        skip_qttx[1] = 0;
-        rd_stats_tmp.skip = 0;
-      }
+    no_split_txb_entropy_ctx = p->txb_entropy_ctx[block];
+    const int txk_type_idx =
+        av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+    no_split_tx_type = mbmi->txk_type[txk_type_idx];
 
-      av1_merge_rd_stats(&rd_stats_qttx, &rd_stats_tmp);
+    if (cpi->sf.txb_split_cap)
+      if (p->eobs[block] == 0) try_split = 0;
+  }
 
-      if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
-        rd_stats_qttx.rate +=
-            av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-      }
-      rd_stats_qttx.rate +=
-          av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 1);
-      rd_qttx = RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist);
-#if CONFIG_LV_MAP
-      eobs_qttx[0] = p->txb_entropy_ctx[0];
-      eobs_qttx[1] = p->txb_entropy_ctx[block_offset_qttx];
-#else
-      eobs_qttx[0] = p->eobs[0];
-      eobs_qttx[1] = p->eobs[block_offset_qttx];
-#endif
-      if (rd_qttx < this_rd) {
-        is_qttx_picked = 1;
-        this_rd = rd_qttx;
-        rd_stats->rate = rd_stats_qttx.rate;
-        rd_stats->dist = rd_stats_qttx.dist;
-        rd_stats->sse = rd_stats_qttx.sse;
-        rd_stats->skip = rd_stats_qttx.skip;
-        rd_stats->rdcost = rd_stats_qttx.rdcost;
-      }
-      av1_get_entropy_contexts(plane_bsize, 0, pd, ta, tl);
+  if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
+    const int threshold = cpi->sf.tx_type_search.ml_tx_split_thresh;
+    if (threshold >= 0) {
+      const int split_score =
+          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+      if (split_score >= 0 && split_score < threshold) try_split = 0;
     }
-#endif
   }
 
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH
-#if CONFIG_MRC_TX
-      // If the tx type we are trying is MRC_DCT, we cannot partition the
-      // transform into anything smaller than TX_32X32
-      && mbmi->tx_type != MRC_DCT
-#endif  // CONFIG_MRC_TX
-      ) {
+#if COLLECT_TX_SIZE_DATA
+  // Do not skip tx_split when collecting tx size data.
+  try_split = 1;
+#endif
+
+  // TX split
+  int64_t split_rd = INT64_MAX;
+  RD_STATS split_rd_stats;
+  av1_init_rd_stats(&split_rd_stats);
+  if (try_split) {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int sub_step = bsw * bsh;
     RD_STATS this_rd_stats;
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
 #if CONFIG_DIST_8X8
-    int sub8x8_eob[4];
+    int sub8x8_eob[4] = { 0, 0, 0, 0 };
+    struct macroblockd_plane *const pd = &xd->plane[0];
 #endif
-    sum_rd_stats.rate =
-        av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+    split_rd_stats.rate = x->txfm_partition_cost[ctx][1];
 
     assert(tx_size < TX_SIZES_ALL);
 
-    ref_best_rd = AOMMIN(this_rd, ref_best_rd);
-
-    for (i = 0; i < 4 && this_cost_valid; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
+    ref_best_rd = AOMMIN(no_split_rd, ref_best_rd);
+
+    int blk_idx = 0;
+    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+        const int offsetr = blk_row + r;
+        const int offsetc = blk_col + c;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        assert(blk_idx < 4);
+        select_tx_block(
+            cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize,
+            ta, tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
+            &this_cost_valid, ftxs_mode,
+            (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
-                      depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
-                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
 #if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) {
-        sub8x8_eob[i] = p->eobs[block];
-      }
+        if (!x->using_dist_8x8)
+#endif
+          if (!this_cost_valid) goto LOOP_EXIT;
+#if CONFIG_DIST_8X8
+        if (x->using_dist_8x8 && tx_size == TX_8X8) {
+          sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
+        }
 #endif  // CONFIG_DIST_8X8
-      av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
+        av1_merge_rd_stats(&split_rd_stats, &this_rd_stats);
 
-      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
+        tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
 #if CONFIG_DIST_8X8
-      if (!x->using_dist_8x8)
+        if (!x->using_dist_8x8)
 #endif
-        if (this_rd < tmp_rd) break;
-      block += sub_step;
+          if (no_split_rd < tmp_rd) {
+            this_cost_valid = 0;
+            goto LOOP_EXIT;
+          }
+        block += sub_step;
+      }
     }
+
+  LOOP_EXIT : {}
+
 #if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && this_cost_valid && plane == 0 &&
-        tx_size == TX_8X8) {
+    if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
       const int src_stride = p->src.stride;
       const int dst_stride = pd->dst.stride;
 
@@ -4997,34 +4494,33 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
 
       int64_t dist_8x8;
-      int qindex = x->qindex;
+      const int qindex = x->qindex;
       const int pred_stride = block_size_wide[plane_bsize];
       const int pred_idx = (blk_row * pred_stride + blk_col)
                            << tx_size_wide_log2[0];
-      int16_t *pred = &pd->pred[pred_idx];
-      int j;
+      const int16_t *pred = &x->pred_luma[pred_idx];
+      int i, j;
       int row, col;
 
-#if CONFIG_HIGHBITDEPTH
       uint8_t *pred8;
       DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-#else
-      DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
-#endif  // CONFIG_HIGHBITDEPTH
 
       dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
                               BLOCK_8X8, 8, 8, 8, 8, qindex) *
                  16;
-      sum_rd_stats.sse = dist_8x8;
 
-#if CONFIG_HIGHBITDEPTH
+#ifdef DEBUG_DIST_8X8
+      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+        assert(sum_rd_stats.sse == dist_8x8);
+#endif  // DEBUG_DIST_8X8
+
+      split_rd_stats.sse = dist_8x8;
+
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
         pred8 = CONVERT_TO_BYTEPTR(pred8_16);
       else
         pred8 = (uint8_t *)pred8_16;
-#endif
 
-#if CONFIG_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         for (row = 0; row < 2; ++row) {
           for (col = 0; col < 2; ++col) {
@@ -5047,7 +4543,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
           }
         }
       } else {
-#endif
         for (row = 0; row < 2; ++row) {
           for (col = 0; col < 2; ++col) {
             int idx = row * 2 + col;
@@ -5066,87 +4561,99 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
             }
           }
         }
-#if CONFIG_HIGHBITDEPTH
       }
-#endif  // CONFIG_HIGHBITDEPTH
       dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
                               8, 8, 8, qindex) *
                  16;
-      sum_rd_stats.dist = dist_8x8;
-      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
-    }
-#endif  // CONFIG_DIST_8X8
-    if (this_cost_valid) sum_rd = tmp_rd;
-  }
 
-  if (this_rd < sum_rd) {
-    int idx, idy;
-#if CONFIG_RECT_TX_EXT
-    TX_SIZE tx_size_selected = is_qttx_picked ? quarter_txsize : tx_size;
-#else
-    TX_SIZE tx_size_selected = tx_size;
-#endif
+#ifdef DEBUG_DIST_8X8
+      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+        assert(sum_rd_stats.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
 
-#if CONFIG_RECT_TX_EXT
-    if (is_qttx_picked) {
-      assert(blk_row == 0 && blk_col == 0 && plane == 0);
-#if CONFIG_LV_MAP
-      p->txb_entropy_ctx[0] = eobs_qttx[0];
-      p->txb_entropy_ctx[block_offset_qttx] = eobs_qttx[1];
-#else
-      p->eobs[0] = eobs_qttx[0];
-      p->eobs[block_offset_qttx] = eobs_qttx[1];
-#endif
-    } else {
-#endif
-#if CONFIG_LV_MAP
-      p->txb_entropy_ctx[block] = tmp_eob;
-#else
-    p->eobs[block] = tmp_eob;
-#endif
-#if CONFIG_RECT_TX_EXT
+      split_rd_stats.dist = dist_8x8;
+      tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
     }
-#endif
+#endif  // CONFIG_DIST_8X8
+    if (this_cost_valid) split_rd = tmp_rd;
+  }
 
-#if !CONFIG_PVQ
-    av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
-#if CONFIG_RECT_TX_EXT
-    if (is_qttx_picked)
-      av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
-                          pta + blk_col_offset, ptl + blk_row_offset);
-#endif  // CONFIG_RECT_TX_EXT
-#endif  // !CONFIG_PVQ
+#if COLLECT_TX_SIZE_DATA
+  do {
+    if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break;
 
+#if 0
+    // Randomly select blocks to collect data to reduce output file size.
+    const int rnd_val = rand() % 2;
+    if (rnd_val) break;
+#endif
+
+    const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+    const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+    if (!within_border) break;
+
+    FILE *fp = fopen(av1_tx_size_data_output_file, "a");
+    if (!fp) break;
+
+    // Split decision, RD cost, block type(inter/intra), q-index, rdmult,
+    // and block size.
+    const int split_selected = sum_rd < this_rd;
+    const int is_inter = 1;
+    const int txb_w = tx_size_wide[tx_size];
+    const int txb_h = tx_size_high[tx_size];
+    fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected,
+            (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex,
+            x->rdmult, is_inter, txb_w, txb_h);
+
+    // Residue signal.
+    const int diff_stride = block_size_wide[plane_bsize];
+    const int16_t *src_diff =
+        &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+    for (int r = 0; r < txb_h; ++r) {
+      for (int c = 0; c < txb_w; ++c) {
+        fprintf(fp, "%d,", src_diff[c]);
+      }
+      src_diff += diff_stride;
+    }
+    fprintf(fp, "\n");
+
+    fclose(fp);
+  } while (0);
+#endif  // COLLECT_TX_SIZE_DATA
+
+  if (no_split_rd < split_rd) {
+    const TX_SIZE tx_size_selected = tx_size;
+    p->txb_entropy_ctx[block] = no_split_txb_entropy_ctx;
+    av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
-    inter_tx_size[0][0] = tx_size_selected;
-    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size_selected;
-    mbmi->tx_size = tx_size_selected;
-#if CONFIG_TXK_SEL
-    mbmi->txk_type[txk_idx] = best_tx_type;
-#endif
-    if (this_rd == INT64_MAX) *is_cost_valid = 0;
-#if CONFIG_RECT_TX_EXT
-    if (is_qttx_picked) {
-      x->blk_skip[plane][0] = skip_qttx[0];
-      x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = skip_qttx[1];
-    } else {
-#endif
-      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
-#if CONFIG_RECT_TX_EXT
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size_selected;
+      }
     }
-#endif
+    mbmi->tx_size = tx_size_selected;
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     no_split_tx_type);
+    x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
   } else {
-    *rd_stats = sum_rd_stats;
-    if (sum_rd == INT64_MAX) *is_cost_valid = 0;
+    *rd_stats = split_rd_stats;
+    if (split_rd == INT64_MAX) *is_cost_valid = 0;
   }
 }
 
-static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd) {
+static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                   int64_t ref_best_rd,
+                                   FAST_TX_SEARCH_MODE ftxs_mode,
+                                   TXB_RD_INFO_NODE *rd_info_tree) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int is_cost_valid = 1;
   int64_t this_rd = 0;
@@ -5157,48 +4664,57 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
   if (is_cost_valid) {
     const struct macroblockd_plane *const pd = &xd->plane[0];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
     const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
     const int bh = tx_size_high_unit[max_tx_size];
     const int bw = tx_size_wide_unit[max_tx_size];
     int idx, idy;
     int block = 0;
-    int init_depth =
-        (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_above[MAX_MIB_SIZE * 2];
-    TXFM_CONTEXT tx_left[MAX_MIB_SIZE * 2];
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
 
     RD_STATS pn_rd_stats;
+    const int init_depth =
+        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
     av1_init_rd_stats(&pn_rd_stats);
 
-    av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
+    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
     memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
     memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
 
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
-        select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth,
+        select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
                         plane_bsize, ctxa, ctxl, tx_above, tx_left,
-                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid);
-        if (pn_rd_stats.rate == INT_MAX) {
+                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid,
+                        ftxs_mode, rd_info_tree);
+        if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
           av1_invalid_rd_stats(rd_stats);
           return;
         }
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
-                          RDCOST(x->rdmult, 0, pn_rd_stats.sse));
+        this_rd +=
+            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
         block += step;
+        if (rd_info_tree != NULL) rd_info_tree += 1;
       }
     }
   }
-
-  this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
-                   RDCOST(x->rdmult, 0, rd_stats->sse));
+  int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  if (zero_rd < this_rd) {
+    this_rd = zero_rd;
+    rd_stats->rate = rd_stats->zero_rate;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
   if (!is_cost_valid) {
@@ -5209,541 +4725,711 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd, TX_TYPE tx_type) {
-  const AV1_COMMON *const cm = &cpi->common;
+                                       int64_t ref_best_rd,
+                                       TXB_RD_INFO_NODE *rd_info_tree) {
+  const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
-  int s0 = av1_cost_bit(skip_prob, 0);
-  int s1 = av1_cost_bit(skip_prob, 1);
+  const int skip_ctx = av1_get_skip_context(xd);
+  int s0 = x->skip_cost[skip_ctx][0];
+  int s1 = x->skip_cost[skip_ctx][1];
   int64_t rd;
-  int row, col;
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 
-  mbmi->tx_type = tx_type;
-  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
-  mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
+  // TODO(debargha): enable this as a speed feature where the
+  // select_inter_block_yrd() function above will use a simplified search
+  // such as not using full optimize, but the inter_block_yrd() function
+  // will use more complex search given that the transform partitions have
+  // already been decided.
 
+  int64_t rd_thresh = ref_best_rd;
+  if (fast_tx_search && rd_thresh < INT64_MAX) {
+    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+  }
+  assert(rd_thresh > 0);
+
+  FAST_TX_SEARCH_MODE ftxs_mode =
+      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+  select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode,
+                         rd_info_tree);
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
-  for (row = 0; row < max_blocks_high / 2; ++row)
-    for (col = 0; col < max_blocks_wide / 2; ++col)
-      mbmi->min_tx_size = AOMMIN(
-          mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
-
-#if !CONFIG_TXK_SEL
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
-                       cm->reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
-                                          cm->reduced_tx_set_used);
-#if CONFIG_LGT_FROM_PRED
-    if (is_lgt_allowed(mbmi->mode, mbmi->min_tx_size)) {
-      if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
-          ALLOW_INTRA_EXT_TX)
-        rd_stats->rate += x->intra_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]]
-                                           [mbmi->mode][mbmi->use_lgt];
-      if (LGT_FROM_PRED_INTER && is_inter && ext_tx_set > 0)
-        rd_stats->rate +=
-            x->inter_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]][mbmi->use_lgt];
-    }
-    if (!mbmi->use_lgt) {
-#endif  // CONFIG_LGT_FROM_PRED
-      if (is_inter) {
-        if (ext_tx_set > 0)
-          rd_stats->rate +=
-              x->inter_tx_type_costs[ext_tx_set]
-                                    [txsize_sqr_map[mbmi->min_tx_size]]
-                                    [mbmi->tx_type];
-      } else {
-        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-          rd_stats->rate +=
-              x->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
-                                    [mbmi->tx_type];
-      }
-    }
-#if CONFIG_LGT_FROM_PRED
+  // If fast_tx_search is true, only DCT and 1D DCT were tested in
+  // select_inter_block_yrd() above. Do a better search for tx type with
+  // tx sizes already decided.
+  if (fast_tx_search) {
+    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+      return INT64_MAX;
   }
-#endif
-#else
-  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_TXK_SEL
 
   if (rd_stats->skip)
     rd = RDCOST(x->rdmult, s1, rd_stats->sse);
   else
     rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
 
-  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-      !(rd_stats->skip))
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
     rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
 
-static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  const int diff_stride = cols;
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int16_t *diff = &p->src_diff[0];
-  uint8_t hash_data[MAX_SB_SQUARE];
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      hash_data[cols * r + c] = clip_pixel(diff[c] + 128);
+// Finds rd cost for a y block, given the transform size partitions
+static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                         int blk_col, int block, TX_SIZE tx_size,
+                         BLOCK_SIZE plane_bsize, int depth,
+                         ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+                         TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                         int64_t ref_best_rd, RD_STATS *rd_stats,
+                         FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+      plane_bsize, blk_row, blk_col)];
+
+  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                   mbmi->sb_type, tx_size);
+
+  av1_init_rd_stats(rd_stats);
+  if (tx_size == plane_tx_size) {
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    rd_stats->zero_rate = zero_blk_rate;
+    rd_stats->ref_rdcost = ref_best_rd;
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta,
+                  tl, rd_stats, ftxs_mode, ref_best_rd, NULL);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+        rd_stats->skip == 1) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      x->blk_skip[blk_row * mi_width + blk_col] = 1;
+      x->plane[0].eobs[block] = 0;
+      x->plane[0].txb_entropy_ctx[block] = 0;
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
+    } else {
+      rd_stats->skip = 0;
+      x->blk_skip[blk_row * mi_width + blk_col] = 0;
+    }
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    RD_STATS pn_rd_stats;
+    int64_t this_rd = 0;
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+        block += step;
+      }
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][1];
+  }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int init_depth =
+        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+    RD_STATS pn_rd_stats;
+
+    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, plane_bsize,
+                     init_depth, ctxa, ctxl, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return 0;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd +=
+            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+        block += step;
+      }
     }
-    diff += diff_stride;
   }
-  return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data,
-                            rows * cols)
-          << 7) +
-         bsize;
+  int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  if (zero_rd < this_rd) {
+    this_rd = zero_rd;
+    rd_stats->rate = rd_stats->zero_rate;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
+  if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+  return is_cost_valid;
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                             (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
 }
 
 static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
                             const RD_STATS *const rd_stats,
-                            TX_RD_INFO *const tx_rd_info) {
+                            MB_RD_RECORD *tx_rd_record) {
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   tx_rd_info->hash_value = hash;
-  tx_rd_info->tx_type = mbmi->tx_type;
   tx_rd_info->tx_size = mbmi->tx_size;
-#if CONFIG_VAR_TX
-  tx_rd_info->min_tx_size = mbmi->min_tx_size;
-  memcpy(tx_rd_info->blk_skip, x->blk_skip[0],
+  memcpy(tx_rd_info->blk_skip, x->blk_skip,
          sizeof(tx_rd_info->blk_skip[0]) * n4);
-  for (int idy = 0; idy < xd->n8_h; ++idy)
-    for (int idx = 0; idx < xd->n8_w; ++idx)
-      tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-#endif  // CONFIG_VAR_TX
-#if CONFIG_TXK_SEL
+  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
   av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
-#endif  // CONFIG_TXK_SEL
   tx_rd_info->rd_stats = *rd_stats;
 }
 
-static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
+static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
                              RD_STATS *const rd_stats, MACROBLOCK *const x) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  mbmi->tx_type = tx_rd_info->tx_type;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   mbmi->tx_size = tx_rd_info->tx_size;
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = tx_rd_info->min_tx_size;
-  memcpy(x->blk_skip[0], tx_rd_info->blk_skip,
+  memcpy(x->blk_skip, tx_rd_info->blk_skip,
          sizeof(tx_rd_info->blk_skip[0]) * n4);
-  for (int idy = 0; idy < xd->n8_h; ++idy)
-    for (int idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx];
-#endif  // CONFIG_VAR_TX
-#if CONFIG_TXK_SEL
+  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
   av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
-#endif  // CONFIG_TXK_SEL
   *rd_stats = tx_rd_info->rd_stats;
 }
 
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
+                                const uint32_t hash) {
+  // Linear search through the circular buffer to find matching hash.
+  int index;
+  for (int i = cur_record->num - 1; i >= 0; i--) {
+    index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+    if (cur_record->hash_vals[index] == hash) return index;
+  }
+
+  // If not found - add new RD info into the buffer and return its index
+  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
+    index = (cur_record->index_start + cur_record->num) %
+            TX_SIZE_RD_RECORD_BUFFER_LEN;
+    cur_record->num++;
+  } else {
+    index = cur_record->index_start;
+    cur_record->index_start =
+        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+  }
+
+  cur_record->hash_vals[index] = hash;
+  av1_zero(cur_record->tx_rd_info[index]);
+  return index;
+}
+
+// Go through all TX blocks that could be used in TX size search, compute
+// residual hash values for them and find matching RD info that stores previous
+// RD search results for these TX blocks. The idea is to prevent repeated
+// rate/distortion computations that happen because of the combination of
+// partition and TX size search. The resulting RD info records are returned in
+// the form of a quadtree for easier access in actual TX size search.
+static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+                                   int mi_col, TXB_RD_INFO_NODE *dst_rd_info) {
+  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
+                                         x->txb_rd_record_16X16,
+                                         x->txb_rd_record_32X32,
+                                         x->txb_rd_record_64X64 };
+  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+
+  // Hashing is performed only for square TX sizes larger than TX_4X4
+  if (max_square_tx_size < TX_8X8) return 0;
+
+  const int bw_mi = mi_size_wide[bsize];
+  const int diff_stride = bw;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+
+  // Coordinates of the top-left corner of current block within the superblock
+  // measured in pixels:
+  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  int cur_rd_info_idx = 0;
+  int cur_tx_depth = 0;
+  uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
+  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
+    const int cur_tx_bw = tx_size_wide[cur_tx_size];
+    const int cur_tx_bh = tx_size_high[cur_tx_size];
+    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
+    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+    for (int row = 0; row < bh; row += cur_tx_bh) {
+      for (int col = 0; col < bw; col += cur_tx_bw) {
+        if (cur_tx_bw != cur_tx_bh) {
+          // Use dummy nodes for all rectangular transforms within the
+          // TX size search tree.
+          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
+        } else {
+          // Get spatial location of this TX block within the superblock
+          // (measured in cur_tx_bsize units).
+          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
+          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
+
+          int16_t hash_data[MAX_SB_SQUARE];
+          int16_t *cur_hash_row = hash_data;
+          const int16_t *cur_diff_row = diff + row * diff_stride + col;
+          for (int i = 0; i < cur_tx_bh; i++) {
+            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
+            cur_hash_row += cur_tx_bw;
+            cur_diff_row += diff_stride;
+          }
+          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                                (uint8_t *)hash_data,
+                                                2 * cur_tx_bw * cur_tx_bh);
+
+          // Find corresponding RD info based on the hash value.
+          const int rd_record_idx =
+              row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
+              col_in_sb;
+
+          int idx = find_tx_size_rd_info(
+              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
+          dst_rd_info[cur_rd_info_idx].rd_info_array =
+              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
+                   .tx_rd_info[idx];
+        }
+
+        // Update the output quadtree RD info structure.
+        av1_zero(dst_rd_info[cur_rd_info_idx].children);
+        const int this_mi_row = row / MI_SIZE;
+        const int this_mi_col = col / MI_SIZE;
+        if (cur_tx_depth > 0) {  // Set up child pointers.
+          const int mi_index = this_mi_row * bw_mi + this_mi_col;
+          const int child_idx = child_idx_buf[mi_index];
+          assert(child_idx < 4);
+          dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] =
+              &dst_rd_info[cur_rd_info_idx];
+        }
+        if (cur_tx_depth < MAX_VARTX_DEPTH) {  // Set up parent and child idx.
+          const int tx_bh_mi = cur_tx_bh / MI_SIZE;
+          const int tx_bw_mi = cur_tx_bw / MI_SIZE;
+          for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) {
+            memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx,
+                   tx_bw_mi);
+          }
+          int child_idx = 0;
+          const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size];
+          const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size];
+          for (int i = this_mi_row; i < this_mi_row + tx_bh_mi;
+               i += next_tx_bh_mi) {
+            for (int j = this_mi_col; j < this_mi_col + tx_bw_mi;
+                 j += next_tx_bw_mi) {
+              assert(child_idx < 4);
+              child_idx_buf[i * bw_mi + j] = child_idx++;
+            }
+          }
+        }
+        ++cur_rd_info_idx;
+      }
+    }
+    cur_tx_size = next_tx_size;
+    ++cur_tx_depth;
+  }
+  return 1;
+}
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
 // Uses simple features on top of DCT coefficients to quickly predict
 // whether optimal RD decision is to skip encoding the residual.
-static int predict_skip_flag_8bit(const MACROBLOCK *x, BLOCK_SIZE bsize) {
-  if (bsize > BLOCK_16X16) return 0;
-  // Tuned for target false-positive rate of 5% for all block sizes:
-  const uint32_t threshold_table[] = { 50, 50, 50, 55, 47, 47, 53, 22, 22, 37 };
-  const struct macroblock_plane *const p = &x->plane[0];
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  tran_low_t DCT_coefs[32 * 32];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // Predict not to skip when mse is larger than threshold.
+  if (mse > mse_thresh) return 0;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
   TxfmParam param;
   param.tx_type = DCT_DCT;
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  param.tx_size = max_txsize_rect_lookup[bsize];
-#else
-  param.tx_size = max_txsize_lookup[bsize];
-#endif
-  param.bd = 8;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = get_bitdepth_data_path_index(xd);
   param.lossless = 0;
-  av1_fwd_txfm(p->src_diff, DCT_coefs, bw, &param);
-
-  uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, AOM_BITS_8);
-  uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, AOM_BITS_8);
-  uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc;
-  for (int i = 1; i < bw * bh; i++) {
-    uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac;
-    if (cur_quantized_coef > max_quantized_coef)
-      max_quantized_coef = cur_quantized_coef;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
   }
-
-  return max_quantized_coef < threshold_table[AOMMAX(bsize - BLOCK_4X4, 0)];
+  return 1;
 }
 
 // Used to set proper context for early termination with skip = 1.
-static void set_skip_flag(const AV1_COMP *cpi, MACROBLOCK *x,
-                          RD_STATS *rd_stats, int bsize) {
+static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
+                          int64_t dist) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int n4 = bsize_to_num_blk(bsize);
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
   const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-#else
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-#endif
-  mbmi->tx_type = DCT_DCT;
-  for (int idy = 0; idy < xd->n8_h; ++idy)
-    for (int idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy][idx] = tx_size;
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
   mbmi->tx_size = tx_size;
-  mbmi->min_tx_size = get_min_tx_size(tx_size);
-  memset(x->blk_skip[0], 1, sizeof(uint8_t) * n4);
+  memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
   rd_stats->skip = 1;
 
   // Rate.
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
-  ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-  ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-  av1_get_entropy_contexts(bsize, 0, &xd->plane[0], ctxa, ctxl);
-  int coeff_ctx = get_entropy_context(tx_size, ctxa, ctxl);
-  int rate = x->token_head_costs[tx_size_ctx][PLANE_TYPE_Y][1][0][coeff_ctx][0];
+  const int tx_size_ctx = get_txsize_entropy_ctx(tx_size);
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+  TXB_CTX txb_ctx;
+  // Because plane is 0, plane_bsize equal to bsize
+  get_txb_ctx(bsize, tx_size, 0, ctxa, ctxl, &txb_ctx);
+  int rate = x->coeff_costs[tx_size_ctx][PLANE_TYPE_Y]
+                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   if (tx_size > TX_4X4) {
     int ctx = txfm_partition_context(
         xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
-    rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-  }
-#if !CONFIG_TXK_SEL
-#if CONFIG_EXT_TX
-  const AV1_COMMON *cm = &cpi->common;
-  const int ext_tx_set = get_ext_tx_set(max_txsize_lookup[bsize], bsize, 1,
-                                        cm->reduced_tx_set_used);
-  if (get_ext_tx_types(mbmi->min_tx_size, bsize, 1, cm->reduced_tx_set_used) >
-          1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    if (ext_tx_set > 0)
-      rate +=
-          x->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->min_tx_size]]
-                                [mbmi->tx_type];
+    rate += x->txfm_partition_cost[ctx][0];
   }
-#else
-  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_TXK_SEL
   rd_stats->rate = rate;
-
-  // Distortion.
-  int64_t tmp = pixel_diff_dist(x, 0, x->plane[0].src_diff,
-                                block_size_wide[bsize], 0, 0, bsize, bsize);
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-  rd_stats->dist = rd_stats->sse = (tmp << 4);
+    dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
 }
 
 static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                               int64_t ref_best_rd) {
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+                               int mi_col, int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int64_t rd = INT64_MAX;
   int64_t best_rd = INT64_MAX;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
-  TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  TX_SIZE best_tx = max_txsize_lookup[bsize];
-  TX_SIZE best_min_tx_size = TX_SIZES_ALL;
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  TX_TYPE txk_start = DCT_DCT;
-#if CONFIG_TXK_SEL
-  TX_TYPE txk_end = DCT_DCT + 1;
-#else
-  TX_TYPE txk_end = TX_TYPES;
-#endif
   const int n4 = bsize_to_num_blk(bsize);
-  int idx, idy;
-  int prune = 0;
-#if CONFIG_EXT_TX
-  const TxSetType tx_set_type = get_ext_tx_set_type(
-      max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-  const int ext_tx_set =
-      get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
+  // Get the tx_size 1 level down
+  const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
 
   av1_invalid_rd_stats(rd_stats);
 
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-  int search_lgt = is_inter
-                       ? LGT_FROM_PRED_INTER &&
-                             (!cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-                       : LGT_FROM_PRED_INTRA && ALLOW_INTRA_EXT_TX;
-#endif  // CONFIG_LGT_FROM_PRED
+  if (cpi->sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) {
+    int model_rate;
+    int64_t model_dist;
+    int model_skip;
+    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist,
+                    &model_skip, NULL, NULL, NULL, NULL);
+    const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+    // If the modeled rd is a lot worse than the best so far, breakout.
+    // TODO(debargha, urvang): Improve the model and make the check below
+    // tighter.
+    assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
+           cpi->sf.model_based_prune_tx_search_level <= 2);
+    if (!model_skip &&
+        model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) >
+            ref_best_rd)
+      return;
+  }
 
   const uint32_t hash = get_block_residue_hash(x, bsize);
-  TX_RD_RECORD *tx_rd_record = &x->tx_rd_record;
+  MB_RD_RECORD *mb_rd_record = &x->mb_rd_record;
 
-  if (ref_best_rd != INT64_MAX) {
-    for (int i = 0; i < tx_rd_record->num; ++i) {
-      const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
       // If there is a match in the tx_rd_record, fetch the RD decision and
       // terminate early.
-      if (tx_rd_record->tx_rd_info[index].hash_value == hash) {
-        TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index];
+      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index];
         fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
         return;
       }
     }
   }
 
-// If we predict that skip is the optimal RD decision - set the respective
-// context and terminate early.
-#if CONFIG_HIGHBITDEPTH
-  if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
-#endif  // CONFIG_HIGHBITDEPTH
-  {
-    if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
-        predict_skip_flag_8bit(x, bsize)) {
-      set_skip_flag(cpi, x, rd_stats, bsize);
-      return;
-    }
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+      predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
+    set_skip_flag(x, rd_stats, bsize, dist);
+    // Save the RD search results into tx_rd_record.
+    if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    return;
   }
 
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-#if CONFIG_EXT_TX
-    prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
-#else
-    prune = prune_tx_types(cpi, bsize, x, xd, 0);
-#endif  // CONFIG_EXT_TX
-
-  int found = 0;
-
-  for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
-    RD_STATS this_rd_stats;
-    av1_init_rd_stats(&this_rd_stats);
-#if CONFIG_MRC_TX
-    // MRC_DCT only implemented for TX_32X32 so only include this tx in
-    // the search for TX_32X32
-    if (tx_type == MRC_DCT &&
-        (max_tx_size != TX_32X32 || (is_inter && !USE_MRC_INTER) ||
-         (!is_inter && !USE_MRC_INTRA)))
-      continue;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_EXT_TX
-    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-    if (is_inter) {
-      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
-        if (!do_tx_type_search(tx_type, prune)) continue;
-      }
-    } else {
-      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
-        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
-      }
-    }
-#else   // CONFIG_EXT_TX
-    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
-        !do_tx_type_search(tx_type, prune))
-      continue;
-#endif  // CONFIG_EXT_TX
-    if (is_inter && x->use_default_inter_tx_type &&
-        tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
-      continue;
+  // Precompute residual hashes and find existing or add new RD records to
+  // store and reuse rate and distortion values to speed up TX size search.
+  TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256];
+  int found_rd_info = 0;
+  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
+    found_rd_info =
+        find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
+  }
 
-    if (xd->lossless[mbmi->segment_id])
-      if (tx_type != DCT_DCT) continue;
+  prune_tx(cpi, bsize, x, xd, tx_set_type);
 
-    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
-                                 tx_type);
-    ref_best_rd = AOMMIN(rd, ref_best_rd);
-    if (rd < best_rd) {
-      best_rd = rd;
-      *rd_stats = this_rd_stats;
-      best_tx_type = mbmi->tx_type;
-      best_tx = mbmi->tx_size;
-      best_min_tx_size = mbmi->min_tx_size;
-      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
-      found = 1;
-      for (idy = 0; idy < xd->n8_h; ++idy)
-        for (idx = 0; idx < xd->n8_w; ++idx)
-          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-    }
+  int found = 0;
+
+  RD_STATS this_rd_stats;
+  av1_init_rd_stats(&this_rd_stats);
+
+  rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                               found_rd_info ? matched_rd_info : NULL);
+
+  ref_best_rd = AOMMIN(rd, ref_best_rd);
+  if (rd < best_rd) {
+    *rd_stats = this_rd_stats;
+    found = 1;
   }
 
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+
   // We should always find at least one candidate unless ref_best_rd is less
   // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
   // might have failed to find something better)
   assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
   if (!found) return;
 
-#if CONFIG_LGT_FROM_PRED
-  if (search_lgt && is_lgt_allowed(mbmi->mode, max_tx_size) &&
-      !cm->reduced_tx_set_used) {
-    RD_STATS this_rd_stats;
-    mbmi->use_lgt = 1;
-    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, 0);
-    if (rd < best_rd) {
-      best_rd = rd;
-      *rd_stats = this_rd_stats;
-      best_tx = mbmi->tx_size;
-      best_min_tx_size = mbmi->min_tx_size;
-      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
-      for (idy = 0; idy < xd->n8_h; ++idy)
-        for (idx = 0; idx < xd->n8_w; ++idx)
-          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-    } else {
-      mbmi->use_lgt = 0;
-    }
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-  // We found a candidate transform to use. Copy our results from the "best"
-  // array into mbmi.
-  mbmi->tx_type = best_tx_type;
-  for (idy = 0; idy < xd->n8_h; ++idy)
-    for (idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
-  mbmi->tx_size = best_tx;
-  mbmi->min_tx_size = best_min_tx_size;
-  memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
-
   // Save the RD search results into tx_rd_record.
-  int index;
-  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
-    index =
-        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
-    ++tx_rd_record->num;
-  } else {
-    index = tx_rd_record->index_start;
-    tx_rd_record->index_start =
-        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
-  }
-  save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]);
+  if (within_border && cpi->sf.use_mb_rd_hash)
+    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
 }
 
-static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                        int blk_col, int plane, int block, TX_SIZE tx_size,
-                        BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
-                        ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
+static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                          int blk_col, int plane, int block, TX_SIZE tx_size,
+                          BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+                          ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats,
+                          FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(plane > 0);
+  assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-
-  assert(tx_size < TX_SIZES_ALL);
-
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
-
-  if (tx_size == plane_tx_size) {
-    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
-    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
-    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, ta, tl, rd_stats);
-#if !CONFIG_PVQ
-    av1_set_txb_context(x, plane, block, tx_size, ta, tl);
-#endif  // !CONFIG_PVQ
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-    int i;
-
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
-                  above_ctx, left_ctx, rd_stats);
-      block += step;
-    }
-  }
+  ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+  ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
+                ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL);
+  av1_set_txb_context(x, plane, block, tx_size, ta, tl);
 }
 
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
 static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd) {
+                            int64_t ref_best_rd,
+                            FAST_TX_SEARCH_MODE ftxs_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int plane;
   int is_cost_valid = 1;
-  int64_t this_rd;
+  int64_t this_rd = 0;
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
   av1_init_rd_stats(rd_stats);
 
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) return is_cost_valid;
-  bsize = scale_chroma_bsize(mbmi->sb_type, xd->plane[1].subsampling_x,
-                             xd->plane[1].subsampling_y);
-#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
-
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  if (is_rect_tx(mbmi->tx_size)) {
-    return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd);
-  }
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  const BLOCK_SIZE bsizec = scale_chroma_bsize(
+      bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
 
   if (is_inter_block(mbmi) && is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
-      av1_subtract_plane(x, bsize, plane);
+      av1_subtract_plane(x, bsizec, plane);
   }
 
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-    const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
-    const int bh = tx_size_high_unit[max_tx_size];
-    const int bw = tx_size_wide_unit[max_tx_size];
-    int idx, idy;
-    int block = 0;
-    const int step = bh * bw;
-    ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
-    RD_STATS pn_rd_stats;
-    av1_init_rd_stats(&pn_rd_stats);
-
-    av1_get_entropy_contexts(bsize, 0, pd, ta, tl);
-
-    for (idy = 0; idy < mi_height; idy += bh) {
-      for (idx = 0; idx < mi_width; idx += bw) {
-        tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
-                    ta, tl, &pn_rd_stats);
-        block += step;
+  if (is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+      const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int mi_height =
+          block_size_high[plane_bsize] >> tx_size_high_log2[0];
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      int idx, idy;
+      int block = 0;
+      const int step = bh * bw;
+      ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
+      ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
+      RD_STATS pn_rd_stats;
+      av1_init_rd_stats(&pn_rd_stats);
+      av1_get_entropy_contexts(bsizec, pd, ta, tl);
+
+      for (idy = 0; idy < mi_height; idy += bh) {
+        for (idx = 0; idx < mi_width; idx += bw) {
+          tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
+                        plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+          block += step;
+        }
       }
-    }
 
-    if (pn_rd_stats.rate == INT_MAX) {
-      is_cost_valid = 0;
-      break;
-    }
+      if (pn_rd_stats.rate == INT_MAX) {
+        is_cost_valid = 0;
+        break;
+      }
 
-    av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
 
-    this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
-                     RDCOST(x->rdmult, 0, rd_stats->sse));
+      this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
+                       RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse));
 
-    if (this_rd > ref_best_rd) {
-      is_cost_valid = 0;
-      break;
+      if (this_rd > ref_best_rd) {
+        is_cost_valid = 0;
+        break;
+      }
     }
   }
 
@@ -5754,7 +5440,6 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
   return is_cost_valid;
 }
-#endif  // CONFIG_VAR_TX
 
 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int dc_mode_cost,
@@ -5764,11 +5449,12 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int *rate_tokenonly, int64_t *distortion,
                                        int *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
+  assert(
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(bsize >= BLOCK_8X8);
   int this_rate;
   int64_t this_rd;
   int colors_u, colors_v, colors;
@@ -5780,42 +5466,32 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   int plane_block_width, plane_block_height, rows, cols;
   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
                            &plane_block_height, &rows, &cols);
-  if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
 
   mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_HIGHBITDEPTH
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
   if (cpi->common.use_highbitdepth) {
     colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       cpi->common.bit_depth);
+                                       cpi->common.bit_depth, count_buf);
     colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       cpi->common.bit_depth);
+                                       cpi->common.bit_depth, count_buf);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    colors_u = av1_count_colors(src_u, src_stride, rows, cols);
-    colors_v = av1_count_colors(src_v, src_stride, rows, cols);
-#if CONFIG_HIGHBITDEPTH
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_PALETTE_DELTA_ENCODING
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
   const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
   colors = colors_u > colors_v ? colors_u : colors_v;
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
     const int max_itr = 50;
-    float lb_u, ub_u, val_u;
-    float lb_v, ub_v, val_v;
-    float *const data = x->palette_buffer->kmeans_data_buf;
-    float centroids[2 * PALETTE_MAX_SIZE];
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
 
-#if CONFIG_HIGHBITDEPTH
     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
     if (cpi->common.use_highbitdepth) {
@@ -5824,32 +5500,25 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       lb_v = src_v16[0];
       ub_v = src_v16[0];
     } else {
-#endif  // CONFIG_HIGHBITDEPTH
       lb_u = src_u[0];
       ub_u = src_u[0];
       lb_v = src_v[0];
       ub_v = src_v[0];
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
 
     for (r = 0; r < rows; ++r) {
       for (c = 0; c < cols; ++c) {
-#if CONFIG_HIGHBITDEPTH
         if (cpi->common.use_highbitdepth) {
           val_u = src_u16[r * src_stride + c];
           val_v = src_v16[r * src_stride + c];
           data[(r * cols + c) * 2] = val_u;
           data[(r * cols + c) * 2 + 1] = val_v;
         } else {
-#endif  // CONFIG_HIGHBITDEPTH
           val_u = src_u[r * src_stride + c];
           val_v = src_v[r * src_stride + c];
           data[(r * cols + c) * 2] = val_u;
           data[(r * cols + c) * 2 + 1] = val_v;
-#if CONFIG_HIGHBITDEPTH
         }
-#endif  // CONFIG_HIGHBITDEPTH
         if (val_u < lb_u)
           lb_u = val_u;
         else if (val_u > ub_u)
@@ -5868,34 +5537,30 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
         centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
       }
       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-#if CONFIG_PALETTE_DELTA_ENCODING
       optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
       // Sort the U channel colors in ascending order.
       for (i = 0; i < 2 * (n - 1); i += 2) {
         int min_idx = i;
-        float min_val = centroids[i];
+        int min_val = centroids[i];
         for (j = i + 2; j < 2 * n; j += 2)
           if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
         if (min_idx != i) {
-          float temp_u = centroids[i], temp_v = centroids[i + 1];
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
           centroids[i] = centroids[min_idx];
           centroids[i + 1] = centroids[min_idx + 1];
           centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
         }
       }
       av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
       extend_palette_color_map(color_map, cols, rows, plane_block_width,
                                plane_block_height);
       pmi->palette_size[1] = n;
       for (i = 1; i < 3; ++i) {
         for (j = 0; j < n; ++j) {
-#if CONFIG_HIGHBITDEPTH
           if (cpi->common.use_highbitdepth)
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
                 (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
                 clip_pixel((int)centroids[j * 2 + i - 1]);
         }
@@ -5903,19 +5568,8 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
       super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate =
-          tokenonly_rd_stats.rate + dc_mode_cost +
-          x->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
-          write_uniform_cost(n, color_map[0]) +
-          av1_cost_bit(
-              av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
-      this_rate += av1_palette_color_cost_uv(pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
-                                             color_cache, n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-                                             cpi->common.bit_depth);
-      this_rate +=
-          av1_cost_color_map(x, 1, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
@@ -5937,68 +5591,13 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 }
 
-#if CONFIG_FILTER_INTRA
-// Return 1 if an filter intra mode is selected; return 0 otherwise.
-static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                     int *rate, int *rate_tokenonly,
-                                     int64_t *distortion, int *skippable,
-                                     BLOCK_SIZE bsize, int64_t *best_rd) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int filter_intra_selected_flag = 0;
-  int this_rate;
-  int64_t this_rd;
-  FILTER_INTRA_MODE mode;
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  RD_STATS tokenonly_rd_stats;
-
-  av1_zero(filter_intra_mode_info);
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
-  mbmi->uv_mode = UV_DC_PRED;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-
-  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-    mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
-    if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd))
-      continue;
-
-    this_rate = tokenonly_rd_stats.rate +
-                av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
-                x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
-                write_uniform_cost(FILTER_INTRA_MODES, mode);
-    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-    if (this_rd < *best_rd) {
-      *best_rd = this_rd;
-      *rate = this_rate;
-      *rate_tokenonly = tokenonly_rd_stats.rate;
-      *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
-      filter_intra_mode_info = mbmi->filter_intra_mode_info;
-      filter_intra_selected_flag = 1;
-    }
-  }
-
-  if (filter_intra_selected_flag) {
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-        filter_intra_mode_info.use_filter_intra_mode[1];
-    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-        filter_intra_mode_info.filter_intra_mode[1];
-    return 1;
-  } else {
-    return 0;
-  }
-}
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_EXT_INTRA
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
     int *best_angle_delta, int64_t *best_rd) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
   assert(!is_inter_block(mbmi));
   int this_rate;
   int64_t this_rd;
@@ -6006,11 +5605,12 @@ static int64_t pick_intra_angle_routine_sbuv(
 
   if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
     return INT64_MAX;
-  this_rate = tokenonly_rd_stats.rate + rate_overhead;
+  this_rate = tokenonly_rd_stats.rate +
+              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[1];
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
@@ -6026,7 +5626,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int64_t best_rd, int *rate,
                                     RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   int i, angle_delta, best_angle_delta = 0;
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
@@ -6041,7 +5641,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       best_rd_in = (best_rd == INT64_MAX)
                        ? INT64_MAX
                        : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
-      mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
       this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
                                               best_rd_in, rate, rd_stats,
                                               &best_angle_delta, &best_rd);
@@ -6064,7 +5664,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
           rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
         skip_search = 1;
       if (!skip_search) {
-        mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
         pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                       rate, rd_stats, &best_angle_delta,
                                       &best_rd);
@@ -6072,202 +5672,137 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-  mbmi->angle_delta[1] = best_angle_delta;
+  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
   return rd_stats->rate != INT_MAX;
 }
-#endif  // CONFIG_EXT_INTRA
-
-#if CONFIG_CFL
-static int64_t cfl_alpha_dist_lbd(const int16_t *pred_buf_q3,
-                                  const uint8_t *src, int src_stride, int width,
-                                  int height, int dc_pred, int alpha_q3,
-                                  int64_t *dist_neg_out) {
-  int64_t dist = 0;
-  int diff;
-
-  if (alpha_q3 == 0) {
-    for (int j = 0; j < height; j++) {
-      for (int i = 0; i < width; i++) {
-        diff = src[i] - dc_pred;
-        dist += diff * diff;
-      }
-      src += src_stride;
-    }
-
-    if (dist_neg_out) *dist_neg_out = dist;
-
-    return dist;
-  }
-
-  int64_t dist_neg = 0;
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      const int uv = src[i];
-      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
 
-      diff = uv - clip_pixel(scaled_luma + dc_pred);
-      dist += diff * diff;
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
-      diff = uv - clip_pixel(-scaled_luma + dc_pred);
-      dist_neg += diff * diff;
-    }
-    pred_buf_q3 += MAX_SB_SIZE;
-    src += src_stride;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_DEBUG
+  assert(is_cfl_allowed(xd));
+  const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+  const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
+  (void)plane_bsize;
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  if (!xd->lossless[mbmi->segment_id]) {
+    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
   }
+#endif  // CONFIG_DEBUG
 
-  if (dist_neg_out) *dist_neg_out = dist_neg;
-
-  return dist;
-}
-#if CONFIG_HIGHBITDEPTH
-static int64_t cfl_alpha_dist_hbd(const int16_t *pred_buf_q3,
-                                  const uint16_t *src, int src_stride,
-                                  int width, int height, int dc_pred,
-                                  int alpha_q3, int bit_depth,
-                                  int64_t *dist_neg_out) {
-  const int shift = 2 * (bit_depth - 8);
-  const int rounding = shift > 0 ? (1 << shift) >> 1 : 0;
-  int64_t dist = 0;
-  int diff;
-
-  if (alpha_q3 == 0) {
-    for (int j = 0; j < height; j++) {
-      for (int i = 0; i < width; i++) {
-        diff = src[i] - dc_pred;
-        dist += diff * diff;
+  xd->cfl.use_dc_pred_cache = 1;
+  const int64_t mode_rd =
+      RDCOST(x->rdmult,
+             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#if CONFIG_DEBUG
+  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#endif  // CONFIG_DEBUG
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    RD_STATS rd_stats;
+    av1_init_rd_stats(&rd_stats);
+    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+      best_rd_uv[joint_sign][plane] = INT64_MAX;
+      best_c[joint_sign][plane] = 0;
+    }
+    // Collect RD stats for an alpha value of zero in this plane.
+    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
+    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
+      const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
+      if (i == CFL_SIGN_NEG) {
+        mbmi->cfl_alpha_idx = 0;
+        mbmi->cfl_alpha_signs = joint_sign;
+        txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize, tx_size,
+                         cpi->sf.use_fast_coef_costing, FTXS_NONE);
+        if (rd_stats.rate == INT_MAX) break;
+      }
+      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+      best_rd_uv[joint_sign][plane] =
+          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+#if CONFIG_DEBUG
+      best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+    }
+  }
+
+  int best_joint_sign = -1;
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
+      int progress = 0;
+      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+        int flag = 0;
+        RD_STATS rd_stats;
+        if (c > 2 && progress < c) break;
+        av1_init_rd_stats(&rd_stats);
+        for (int i = 0; i < CFL_SIGNS; i++) {
+          const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
+          if (i == 0) {
+            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
+            mbmi->cfl_alpha_signs = joint_sign;
+            txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize,
+                             tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+            if (rd_stats.rate == INT_MAX) break;
+          }
+          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+          int64_t this_rd =
+              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
+          best_rd_uv[joint_sign][plane] = this_rd;
+          best_c[joint_sign][plane] = c;
+#if CONFIG_DEBUG
+          best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+          flag = 2;
+          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
+          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
+          if (this_rd >= best_rd) continue;
+          best_rd = this_rd;
+          best_joint_sign = joint_sign;
+        }
+        progress += flag;
       }
-      src += src_stride;
     }
-    dist = (dist + rounding) >> shift;
-
-    if (dist_neg_out) *dist_neg_out = dist;
-
-    return dist;
-  }
-
-  int64_t dist_neg = 0;
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      const int uv = src[i];
-      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
-
-      diff = uv - clip_pixel_highbd(scaled_luma + dc_pred, bit_depth);
-      dist += diff * diff;
-
-      diff = uv - clip_pixel_highbd(-scaled_luma + dc_pred, bit_depth);
-      dist_neg += diff * diff;
-    }
-    pred_buf_q3 += MAX_SB_SIZE;
-    src += src_stride;
-  }
-
-  if (dist_neg_out) *dist_neg_out = (dist_neg + rounding) >> shift;
-
-  return (dist + rounding) >> shift;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-static int64_t cfl_alpha_dist(const int16_t *pred_buf_q3, const uint8_t *src,
-                              int src_stride, int width, int height,
-                              int dc_pred, int alpha_q3, int use_hbd,
-                              int bit_depth, int64_t *dist_neg_out) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    const uint16_t *src_16 = CONVERT_TO_SHORTPTR(src);
-    return cfl_alpha_dist_hbd(pred_buf_q3, src_16, src_stride, width, height,
-                              dc_pred, alpha_q3, bit_depth, dist_neg_out);
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  (void)bit_depth;
-  return cfl_alpha_dist_lbd(pred_buf_q3, src, src_stride, width, height,
-                            dc_pred, alpha_q3, dist_neg_out);
-}
-
-static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
-  const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
-  const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
-  const uint8_t *const src_u = p_u->src.buf;
-  const uint8_t *const src_v = p_v->src.buf;
-  const int src_stride_u = p_u->src.stride;
-  const int src_stride_v = p_v->src.stride;
-
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  CFL_CTX *const cfl = xd->cfl;
-  cfl_compute_parameters(xd, tx_size);
-  const int width = cfl->uv_width;
-  const int height = cfl->uv_height;
-  const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
-  const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-  const int16_t *pred_buf_q3 = cfl->pred_buf_q3;
-  const int use_hbd = get_bitdepth_data_path_index(xd);
-
-  int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
-  sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
-                     0, use_hbd, xd->bd, NULL);
-  sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
-                     0, use_hbd, xd->bd, NULL);
-
-  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-    const int m = c * 2 + 1;
-    const int abs_alpha_q3 = c + 1;
-    sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
-        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_U][m + 1]);
-    sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
-        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_V][m + 1]);
   }
 
-  int64_t dist;
-  int64_t cost;
-  int64_t best_cost = INT64_MAX;
-  int best_rate = 0;
-
-  // Compute least squares parameter of the entire block
+  int best_rate_overhead = INT_MAX;
   int ind = 0;
-  int signs = 0;
-
-  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-    const int sign_u = CFL_SIGN_U(joint_sign);
-    const int sign_v = CFL_SIGN_V(joint_sign);
-    const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
-    const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
-    for (int u = 0; u < size_u; u++) {
-      const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1;
-      for (int v = 0; v < size_v; v++) {
-        const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1;
-        dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
-               sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
-        dist *= 16;
-        const int rate = x->cfl_cost[joint_sign][CFL_PRED_U][u] +
-                         x->cfl_cost[joint_sign][CFL_PRED_V][v];
-        cost = RDCOST(x->rdmult, rate, dist);
-        if (cost < best_cost) {
-          best_cost = cost;
-          best_rate = rate;
-          ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
-          signs = joint_sign;
-        }
-      }
-    }
+  if (best_joint_sign >= 0) {
+    const int u = best_c[best_joint_sign][CFL_PRED_U];
+    const int v = best_c[best_joint_sign][CFL_PRED_V];
+    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+#if CONFIG_DEBUG
+    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+                   best_rate_overhead +
+                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
+                   best_rate_uv[best_joint_sign][CFL_PRED_V];
+#endif  // CONFIG_DEBUG
+  } else {
+    best_joint_sign = 0;
   }
 
   mbmi->cfl_alpha_idx = ind;
-  mbmi->cfl_alpha_signs = signs;
-  return best_rate;
+  mbmi->cfl_alpha_signs = best_joint_sign;
+  xd->cfl.use_dc_pred_cache = 0;
+  xd->cfl.dc_pred_is_cached[0] = 0;
+  xd->cfl.dc_pred_is_cached[1] = 0;
+  return best_rate_overhead;
 }
-#endif  // CONFIG_CFL
 
 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->palette_mode_info.palette_size[1] = 0;
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
 }
 
 static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -6275,83 +5810,53 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_rd = INT64_MAX, this_rd;
-#if CONFIG_PVQ
-  od_rollback_buffer buf;
-  od_encode_checkpoint(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int try_palette =
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
 
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
     RD_STATS tokenonly_rd_stats;
     UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
-#if CONFIG_EXT_INTRA
-    const int is_directional_mode =
-        av1_is_directional_mode(get_uv_mode(mode), mbmi->sb_type);
-#endif  // CONFIG_EXT_INTRA
+    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
       continue;
 
     mbmi->uv_mode = mode;
-#if CONFIG_CFL
     int cfl_alpha_rate = 0;
     if (mode == UV_CFL_PRED) {
+      if (!is_cfl_allowed(xd)) continue;
       assert(!is_directional_mode);
-      const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
+      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
+      if (cfl_alpha_rate == INT_MAX) continue;
     }
-#endif
-#if CONFIG_EXT_INTRA
-    mbmi->angle_delta[1] = 0;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
     if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
-      const int rate_overhead = x->intra_uv_mode_cost[mbmi->mode][mode] +
-                                write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
+      const int rate_overhead =
+          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
         continue;
     } else {
-#endif  // CONFIG_EXT_INTRA
       if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
-#if CONFIG_PVQ
-        od_encode_rollback(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
         continue;
       }
-#if CONFIG_EXT_INTRA
     }
-#endif  // CONFIG_EXT_INTRA
-    this_rate =
-        tokenonly_rd_stats.rate + x->intra_uv_mode_cost[mbmi->mode][mode];
-
-#if CONFIG_CFL
+    const int mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+        cfl_alpha_rate;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
     if (mode == UV_CFL_PRED) {
-      this_rate += cfl_alpha_rate;
+      assert(is_cfl_allowed(xd));
+#if CONFIG_DEBUG
+      if (!xd->lossless[mbmi->segment_id])
+        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
+#endif  // CONFIG_DEBUG
     }
-#endif
-#if CONFIG_EXT_INTRA
-    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
-      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                      MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-    }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-    if (mbmi->sb_type >= BLOCK_8X8 && mode == UV_DC_PRED)
-      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
-#endif  // CONFIG_FILTER_INTRA
-    if (try_palette && mode == UV_DC_PRED)
-      this_rate += av1_cost_bit(
-          av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
-
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
@@ -6364,21 +5869,16 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
   if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
-    rd_pick_palette_intra_sbuv(cpi, x,
-                               x->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
-                               best_palette_color_map, &best_mbmi, &best_rd,
-                               rate, rate_tokenonly, distortion, skippable);
-  }
-
-#if CONFIG_FILTER_INTRA
-  if (mbmi->sb_type >= BLOCK_8X8) {
-    if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
-                                  skippable, bsize, &best_rd))
-      best_mbmi = *mbmi;
+    rd_pick_palette_intra_sbuv(
+        cpi, x,
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+        distortion, skippable);
   }
-#endif  // CONFIG_FILTER_INTRA
 
   *mbmi = best_mbmi;
   // Make sure we actually chose a mode
@@ -6391,13 +5891,14 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  UV_PREDICTION_MODE *mode_uv) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   init_sbuv_mode(mbmi);
-#if CONFIG_CB4X4
-#if !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) {
     *rate_uv = 0;
     *rate_uv_tokenonly = 0;
@@ -6406,31 +5907,20 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *mode_uv = UV_DC_PRED;
     return;
   }
+  xd->cfl.is_chroma_reference = is_chroma_reference(
+      mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
   bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
                              xd->plane[AOM_PLANE_U].subsampling_y);
-#endif  // !CONFIG_CHROMA_2X2
-#if CONFIG_CFL
   // Only store reconstructed luma when there's chroma RDO. When there's no
   // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-  xd->cfl->store_y = !x->skip_chroma_rd;
-#endif  // CONFIG_CFL
-#else
-  bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize;
-#if CONFIG_CFL
-  xd->cfl->store_y = 1;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_CB4X4
-#if CONFIG_CFL
-  if (xd->cfl->store_y) {
-    // Perform one extra call to txfm_rd_in_plane(), with the values chosen
-    // during luma RDO, so we can store reconstructed luma values
-    RD_STATS this_rd_stats;
-    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                     mbmi->sb_type, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-    xd->cfl->store_y = 0;
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
+                                 cpi->optimize_seg_arr[mbmi->segment_id],
+                                 mi_row, mi_col);
+    xd->cfl.store_y = 0;
   }
-#endif  // CONFIG_CFL
   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                           bsize, max_tx_size);
   *mode_uv = mbmi->uv_mode;
@@ -6441,16 +5931,10 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
   if (is_inter_compound_mode(mode)) {
     return x
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_inter_singleref_comp_mode(mode)) {
-    return x->inter_singleref_comp_mode_cost[mode_context]
-                                            [INTER_SINGLEREF_COMP_OFFSET(mode)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 
   int mode_cost = 0;
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
-  int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
 
   assert(is_inter_mode(mode));
 
@@ -6459,43 +5943,34 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
     return mode_cost;
   } else {
     mode_cost = x->newmv_mode_cost[mode_ctx][1];
-    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
 
-    if (is_all_zero_mv) return mode_cost;
-
-    if (mode == ZEROMV) {
+    if (mode == GLOBALMV) {
       mode_cost += x->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
       mode_cost += x->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
-
       mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
 }
 
-#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
-static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
-                                             COMPOUND_TYPE comp_type) {
-  (void)bsize;
-  switch (comp_type) {
+static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
+                                             const MB_MODE_INFO *const mbmi) {
+  switch (mbmi->interinter_comp.type) {
     case COMPOUND_AVERAGE: return 0;
-#if CONFIG_WEDGE
-    case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG: return 1;
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_WEDGE:
+      return get_interinter_wedge_bits(mbmi->sb_type) > 0
+                 ? av1_cost_literal(1) +
+                       x->wedge_idx_cost[mbmi->sb_type]
+                                        [mbmi->interinter_comp.wedge_index]
+                 : 0;
+    case COMPOUND_DIFFWTD: return av1_cost_literal(1);
     default: assert(0); return 0;
   }
 }
-#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 typedef struct {
   int eobs;
@@ -6508,13 +5983,8 @@ typedef struct {
   int_mv pred_mv[2];
   int_mv ref_mv[2];
 
-#if CONFIG_CHROMA_2X2
-  ENTROPY_CONTEXT ta[4];
-  ENTROPY_CONTEXT tl[4];
-#else
   ENTROPY_CONTEXT ta[2];
   ENTROPY_CONTEXT tl[2];
-#endif  // CONFIG_CHROMA_2X2
 } SEG_RDSTAT;
 
 typedef struct {
@@ -6527,12 +5997,7 @@ typedef struct {
   int64_t sse;
   int segment_yrate;
   PREDICTION_MODE modes[4];
-#if CONFIG_COMPOUND_SINGLEREF
-  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
-                       INTER_COMPOUND_MODES];
-#else   // !CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -6543,234 +6008,120 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
          (mv->col >> 3) > mv_limits->col_max;
 }
 
-// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
-// TODO(aconverse): Find out if this is still productive then clean up or remove
-static int check_best_zero_mv(
-    const AV1_COMP *const cpi, const MACROBLOCK *const x,
-    const int16_t mode_context[TOTAL_REFS_PER_FRAME],
-    const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
-    int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
-    const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
-    int mi_row, int mi_col) {
-  int_mv zeromv[2] = { {.as_int = 0 } };
-#if CONFIG_GLOBAL_MOTION
-  int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
-#endif
-  (void)mi_row;
-  (void)mi_col;
-  (void)cpi;
-#if CONFIG_GLOBAL_MOTION
-  if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
-    for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
-      zeromv[cur_frm].as_int =
-          gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
-                               cpi->common.allow_high_precision_mv, bsize,
-                               mi_col, mi_row, block
-#if CONFIG_AMVR
-                               ,
-                               cpi->common.cur_frame_mv_precision_level
-#endif
-                               )
-              .as_int;
-    }
+static INLINE int get_single_mode(int this_mode, int ref_idx,
+                                  int is_comp_pred) {
+  int single_mode;
+  if (is_comp_pred) {
+    single_mode =
+        ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
+  } else {
+    single_mode = this_mode;
   }
-#endif  // CONFIG_GLOBAL_MOTION
-
-  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
-      frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
-      (ref_frames[1] <= INTRA_FRAME ||
-       frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
-    int16_t rfc =
-        av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
-    int c1 = cost_mv_ref(x, NEARMV, rfc);
-    int c2 = cost_mv_ref(x, NEARESTMV, rfc);
-    int c3 = cost_mv_ref(x, ZEROMV, rfc);
+  return single_mode;
+}
 
+/* If the current mode shares the same mv with other modes with higher prority,
+ * skip this mode. This priority order is nearest > global > near. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+                            const MACROBLOCK *const x, int this_mode,
+                            const MV_REFERENCE_FRAME ref_frames[2]) {
+  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  if (!is_comp_pred) {
     if (this_mode == NEARMV) {
-      if (c1 > c3) return 0;
-    } else if (this_mode == NEARESTMV) {
-      if (c2 > c3) return 0;
-    } else {
-      assert(this_mode == ZEROMV);
-      if (ref_frames[1] <= INTRA_FRAME) {
-        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
-            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
-          return 0;
-      } else {
-        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
-             frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
-            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
-             frame_mv[NEARMV][ref_frames[1]].as_int == 0))
-          return 0;
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
+        // NEARMV has the same motion vector as NEARESTMV
+        return 1;
+      }
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // NEARMV has the same motion vector as GLOBALMV
+        return 1;
       }
     }
-  } else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
-              this_mode == ZERO_ZEROMV) &&
-             frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
-             frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
-    int16_t rfc = compound_mode_context[ref_frames[0]];
-    int c2 = cost_mv_ref(x, NEAREST_NEARESTMV, rfc);
-    int c3 = cost_mv_ref(x, ZERO_ZEROMV, rfc);
-    int c5 = cost_mv_ref(x, NEAR_NEARMV, rfc);
-
-    if (this_mode == NEAREST_NEARESTMV) {
-      if (c2 > c3) return 0;
-    } else if (this_mode == NEAR_NEARMV) {
-      if (c5 > c3) return 0;
-    } else {
-      assert(this_mode == ZERO_ZEROMV);
-      if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
-          (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0))
-        return 0;
+    if (this_mode == GLOBALMV) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // GLOBALMV has the same motion vector as NEARESTMV
+        return 1;
+      }
+    }
+  } else {
+    for (int i = 0; i < 2; ++i) {
+      const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+      if (single_mode == NEARMV) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
+          // NEARMV has the same motion vector as NEARESTMV in compound mode
+          return 1;
+        }
+      }
+    }
+    if (this_mode == NEAR_NEARMV) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
+          cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
+        // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV
+        return 1;
+      }
+    }
+    if (this_mode == GLOBAL_GLOBALMV) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
+          cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
+        // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV
+        return 1;
+      }
     }
   }
-  return 1;
+  return 0;
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize, int_mv *frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                int_mv *frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                int mi_row, int mi_col,
-                                int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
-                                int mask_stride, int *rate_mv,
-                                const int block) {
+                                BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
+                                int mi_col, int_mv *ref_mv_sub8x8[2],
+                                const uint8_t *mask, int mask_stride,
+                                int *rate_mv, const int block) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-// This function should only ever be called for compound modes
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) {
-    assert(is_inter_singleref_comp_mode(mbmi->mode));
-    assert(frame_comp_mv);
-  }
-  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
-  const int refs[2] = { mbmi->ref_frame[0],
-                        has_second_ref(mbmi) ? mbmi->ref_frame[1]
-                                             : mbmi->ref_frame[0] };
-#else
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // This function should only ever be called for compound modes
   assert(has_second_ref(mbmi));
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv[2];
   int ite, ref;
-  struct scale_factors sf;
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
-  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-#if CONFIG_GLOBAL_MOTION
-  int is_global[2];
-#if CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#else
-  for (ref = 0; ref < 2; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    WarpedMotionParams *const wm =
-        &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
-  }
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_GLOBAL_MOTION
-#else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-  (void)block;
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-  // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
-  int last_besterr[2] = { INT_MAX, INT_MAX };
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    av1_get_scaled_ref_frame(cpi, refs[0]),
-    av1_get_scaled_ref_frame(cpi, refs[1])
-  };
-
-// Prediction buffer from second frame.
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
-  uint8_t *second_pred;
-#else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_CB4X4
-  (void)ref_mv_sub8x8;
-#endif  // CONFIG_CB4X4
-
-#if CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#else
-  for (ref = 0; ref < 2; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-#if !CONFIG_CB4X4
-    if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
-      ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
-    else
-#endif  // !CONFIG_CB4X4
-      ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
-
-    if (scaled_ref_frame[ref]) {
-      int i;
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        backup_yv12[ref][i] = xd->plane[i].pre[ref];
-      av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
-                           NULL);
-    }
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) {
-    assert(is_inter_singleref_comp_mode(mbmi->mode));
-    // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
-    //       all from the 1st reference frame, i.e. refs[0].
-    ref_mv[1] = x->mbmi_ext->ref_mvs[refs[0]][0];
-    if (scaled_ref_frame[0]) {
-      int i;
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        backup_yv12[1][i] = xd->plane[i].pre[1];
-      av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
-    }
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+  int is_global[2];
+  for (ref = 0; ref < 2; ++ref) {
+    const WarpedMotionParams *const wm =
+        &xd->global_motion[xd->mi[0]->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype);
   }
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
-// Since we have scaled the reference frames to match the size of the current
-// frame we must use a unit scaling factor during mode selection.
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height, cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
-
-// Allow joint search multiple times iteratively for each reference frame
-// and break out of the search loop if it couldn't find a better mv.
-#if CONFIG_COMPOUND_SINGLEREF
-  const int num_ites =
-      (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
-  const int start_ite = has_second_ref(mbmi) ? 0 : 1;
-  for (ite = start_ite; ite < (start_ite + num_ites); ite++)
-#else
-  for (ite = 0; ite < 4; ite++)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = { INT_MAX, INT_MAX };
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    av1_get_scaled_ref_frame(cpi, refs[0]),
+    av1_get_scaled_ref_frame(cpi, refs[1])
+  };
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+  (void)ref_mv_sub8x8;
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
@@ -6782,84 +6133,78 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
     const int plane = 0;
-    ConvolveParams conv_params = get_conv_params(!id, 0, plane);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd);
+    conv_params.use_jnt_comp_avg = 0;
     WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
     warp_types.global_warp_allowed = is_global[!id];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
     warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-    // Initialized here because of compiler problem in Visual Studio.
+    for (ref = 0; ref < 2; ++ref) {
+      ref_mv[ref] = av1_get_ref_mv(x, ref);
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      if (scaled_ref_frame[ref]) {
+        int i;
+        for (i = 0; i < num_planes; i++)
+          backup_yv12[ref][i] = xd->plane[i].pre[ref];
+        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                             NULL, num_planes);
+      }
+    }
+
+    assert(IMPLIES(scaled_ref_frame[0] != NULL,
+                   cm->width == scaled_ref_frame[0]->y_crop_width &&
+                       cm->height == scaled_ref_frame[0]->y_crop_height));
+    assert(IMPLIES(scaled_ref_frame[1] != NULL,
+                   cm->width == scaled_ref_frame[1]->y_crop_width &&
+                       cm->height == scaled_ref_frame[1]->y_crop_height));
+
+    // Initialize based on (possibly scaled) prediction buffers.
     ref_yv12[0] = xd->plane[plane].pre[0];
     ref_yv12[1] = xd->plane[plane].pre[1];
 
-// Get the prediction block from the 'other' reference frame.
-#if CONFIG_COMPOUND_SINGLEREF
-    MV *const the_other_mv = (has_second_ref(mbmi) || id)
-                                 ? &frame_mv[refs[!id]].as_mv
-                                 : &frame_comp_mv[refs[0]].as_mv;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    // Get the prediction block from the 'other' reference frame.
+    InterpFilters interp_filters = EIGHTTAP_REGULAR;
 
-#if CONFIG_HIGHBITDEPTH
+    // Since we have scaled the reference frames to match the size of the
+    // current frame we must use a unit scaling factor during mode selection.
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_COMPOUND_SINGLEREF
-          the_other_mv,
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-          &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, 0, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          &warp_types, p_col, p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+          &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters,
+          &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd, cm->allow_warped_motion);
     } else {
       second_pred = (uint8_t *)second_pred_alloc_16;
-#endif  // CONFIG_HIGHBITDEPTH
-      av1_build_inter_predictor(
-          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_COMPOUND_SINGLEREF
-          the_other_mv,
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-        &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, &conv_params, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          &warp_types, p_col, p_row, plane, !id,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-
-    // Do compound motion search on the current reference frame.
+      av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+                                second_pred, pw, &cur_mv[!id].as_mv,
+                                &cm->sf_identity, pw, ph, &conv_params,
+                                interp_filters, &warp_types, p_col, p_row,
+                                plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+                                mi_row * MI_SIZE, xd, cm->allow_warped_motion);
+    }
+
+    const int order_idx = id != 0;
+    av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+                               &xd->jcp_param.bck_offset,
+                               &xd->jcp_param.use_jnt_comp_avg, 1);
+
+    // Do full-pixel compound motion search on the current reference frame.
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
 
-// Use the mv result from the single mode as mv predictor.
-// Use the mv result from the single mode as mv predictor.
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi) && id)
-      *best_mv = frame_comp_mv[refs[0]].as_mv;
-    else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      *best_mv = frame_mv[refs[id]].as_mv;
+    // Use the mv result from the single mode as mv predictor.
+    // Use the mv result from the single mode as mv predictor.
+    *best_mv = cur_mv[id].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi))
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-    else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
+    av1_set_mvcost(
+        x, id,
+        mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
     // Small-range full-pixel motion search.
     bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6877,42 +6222,44 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
     x->mv_limits = tmp_mv_limits;
 
-#if CONFIG_AMVR
-    if (cpi->common.cur_frame_mv_precision_level) {
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (scaled_ref_frame[ref]) {
+        // Swap back the original buffers for subpel motion search.
+        for (int i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+        // Re-initialize based on unscaled prediction buffers.
+        ref_yv12[ref] = xd->plane[plane].pre[ref];
+      }
+    }
+
+    // Do sub-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    if (cpi->common.cur_frame_force_integer_mv) {
       x->best_mv.as_mv.row *= 8;
       x->best_mv.as_mv.col *= 8;
     }
-    if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0)
-#else
-    if (bestsme < INT_MAX)
-#endif
-    {
+    if (bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
       bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-          x->errorperbit, &cpi->fn_ptr[bsize], 0,
-          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred, mask, mask_stride, id, pw, ph,
-          cpi->sf.use_upsampled_references);
+          x, cm, mi_row, mi_col, &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv, x->errorperbit,
+          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
+          mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search);
     }
 
-    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    // Restore the pointer to the first prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-#if CONFIG_COMPOUND_SINGLEREF
-      // NOTE: For single ref comp mode, frame_mv stores the first mv and
-      //       frame_comp_mv stores the second mv.
-      if (!has_second_ref(mbmi) && id)
-        frame_comp_mv[refs[0]].as_mv = *best_mv;
-      else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        frame_mv[refs[id]].as_mv = *best_mv;
+      cur_mv[id].as_mv = *best_mv;
       last_besterr[id] = bestsme;
-#if CONFIG_COMPOUND_SINGLEREF
-      if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
-#endif  // CONFIG_COMPOUND_SINGLEREF
     } else {
       break;
     }
@@ -6920,216 +6267,124 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
-#if CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#else
-  for (ref = 0; ref < 2; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    if (scaled_ref_frame[ref]) {
-      // Restore the prediction frame pointers to their unscaled versions.
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[ref] = backup_yv12[ref][i];
-    }
-
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi))
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-    else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
-
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi)) {
-      // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
-      //       first mv is stored in frame_mv[] and the second mv is stored in
-      //       frame_comp_mv[].
-      if (compound_ref0_mode(mbmi->mode) == NEWMV)  // SR_NEW_NEWMV
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-      assert(compound_ref1_mode(mbmi->mode) == NEWMV);
-      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-    } else {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if !CONFIG_CB4X4
-      if (bsize >= BLOCK_8X8)
-#endif  // !CONFIG_CB4X4
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                    &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-#if !CONFIG_CB4X4
-      else
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                    &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
-                                    x->mvcost, MV_COST_WEIGHT);
-#endif  // !CONFIG_CB4X4
-#if CONFIG_COMPOUND_SINGLEREF
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  }
+  for (ref = 0; ref < 2; ++ref) {
+    av1_set_mvcost(
+        x, ref,
+        mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) {
-    if (scaled_ref_frame[0]) {
-      // Restore the prediction frame pointers to their unscaled versions.
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = backup_yv12[1][i];
-    }
+    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+    *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
-#endif  // CONFIG_COMPOUND_SINGLEREF
 }
 
 static void estimate_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
-    unsigned int *ref_costs_single,
-#if CONFIG_EXT_COMP_REFS
-    unsigned int (*ref_costs_comp)[TOTAL_REFS_PER_FRAME],
-#else
-    unsigned int *ref_costs_comp,
-#endif  // CONFIG_EXT_COMP_REFS
-    aom_prob *comp_mode_p) {
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    int segment_id, unsigned int *ref_costs_single,
+    unsigned int (*ref_costs_comp)[REF_FRAMES]) {
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
-    memset(ref_costs_single, 0,
-           TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
-#if CONFIG_EXT_COMP_REFS
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
     int ref_frame;
-    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+    for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
       memset(ref_costs_comp[ref_frame], 0,
-             TOTAL_REFS_PER_FRAME * sizeof((*ref_costs_comp)[0]));
-#else
-    memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
-#endif  // CONFIG_EXT_COMP_REFS
-
-    *comp_mode_p = 128;
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
   } else {
-    aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
-    aom_prob comp_inter_p = 128;
-
-    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      comp_inter_p = av1_get_reference_mode_prob(cm, xd);
-      *comp_mode_p = comp_inter_p;
-    } else {
-      *comp_mode_p = 128;
-    }
-
-    ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0);
-
-    if (cm->reference_mode != COMPOUND_REFERENCE) {
-      aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd);
-      aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd);
-#if CONFIG_EXT_REFS
-      aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
-      aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
-      aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
-      aom_prob ref_single_p6 = av1_get_pred_prob_single_ref_p6(cm, xd);
-#endif  // CONFIG_EXT_REFS
-
-      unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
-
-      ref_costs_single[LAST_FRAME] =
-#if CONFIG_EXT_REFS
-          ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
-              ref_costs_single[BWDREF_FRAME] = ref_costs_single[ALTREF2_FRAME] =
-#endif  // CONFIG_EXT_REFS
-                  ref_costs_single[GOLDEN_FRAME] =
-                      ref_costs_single[ALTREF_FRAME] = base_cost;
-
-#if CONFIG_EXT_REFS
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
-
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
-      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0);
-      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
-
-      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
-      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p2, 0);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
-
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
-      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1);
-
-      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
-
-      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p6, 0);
-      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p6, 1);
-#else   // !CONFIG_EXT_REFS
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
-
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
-#endif  // CONFIG_EXT_REFS
-    } else {
-      ref_costs_single[LAST_FRAME] = 512;
-#if CONFIG_EXT_REFS
-      ref_costs_single[LAST2_FRAME] = 512;
-      ref_costs_single[LAST3_FRAME] = 512;
-      ref_costs_single[BWDREF_FRAME] = 512;
-      ref_costs_single[ALTREF2_FRAME] = 512;
-#endif  // CONFIG_EXT_REFS
-      ref_costs_single[GOLDEN_FRAME] = 512;
-      ref_costs_single[ALTREF_FRAME] = 512;
-    }
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
 
     if (cm->reference_mode != SINGLE_REFERENCE) {
-      aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
-#if CONFIG_EXT_REFS
-      aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
-      aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
-      aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
-      aom_prob bwdref_comp_p1 = av1_get_pred_prob_comp_bwdref_p1(cm, xd);
-#endif  // CONFIG_EXT_REFS
-
-      unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
 
-#if CONFIG_EXT_COMP_REFS
-      aom_prob comp_ref_type_p = av1_get_comp_reference_type_prob(cm, xd);
-      unsigned int ref_bicomp_costs[TOTAL_REFS_PER_FRAME] = { 0 };
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
 
       ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
           ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-#if USE_UNI_COMP_REFS
-              base_cost + av1_cost_bit(comp_ref_type_p, 1);
-#else
-              base_cost;
-#endif  // USE_UNI_COMP_REFS
+              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
       ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
       ref_bicomp_costs[ALTREF_FRAME] = 0;
 
-      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
-      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
 
-      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
-      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
 
-      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
-      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
 
-      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
 
-      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
-      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
 
+      // cost: if one ref frame is forward ref, the other ref is backward ref
       int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
         for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
@@ -7138,66 +6393,28 @@ static void estimate_ref_frame_costs(
         }
       }
 
-      aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
-      aom_prob uni_comp_ref_p1 = av1_get_pred_prob_uni_comp_ref_p1(cm, xd);
-      aom_prob uni_comp_ref_p2 = av1_get_pred_prob_uni_comp_ref_p2(cm, xd);
-
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 0);
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
-          av1_cost_bit(uni_comp_ref_p2, 0);
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
-          av1_cost_bit(uni_comp_ref_p2, 1);
-
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 1);
-
-#else  // !CONFIG_EXT_COMP_REFS
-
-      ref_costs_comp[LAST_FRAME] =
-#if CONFIG_EXT_REFS
-          ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
-#endif  // CONFIG_EXT_REFS
-              ref_costs_comp[GOLDEN_FRAME] = base_cost;
-
-#if CONFIG_EXT_REFS
-      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF2_FRAME] =
-          ref_costs_comp[ALTREF_FRAME] = 0;
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
-
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
-      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
-
-      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
-
-      // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
-      //               more bit.
-      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
-
-      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
-      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
-#else   // !CONFIG_EXT_REFS
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_COMP_REFS
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
     } else {
-#if CONFIG_EXT_COMP_REFS
       int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
         for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
@@ -7207,17 +6424,6 @@ static void estimate_ref_frame_costs(
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
-#else  // !CONFIG_EXT_COMP_REFS
-      ref_costs_comp[LAST_FRAME] = 512;
-#if CONFIG_EXT_REFS
-      ref_costs_comp[LAST2_FRAME] = 512;
-      ref_costs_comp[LAST3_FRAME] = 512;
-      ref_costs_comp[BWDREF_FRAME] = 512;
-      ref_costs_comp[ALTREF2_FRAME] = 512;
-      ref_costs_comp[ALTREF_FRAME] = 512;
-#endif  // CONFIG_EXT_REFS
-      ref_costs_comp[GOLDEN_FRAME] = 512;
-#endif  // CONFIG_EXT_COMP_REFS
     }
   }
 }
@@ -7240,17 +6446,15 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 }
 
-static void setup_buffer_inter(
+static void setup_buffer_ref_mvs_inter(
     const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     BLOCK_SIZE block_size, int mi_row, int mi_col,
-    int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
-    int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
-    struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) {
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mi = xd->mi[0];
-  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
 
@@ -7258,35 +6462,20 @@ static void setup_buffer_inter(
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+                       num_planes);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                   mbmi_ext->ref_mv_stack[ref_frame],
-                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
-
-// Candidate refinement carried out at encoder and decoder
-#if CONFIG_AMVR
-  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
-                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
-                        cm->cur_frame_mv_precision_level);
-#else
-  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
-                        &frame_nearest_mv[ref_frame],
-                        &frame_near_mv[ref_frame]);
-#endif
-// Further refinement that is encode side only to test the top few candidates
-// in full and choose the best as the centre point for subsequent searches.
-// The current implementation doesn't support scaling.
-#if CONFIG_CB4X4
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                   mi_col, mbmi_ext->mode_context);
+
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the centre point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  (void)block_size;
   av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
               block_size);
-#else
-  if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8)
-    av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
-                block_size);
-#endif  // CONFIG_CB4X4
 }
 
 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -7294,19 +6483,15 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  int ref_idx, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
   int bestsme = INT_MAX;
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
-#if CONFIG_COMPOUND_SINGLEREF
-  int ref =
-      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
-#else   // !CONFIG_COMPOUND_SINGLEREF
   int ref = mbmi->ref_frame[ref_idx];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
 
   MvLimits tmp_mv_limits = x->mv_limits;
   int cost_list[5];
@@ -7314,25 +6499,21 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, ref);
 
-  MV pred_mv[3];
-  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
-  pred_mv[2] = x->pred_mv[ref];
-
   if (scaled_ref_frame) {
-    int i;
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
       backup_yv12[i] = xd->plane[i].pre[ref_idx];
-
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+    }
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
   }
 
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
-
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
@@ -7347,16 +6528,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+  if (cpi->sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
     int boffset =
-        2 * (b_width_log2_lookup[cm->sb_size] -
-             AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
+             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
     step_param = AOMMAX(step_param, boffset);
   }
 
   if (cpi->sf.adaptive_motion_search) {
-    int bwl = b_width_log2_lookup[bsize];
-    int bhl = b_height_log2_lookup[bsize];
+    int bwl = mi_size_wide_log2[bsize];
+    int bhl = mi_size_high_log2[bsize];
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
     if (tlevel < 5) {
@@ -7374,8 +6555,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           x->best_mv.as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
-            int j;
-            for (j = 0; j < MAX_MB_PLANE; ++j)
+            // Swap back the original buffers before returning.
+            for (int j = 0; j < num_planes; ++j)
               xd->plane[j].pre[ref_idx] = backup_yv12[j];
           }
           return;
@@ -7384,35 +6565,26 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
+  // Note: MV limits are modified here. Always restore the original values
+  // after full-pixel motion search.
   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
 
-#if CONFIG_MOTION_VAR
   if (mbmi->motion_mode != SIMPLE_TRANSLATION)
     mvp_full = mbmi->mv[0].as_mv;
   else
-#endif  // CONFIG_MOTION_VAR
-    mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+    mvp_full = ref_mv;
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
   x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
 
-#if CONFIG_MOTION_VAR
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_HASH_ME
       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                       sadpb, cond_cost_list(cpi, cost_list),
                                       &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
                                       (MI_SIZE * mi_row), 0);
-#else
-  bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                                  cond_cost_list(cpi, cost_list), &ref_mv,
-                                  INT_MAX, 1);
-#endif
-#if CONFIG_MOTION_VAR
       break;
     case OBMC_CAUSAL:
       bestsme = av1_obmc_full_pixel_diamond(
@@ -7422,25 +6594,27 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
       break;
     default: assert(0 && "Invalid motion mode!\n");
   }
-#endif  // CONFIG_MOTION_VAR
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
 
   x->mv_limits = tmp_mv_limits;
 
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level) {
+  if (cpi->common.cur_frame_force_integer_mv) {
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
   }
-  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
-#else
-  if (bestsme < INT_MAX) {
-#endif
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
     int dis; /* TODO: use dis in distortion calculation later. */
-#if CONFIG_MOTION_VAR
     switch (mbmi->motion_mode) {
       case SIMPLE_TRANSLATION:
-#endif  // CONFIG_MOTION_VAR
-        if (cpi->sf.use_upsampled_references) {
+        if (cpi->sf.use_accurate_subpel_search) {
           int best_mv_var;
           const int try_second = x->second_best_mv.as_int != INVALID_MV &&
                                  x->second_best_mv.as_int != x->best_mv.as_int;
@@ -7448,8 +6622,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           const int ph = block_size_high[bsize];
 
           best_mv_var = cpi->find_fractional_mv_step(
-              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
-              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
               0, 0, pw, ph, 1);
@@ -7472,8 +6646,9 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                 x->best_mv.as_mv.col * 8 <= maxc &&
                 x->best_mv.as_mv.col * 8 >= minc) {
               this_var = cpi->find_fractional_mv_step(
-                  x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
-                  &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+                  x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
                   &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
@@ -7483,45 +6658,35 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           }
         } else {
           cpi->find_fractional_mv_step(
-              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
-              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
               0, 0, 0, 0, 0);
         }
-#if CONFIG_MOTION_VAR
         break;
       case OBMC_CAUSAL:
         av1_find_best_obmc_sub_pixel_tree_up(
-            x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-            cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis,
-            &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references);
+            x, cm, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
+            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
+            cpi->sf.use_accurate_subpel_search);
         break;
       default: assert(0 && "Invalid motion mode!\n");
     }
-#endif  // CONFIG_MOTION_VAR
   }
   *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
                              x->mvcost, MV_COST_WEIGHT);
 
-#if CONFIG_MOTION_VAR
   if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
-#else
-  if (cpi->sf.adaptive_motion_search)
-#endif  // CONFIG_MOTION_VAR
     x->pred_mv[ref] = x->best_mv.as_mv;
-
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[ref_idx] = backup_yv12[i];
-  }
 }
 
-static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst,
+                                   const int num_planes) {
   int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  for (i = 0; i < num_planes; i++) {
     xd->plane[i].dst.buf = dst.plane[i];
     xd->plane[i].dst.stride = dst.stride[i];
   }
@@ -7535,106 +6700,50 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_COMPOUND_SINGLEREF
-  const int other_ref =
-      has_second_ref(mbmi) ? mbmi->ref_frame[!ref_idx] : mbmi->ref_frame[0];
-#else  // !CONFIG_COMPOUND_SINGLEREF
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int other_ref = mbmi->ref_frame[!ref_idx];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  struct scale_factors sf;
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   struct macroblockd_plane *const pd = &xd->plane[0];
   // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams *const wm = &xd->global_motion[other_ref];
-  int is_global = is_global_mv_block(xd->mi[0], block, wm->wmtype);
-#endif  // CONFIG_GLOBAL_MOTION
-#else
-  (void)block;
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  const WarpedMotionParams *const wm = &xd->global_motion[other_ref];
+  int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
 
-// This function should only ever be called for compound modes
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
-#else   // !CONFIG_COMPOUND_SINGLEREF
+  // This function should only ever be called for compound modes
   assert(has_second_ref(mbmi));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  struct buf_2d backup_yv12[MAX_MB_PLANE];
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, other_ref);
-
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[!ref_idx];
-    av1_setup_pre_planes(xd, !ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
-  }
 
-// Since we have scaled the reference frames to match the size of the current
-// frame we must use a unit scaling factor during mode selection.
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height, cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
+  const int plane = 0;
+  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
 
-  struct buf_2d ref_yv12;
+  struct scale_factors sf;
+  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+                                    cm->width, cm->height);
 
-  const int plane = 0;
-  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd);
   WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
   warp_types.global_warp_allowed = is_global;
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
   warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-  // Initialized here because of compiler problem in Visual Studio.
-  ref_yv12 = xd->plane[plane].pre[!ref_idx];
 
-// Get the prediction block from the 'other' reference frame.
-#if CONFIG_HIGHBITDEPTH
+  // Get the prediction block from the 'other' reference frame.
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        &warp_types, p_col, p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+        0, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
+        MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
+        cm->allow_warped_motion);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
     av1_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        &warp_types, p_col, p_row, plane, !ref_idx,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
-#if CONFIG_HIGHBITDEPTH
+        &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
+        !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
+        cm->allow_warped_motion);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
-  if (scaled_ref_frame) {
-    // Restore the prediction frame pointers to their unscaled versions.
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[!ref_idx] = backup_yv12[i];
-  }
+  av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+                             &xd->jcp_param.bck_offset,
+                             &xd->jcp_param.use_jnt_comp_avg, 1);
 }
 
 // Search for the best mv for one component of a compound,
@@ -7645,45 +6754,41 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                           const uint8_t *second_pred,
                                           const uint8_t *mask, int mask_stride,
                                           int *rate_mv, int ref_idx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_COMPOUND_SINGLEREF
-  const int ref =
-      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
-#else
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int ref = mbmi->ref_frame[ref_idx];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
   struct macroblockd_plane *const pd = &xd->plane[0];
 
   struct buf_2d backup_yv12[MAX_MB_PLANE];
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, ref);
 
-// Check that this is either an interinter or an interintra block
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(has_second_ref(mbmi) ||
-         // or a single ref comp pred mode
-         is_inter_singleref_comp_mode(mbmi->mode) ||
-         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
-#else
-  assert(has_second_ref(mbmi) ||
-         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+  // Store the first prediction buffer.
+  struct buf_2d orig_yv12;
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
 
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[ref_idx];
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (i = 0; i < num_planes; i++) backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
   }
 
-  struct buf_2d orig_yv12;
   int bestsme = INT_MAX;
   int sadpb = x->sadperbit16;
   MV *const best_mv = &x->best_mv.as_mv;
@@ -7691,12 +6796,6 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   MvLimits tmp_mv_limits = x->mv_limits;
 
-  // Initialized here because of compiler problem in Visual Studio.
-  if (ref_idx) {
-    orig_yv12 = pd->pre[0];
-    pd->pre[0] = pd->pre[ref_idx];
-  }
-
   // Do compound motion search on the current reference frame.
   av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
 
@@ -7706,12 +6805,9 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   best_mv->col >>= 3;
   best_mv->row >>= 3;
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
-  else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
   // Small-range full-pixel motion search.
   bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -7729,44 +6825,40 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   x->mv_limits = tmp_mv_limits;
 
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level) {
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  if (cpi->common.cur_frame_force_integer_mv) {
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
   }
-  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
-#else
-  if (bestsme < INT_MAX) {
-#endif
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     bestsme = cpi->find_fractional_mv_step(
-        x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        x, cm, mi_row, mi_col, &ref_mv.as_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
         x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
-        ref_idx, pw, ph, cpi->sf.use_upsampled_references);
+        ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search);
   }
 
-  // Restore the pointer to the first (possibly scaled) prediction buffer.
+  // Restore the pointer to the first unscaled prediction buffer.
   if (ref_idx) pd->pre[0] = orig_yv12;
 
   if (bestsme < INT_MAX) *this_mv = *best_mv;
 
   *rate_mv = 0;
 
-  if (scaled_ref_frame) {
-    // Restore the prediction frame pointers to their unscaled versions.
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[ref_idx] = backup_yv12[i];
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
-  else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
   *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
                               x->mvcost, MV_COST_WEIGHT);
 }
@@ -7774,51 +6866,23 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 // Wrapper for compound_single_motion_search, for the common case
 // where the second prediction is also an inter mode.
 static void compound_single_motion_search_interinter(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-    int_mv *frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
     const int block, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-// This function should only ever be called for compound modes
-#if CONFIG_COMPOUND_SINGLEREF
-  int is_singleref_comp_mode =
-      !has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode);
-  assert(has_second_ref(mbmi) || is_singleref_comp_mode);
-  if (is_singleref_comp_mode && ref_idx) assert(frame_comp_mv);
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  assert(has_second_ref(mbmi));
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(xd->mi[0]));
 
-// Prediction buffer from second frame.
-#if CONFIG_HIGHBITDEPTH
+  // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
   else
     second_pred = (uint8_t *)second_pred_alloc_16;
-#else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_COMPOUND_SINGLEREF
-  MV *this_mv = has_second_ref(mbmi)
-                    ? &frame_mv[mbmi->ref_frame[ref_idx]].as_mv
-                    : (ref_idx ? &frame_comp_mv[mbmi->ref_frame[0]].as_mv
-                               : &frame_mv[mbmi->ref_frame[0]].as_mv);
-  const MV *other_mv =
-      has_second_ref(mbmi)
-          ? &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv
-          : (ref_idx ? &frame_mv[mbmi->ref_frame[0]].as_mv
-                     : &frame_comp_mv[mbmi->ref_frame[0]].as_mv);
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
-  const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+
+  MV *this_mv = &cur_mv[ref_idx].as_mv;
+  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
 
   build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
                           ref_idx, second_pred);
@@ -7828,58 +6892,33 @@ static void compound_single_motion_search_interinter(
                                 ref_idx);
 }
 
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static void do_masked_motion_search_indexed(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
     int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   BLOCK_SIZE sb_type = mbmi->sb_type;
   const uint8_t *mask;
   const int mask_stride = block_size_wide[bsize];
 
   mask = av1_get_compound_type_mask(comp_data, sb_type);
 
-  int_mv frame_mv[TOTAL_REFS_PER_FRAME];
-#if CONFIG_COMPOUND_SINGLEREF
-  int_mv frame_comp_mv[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-  assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
-
-  frame_mv[rf[0]].as_int = cur_mv[0].as_int;
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    frame_comp_mv[rf[0]].as_int = cur_mv[1].as_int;
-  else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    frame_mv[rf[1]].as_int = cur_mv[1].as_int;
+  tmp_mv[0].as_int = cur_mv[0].as_int;
+  tmp_mv[1].as_int = cur_mv[1].as_int;
   if (which == 0 || which == 1) {
-    compound_single_motion_search_interinter(
-        cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-        has_second_ref(mbmi) ? NULL : frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        mi_row, mi_col, mask, mask_stride, rate_mv, 0, which);
+    compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mi_row,
+                                             mi_col, mask, mask_stride, rate_mv,
+                                             0, which);
   } else if (which == 2) {
-    joint_motion_search(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                        has_second_ref(mbmi) ? NULL : frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                        mi_row, mi_col, NULL, mask, mask_stride, rate_mv, 0);
-  }
-  tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    tmp_mv[1].as_int = frame_comp_mv[rf[0]].as_int;
-  else  // comp ref
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
-}
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+    joint_motion_search(cpi, x, bsize, tmp_mv, mi_row, mi_col, NULL, mask,
+                        mask_stride, rate_mv, 0);
+  }
+}
 
+#define USE_DISCOUNT_NEWMV_TEST 0
+#if USE_DISCOUNT_NEWMV_TEST
 // In some situations we want to discount the apparent cost of a new motion
 // vector. Where there is a subtle motion field and especially where there is
 // low spatial complexity then it can be hard to cover the cost of a new motion
@@ -7887,17 +6926,42 @@ static void do_masked_motion_search_indexed(
 // However, once established that vector may be usable through the nearest and
 // near mv modes to reduce distortion in subsequent blocks and also improve
 // visual quality.
-static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
-                               int_mv this_mv,
-                               int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
-                               int ref_frame) {
-  return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
-          (this_mv.as_int != 0) &&
-          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
-           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
-          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
-           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+#define NEW_MV_DISCOUNT_FACTOR 8
+static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
+                               int ref_mv_idx,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               const MB_MODE_INFO_EXT *mbmi_ext);
+static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                               int this_mode, int_mv this_mv) {
+  if (this_mode == NEWMV && this_mv.as_int != 0 &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    // Only discount new_mv when nearst_mv and all near_mv are zero, and the
+    // new_mv is not equal to global_mv
+    const AV1_COMMON *const cm = &cpi->common;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const MV_REFERENCE_FRAME tmp_ref_frames[2] = { mbmi->ref_frame[0],
+                                                   NONE_FRAME };
+    const uint8_t ref_frame_type = av1_ref_frame_type(tmp_ref_frames);
+    int_mv nearest_mv;
+    get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+    int ret = nearest_mv.as_int == 0;
+    for (int ref_mv_idx = 0;
+         ref_mv_idx < x->mbmi_ext->ref_mv_count[ref_frame_type]; ++ref_mv_idx) {
+      int_mv near_mv;
+      get_this_mv(&near_mv, NEARMV, 0, ref_mv_idx, tmp_ref_frames, x->mbmi_ext);
+      ret &= near_mv.as_int == 0;
+    }
+    if (cm->global_motion[tmp_ref_frames[0]].wmtype <= TRANSLATION) {
+      int_mv global_mv;
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      ret &= global_mv.as_int != this_mv.as_int;
+    }
+    return ret;
+  }
+  return 0;
 }
+#endif
 
 #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
 #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
@@ -7910,25 +6974,42 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-#if CONFIG_WEDGE
 static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
                                const BLOCK_SIZE bsize, const uint8_t *pred0,
                                int stride0, const uint8_t *pred1, int stride1) {
+  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+    //                            4X4
+    BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+    // 64x128,     128x64,        128x128
+    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+    // 4X16,       16X4,          8X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    // 32X8,       16X64,         64X16
+    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+  };
   const struct macroblock_plane *const p = &x->plane[0];
   const uint8_t *src = p->src.buf;
   int src_stride = p->src.stride;
-  const int f_index = bsize - BLOCK_8X8;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   uint32_t esq[2][4];
   int64_t tl, br;
 
-#if CONFIG_HIGHBITDEPTH
+  const BLOCK_SIZE f_index = split_qtr[bsize];
+  assert(f_index != BLOCK_INVALID);
+
   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     pred0 = CONVERT_TO_BYTEPTR(pred0);
     pred1 = CONVERT_TO_BYTEPTR(pred1);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
   cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
@@ -7947,100 +7028,14 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
                           pred1 + bh / 2 * stride1 + bw / 2, stride0,
                           &esq[1][3]);
 
-  tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
-       (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
-  br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
-       (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+  tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) -
+       ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]);
+  br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) -
+       ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]);
   return (tl + br > 0);
 }
-#endif  // CONFIG_WEDGE
-
-#if !CONFIG_DUAL_FILTER
-static InterpFilter predict_interp_filter(
-    const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
-    const int mi_row, const int mi_col,
-    InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
-  InterpFilter best_filter = SWITCHABLE;
-  const AV1_COMMON *cm = &cpi->common;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int is_comp_pred = has_second_ref(mbmi);
-  const int this_mode = mbmi->mode;
-  int refs[2] = { mbmi->ref_frame[0],
-                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-  if (pred_filter_search) {
-    InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->up_available)
-      af = av1_extract_interp_filter(
-          xd->mi[-xd->mi_stride]->mbmi.interp_filters, 0);
-    if (xd->left_available)
-      lf = av1_extract_interp_filter(xd->mi[-1]->mbmi.interp_filters, 0);
-
-    if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
-      best_filter = af;
-  }
-  if (is_comp_pred) {
-    if (cpi->sf.adaptive_mode_search) {
-      switch (this_mode) {
-        case NEAREST_NEARESTMV:
-          if (single_filter[NEARESTMV][refs[0]] ==
-              single_filter[NEARESTMV][refs[1]])
-            best_filter = single_filter[NEARESTMV][refs[0]];
-          break;
-        case NEAR_NEARMV:
-          if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
-            best_filter = single_filter[NEARMV][refs[0]];
-          break;
-        case ZERO_ZEROMV:
-          if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
-            best_filter = single_filter[ZEROMV][refs[0]];
-          break;
-        case NEW_NEWMV:
-          if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
-            best_filter = single_filter[NEWMV][refs[0]];
-          break;
-        case NEAREST_NEWMV:
-          if (single_filter[NEARESTMV][refs[0]] ==
-              single_filter[NEWMV][refs[1]])
-            best_filter = single_filter[NEARESTMV][refs[0]];
-          break;
-        case NEAR_NEWMV:
-          if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
-            best_filter = single_filter[NEARMV][refs[0]];
-          break;
-        case NEW_NEARESTMV:
-          if (single_filter[NEWMV][refs[0]] ==
-              single_filter[NEARESTMV][refs[1]])
-            best_filter = single_filter[NEWMV][refs[0]];
-          break;
-        case NEW_NEARMV:
-          if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
-            best_filter = single_filter[NEWMV][refs[0]];
-          break;
-        default:
-          if (single_filter[this_mode][refs[0]] ==
-              single_filter[this_mode][refs[1]])
-            best_filter = single_filter[this_mode][refs[0]];
-          break;
-      }
-    }
-  }
-  if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
-    best_filter = EIGHTTAP_REGULAR;
-  }
-  return best_filter;
-}
-#endif  // !CONFIG_DUAL_FILTER
 
 // Choose the best wedge index and sign
-#if CONFIG_WEDGE
 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
                           const BLOCK_SIZE bsize, const uint8_t *const p0,
                           const uint8_t *const p1, int *const best_wedge_sign,
@@ -8058,12 +7053,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   const uint8_t *mask;
   uint64_t sse;
-#if CONFIG_HIGHBITDEPTH
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-#else
-  const int bd_round = 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
@@ -8072,7 +7063,6 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
 
   int64_t sign_limit;
 
-#if CONFIG_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
@@ -8080,9 +7070,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else  // NOLINT
-#endif    // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
@@ -8114,6 +7102,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -8123,7 +7112,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     }
   }
 
-  return best_rd;
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 // Choose the best wedge index the specified sign
@@ -8143,25 +7133,18 @@ static int64_t pick_wedge_fixed_sign(
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   const uint8_t *mask;
   uint64_t sse;
-#if CONFIG_HIGHBITDEPTH
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-#else
-  const int bd_round = 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
 
-#if CONFIG_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else  // NOLINT
-#endif    // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
   }
@@ -8175,6 +7158,7 @@ static int64_t pick_wedge_fixed_sign(
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -8183,7 +7167,8 @@ static int64_t pick_wedge_fixed_sign(
     }
   }
 
-  return best_rd;
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
@@ -8192,7 +7177,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
                                      const uint8_t *const p0,
                                      const uint8_t *const p1) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int bw = block_size_wide[bsize];
 
   int64_t rd;
@@ -8200,7 +7185,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
   int wedge_sign = 0;
 
   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-  assert(cpi->common.allow_masked_compound);
+  assert(cpi->common.seq_params.enable_masked_compound);
 
   if (cpi->sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
@@ -8209,19 +7194,17 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
     rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
   }
 
-  mbmi->wedge_sign = wedge_sign;
-  mbmi->wedge_index = wedge_index;
+  mbmi->interinter_comp.wedge_sign = wedge_sign;
+  mbmi->interinter_comp.wedge_index = wedge_index;
   return rd;
 }
-#endif  // CONFIG_WEDGE
 
-#if CONFIG_COMPOUND_SEGMENT
 static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
                                    MACROBLOCK *const x, const BLOCK_SIZE bsize,
                                    const uint8_t *const p0,
                                    const uint8_t *const p1) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct buf_2d *const src = &x->plane[0].src;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
@@ -8230,20 +7213,15 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   uint64_t sse;
   int64_t dist;
   int64_t rd0;
-  SEG_MASK_TYPE cur_mask_type;
+  DIFFWTD_MASK_TYPE cur_mask_type;
   int64_t best_rd = INT64_MAX;
-  SEG_MASK_TYPE best_mask_type = 0;
-#if CONFIG_HIGHBITDEPTH
+  DIFFWTD_MASK_TYPE best_mask_type = 0;
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-#else
-  const int bd_round = 0;
-#endif  // CONFIG_HIGHBITDEPTH
   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
 
-#if CONFIG_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
@@ -8251,26 +7229,22 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else  // NOLINT
-#endif    // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
   }
 
   // try each mask type and its inverse
-  for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) {
-// build mask and inverse
-#if CONFIG_HIGHBITDEPTH
+  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+    // build mask and inverse
     if (hbd)
-      build_compound_seg_mask_highbd(
+      av1_build_compound_diffwtd_mask_highbd(
           xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
-          CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
-      build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw,
-                              bsize, bh, bw);
+      av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1,
+                                      bw, bh, bw);
 
     // compute rd for mask
     sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
@@ -8286,35 +7260,31 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   }
 
   // make final mask
-  mbmi->mask_type = best_mask_type;
-#if CONFIG_HIGHBITDEPTH
+  mbmi->interinter_comp.mask_type = best_mask_type;
   if (hbd)
-    build_compound_seg_mask_highbd(
-        xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw,
-        CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+    av1_build_compound_diffwtd_mask_highbd(
+        xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0),
+        bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
   else
-#endif  // CONFIG_HIGHBITDEPTH
-    build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw,
-                            bsize, bh, bw);
+    av1_build_compound_diffwtd_mask(
+        xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw);
 
   return best_rd;
 }
-#endif  // CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_WEDGE && CONFIG_INTERINTRA
 static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
                                      const MACROBLOCK *const x,
                                      const BLOCK_SIZE bsize,
                                      const uint8_t *const p0,
                                      const uint8_t *const p1) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
   int64_t rd;
   int wedge_index = -1;
 
   assert(is_interintra_wedge_used(bsize));
-  assert(cpi->common.allow_interintra_compound);
+  assert(cpi->common.seq_params.enable_interintra_compound);
 
   rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
 
@@ -8322,22 +7292,15 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   mbmi->interintra_wedge_index = wedge_index;
   return rd;
 }
-#endif  // CONFIG_WEDGE && CONFIG_INTERINTRA
 
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     const BLOCK_SIZE bsize,
                                     const uint8_t *const p0,
                                     const uint8_t *const p1) {
-  const COMPOUND_TYPE compound_type =
-      x->e_mbd.mi[0]->mbmi.interinter_compound_type;
+  const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
   switch (compound_type) {
-#if CONFIG_WEDGE
     case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1);
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_DIFFWTD: return pick_interinter_seg(cpi, x, bsize, p0, p1);
     default: assert(0); return 0;
   }
 }
@@ -8346,46 +7309,23 @@ static int interinter_compound_motion_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int_mv tmp_mv[2];
   int tmp_rate_mv = 0;
-  const INTERINTER_COMPOUND_DATA compound_data = {
-#if CONFIG_WEDGE
-    mbmi->wedge_index,
-    mbmi->wedge_sign,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mbmi->mask_type,
-    xd->seg_mask,
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mbmi->interinter_compound_type
-  };
-#if CONFIG_COMPOUND_SINGLEREF
-  // NOTE: Mode is needed to identify the compound mode prediction, regardless
-  //       of comp refs or single ref.
-  mbmi->mode = this_mode;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  if (this_mode == NEW_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-      || this_mode == SR_NEW_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      ) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
   } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
-  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-             // || this_mode == SR_NEAREST_NEWMV
-             || this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-             ) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
   }
@@ -8394,22 +7334,23 @@ static int interinter_compound_motion_search(
 
 static int64_t build_and_cost_compound_type(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
-    const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
+    const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
     BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
     int *strides, int mi_row, int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int rate_sum;
   int64_t dist_sum;
   int64_t best_rd_cur = INT64_MAX;
   int64_t rd = INT64_MAX;
   int tmp_skip_txfm_sb;
   int64_t tmp_skip_sse_sb;
-  const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
 
   best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
-  best_rd_cur += RDCOST(x->rdmult, rs2 + rate_mv, 0);
+  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
 
   if (have_newmv_in_inter_mode(this_mode) &&
       use_masked_motion_search(compound_type)) {
@@ -8417,80 +7358,74 @@ static int64_t build_and_cost_compound_type(
                                                      this_mode, mi_row, mi_col);
     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-    rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
+                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
     if (rd >= best_rd_cur) {
       mbmi->mv[0].as_int = cur_mv[0].as_int;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
       *out_rate_mv = rate_mv;
-      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
-#if CONFIG_SUPERTX
-                                               0, 0,
-#endif  // CONFIG_SUPERTX
-                                               preds0, strides, preds1,
-                                               strides);
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                               preds1, strides);
     }
     av1_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
 
   } else {
-    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
-#if CONFIG_SUPERTX
-                                             0, 0,
-#endif  // CONFIG_SUPERTX
-                                             preds0, strides, preds1, strides);
+    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                             preds1, strides);
     av1_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, *rs2 + rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
   }
   return best_rd_cur;
 }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 
 typedef struct {
-#if CONFIG_MOTION_VAR
-  // Inter prediction buffers and respective strides
+  // OBMC secondary prediction buffers and respective strides
   uint8_t *above_pred_buf[MAX_MB_PLANE];
   int above_pred_stride[MAX_MB_PLANE];
   uint8_t *left_pred_buf[MAX_MB_PLANE];
   int left_pred_stride[MAX_MB_PLANE];
-#endif  // CONFIG_MOTION_VAR
   int_mv *single_newmv;
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
   int *single_newmv_rate;
+  int *single_newmv_valid;
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
-  int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
-  InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  int64_t (*modelled_rd)[REF_FRAMES];
+  InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
+  int ref_frame_cost;
+  int single_comp_cost;
 } HandleInterModeArgs;
 
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+                                     const AV1_COMMON *cm,
+                                     const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  *out_mv = in_mv;
+  lower_mv_precision(&out_mv->as_mv, cm->allow_high_precision_mv,
+                     cm->cur_frame_force_integer_mv);
+  clamp_mv2(&out_mv->as_mv, xd);
+  return !mv_check_bounds(&x->mv_limits, &out_mv->as_mv);
+}
+
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            const BLOCK_SIZE bsize,
-                            int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_COMPOUND_SINGLEREF
-                            int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_COMPOUND_SINGLEREF
+                            const BLOCK_SIZE bsize, int_mv *cur_mv,
                             const int mi_row, const int mi_col,
-                            int *const rate_mv, int_mv *const single_newmv,
+                            int *const rate_mv,
                             HandleInterModeArgs *const args) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-  int_mv *const frame_mv = mode_mv[this_mode];
-#if CONFIG_COMPOUND_SINGLEREF
-  int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int i;
@@ -8498,392 +7433,338 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   (void)args;
 
   if (is_comp_pred) {
-    for (i = 0; i < 2; ++i) {
-      single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
-    }
-
     if (this_mode == NEW_NEWMV) {
-      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+      cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
+      cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        joint_motion_search(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                            NULL,  // int_mv *frame_comp_mv
-#endif                             // CONFIG_COMPOUND_SINGLEREF
-                            mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
+        joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
+                            0, rate_mv, 0);
       } else {
         *rate_mv = 0;
         for (i = 0; i < 2; ++i) {
-          av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
-          *rate_mv += av1_mv_bit_cost(
-              &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
-              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+          const int_mv ref_mv = av1_get_ref_mv(x, i);
+          av1_set_mvcost(x, i, mbmi->ref_mv_idx);
+          *rate_mv +=
+              av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmvjointcost,
+                              x->mvcost, MV_COST_WEIGHT);
         }
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+      cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        frame_mv[refs[0]].as_int =
-            mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
-        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                                 NULL,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                                 mi_row, mi_col, NULL, 0,
-                                                 rate_mv, 0, 1);
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
       } else {
-        av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
-        *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                   &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+        av1_set_mvcost(x, 1,
+                       mbmi->ref_mv_idx + (this_mode == NEAR_NEWMV ? 1 : 0));
+        const int_mv ref_mv = av1_get_ref_mv(x, 1);
+        *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
-      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+      cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        frame_mv[refs[1]].as_int =
-            mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
-        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                                 NULL,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                                 mi_row, mi_col, NULL, 0,
-                                                 rate_mv, 0, 0);
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
       } else {
-        av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-        *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                   &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+        const int_mv ref_mv = av1_get_ref_mv(x, 0);
+        av1_set_mvcost(x, 0,
+                       mbmi->ref_mv_idx + (this_mode == NEW_NEARMV ? 1 : 0));
+        *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     }
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_inter_singleref_comp_mode(this_mode)) {
-    // Single ref comp mode
-    const int mode0 = compound_ref0_mode(this_mode);
-
-    single_newmv[refs[0]].as_int = args->single_newmv[refs[0]].as_int;
-    frame_mv[refs[0]].as_int = (mode0 == NEWMV)
-                                   ? single_newmv[refs[0]].as_int
-                                   : mode_mv[mode0][refs[0]].as_int;
-    assert(compound_ref1_mode(this_mode) == NEWMV);
-    frame_comp_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-
-    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-      if (this_mode == SR_NEW_NEWMV) {
-        joint_motion_search(cpi, x, bsize, frame_mv, frame_comp_mv, mi_row,
-                            mi_col, NULL, NULL, 0, rate_mv, 0);
-      } else {
-        assert(  // this_mode == SR_NEAREST_NEWMV ||
-            this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV);
-        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
-                                                 frame_comp_mv, mi_row, mi_col,
-                                                 NULL, 0, rate_mv, 0, 1);
-      }
-    } else {
-      *rate_mv = 0;
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-      if (mode0 == NEWMV)
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
-                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
-    if (is_comp_interintra_pred) {
-      x->best_mv = args->single_newmv[refs[0]];
-      *rate_mv = args->single_newmv_rate[refs[0]];
-    } else {
-      single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
-      args->single_newmv[refs[0]] = x->best_mv;
-      args->single_newmv_rate[refs[0]] = *rate_mv;
-    }
-
+    single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
 
-    frame_mv[refs[0]] = x->best_mv;
-    xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
+    args->single_newmv[refs[0]] = x->best_mv;
+    args->single_newmv_rate[refs[0]] = *rate_mv;
+    args->single_newmv_valid[refs[0]] = 1;
 
+    cur_mv[0].as_int = x->best_mv.as_int;
+
+#if USE_DISCOUNT_NEWMV_TEST
     // Estimate the rate implications of a new mv but discount this
     // under certain circumstances where we want to help initiate a weak
     // motion field, where the distortion gain for a single block may not
     // be enough to overcome the cost of a new mv.
-    if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
+    if (discount_newmv_test(cpi, x, this_mode, x->best_mv)) {
       *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
     }
+#endif
   }
 
   return 0;
 }
 
-int64_t interpolation_filter_search(
+static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+                                int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int tmp_rate, tmp_skip_sb = 0;
+  int64_t tmp_dist, tmp_skip_sse = INT64_MAX;
+
+  const InterpFilters last_best = mbmi->interp_filters;
+  mbmi->interp_filters = filter_sets[filter_idx];
+  const int tmp_rs = av1_get_switchable_rate(cm, x, xd);
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
+                  &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
+  int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+  if (tmp_rd < *rd) {
+    *rd = tmp_rd;
+    *switchable_rate = tmp_rs;
+    *skip_txfm_sb = tmp_skip_sb;
+    *skip_sse_sb = tmp_skip_sse;
+    swap_dst_buf(xd, dst_bufs, num_planes);
+    return 1;
+  }
+  mbmi->interp_filters = last_best;
+  return 0;
+}
+
+// check if there is saved result match with this search
+static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
+                                         MB_MODE_INFO *const mi) {
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
+                                              MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  for (int j = 0; j < offset; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j];
+    if (is_interp_filter_match(st, mbmi)) {
+      mbmi->interp_filters = st->filters;
+      return j;
+    }
+  }
+  return -1;  // no match result found
+}
+
+static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
+                                                  MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  if (offset < MAX_INTERP_FILTER_STATS) {
+    INTERPOLATION_FILTER_STATS stat = {
+      mbmi->interp_filters,
+      { mbmi->mv[0], mbmi->mv[1] },
+      { mbmi->ref_frame[0], mbmi->ref_frame[1] },
+    };
+    x->interp_filter_stats[comp_idx][offset] = stat;
+    x->interp_filter_stats_idx[comp_idx]++;
+  }
+}
+
+static int64_t interpolation_filter_search(
     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
     int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
-    BUFFER_SET *const orig_dst,
-    InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME],
+    BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
     int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
     int64_t *const skip_sse_sb) {
   const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int i;
-  int tmp_rate;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search =
+      av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+  int i, tmp_rate;
   int64_t tmp_dist;
 
   (void)single_filter;
-
-  InterpFilter assign_filter = SWITCHABLE;
-
-  if (cm->interp_filter == SWITCHABLE) {
-#if !CONFIG_DUAL_FILTER
-    assign_filter = av1_is_interp_needed(xd)
-                        ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
-                                                single_filter)
-                        : cm->interp_filter;
-#endif  // !CONFIG_DUAL_FILTER
-  } else {
-    assign_filter = cm->interp_filter;
+  int match_found = -1;
+  const InterpFilter assign_filter = cm->interp_filter;
+  if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+    match_found = find_interp_filter_in_stats(x, mbmi);
+  }
+  if (!need_search || match_found == -1) {
+    set_default_interp_filters(mbmi, assign_filter);
   }
-
-  set_default_interp_filters(mbmi, assign_filter);
-
   *switchable_rate = av1_get_switchable_rate(cm, x, xd);
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-  model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
-                  skip_txfm_sb, skip_sse_sb);
+  model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
+                  skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
   *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
 
-  if (assign_filter == SWITCHABLE) {
-    // do interp_filter search
-    if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) {
-#if CONFIG_DUAL_FILTER
-      const int filter_set_size = DUAL_FILTER_SET_SIZE;
-#else
-      const int filter_set_size = SWITCHABLE_FILTERS;
-#endif  // CONFIG_DUAL_FILTER
-      int best_in_temp = 0;
-      InterpFilters best_filters = mbmi->interp_filters;
-      restore_dst_buf(xd, *tmp_dst);
-      // EIGHTTAP_REGULAR mode is calculated beforehand
-      for (i = 1; i < filter_set_size; ++i) {
-        int tmp_skip_sb = 0;
-        int64_t tmp_skip_sse = INT64_MAX;
-        int tmp_rs;
-        int64_t tmp_rd;
-#if CONFIG_DUAL_FILTER
-        mbmi->interp_filters =
-            av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
-#else
-        mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
-#endif  // CONFIG_DUAL_FILTER
-        tmp_rs = av1_get_switchable_rate(cm, x, xd);
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                        &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
-
-        if (tmp_rd < *rd) {
-          *rd = tmp_rd;
-          *switchable_rate = av1_get_switchable_rate(cm, x, xd);
-          best_filters = mbmi->interp_filters;
-          *skip_txfm_sb = tmp_skip_sb;
-          *skip_sse_sb = tmp_skip_sse;
-          best_in_temp = !best_in_temp;
-          if (best_in_temp) {
-            restore_dst_buf(xd, *orig_dst);
-          } else {
-            restore_dst_buf(xd, *tmp_dst);
-          }
-        }
-      }
-      if (best_in_temp) {
-        restore_dst_buf(xd, *tmp_dst);
-      } else {
-        restore_dst_buf(xd, *orig_dst);
+  if (assign_filter != SWITCHABLE || match_found != -1) {
+    return 0;
+  }
+  if (!need_search) {
+    assert(mbmi->interp_filters ==
+           av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
+    return 0;
+  }
+  // do interp_filter search
+  const int filter_set_size = DUAL_FILTER_SET_SIZE;
+  restore_dst_buf(xd, *tmp_dst, num_planes);
+  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+  if (cpi->sf.use_fast_interpolation_filter_search &&
+      cm->seq_params.enable_dual_filter) {
+    // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
+    int best_dual_mode = 0;
+    // Find best of {R}x{R,Sm,Sh}
+    // EIGHTTAP_REGULAR mode is calculated beforehand
+    for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
+                                  dst_bufs, i)) {
+        best_dual_mode = i;
+      }
+    }
+    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+    for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+         i += SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i);
+    }
+  } else {
+    // EIGHTTAP_REGULAR mode is calculated beforehand
+    for (i = 1; i < filter_set_size; ++i) {
+      if (cm->seq_params.enable_dual_filter == 0) {
+        const int16_t filter_y = filter_sets[i] & 0xffff;
+        const int16_t filter_x = filter_sets[i] >> 16;
+        if (filter_x != filter_y) continue;
       }
-      mbmi->interp_filters = best_filters;
-    } else {
-      assert(mbmi->interp_filters ==
-             av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i);
     }
   }
-
+  swap_dst_buf(xd, dst_bufs, num_planes);
+  // save search results
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    assert(match_found == -1);
+    save_interp_filter_search_stat(x, mbmi);
+  }
   return 0;
 }
 
-#if CONFIG_DUAL_FILTER
-static InterpFilters condition_interp_filters_on_mv(
-    InterpFilters interp_filters, const MACROBLOCKD *xd) {
-  InterpFilter filters[2];
-  for (int i = 0; i < 2; ++i)
-    filters[i] = (has_subpel_mv_component(xd->mi[0], xd, i))
-                     ? av1_extract_interp_filter(interp_filters, i)
-                     : EIGHTTAP_REGULAR;
-
-  return av1_make_interp_filters(filters[0], filters[1]);
-}
-#endif
-
 // TODO(afergs): Refactor the MBMI references in here - there's four
 // TODO(afergs): Refactor optional args - add them to a struct or remove
-static int64_t motion_mode_rd(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
-    int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
-    const int *refs, int rate_mv,
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    // only used when WARPED_MOTION is on?
-    int_mv *const single_newmv, int rate2_bmc_nocoeff,
-    MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
+static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, RD_STATS *rd_stats,
+                              RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                              int *disable_skip, int mi_row, int mi_col,
+                              HandleInterModeArgs *const args,
+                              int64_t ref_best_rd, const int *refs, int rate_mv,
+                              BUFFER_SET *orig_dst
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+                              ,
+                              int64_t *best_est_rd
+#endif
+) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-
-  (void)mode_mv;
-  (void)mi_row;
-  (void)mi_col;
-  (void)args;
-  (void)refs;
-  (void)rate_mv;
-  (void)is_comp_pred;
-  (void)this_mode;
-#if !CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-  (void)single_newmv;
-#endif
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  MOTION_MODE motion_mode, last_motion_mode_allowed;
   int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   MB_MODE_INFO base_mbmi, best_mbmi;
-#if CONFIG_VAR_TX
-  uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
-#endif  // CONFIG_VAR_TX
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-#if CONFIG_WARPED_MOTION
-#if WARPED_MOTION_SORT_SAMPLES
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+                           is_interintra_allowed(mbmi) && mbmi->compound_idx;
   int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
-  int pts_mv0[SAMPLES_ARRAY_SIZE];
   int total_samples;
-#else
-  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  (void)rate_mv;
+
   av1_invalid_rd_stats(&best_rd_stats);
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-  if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
-#if CONFIG_WARPED_MOTION
   aom_clear_system_state();
-#if WARPED_MOTION_SORT_SAMPLES
-  mbmi->num_proj_ref[0] =
-      findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0, pts_mv0);
+  mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
   total_samples = mbmi->num_proj_ref[0];
-#else
-  mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-  best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   rate2_nocoeff = rd_stats->rate;
-  last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
   base_mbmi = *mbmi;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  MOTION_MODE last_motion_mode_allowed =
+      cm->switchable_motion_mode
+          ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                cm->allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->ref_frame[1] != INTRA_FRAME);
+  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t best_rd = INT64_MAX;
-  for (motion_mode = SIMPLE_TRANSLATION;
-       motion_mode <= last_motion_mode_allowed; motion_mode++) {
+
+  for (int mode_index = (int)SIMPLE_TRANSLATION;
+       mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
+       mode_index++) {
     int64_t tmp_rd = INT64_MAX;
-    int tmp_rate;
-    int64_t tmp_dist;
-    int tmp_rate2 =
-        motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
-    // right now since it requires mvs from all neighboring blocks. We will
-    // check if this mode is beneficial after all the mv's in the current
-    // superblock are selected.
-    if (motion_mode == NCOBMC_ADAPT_WEIGHT) continue;
-#endif
+    int tmp_rate2 = rate2_nocoeff;
+    int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+    int skip_txfm_sb = 0;
 
     *mbmi = base_mbmi;
-    mbmi->motion_mode = motion_mode;
-#if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-      *mbmi = *best_bmc_mbmi;
+    if (is_interintra_mode) {
+      mbmi->motion_mode = SIMPLE_TRANSLATION;
+    } else {
+      mbmi->motion_mode = (MOTION_MODE)mode_index;
+      assert(mbmi->ref_frame[1] != INTRA_FRAME);
+    }
+
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+      // SIMPLE_TRANSLATION mode: no need to recalculate.
+      // The prediction is calculated before motion_mode_rd() is called in
+      // handle_inter_mode()
+    } else if (mbmi->motion_mode == OBMC_CAUSAL) {
       mbmi->motion_mode = OBMC_CAUSAL;
-      if (!is_comp_pred &&
-#if CONFIG_COMPOUND_SINGLEREF
-          !is_inter_singleref_comp_mode(this_mode) &&
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          have_newmv_in_inter_mode(this_mode)) {
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
         single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
         mbmi->mv[0].as_int = x->best_mv.as_int;
-        if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
-                                refs[0])) {
+#if USE_DISCOUNT_NEWMV_TEST
+        if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
         }
-        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#if CONFIG_DUAL_FILTER
-        mbmi->interp_filters =
-            condition_interp_filters_on_mv(mbmi->interp_filters, xd);
-#endif  // CONFIG_DUAL_FILTER
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-      } else {
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#endif
+        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
       }
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
-      model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                      &tmp_dist, skip_txfm_sb, skip_sse_sb);
-    }
-#endif  // CONFIG_MOTION_VAR
-
-#if CONFIG_WARPED_MOTION
-    if (mbmi->motion_mode == WARPED_CAUSAL) {
-#if WARPED_MOTION_SORT_SAMPLES
+    } else if (mbmi->motion_mode == WARPED_CAUSAL) {
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-      *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = WARPED_CAUSAL;
       mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
       mbmi->interp_filters = av1_broadcast_interp_filter(
           av1_unswitchable_filter(cm->interp_filter));
 
-#if WARPED_MOTION_SORT_SAMPLES
       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
       memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
-      // Rank the samples by motion vector difference
+      // Select the samples according to motion vector difference
       if (mbmi->num_proj_ref[0] > 1) {
-        mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
-                                            pts_inref, mbmi->num_proj_ref[0]);
-        best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+        mbmi->num_proj_ref[0] = selectSamples(
+            &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize);
       }
-#endif  // WARPED_MOTION_SORT_SAMPLES
 
       if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
@@ -8892,144 +7773,299 @@ static int64_t motion_mode_rd(
         if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
           int tmp_rate_mv = 0;
           const int_mv mv0 = mbmi->mv[0];
-          WarpedMotionParams wm_params0 = mbmi->wm_params[0];
-#if WARPED_MOTION_SORT_SAMPLES
+          const WarpedMotionParams wm_params0 = mbmi->wm_params[0];
           int num_proj_ref0 = mbmi->num_proj_ref[0];
 
           // Refine MV in a small range.
           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
-                               pts_mv0, total_samples);
-#else
-          // Refine MV in a small range.
-          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
+                               total_samples);
 
           // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
             const int ref = refs[0];
-            const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-
+            const int_mv ref_mv = av1_get_ref_mv(x, 0);
             tmp_rate_mv =
-                av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv, x->nmvjointcost,
-                                x->mvcost, MV_COST_WEIGHT);
+                av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
             if (cpi->sf.adaptive_motion_search)
               x->pred_mv[ref] = mbmi->mv[0].as_mv;
 
-            single_newmv[ref] = mbmi->mv[0];
-
-            if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
-                                    refs[0])) {
+#if USE_DISCOUNT_NEWMV_TEST
+            if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
-#if WARPED_MOTION_SORT_SAMPLES
-            best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-            tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#if CONFIG_DUAL_FILTER
-            mbmi->interp_filters =
-                condition_interp_filters_on_mv(mbmi->interp_filters, xd);
-#endif  // CONFIG_DUAL_FILTER
+#endif
+            tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
           } else {
             // Restore the old MV and WM parameters.
             mbmi->mv[0] = mv0;
             mbmi->wm_params[0] = wm_params0;
-#if WARPED_MOTION_SORT_SAMPLES
             mbmi->num_proj_ref[0] = num_proj_ref0;
-#endif  // WARPED_MOTION_SORT_SAMPLES
           }
         }
 
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                        &tmp_dist, skip_txfm_sb, skip_sse_sb);
       } else {
         continue;
       }
+    } else if (is_interintra_mode) {
+      INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+      int64_t rd, best_interintra_rd = INT64_MAX;
+      int rmode, rate_sum;
+      int64_t dist_sum;
+      int j;
+      int tmp_rate_mv = 0;
+      int tmp_skip_txfm_sb;
+      int bw = block_size_wide[bsize];
+      int64_t tmp_skip_sse_sb;
+      DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+      uint8_t *tmp_buf, *intrapred;
+      const int *const interintra_mode_cost =
+          x->interintra_mode_cost[size_group_lookup[bsize]];
+
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+        intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+      } else {
+        tmp_buf = tmp_buf_;
+        intrapred = intrapred_;
+      }
+      const int_mv mv0 = mbmi->mv[0];
+
+      mbmi->ref_frame[1] = NONE_FRAME;
+      xd->plane[0].dst.buf = tmp_buf;
+      xd->plane[0].dst.stride = bw;
+      av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+      restore_dst_buf(xd, *orig_dst, num_planes);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->use_wedge_interintra = 0;
+      for (j = 0; j < INTERINTRA_MODES; ++j) {
+        mbmi->interintra_mode = (INTERINTRA_MODE)j;
+        rmode = interintra_mode_cost[mbmi->interintra_mode];
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                        &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+        rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+        if (rd < best_interintra_rd) {
+          best_interintra_rd = rd;
+          best_interintra_mode = mbmi->interintra_mode;
+        }
+      }
+      mbmi->interintra_mode = best_interintra_mode;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      av1_subtract_plane(x, bsize, 0);
+      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
+      best_interintra_rd = rd;
+
+      if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+        // restore ref_frame[1]
+        mbmi->ref_frame[1] = ref_frame_1;
+        continue;
+      }
+
+      if (is_interintra_wedge_used(bsize)) {
+        int64_t best_interintra_rd_nowedge = INT64_MAX;
+        int64_t best_interintra_rd_wedge = INT64_MAX;
+        int_mv tmp_mv;
+        InterpFilters backup_interp_filters = mbmi->interp_filters;
+        int rwedge = x->wedge_interintra_cost[bsize][0];
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum);
+        best_interintra_rd_nowedge = rd;
+
+        // Disable wedge search if source variance is small
+        if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+          mbmi->use_wedge_interintra = 1;
+
+          rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+                   x->wedge_interintra_cost[bsize][1];
+
+          best_interintra_rd_wedge =
+              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+          best_interintra_rd_wedge +=
+              RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
+          // Refine motion vector.
+          if (have_newmv_in_inter_mode(mbmi->mode)) {
+            // get negative of mask
+            const uint8_t *mask = av1_get_contiguous_soft_mask(
+                mbmi->interintra_wedge_index, 1, bsize);
+            tmp_mv = av1_get_ref_mv(x, 0);
+            compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+                                          mi_col, intrapred, mask, bw,
+                                          &tmp_rate_mv, 0);
+            mbmi->mv[0].as_int = tmp_mv.as_int;
+            av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+                                           bsize);
+            model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                            &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
+                            NULL);
+            rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+                        dist_sum);
+            if (rd >= best_interintra_rd_wedge) {
+              tmp_mv.as_int = mv0.as_int;
+              tmp_rate_mv = rate_mv;
+              mbmi->interp_filters = backup_interp_filters;
+              av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+            }
+          } else {
+            tmp_mv.as_int = mv0.as_int;
+            tmp_rate_mv = rate_mv;
+            av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+          }
+          // Evaluate closer to true rd
+          av1_subtract_plane(x, bsize, 0);
+          rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                   &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                   INT64_MAX);
+          if (rd != INT64_MAX)
+            rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                        dist_sum);
+          best_interintra_rd_wedge = rd;
+          if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+            mbmi->use_wedge_interintra = 1;
+            mbmi->mv[0].as_int = tmp_mv.as_int;
+            tmp_rate2 += tmp_rate_mv - rate_mv;
+          } else {
+            mbmi->use_wedge_interintra = 0;
+            mbmi->mv[0].as_int = mv0.as_int;
+            mbmi->interp_filters = backup_interp_filters;
+          }
+        } else {
+          mbmi->use_wedge_interintra = 0;
+        }
+      }  // if (is_interintra_wedge_used(bsize))
+      restore_dst_buf(xd, *orig_dst, num_planes);
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
     }
-#endif  // CONFIG_WARPED_MOTION
+
+    if (!cpi->common.all_lossless)
+      check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+
     x->skip = 0;
 
     rd_stats->dist = 0;
     rd_stats->sse = 0;
     rd_stats->skip = 1;
     rd_stats->rate = tmp_rate2;
-    if (last_motion_mode_allowed > SIMPLE_TRANSLATION) {
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-      if (last_motion_mode_allowed == WARPED_CAUSAL)
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+    if (av1_is_interp_needed(xd))
+      rd_stats->rate += av1_get_switchable_rate(cm, x, xd);
+    if (interintra_allowed) {
+      rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
+                                          [mbmi->ref_frame[1] == INTRA_FRAME];
+      if (mbmi->ref_frame[1] == INTRA_FRAME) {
+        rd_stats->rate += x->interintra_mode_cost[size_group_lookup[bsize]]
+                                                 [mbmi->interintra_mode];
+        if (is_interintra_wedge_used(bsize)) {
+          rd_stats->rate +=
+              x->wedge_interintra_cost[bsize][mbmi->use_wedge_interintra];
+          if (mbmi->use_wedge_interintra) {
+            rd_stats->rate +=
+                av1_cost_literal(get_interintra_wedge_bits(bsize));
+          }
+        }
+      }
+    }
+    if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+        (mbmi->ref_frame[1] != INTRA_FRAME)) {
+      if (last_motion_mode_allowed == WARPED_CAUSAL) {
         rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-      else
+      } else {
         rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-    }
-#if CONFIG_WARPED_MOTION
-    if (mbmi->motion_mode == WARPED_CAUSAL) {
-      rd_stats->rate -= rs;
+      }
     }
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    if (!*skip_txfm_sb) {
+    if (!skip_txfm_sb) {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      int64_t est_rd = 0;
+      int est_skip = 0;
+      if (cpi->sf.inter_mode_rd_model_estimation) {
+        InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type];
+        if (md->ready) {
+          const int64_t curr_sse = get_sse(cpi, x);
+          est_rd =
+              get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate);
+          est_skip = est_rd * 0.8 > *best_est_rd;
+#if INTER_MODE_RD_TEST
+          if (est_rd < *best_est_rd) {
+            *best_est_rd = est_rd;
+          }
+#else   // INTER_MODE_RD_TEST
+          if (est_skip) {
+            ++md->skip_count;
+            mbmi->ref_frame[1] = ref_frame_1;
+            continue;
+          } else {
+            if (est_rd < *best_est_rd) {
+              *best_est_rd = est_rd;
+            }
+            ++md->non_skip_count;
+          }
+#endif  // INTER_MODE_RD_TEST
+        }
+      }
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+
       int64_t rdcosty = INT64_MAX;
       int is_cost_valid_uv = 0;
 
       // cost and distortion
       av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-        select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+        // Motion mode
+        select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col,
+                           ref_best_rd);
+#if CONFIG_COLLECT_RD_STATS == 2
+        PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
       } else {
-        int idx, idy;
         super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
-        for (idy = 0; idy < xd->n8_h; ++idy)
-          for (idx = 0; idx < xd->n8_w; ++idx)
-            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-        memset(x->blk_skip[0], rd_stats_y->skip,
-               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+        memset(x->blk_skip, rd_stats_y->skip,
+               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
       }
-#else
-    /* clang-format off */
-      super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
-/* clang-format on */
-#endif  // CONFIG_VAR_TX
 
       if (rd_stats_y->rate == INT_MAX) {
         av1_invalid_rd_stats(rd_stats);
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-        if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
+        if (mbmi->motion_mode != SIMPLE_TRANSLATION ||
+            mbmi->ref_frame[1] == INTRA_FRAME) {
+          mbmi->ref_frame[1] = ref_frame_1;
           continue;
         } else {
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-          restore_dst_buf(xd, *orig_dst);
+          restore_dst_buf(xd, *orig_dst, num_planes);
+          mbmi->ref_frame[1] = ref_frame_1;
           return INT64_MAX;
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         }
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
 
       av1_merge_rd_stats(rd_stats, rd_stats_y);
 
       rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
       rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
-/* clang-format off */
-#if CONFIG_VAR_TX
-      is_cost_valid_uv =
-          inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
-#else
-      is_cost_valid_uv =
-          super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
-#endif  // CONFIG_VAR_TX
-      if (!is_cost_valid_uv) {
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-        continue;
-#else
-        restore_dst_buf(xd, *orig_dst);
-        return INT64_MAX;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      if (num_planes > 1) {
+        /* clang-format off */
+        is_cost_valid_uv =
+            inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty,
+                             FTXS_NONE);
+        if (!is_cost_valid_uv) {
+          mbmi->ref_frame[1] = ref_frame_1;
+          continue;
+        }
+        /* clang-format on */
+        av1_merge_rd_stats(rd_stats, rd_stats_uv);
+      } else {
+        av1_init_rd_stats(rd_stats_uv);
       }
-      /* clang-format on */
-      av1_merge_rd_stats(rd_stats, rd_stats_uv);
 #if CONFIG_RD_DEBUG
       // record transform block coefficient cost
       // TODO(angiebird): So far rd_debug tool only detects discrepancy of
@@ -9038,2250 +8074,2167 @@ static int64_t motion_mode_rd(
       // other place when we need to compare non-coefficient cost.
       mbmi->rd_stats = *rd_stats;
 #endif  // CONFIG_RD_DEBUG
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      const int skip_ctx = av1_get_skip_context(xd);
       if (rd_stats->skip) {
         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
         rd_stats_y->rate = 0;
         rd_stats_uv->rate = 0;
-        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        rd_stats->rate += x->skip_cost[skip_ctx][1];
         mbmi->skip = 0;
         // here mbmi->skip temporarily plays a role as what this_skip2 does
       } else if (!xd->lossless[mbmi->segment_id] &&
                  (RDCOST(x->rdmult,
                          rd_stats_y->rate + rd_stats_uv->rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                         rd_stats->dist) >=
-                  RDCOST(x->rdmult, av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
-                         rd_stats->sse))) {
+                             x->skip_cost[skip_ctx][0],
+                         rd_stats->dist) >= RDCOST(x->rdmult,
+                                                   x->skip_cost[skip_ctx][1],
+                                                   rd_stats->sse))) {
         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        rd_stats->rate += x->skip_cost[skip_ctx][1];
         rd_stats->dist = rd_stats->sse;
         rd_stats_y->rate = 0;
         rd_stats_uv->rate = 0;
         mbmi->skip = 1;
       } else {
-        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        rd_stats->rate += x->skip_cost[skip_ctx][0];
         mbmi->skip = 0;
       }
       *disable_skip = 0;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+          cm->tile_rows == 1) {
+#if INTER_MODE_RD_TEST
+        if (md->ready) {
+          int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+          if (est_skip) {
+            ++md->skip_count;
+            if (real_rd < ref_best_rd) {
+              ++md->fp_skip_count;
+            }
+            // int fp_skip = real_rd < ref_best_rd;
+            // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd
+            // %ld ref_best_rd %ld\n",
+            //        est_skip, fp_skip, est_rd, *best_est_rd, real_rd,
+            //        ref_best_rd);
+          } else {
+            ++md->non_skip_count;
+          }
+        }
+#endif  // INTER_MODE_RD_TEST
+        inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist,
+                             rd_stats_y->rate + rd_stats_uv->rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip],
+                             rd_stats->rate, ref_best_rd);
+      }
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+      int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (curr_rd < ref_best_rd) {
+        ref_best_rd = curr_rd;
+      }
     } else {
       x->skip = 1;
       *disable_skip = 1;
-      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
 
-// The cost of skip bit needs to be added.
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      // The cost of skip bit needs to be added.
       mbmi->skip = 0;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
 
-      rd_stats->dist = *skip_sse_sb;
-      rd_stats->sse = *skip_sse_sb;
+      rd_stats->dist = 0;
+      rd_stats->sse = 0;
       rd_stats_y->rate = 0;
       rd_stats_uv->rate = 0;
       rd_stats->skip = 1;
     }
 
-#if CONFIG_GLOBAL_MOTION
-    if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
-      if (is_nontrans_global_motion(xd)) {
-        rd_stats->rate -= rs;
+    if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+      if (is_nontrans_global_motion(xd, xd->mi[0])) {
         mbmi->interp_filters = av1_broadcast_interp_filter(
             av1_unswitchable_filter(cm->interp_filter));
       }
     }
-#endif  // CONFIG_GLOBAL_MOTION
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-    if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
+    if ((mbmi->motion_mode == SIMPLE_TRANSLATION &&
+         mbmi->ref_frame[1] != INTRA_FRAME) ||
+        (tmp_rd < best_rd)) {
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rd_stats = *rd_stats;
       best_rd_stats_y = *rd_stats_y;
-      best_rd_stats_uv = *rd_stats_uv;
-#if CONFIG_VAR_TX
-      for (int i = 0; i < MAX_MB_PLANE; ++i)
-        memcpy(best_blk_skip[i], x->blk_skip[i],
-               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-#endif  // CONFIG_VAR_TX
+      if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
       best_xskip = x->skip;
       best_disable_skip = *disable_skip;
+      if (best_xskip) break;
     }
   }
+  mbmi->ref_frame[1] = ref_frame_1;
 
   if (best_rd == INT64_MAX) {
     av1_invalid_rd_stats(rd_stats);
-    restore_dst_buf(xd, *orig_dst);
+    restore_dst_buf(xd, *orig_dst, num_planes);
     return INT64_MAX;
   }
   *mbmi = best_mbmi;
   *rd_stats = best_rd_stats;
   *rd_stats_y = best_rd_stats_y;
-  *rd_stats_uv = best_rd_stats_uv;
-#if CONFIG_VAR_TX
-  for (int i = 0; i < MAX_MB_PLANE; ++i)
-    memcpy(x->blk_skip[i], best_blk_skip[i],
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-#endif  // CONFIG_VAR_TX
+  if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
   x->skip = best_xskip;
   *disable_skip = best_disable_skip;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-  restore_dst_buf(xd, *orig_dst);
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+                            MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col, BUFFER_SET *const orig_dst) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    av1_subtract_plane(x, bsize, plane);
+    int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+    sse = sse << 4;
+    total_sse += sse;
+  }
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  rd_stats->dist = rd_stats->sse = total_sse;
+  rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
   return 0;
 }
 
+#ifndef NDEBUG
+static INLINE int is_single_inter_mode(int this_mode) {
+  return this_mode >= SINGLE_INTER_MODE_START &&
+         this_mode < SINGLE_INTER_MODE_END;
+}
+#endif
+
+static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
+  assert(is_single_inter_mode(single_mode));
+  int ref_mv_offset;
+  if (single_mode == NEARESTMV) {
+    ref_mv_offset = 0;
+  } else if (single_mode == NEARMV) {
+    ref_mv_offset = ref_mv_idx + 1;
+  } else {
+    ref_mv_offset = -1;
+  }
+  return ref_mv_offset;
+}
+
+static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
+                               int ref_mv_idx,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               const MB_MODE_INFO_EXT *mbmi_ext) {
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
+  const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
+  assert(is_single_inter_mode(single_mode));
+  if (single_mode == NEWMV) {
+    this_mv->as_int = INVALID_MV;
+  } else if (single_mode == GLOBALMV) {
+    *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+  } else {
+    assert(single_mode == NEARMV || single_mode == NEARESTMV);
+    const int ref_mv_offset = get_ref_mv_offset(single_mode, ref_mv_idx);
+    if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      assert(ref_mv_offset >= 0);
+      if (ref_idx == 0) {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+      } else {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+      }
+    } else {
+      *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+    }
+  }
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
+                               const AV1_COMMON *cm, const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  int ret = 1;
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    int_mv this_mv;
+    get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
+                x->mbmi_ext);
+    const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+    if (single_mode == NEWMV) {
+      cur_mv[i] = this_mv;
+    } else {
+      ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+    }
+  }
+  return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               int (*drl_mode_cost0)[2],
+                               int8_t ref_frame_type) {
+  int cost = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+        if (mbmi->ref_mv_idx == idx) return cost;
+      }
+    }
+    return cost;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    for (int idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+        if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+      }
+    }
+    return cost;
+  }
+  return cost;
+}
+
 static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, RD_STATS *rd_stats,
                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                                 int *disable_skip,
-                                 int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_COMPOUND_SINGLEREF
-                                 int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                 int mi_row, int mi_col,
-                                 HandleInterModeArgs *args,
-                                 const int64_t ref_best_rd) {
+                                 int *disable_skip, int mi_row, int mi_col,
+                                 HandleInterModeArgs *args, int64_t ref_best_rd
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+                                 ,
+                                 int64_t *best_est_rd
+#endif
+) {
   const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
-#if CONFIG_COMPOUND_SINGLEREF
-  const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  int_mv *frame_mv = mode_mv[this_mode];
-#if CONFIG_COMPOUND_SINGLEREF
-  // The comp mv for the compound mode in single ref
-  int_mv *frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-  int_mv cur_mv[2];
   int rate_mv = 0;
-  int pred_exists = 1;
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
   const int bw = block_size_wide[bsize];
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  int_mv single_newmv[TOTAL_REFS_PER_FRAME];
-#if CONFIG_INTERINTRA
-  const int *const interintra_mode_cost =
-      x->interintra_mode_cost[size_group_lookup[bsize]];
-#endif  // CONFIG_INTERINTRA
-  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
   uint8_t *tmp_buf;
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int rate2_bmc_nocoeff;
-  MB_MODE_INFO best_bmc_mbmi;
-  int rate_mv_bmc;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
   BUFFER_SET orig_dst, tmp_dst;
-  int rs = 0;
 
-  int skip_txfm_sb = 0;
-  int64_t skip_sse_sb = INT64_MAX;
-  int16_t mode_ctx;
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  // dummy fillers
-  mbmi->ncobmc_mode[0] = NO_OVERLAP;
-  mbmi->ncobmc_mode[1] = NO_OVERLAP;
-#endif
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int16_t mode_ctx;
+
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+  mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+  else
+    tmp_buf = tmp_buf_;
+  // Make sure that we didn't leave the plane destination buffers set
+  // to tmp_buf at the end of the last iteration
+  assert(xd->plane[0].dst.buf != tmp_buf);
+
+  mbmi->num_proj_ref[0] = 0;
+  mbmi->num_proj_ref[1] = 0;
+
+  if (is_comp_pred) {
+    for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
+      const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
+      if (single_mode == NEWMV &&
+          args->single_newmv[mbmi->ref_frame[ref_idx]].as_int == INVALID_MV)
+        return INT64_MAX;
+    }
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params.enable_masked_compound;
+  int64_t ret_val = INT64_MAX;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+  rd_stats->rate +=
+      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  const RD_STATS backup_rd_stats = *rd_stats;
+  const RD_STATS backup_rd_stats_y = *rd_stats_y;
+  const RD_STATS backup_rd_stats_uv = *rd_stats_uv;
+  const MB_MODE_INFO backup_mbmi = *mbmi;
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  int64_t best_rd = INT64_MAX;
+  int64_t best_ret_val = INT64_MAX;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t early_terminate = 0;
+  int plane_rate[MAX_MB_PLANE] = { 0 };
+  int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+  int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+  int64_t newmv_ret_val = INT64_MAX;
+  int_mv backup_mv[2] = { { 0 } };
+  int backup_rate_mv = 0;
+
+  int comp_idx;
+  const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
+                              (mbmi->mode != GLOBAL_GLOBALMV);
+  // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
+  for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+    int rs = 0;
+    int compmode_interinter_cost = 0;
+    early_terminate = 0;
+    *rd_stats = backup_rd_stats;
+    *rd_stats_y = backup_rd_stats_y;
+    *rd_stats_uv = backup_rd_stats_uv;
+    *mbmi = backup_mbmi;
+    mbmi->compound_idx = comp_idx;
+
+    if (is_comp_pred && comp_idx == 0) {
+      mbmi->comp_group_idx = 0;
+      mbmi->compound_idx = 0;
+
+      const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+      const int comp_index_ctx = get_comp_index_context(cm, xd);
+      if (masked_compound_used) {
+        compmode_interinter_cost +=
+            x->comp_group_idx_cost[comp_group_idx_ctx][0];
+      }
+      compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
+    }
+
+    int_mv cur_mv[2];
+    if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
+      early_terminate = INT64_MAX;
+      continue;
+    }
+    if (have_newmv_in_inter_mode(this_mode)) {
+      if (comp_idx == 0) {
+        cur_mv[0] = backup_mv[0];
+        cur_mv[1] = backup_mv[1];
+        rate_mv = backup_rate_mv;
+      }
+
+      // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
+      if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
+            comp_idx == 0)) {
+        newmv_ret_val =
+            handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args);
+
+        // Store cur_mv and rate_mv so that they can be restored in the next
+        // iteration of the loop
+        backup_mv[0] = cur_mv[0];
+        backup_mv[1] = cur_mv[1];
+        backup_rate_mv = rate_mv;
+      }
+
+      if (newmv_ret_val != 0) {
+        early_terminate = INT64_MAX;
+        continue;
+      } else {
+        rd_stats->rate += rate_mv;
+      }
+    }
+    for (i = 0; i < is_comp_pred + 1; ++i) {
+      mbmi->mv[i].as_int = cur_mv[i].as_int;
+    }
+
+    // Initialise tmp_dst and orig_dst buffers to prevent "may be used
+    // uninitialized" warnings in GCC when the stream is monochrome.
+    memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
+    memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
+    memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
+    memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
+
+    // do first prediction into the destination buffer. Do the next
+    // prediction into a temporary buffer. Then keep track of which one
+    // of these currently holds the best predictor, and use the other
+    // one for future predictions. In the end, copy from tmp_buf to
+    // dst if necessary.
+    for (i = 0; i < num_planes; i++) {
+      tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
+      tmp_dst.stride[i] = MAX_SB_SIZE;
+    }
+    for (i = 0; i < num_planes; i++) {
+      orig_dst.plane[i] = xd->plane[i].dst.buf;
+      orig_dst.stride[i] = xd->plane[i].dst.stride;
+    }
+
+    const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+#if USE_DISCOUNT_NEWMV_TEST
+    // We don't include the cost of the second reference here, because there
+    // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+    // words if you present them in that order, the second one is always known
+    // if the first is known.
+    //
+    // Under some circumstances we discount the cost of new mv mode to encourage
+    // initiation of a motion field.
+    if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+      // discount_newmv_test only applies discount on NEWMV mode.
+      assert(this_mode == NEWMV);
+      rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
+                               cost_mv_ref(x, NEARESTMV, mode_ctx));
+    } else {
+      rd_stats->rate += ref_mv_cost;
+    }
+#else
+    rd_stats->rate += ref_mv_cost;
+#endif
+
+    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+      early_terminate = INT64_MAX;
+      continue;
+    }
+
+    ret_val = interpolation_filter_search(
+        x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
+        &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
+    if (ret_val != 0) {
+      early_terminate = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, num_planes);
+      continue;
+    } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+               ref_best_rd != INT64_MAX && (rd / 6) > ref_best_rd) {
+      early_terminate = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, num_planes);
+      if ((rd >> 4) > ref_best_rd) break;
+      continue;
+    }
+
+    if (is_comp_pred && comp_idx) {
+      int rate_sum, rs2;
+      int64_t dist_sum;
+      int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
+      int_mv best_mv[2];
+      int best_tmp_rate_mv = rate_mv;
+      int tmp_skip_txfm_sb;
+      int64_t tmp_skip_sse_sb;
+      DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
+      uint8_t *preds0[1] = { pred0 };
+      uint8_t *preds1[1] = { pred1 };
+      int strides[1] = { bw };
+      int tmp_rate_mv;
+      const int num_pix = 1 << num_pels_log2_lookup[bsize];
+      COMPOUND_TYPE cur_type;
+      int best_compmode_interinter_cost = 0;
+      int can_use_previous = cm->allow_warped_motion;
+
+      best_mv[0].as_int = cur_mv[0].as_int;
+      best_mv[1].as_int = cur_mv[1].as_int;
+
+      if (masked_compound_used) {
+        // get inter predictors to use for masked compound modes
+        av1_build_inter_predictors_for_planes_single_buf(
+            xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides,
+            can_use_previous);
+        av1_build_inter_predictors_for_planes_single_buf(
+            xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides,
+            can_use_previous);
+      }
+
+      int best_comp_group_idx = 0;
+      int best_compound_idx = 1;
+      for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+        if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+        if (!is_interinter_compound_used(cur_type, bsize)) continue;
+        tmp_rate_mv = rate_mv;
+        best_rd_cur = INT64_MAX;
+        mbmi->interinter_comp.type = cur_type;
+        int masked_type_cost = 0;
+
+        const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        if (masked_compound_used) {
+          if (cur_type == COMPOUND_AVERAGE) {
+            mbmi->comp_group_idx = 0;
+            mbmi->compound_idx = 1;
+
+            masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
+            masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+          } else {
+            mbmi->comp_group_idx = 1;
+            mbmi->compound_idx = 1;
+
+            masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
+            masked_type_cost +=
+                x->compound_type_cost[bsize][mbmi->interinter_comp.type - 1];
+          }
+        } else {
+          mbmi->comp_group_idx = 0;
+          mbmi->compound_idx = 1;
+
+          masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+        }
+        rs2 = masked_type_cost;
+
+        switch (cur_type) {
+          case COMPOUND_AVERAGE:
+            av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
+                                           bsize);
+            av1_subtract_plane(x, bsize, 0);
+            rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                     INT64_MAX);
+            if (rd != INT64_MAX)
+              best_rd_cur =
+                  RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
+            break;
+          case COMPOUND_WEDGE:
+            if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+                best_rd_compound / 3 < ref_best_rd) {
+              best_rd_cur = build_and_cost_compound_type(
+                  cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
+                  &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+            }
+            break;
+          case COMPOUND_DIFFWTD:
+            if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+                best_rd_compound / 3 < ref_best_rd) {
+              best_rd_cur = build_and_cost_compound_type(
+                  cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
+                  &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+            }
+            break;
+          default: assert(0); return INT64_MAX;
+        }
+
+        if (best_rd_cur < best_rd_compound) {
+          best_comp_group_idx = mbmi->comp_group_idx;
+          best_compound_idx = mbmi->compound_idx;
+          best_rd_compound = best_rd_cur;
+          best_compound_data = mbmi->interinter_comp;
+          memcpy(tmp_best_mask_buf, xd->seg_mask,
+                 2 * num_pix * sizeof(uint8_t));
+          best_compmode_interinter_cost = rs2;
+          if (have_newmv_in_inter_mode(this_mode)) {
+            if (use_masked_motion_search(cur_type)) {
+              best_tmp_rate_mv = tmp_rate_mv;
+              best_mv[0].as_int = mbmi->mv[0].as_int;
+              best_mv[1].as_int = mbmi->mv[1].as_int;
+            } else {
+              best_mv[0].as_int = cur_mv[0].as_int;
+              best_mv[1].as_int = cur_mv[1].as_int;
+            }
+          }
+        }
+        // reset to original mvs for next iteration
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+      }
+      mbmi->comp_group_idx = best_comp_group_idx;
+      mbmi->compound_idx = best_compound_idx;
+      mbmi->interinter_comp = best_compound_data;
+      assert(IMPLIES(mbmi->comp_group_idx == 1,
+                     mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+      memcpy(xd->seg_mask, tmp_best_mask_buf, 2 * num_pix * sizeof(uint8_t));
+      if (have_newmv_in_inter_mode(this_mode)) {
+        mbmi->mv[0].as_int = best_mv[0].as_int;
+        mbmi->mv[1].as_int = best_mv[1].as_int;
+        if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+          rd_stats->rate += best_tmp_rate_mv - rate_mv;
+          rate_mv = best_tmp_rate_mv;
+        }
+      }
+
+      if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        early_terminate = INT64_MAX;
+        continue;
+      }
+      compmode_interinter_cost = best_compmode_interinter_cost;
+    }
+
+    if (is_comp_pred) {
+      int tmp_rate;
+      int64_t tmp_dist;
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
+                      &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
+                      plane_sse, plane_dist);
+      rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+    }
+
+    if (search_jnt_comp) {
+      // if 1/2 model rd is larger than best_rd in jnt_comp mode,
+      // use jnt_comp mode, save additional search
+      if ((rd >> 1) > best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
+    }
+
+    if (!is_comp_pred)
+      args->single_filter[this_mode][refs[0]] =
+          av1_extract_interp_filter(mbmi->interp_filters, 0);
+
+    if (args->modelled_rd != NULL) {
+      if (is_comp_pred) {
+        const int mode0 = compound_ref0_mode(this_mode);
+        const int mode1 = compound_ref1_mode(this_mode);
+        const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
+                                   args->modelled_rd[mode1][refs[1]]);
+        if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          early_terminate = INT64_MAX;
+          continue;
+        }
+      } else {
+        args->modelled_rd[this_mode][refs[0]] = rd;
+      }
+    }
+
+    if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+      // if current pred_error modeled rd is substantially more than the best
+      // so far, do not bother doing full rd
+      if (rd / 2 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        early_terminate = INT64_MAX;
+        continue;
+      }
+    }
+
+    rd_stats->rate += compmode_interinter_cost;
+
+    if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+      // TODO(chengchen): this speed feature introduces big loss.
+      // Need better estimation of rate distortion.
+      rd_stats->rate += rs;
+      rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
+      rd_stats_y->rate = plane_rate[0];
+      rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
+      rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
+      rd_stats_y->sse = plane_sse[0];
+      rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
+      rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
+      rd_stats_y->dist = plane_dist[0];
+      rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
+    } else {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                               disable_skip, mi_row, mi_col, args, ref_best_rd,
+                               refs, rate_mv, &orig_dst, best_est_rd);
+#else
+      ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                               disable_skip, mi_row, mi_col, args, ref_best_rd,
+                               refs, rate_mv, &orig_dst);
+#endif
+    }
+    if (ret_val != INT64_MAX) {
+      if (search_jnt_comp) {
+        int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+        if (tmp_rd < best_rd) {
+          best_rd_stats = *rd_stats;
+          best_rd_stats_y = *rd_stats_y;
+          best_rd_stats_uv = *rd_stats_uv;
+          best_ret_val = ret_val;
+          best_rd = tmp_rd;
+          best_mbmi = *mbmi;
+          memcpy(best_blk_skip, x->blk_skip,
+                 sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+        }
+        if (tmp_rd < ref_best_rd) {
+          ref_best_rd = tmp_rd;
+        }
+      }
+    }
+    if (!search_jnt_comp && ret_val != 0) {
+      restore_dst_buf(xd, orig_dst, num_planes);
+      return ret_val;
+    }
+    restore_dst_buf(xd, orig_dst, num_planes);
+  }
+
+  // re-instate status of the best choice
+  if (is_comp_pred && best_ret_val != INT64_MAX) {
+    *rd_stats = best_rd_stats;
+    *rd_stats_y = best_rd_stats_y;
+    *rd_stats_uv = best_rd_stats_uv;
+    ret_val = best_ret_val;
+    *mbmi = best_mbmi;
+    assert(IMPLIES(mbmi->comp_group_idx == 1,
+                   mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+  }
+  if (early_terminate == INT64_MAX) return INT64_MAX;
+  if (ret_val != 0) return ret_val;
+  return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+}
 
-#if CONFIG_INTERINTRA
-  int compmode_interintra_cost = 0;
-  mbmi->use_wedge_interintra = 0;
-#endif
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  int compmode_interinter_cost = 0;
-  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-#endif
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                       int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (!av1_allow_intrabc(cm)) return INT64_MAX;
+  const int num_planes = av1_num_planes(cm);
 
-#if CONFIG_INTERINTRA
-  if (!cm->allow_interintra_compound && is_comp_interintra_pred)
-    return INT64_MAX;
-#endif  // CONFIG_INTERINTRA
-
-  // is_comp_interintra_pred implies !is_comp_pred
-  assert(!is_comp_interintra_pred || (!is_comp_pred));
-  // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
-  assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (is_comp_pred || is_singleref_comp_mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_comp_pred)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
-  else
-    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                         mbmi->ref_frame, bsize, -1);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TileInfo *tile = &xd->tile;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
+  const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
-  else
-#endif  // CONFIG_HIGHBITDEPTH
-    tmp_buf = tmp_buf_;
-  // Make sure that we didn't leave the plane destination buffers set
-  // to tmp_buf at the end of the last iteration
-  assert(xd->plane[0].dst.buf != tmp_buf);
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                   mi_col, mbmi_ext->mode_context);
 
-#if CONFIG_WARPED_MOTION
-  mbmi->num_proj_ref[0] = 0;
-  mbmi->num_proj_ref[1] = 0;
-#endif  // CONFIG_WARPED_MOTION
+  int_mv nearestmv, nearmv;
+  av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+                                   0);
 
-  if (is_comp_pred) {
-    if (frame_mv[refs[0]].as_int == INVALID_MV ||
-        frame_mv[refs[1]].as_int == INVALID_MV)
-      return INT64_MAX;
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_singleref_comp_mode) {
-    if (frame_mv[refs[0]].as_int == INVALID_MV ||
-        frame_comp_mv[refs[0]].as_int == INVALID_MV)
-      return INT64_MAX;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  }
+  int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+  if (dv_ref.as_int == 0)
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
+  // Ref DV should not have sub-pel.
+  assert((dv_ref.as_mv.col & 7) == 0);
+  assert((dv_ref.as_mv.row & 7) == 0);
+  mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
 
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-  if (have_newmv_in_inter_mode(this_mode)) {
-    const int64_t ret_val =
-        handle_newmv(cpi, x, bsize, mode_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                     mode_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                     mi_row, mi_col, &rate_mv, single_newmv, args);
-    if (ret_val != 0)
-      return ret_val;
-    else
-      rd_stats->rate += rate_mv;
-  }
-  for (i = 0; i < is_comp_pred + 1; ++i) {
-    cur_mv[i] = frame_mv[refs[i]];
-    // Clip "next_nearest" so that it does not extend to far out of image
-    if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
-    if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
-    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL,
+                       num_planes);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].pre[0] = yv12_mb[i];
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!is_comp_pred && is_singleref_comp_mode) {
-    cur_mv[1] = frame_comp_mv[refs[0]];
-    // Clip "next_nearest" so that it does not extend to far out of image
-    if (this_mode != NEWMV) clamp_mv2(&cur_mv[1].as_mv, xd);
-    if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
-    mbmi->mv[1].as_int = cur_mv[1].as_int;
-  }
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  enum IntrabcMotionDirection {
+    IBC_MOTION_ABOVE,
+    IBC_MOTION_LEFT,
+    IBC_MOTION_DIRECTIONS
+  };
 
-  if (this_mode == NEAREST_NEARESTMV) {
-    if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+  MB_MODE_INFO best_mbmi = *mbmi;
+  RD_STATS best_rdcost = *rd_cost;
+  int best_skip = x->skip;
 
-      for (i = 0; i < 2; ++i) {
-        clamp_mv2(&cur_mv[i].as_mv, xd);
-        if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
-        mbmi->mv[i].as_int = cur_mv[i].as_int;
-      }
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+       dir < IBC_MOTION_DIRECTIONS; ++dir) {
+    const MvLimits tmp_mv_limits = x->mv_limits;
+    switch (dir) {
+      case IBC_MOTION_ABOVE:
+        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+        x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
+        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        x->mv_limits.row_max =
+            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+        break;
+      case IBC_MOTION_LEFT:
+        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+        x->mv_limits.col_max =
+            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+        // TODO(aconverse@google.com): Minimize the overlap between above and
+        // left areas.
+        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        int bottom_coded_mi_edge =
+            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+        x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+        break;
+      default: assert(0);
     }
-  }
-
-  if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-#if CONFIG_COMPOUND_SINGLEREF
-    if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
-        this_mode == SR_NEAREST_NEARMV)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-    if (this_mode == NEAREST_NEWMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+    assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
+    assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
+    assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
+    assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
+    av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[0].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    if (x->mv_limits.col_max < x->mv_limits.col_min ||
+        x->mv_limits.row_max < x->mv_limits.row_min) {
+      x->mv_limits = tmp_mv_limits;
+      continue;
     }
 
-    if (this_mode == NEW_NEARESTMV) {
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+    int step_param = cpi->mv_step_param;
+    MV mvp_full = dv_ref.as_mv;
+    mvp_full.col >>= 3;
+    mvp_full.row >>= 3;
+    int sadpb = x->sadperbit16;
+    int cost_list[5];
+    int bestsme = av1_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, sadpb,
+        cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[1].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
-    }
-  }
+    x->mv_limits = tmp_mv_limits;
+    if (bestsme == INT_MAX) continue;
+    mvp_full = x->best_mv.as_mv;
+    MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+    if (mv_check_bounds(&x->mv_limits, &dv)) continue;
+    if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+                         cm->seq_params.mib_size_log2))
+      continue;
 
-  if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-    int ref_mv_idx = mbmi->ref_mv_idx + 1;
-    if (this_mode == NEAR_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-        this_mode == SR_NEAR_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        this_mode == NEAR_NEARMV) {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+    // DV should not have sub-pel.
+    assert((dv.col & 7) == 0);
+    assert((dv.row & 7) == 0);
+    memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->use_intrabc = 1;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->mv[0].as_mv = dv;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->skip = 0;
+    x->skip = 0;
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[0].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
+                       (int *)&cpi->dv_cost[1][MV_MAX] };
+    // TODO(aconverse@google.com): The full motion field defining discount
+    // in MV_COST_WEIGHT is too large. Explore other values.
+    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+                                  dvcost, MV_COST_WEIGHT_SUB);
+    const int rate_mode = x->intrabc_cost[1];
+    RD_STATS rd_stats, rd_stats_uv;
+    av1_subtract_plane(x, bsize, 0);
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+      // Intrabc
+      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+    } else {
+      super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+      memset(x->blk_skip, rd_stats.skip,
+             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+    }
+    if (num_planes > 1) {
+      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
     }
+#if CONFIG_RD_DEBUG
+    mbmi->rd_stats = rd_stats;
+#endif
 
-    if (this_mode == NEW_NEARMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-        this_mode == SR_NEAREST_NEARMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        this_mode == NEAR_NEARMV) {
-#if CONFIG_COMPOUND_SINGLEREF
-      if (this_mode == SR_NEAREST_NEARMV)
-        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-      else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+    const int skip_ctx = av1_get_skip_context(xd);
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[1].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    RD_STATS rdc_noskip;
+    av1_init_rd_stats(&rdc_noskip);
+    rdc_noskip.rate =
+        rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0];
+    rdc_noskip.dist = rd_stats.dist;
+    rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
+    if (rdc_noskip.rdcost < best_rd) {
+      best_rd = rdc_noskip.rdcost;
+      best_mbmi = *mbmi;
+      best_skip = x->skip;
+      best_rdcost = rdc_noskip;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
     }
-  }
 
-  // do first prediction into the destination buffer. Do the next
-  // prediction into a temporary buffer. Then keep track of which one
-  // of these currently holds the best predictor, and use the other
-  // one for future predictions. In the end, copy from tmp_buf to
-  // dst if necessary.
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
-    tmp_dst.stride[i] = MAX_SB_SIZE;
-  }
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    orig_dst.plane[i] = xd->plane[i].dst.buf;
-    orig_dst.stride[i] = xd->plane[i].dst.stride;
+    if (!xd->lossless[mbmi->segment_id]) {
+      x->skip = 1;
+      mbmi->skip = 1;
+      RD_STATS rdc_skip;
+      av1_init_rd_stats(&rdc_skip);
+      rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1];
+      rdc_skip.dist = rd_stats.sse;
+      rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
+      if (rdc_skip.rdcost < best_rd) {
+        best_rd = rdc_skip.rdcost;
+        best_mbmi = *mbmi;
+        best_skip = x->skip;
+        best_rdcost = rdc_skip;
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+      }
+    }
   }
+  *mbmi = best_mbmi;
+  *rd_cost = best_rdcost;
+  x->skip = best_skip;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+  return best_rd;
+}
+
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int y_skip = 0, uv_skip = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+  TX_SIZE max_uv_tx_size;
+
+  ctx->skip = 0;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->use_intrabc = 0;
+  mbmi->mv[0].as_int = 0;
+
+  const int64_t intra_yrd =
+      rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+                             &y_skip, bsize, best_rd, ctx);
+
+  if (intra_yrd < best_rd) {
+    // Only store reconstructed luma when there's chroma RDO. When there's no
+    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+    xd->cfl.is_chroma_reference = is_chroma_reference(
+        mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+    xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+    if (xd->cfl.store_y) {
+      // Restore reconstructed luma values.
+      memcpy(x->blk_skip, ctx->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y,
+                                   cpi->optimize_seg_arr[mbmi->segment_id],
+                                   mi_row, mi_col);
+      xd->cfl.store_y = 0;
+    }
+    if (num_planes > 1) {
+      max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      init_sbuv_mode(mbmi);
+      if (!x->skip_chroma_rd)
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+                                &uv_skip, bsize, max_uv_tx_size);
+    }
 
-  // We don't include the cost of the second reference here, because there
-  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-  // words if you present them in that order, the second one is always known
-  // if the first is known.
-  //
-  // Under some circumstances we discount the cost of new mv mode to encourage
-  // initiation of a motion field.
-  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
-                          refs[0])) {
-    rd_stats->rate += AOMMIN(
-        cost_mv_ref(x, this_mode, mode_ctx),
-        cost_mv_ref(x, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, mode_ctx));
+    if (y_skip && (uv_skip || x->skip_chroma_rd)) {
+      rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                      x->skip_cost[av1_get_skip_context(xd)][1];
+      rd_cost->dist = dist_y + dist_uv;
+    } else {
+      rd_cost->rate =
+          rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+      rd_cost->dist = dist_y + dist_uv;
+    }
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
   } else {
-    rd_stats->rate += cost_mv_ref(x, this_mode, mode_ctx);
+    rd_cost->rate = INT_MAX;
   }
 
-  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
-      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV)
-    return INT64_MAX;
+  if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+    best_rd = rd_cost->rdcost;
+  if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
+    ctx->skip = x->skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    assert(rd_cost->rate != INT_MAX);
+  }
+  if (rd_cost->rate == INT_MAX) return;
 
-  int64_t ret_val = interpolation_filter_search(
-      x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
-      &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
-  if (ret_val != 0) return ret_val;
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+}
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  best_bmc_mbmi = *mbmi;
-  rate2_bmc_nocoeff = rd_stats->rate;
-  if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
-  rate_mv_bmc = rate_mv;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
 
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  if (is_comp_pred || is_singleref_comp_mode)
-#else
-  if (is_comp_pred)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    int rate_sum, rs2;
-    int64_t dist_sum;
-    int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
-    INTERINTER_COMPOUND_DATA best_compound_data;
-    int_mv best_mv[2];
-    int best_tmp_rate_mv = rate_mv;
-    int tmp_skip_txfm_sb;
-    int64_t tmp_skip_sse_sb;
-    DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
-    uint8_t *preds0[1] = { pred0 };
-    uint8_t *preds1[1] = { pred1 };
-    int strides[1] = { bw };
-    int tmp_rate_mv;
-    int masked_compound_used = is_any_masked_compound_used(bsize);
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-    masked_compound_used = masked_compound_used && cm->allow_masked_compound;
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-    COMPOUND_TYPE cur_type;
-    int best_compmode_interinter_cost = 0;
-
-    best_mv[0].as_int = cur_mv[0].as_int;
-    best_mv[1].as_int = cur_mv[1].as_int;
-    memset(&best_compound_data, 0, sizeof(best_compound_data));
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
-    best_compound_data.seg_mask = tmp_mask_buf;
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-#if CONFIG_COMPOUND_SINGLEREF
-    // TODO(zoeliu): To further check whether the following setups are needed.
-    // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
-    // the 1st one.
-    if (!is_comp_pred && is_singleref_comp_mode) {
-      xd->block_refs[1] = xd->block_refs[0];
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = xd->plane[i].pre[0];
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-    if (masked_compound_used) {
-      // get inter predictors to use for masked compound modes
-      av1_build_inter_predictors_for_planes_single_buf(
-          xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
-      av1_build_inter_predictors_for_planes_single_buf(
-          xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
-    }
-
-    for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
-      if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
-      if (!is_interinter_compound_used(cur_type, bsize)) continue;
-      tmp_rate_mv = rate_mv;
-      best_rd_cur = INT64_MAX;
-      mbmi->interinter_compound_type = cur_type;
-      int masked_type_cost = 0;
-      if (masked_compound_used) {
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          masked_type_cost += av1_cost_literal(1);
-        else
-#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          masked_type_cost +=
-              x->compound_type_cost[bsize][mbmi->interinter_compound_type];
-      }
-      rs2 = av1_cost_literal(get_interinter_compound_type_bits(
-                bsize, mbmi->interinter_compound_type)) +
-            masked_type_cost;
-
-      switch (cur_type) {
-        case COMPOUND_AVERAGE:
-          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
-                                         bsize);
-          av1_subtract_plane(x, bsize, 0);
-          rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                   &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
-                                   INT64_MAX);
-          if (rd != INT64_MAX)
-            best_rd_cur = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
-          best_rd_compound = best_rd_cur;
-          break;
-#if CONFIG_WEDGE
-        case COMPOUND_WEDGE:
-          if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-              best_rd_compound / 3 < ref_best_rd) {
-            best_rd_cur = build_and_cost_compound_type(
-                cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
-                &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
-          }
-          break;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        case COMPOUND_SEG:
-          if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-              best_rd_compound / 3 < ref_best_rd) {
-            best_rd_cur = build_and_cost_compound_type(
-                cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
-                &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
-          }
-          break;
-#endif  // CONFIG_COMPOUND_SEGMENT
-        default: assert(0); return 0;
-      }
-
-      if (best_rd_cur < best_rd_compound) {
-        best_rd_compound = best_rd_cur;
-#if CONFIG_WEDGE
-        best_compound_data.wedge_index = mbmi->wedge_index;
-        best_compound_data.wedge_sign = mbmi->wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        best_compound_data.mask_type = mbmi->mask_type;
-        memcpy(best_compound_data.seg_mask, xd->seg_mask,
-               2 * MAX_SB_SQUARE * sizeof(uint8_t));
-#endif  // CONFIG_COMPOUND_SEGMENT
-        best_compound_data.interinter_compound_type =
-            mbmi->interinter_compound_type;
-        best_compmode_interinter_cost = rs2;
-        if (have_newmv_in_inter_mode(this_mode)) {
-          if (use_masked_motion_search(cur_type)) {
-            best_tmp_rate_mv = tmp_rate_mv;
-            best_mv[0].as_int = mbmi->mv[0].as_int;
-            best_mv[1].as_int = mbmi->mv[1].as_int;
-          } else {
-            best_mv[0].as_int = cur_mv[0].as_int;
-            best_mv[1].as_int = cur_mv[1].as_int;
-          }
-        }
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
       }
-      // reset to original mvs for next iteration
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
     }
-#if CONFIG_WEDGE
-    mbmi->wedge_index = best_compound_data.wedge_index;
-    mbmi->wedge_sign = best_compound_data.wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mbmi->mask_type = best_compound_data.mask_type;
-    memcpy(xd->seg_mask, best_compound_data.seg_mask,
-           2 * MAX_SB_SQUARE * sizeof(uint8_t));
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mbmi->interinter_compound_type =
-        best_compound_data.interinter_compound_type;
-    if (have_newmv_in_inter_mode(this_mode)) {
-      mbmi->mv[0].as_int = best_mv[0].as_int;
-      mbmi->mv[1].as_int = best_mv[1].as_int;
-      xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-      xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
-      if (use_masked_motion_search(mbmi->interinter_compound_type)) {
-        rd_stats->rate += best_tmp_rate_mv - rate_mv;
-        rate_mv = best_tmp_rate_mv;
-      }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
     }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
+
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const uint8_t *above,
+                                      int above_stride, const uint8_t *left,
+                                      int left_stride);
 
-    if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst);
-      return INT64_MAX;
-    }
+static const int ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                     AOM_LAST_FLAG,
+                                                     AOM_LAST2_FLAG,
+                                                     AOM_LAST3_FLAG,
+                                                     AOM_GOLD_FLAG,
+                                                     AOM_BWD_FLAG,
+                                                     AOM_ALT2_FLAG,
+                                                     AOM_ALT_FLAG };
+
+static void rd_pick_skip_mode(RD_STATS *rd_cost,
+                              InterModeSearchState *search_state,
+                              const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+                              struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
-    pred_exists = 0;
+  x->compound_idx = 1;  // COMPOUND_AVERAGE
+  RD_STATS skip_mode_rd_stats;
+  av1_invalid_rd_stats(&skip_mode_rd_stats);
 
-    compmode_interinter_cost = best_compmode_interinter_cost;
+  if (cm->ref_frame_idx_0 == INVALID_IDX ||
+      cm->ref_frame_idx_1 == INVALID_IDX) {
+    return;
   }
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_INTERINTRA
-  if (is_comp_interintra_pred) {
-    INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
-    int64_t best_interintra_rd = INT64_MAX;
-    int rmode, rate_sum;
-    int64_t dist_sum;
-    int j;
-    int tmp_rate_mv = 0;
-    int tmp_skip_txfm_sb;
-    int64_t tmp_skip_sse_sb;
-    DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
-    uint8_t *intrapred;
+  const MV_REFERENCE_FRAME ref_frame = LAST_FRAME + cm->ref_frame_idx_0;
+  const MV_REFERENCE_FRAME second_ref_frame = LAST_FRAME + cm->ref_frame_idx_1;
+  const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+  const int mode_index =
+      get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
 
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      intrapred = CONVERT_TO_BYTEPTR(intrapred_);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      intrapred = intrapred_;
+  if (mode_index == -1) {
+    return;
+  }
 
-    mbmi->ref_frame[1] = NONE_FRAME;
-    for (j = 0; j < MAX_MB_PLANE; j++) {
-      xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
-      xd->plane[j].dst.stride = bw;
-    }
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, bsize);
-    restore_dst_buf(xd, orig_dst);
-    mbmi->ref_frame[1] = INTRA_FRAME;
-    mbmi->use_wedge_interintra = 0;
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame;
+  mbmi->ref_frame[1] = second_ref_frame;
 
-    for (j = 0; j < INTERINTRA_MODES; ++j) {
-      mbmi->interintra_mode = (INTERINTRA_MODE)j;
-      rmode = interintra_mode_cost[mbmi->interintra_mode];
-      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
-                                                intrapred, bw);
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-      model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                      &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
-      if (rd < best_interintra_rd) {
-        best_interintra_rd = rd;
-        best_interintra_mode = mbmi->interintra_mode;
-      }
-    }
-    mbmi->interintra_mode = best_interintra_mode;
-    rmode = interintra_mode_cost[mbmi->interintra_mode];
-    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
-                                              intrapred, bw);
-    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    av1_subtract_plane(x, bsize, 0);
-    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-    if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
-    best_interintra_rd = rd;
+  assert(this_mode == NEAREST_NEARESTMV);
+  if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
+    return;
+  }
 
-    if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
-      // Don't need to call restore_dst_buf here
-      return INT64_MAX;
-    }
-#if CONFIG_WEDGE
-    if (is_interintra_wedge_used(bsize)) {
-      int64_t best_interintra_rd_nowedge = INT64_MAX;
-      int64_t best_interintra_rd_wedge = INT64_MAX;
-      int_mv tmp_mv;
-      int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
-      if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, rmode + rate_mv + rwedge + rate_sum, dist_sum);
-      best_interintra_rd_nowedge = best_interintra_rd;
-
-      // Disable wedge search if source variance is small
-      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
-        mbmi->use_wedge_interintra = 1;
-
-        rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
-                 av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
-
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-
-        best_interintra_rd_wedge +=
-            RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
-        // Refine motion vector.
-        if (have_newmv_in_inter_mode(this_mode)) {
-          // get negative of mask
-          const uint8_t *mask = av1_get_contiguous_soft_mask(
-              mbmi->interintra_wedge_index, 1, bsize);
-          tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
-          compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
-                                        mi_col, intrapred, mask, bw,
-                                        &tmp_rate_mv, 0);
-          mbmi->mv[0].as_int = tmp_mv.as_int;
-          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
-                                         bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                          &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
-                      dist_sum);
-          if (rd >= best_interintra_rd_wedge) {
-            tmp_mv.as_int = cur_mv[0].as_int;
-            tmp_rate_mv = rate_mv;
-          }
-        } else {
-          tmp_mv.as_int = cur_mv[0].as_int;
-          tmp_rate_mv = rate_mv;
-          av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-        }
-        // Evaluate closer to true rd
-        av1_subtract_plane(x, bsize, 0);
-        rd =
-            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-        if (rd != INT64_MAX)
-          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
-                      dist_sum);
-        best_interintra_rd_wedge = rd;
-        if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
-          mbmi->use_wedge_interintra = 1;
-          mbmi->mv[0].as_int = tmp_mv.as_int;
-          rd_stats->rate += tmp_rate_mv - rate_mv;
-          rate_mv = tmp_rate_mv;
-        } else {
-          mbmi->use_wedge_interintra = 0;
-          mbmi->mv[0].as_int = cur_mv[0].as_int;
-        }
-      } else {
-        mbmi->use_wedge_interintra = 0;
-      }
-    }
-#endif  // CONFIG_WEDGE
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = x->compound_idx;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = 0;
+  mbmi->skip_mode = mbmi->skip = 1;
 
-    pred_exists = 0;
-    compmode_interintra_cost =
-        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1) +
-        interintra_mode_cost[mbmi->interintra_mode];
-    if (is_interintra_wedge_used(bsize)) {
-      compmode_interintra_cost += av1_cost_bit(
-          cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
-      if (mbmi->use_wedge_interintra) {
-        compmode_interintra_cost +=
-            av1_cost_literal(get_interintra_wedge_bits(bsize));
-      }
-    }
-  } else if (is_interintra_allowed(mbmi)) {
-    compmode_interintra_cost =
-        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
+  set_default_interp_filters(mbmi, cm->interp_filter);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+    xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
   }
-#endif  // CONFIG_INTERINTRA
 
-  if (pred_exists == 0) {
-    int tmp_rate;
-    int64_t tmp_dist;
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                    &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
-    rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+  BUFFER_SET orig_dst;
+  for (int i = 0; i < num_planes; i++) {
+    orig_dst.plane[i] = xd->plane[i].dst.buf;
+    orig_dst.stride[i] = xd->plane[i].dst.stride;
   }
 
-  if (!is_comp_pred)
-    args->single_filter[this_mode][refs[0]] =
-        av1_extract_interp_filter(mbmi->interp_filters, 0);
+  // Obtain the rdcost for skip_mode.
+  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, mi_row, mi_col, &orig_dst);
+
+  // Compare the use of skip_mode with the best intra/inter mode obtained.
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  const int64_t best_intra_inter_mode_cost =
+      (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX)
+          ? RDCOST(x->rdmult,
+                   rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
+                   rd_cost->dist)
+          : INT64_MAX;
+
+  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) {
+    assert(mode_index != -1);
+    search_state->best_mbmode.skip_mode = 1;
+    search_state->best_mbmode = *mbmi;
+
+    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
+    search_state->best_mbmode.mode = NEAREST_NEARESTMV;
+    search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
+    search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
+    search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
+    search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
+    search_state->best_mbmode.ref_mv_idx = 0;
+
+    // Set up tx_size related variables for skip-specific loop filtering.
+    search_state->best_mbmode.tx_size =
+        block_signals_txsize(bsize) ? tx_size_from_tx_mode(bsize, cm->tx_mode)
+                                    : max_txsize_rect_lookup[bsize];
+    memset(search_state->best_mbmode.inter_tx_size,
+           search_state->best_mbmode.tx_size,
+           sizeof(search_state->best_mbmode.inter_tx_size));
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h,
+                  search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
+
+    // Set up color-related variables for skip mode.
+    search_state->best_mbmode.uv_mode = UV_DC_PRED;
+    search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
+    search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
+
+    search_state->best_mbmode.comp_group_idx = 0;
+    search_state->best_mbmode.compound_idx = x->compound_idx;
+    search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
+    search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
+
+    search_state->best_mbmode.interintra_mode =
+        (INTERINTRA_MODE)(II_DC_PRED - 1);
+    search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
+
+    set_default_interp_filters(&search_state->best_mbmode, cm->interp_filter);
+
+    search_state->best_mode_index = mode_index;
+
+    // Update rd_cost
+    rd_cost->rate = skip_mode_rd_stats.rate;
+    rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+    rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+    search_state->best_rd = rd_cost->rdcost;
+    search_state->best_skip2 = 1;
+    search_state->best_mode_skippable = (skip_mode_rd_stats.sse == 0);
 
-  if (args->modelled_rd != NULL) {
-    if (is_comp_pred) {
-      const int mode0 = compound_ref0_mode(this_mode);
-      const int mode1 = compound_ref1_mode(this_mode);
-      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
-                                 args->modelled_rd[mode1][refs[1]]);
-      if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
-        restore_dst_buf(xd, orig_dst);
-        return INT64_MAX;
-      }
-    } else if (!is_comp_interintra_pred) {
-      args->modelled_rd[this_mode][refs[0]] = rd;
-    }
+    x->skip = 1;
   }
+}
 
-  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-    // if current pred_error modeled rd is substantially more than the best
-    // so far, do not bother doing full rd
-    if (rd / 2 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst);
-      return INT64_MAX;
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static void sf_refine_fast_tx_type_search(
+    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+    int best_mode_index, MB_MODE_INFO *best_mbmode,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y,
+    int best_rate_uv, int *best_skip2) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode->mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode->mode)))) {
+    int skip_blk = 0;
+    RD_STATS rd_stats_y, rd_stats_uv;
+
+    x->use_default_inter_tx_type = 0;
+    x->use_default_intra_tx_type = 0;
+
+    *mbmi = *best_mbmode;
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (has_second_ref(mbmi))
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
     }
-  }
 
-#if CONFIG_INTERINTRA
-  rd_stats->rate += compmode_interintra_cost;
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  rate2_bmc_nocoeff += compmode_interintra_cost;
-#endif
-#endif
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  rd_stats->rate += compmode_interinter_cost;
-#endif
+    if (is_inter_mode(mbmi->mode)) {
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
 
-  ret_val = motion_mode_rd(
-      cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mode_mv,
-      mi_row, mi_col, args, ref_best_rd, refs, rate_mv,
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      single_newmv, rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
-  if (ret_val != 0) return ret_val;
+      av1_subtract_plane(x, bsize, 0);
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+        // av1_rd_pick_inter_mode_sb
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+                           INT64_MAX);
+        assert(rd_stats_y.rate != INT_MAX);
+      } else {
+        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+        memset(x->blk_skip, rd_stats_y.skip,
+               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+      }
+      if (num_planes > 1) {
+        inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+    } else {
+      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+      if (num_planes > 1) {
+        super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+    }
+
+    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist)) >
+        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+      skip_blk = 1;
+      rd_stats_y.rate = x->skip_cost[av1_get_skip_context(xd)][1];
+      rd_stats_uv.rate = 0;
+      rd_stats_y.dist = rd_stats_y.sse;
+      rd_stats_uv.dist = rd_stats_uv.sse;
+    } else {
+      skip_blk = 0;
+      rd_stats_y.rate += x->skip_cost[av1_get_skip_context(xd)][0];
+    }
 
-  return 0;  // The rate-distortion cost will be re-calculated by caller.
+    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist))) {
+      best_mbmode->tx_size = mbmi->tx_size;
+      av1_copy(best_mbmode->inter_tx_size, mbmi->inter_tx_size);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy(best_mbmode->txk_type, mbmi->txk_type);
+      rd_cost->rate +=
+          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
+      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+      *best_skip2 = skip_blk;
+    }
+  }
 }
 
-#if CONFIG_INTRABC
-static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                                       int64_t best_rd) {
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static void set_params_rd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
+    uint32_t mode_skip_mask[REF_FRAMES],
+    unsigned int ref_costs_single[REF_FRAMES],
+    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (!av1_allow_intrabc(bsize, cm)) return INT64_MAX;
-
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  const TileInfo *tile = &xd->tile;
-  MODE_INFO *const mi = xd->mi[0];
-  const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
-  const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
-  const int w = block_size_wide[bsize];
-  const int h = block_size_high[bsize];
-  const int sb_row = mi_row / MAX_MIB_SIZE;
-  const int sb_col = mi_col / MAX_MIB_SIZE;
-
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
-  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
-  av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                   mbmi_ext->ref_mv_stack[ref_frame],
-                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
-
-  int_mv nearestmv, nearmv;
-  av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
+  const struct segmentation *const seg = &cm->seg;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  unsigned char segment_id = mbmi->segment_id;
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                   MAX_SB_SIZE >> 1 };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                    MAX_SB_SIZE >> 1 };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
-  if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
-  mbmi_ext->ref_mvs[INTRA_FRAME][0] = dv_ref;
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
 
-  struct buf_2d yv12_mb[MAX_MB_PLANE];
-  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
-  for (int i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->plane[i].pre[0] = yv12_mb[i];
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
   }
 
-  enum IntrabcMotionDirection {
-    IBC_MOTION_ABOVE,
-    IBC_MOTION_LEFT,
-    IBC_MOTION_DIRECTIONS
-  };
+  av1_collect_neighbors_ref_counts(xd);
 
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  MB_MODE_INFO best_mbmi = *mbmi;
-  RD_STATS best_rdcost = *rd_cost;
-  int best_skip = x->skip;
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
 
-  for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
-       dir < IBC_MOTION_DIRECTIONS; ++dir) {
-    const MvLimits tmp_mv_limits = x->mv_limits;
-    switch (dir) {
-      case IBC_MOTION_ABOVE:
-        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
-        x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
-        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
-        x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
-        break;
-      case IBC_MOTION_LEFT:
-        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
-        x->mv_limits.col_max = (sb_col * MAX_MIB_SIZE - mi_col) * MI_SIZE - w;
-        // TODO(aconverse@google.com): Minimize the overlap between above and
-        // left areas.
-        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
-        int bottom_coded_mi_edge =
-            AOMMIN((sb_row + 1) * MAX_MIB_SIZE, tile->mi_row_end);
-        x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
-        break;
-      default: assert(0);
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                                 yv12_mb);
     }
-    assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
-    assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
-    assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
-    assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
-    av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+  }
 
-    if (x->mv_limits.col_max < x->mv_limits.col_min ||
-        x->mv_limits.row_max < x->mv_limits.row_min) {
-      x->mv_limits = tmp_mv_limits;
-      continue;
+  // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector
+  // references for compound prediction, as not every pair of reference frames
+  // woud be examined for the RD evaluation.
+  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                     mi_col, mbmi_ext->mode_context);
+  }
+
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
+  if (check_num_overlappable_neighbors(mbmi) &&
+      is_motion_variation_allowed_bsize(bsize)) {
+    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                        args->above_pred_buf, dst_width1,
+                                        dst_height1, args->above_pred_stride);
+    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                       args->left_pred_buf, dst_width2,
+                                       dst_height2, args->left_pred_stride);
+    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                         mi_col, 0, num_planes);
+    calc_target_weighted_pred(
+        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+        args->above_pred_stride[0], args->left_pred_buf[0],
+        args->left_pred_stride[0]);
+  }
+
+  int min_pred_mv_sad = INT_MAX;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+
+  for (int i = 0; i < 2; ++i) {
+    ref_frame_skip_mask[i] = 0;
+  }
+  memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask));
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) {
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    } else {
+      // Skip fixed mv modes for poor references
+      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+        mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
+  }
 
-    int step_param = cpi->mv_step_param;
-    MV mvp_full = dv_ref.as_mv;
-    mvp_full.col >>= 3;
-    mvp_full.row >>= 3;
-    int sadpb = x->sadperbit16;
-    int cost_list[5];
-#if CONFIG_HASH_ME
-    int bestsme = av1_full_pixel_search(
-        cpi, x, bsize, &mvp_full, step_param, sadpb,
-        cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
-        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
-#else
-    int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                        sadpb, cond_cost_list(cpi, cost_list),
-                                        &dv_ref.as_mv, INT_MAX, 1);
-#endif
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+                               (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) |
+                               (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      // TODO(zoeliu): To further explore whether following needs to be done for
+      //               BWDREF_FRAME as well.
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+      int_mv near_mv, nearest_mv, global_mv;
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
 
-    x->mv_limits = tmp_mv_limits;
-    if (bestsme == INT_MAX) continue;
-    mvp_full = x->best_mv.as_mv;
-    MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
-    if (mv_check_bounds(&x->mv_limits, &dv)) continue;
-    if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
+      if (near_mv.as_int != global_mv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+      if (nearest_mv.as_int != global_mv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
 
-    memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
-    mbmi->use_intrabc = 1;
-    mbmi->mode = DC_PRED;
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->mv[0].as_mv = dv;
-    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-    mbmi->skip = 0;
-    x->skip = 0;
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]);
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
+  }
 
-    assert(x->mvcost == x->mv_cost_stack[0]);
-    // TODO(aconverse@google.com): The full motion field defining discount
-    // in MV_COST_WEIGHT is too large. Explore other values.
-    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
-                                  x->mvcost, MV_COST_WEIGHT_SUB);
-    const int rate_mode = x->intrabc_cost[1];
-    RD_STATS rd_stats, rd_stats_uv;
-    av1_subtract_plane(x, bsize, 0);
-    super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
-    super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-    av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
-#if CONFIG_RD_DEBUG
-    mbmi->rd_stats = rd_stats;
-#endif
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
 
-#if CONFIG_VAR_TX
-    // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
-    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-    int idx, idy;
-    for (idy = 0; idy < height; ++idy)
-      for (idx = 0; idx < width; ++idx)
-        mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  }
 
-    const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  if (bsize > sf->max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
 
-    RD_STATS rdc_noskip;
-    av1_init_rd_stats(&rdc_noskip);
-    rdc_noskip.rate =
-        rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
-    rdc_noskip.dist = rd_stats.dist;
-    rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
-    if (rdc_noskip.rdcost < best_rd) {
-      best_rd = rdc_noskip.rdcost;
-      best_mbmi = *mbmi;
-      best_skip = x->skip;
-      best_rdcost = rdc_noskip;
-    }
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
-    x->skip = 1;
-    mbmi->skip = 1;
-    RD_STATS rdc_skip;
-    av1_init_rd_stats(&rdc_skip);
-    rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
-    rdc_skip.dist = rd_stats.sse;
-    rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
-    if (rdc_skip.rdcost < best_rd) {
-      best_rd = rdc_skip.rdcost;
-      best_mbmi = *mbmi;
-      best_skip = x->skip;
-      best_rdcost = rdc_skip;
-    }
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    x->interp_filter_stats_idx[0] = 0;
+    x->interp_filter_stats_idx[1] = 0;
   }
-  *mbmi = best_mbmi;
-  *rd_cost = best_rdcost;
-  x->skip = best_skip;
-  return best_rd;
 }
-#endif  // CONFIG_INTRABC
 
-void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                                RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+                                PALETTE_MODE_INFO *const pmi,
+                                unsigned int *ref_costs_single,
+                                InterModeSearchState *search_state) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblockd_plane *const pd = xd->plane;
-  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
-  int y_skip = 0, uv_skip = 0;
-  int64_t dist_y = 0, dist_uv = 0;
-  TX_SIZE max_uv_tx_size;
-  const int unify_bsize = CONFIG_CB4X4;
+  int rate2 = 0;
+  int64_t distortion2 = 0, best_rd_palette = search_state->best_rd, this_rd,
+          best_model_rd_palette = INT64_MAX;
+  int skippable = 0, rate_overhead_palette = 0;
+  RD_STATS rd_stats_y;
+  TX_SIZE uv_tx = TX_4X4;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
 
-  ctx->skip = 0;
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
-#if CONFIG_INTRABC
-  mbmi->use_intrabc = 0;
-  mbmi->mv[0].as_int = 0;
-#endif  // CONFIG_INTRABC
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
-
-  const int64_t intra_yrd =
-      (bsize >= BLOCK_8X8 || unify_bsize)
-          ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
-                                   &y_skip, bsize, best_rd)
-          : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                         &dist_y, &y_skip, best_rd);
-
-  if (intra_yrd < best_rd) {
-#if CONFIG_CFL
-#if CONFIG_CB4X4
-    // Only store reconstructed luma when there's chroma RDO. When there's no
-    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-    xd->cfl->store_y = !x->skip_chroma_rd;
-#else
-    xd->cfl->store_y = 1;
-#endif  // CONFIG_CB4X4
-    if (xd->cfl->store_y) {
-      // Perform one extra call to txfm_rd_in_plane(), with the values chosen
-      // during luma RDO, so we can store reconstructed luma values
-      RD_STATS this_rd_stats;
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                       mbmi->sb_type, mbmi->tx_size,
-                       cpi->sf.use_fast_coef_costing);
-      xd->cfl->store_y = 0;
-    }
-#endif  // CONFIG_CFL
-    max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
-                                     [pd[1].subsampling_y];
-    init_sbuv_mode(mbmi);
-#if CONFIG_CB4X4
-    if (!x->skip_chroma_rd)
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
-                              &uv_skip, bsize, max_uv_tx_size);
-#else
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
-                            &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
-#endif  // CONFIG_CB4X4
-
-    if (y_skip && (uv_skip || x->skip_chroma_rd)) {
-      rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                      av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      rd_cost->dist = dist_y + dist_uv;
-    } else {
-      rd_cost->rate =
-          rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-      rd_cost->dist = dist_y + dist_uv;
+  rate_overhead_palette = rd_pick_palette_intra_sby(
+      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
+      best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL,
+      NULL, NULL, NULL, ctx, best_blk_skip);
+  if (pmi->palette_size[0] == 0) return;
+
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, search_state->best_rd);
+  if (rd_stats_y.rate == INT_MAX) return;
+
+  skippable = rd_stats_y.skip;
+  distortion2 = rd_stats_y.dist;
+  rate2 = rd_stats_y.rate + rate_overhead_palette;
+  rate2 += ref_costs_single[INTRA_FRAME];
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+          &search_state->rate_uv_tokenonly[uv_tx],
+          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+          &search_state->mode_uv[uv_tx]);
+      search_state->pmi_uv[uv_tx] = *pmi;
+      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+    mbmi->uv_mode = search_state->mode_uv[uv_tx];
+    pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
     }
-    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-  } else {
-    rd_cost->rate = INT_MAX;
+    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+    skippable = skippable && search_state->skip_uvs[uv_tx];
+    distortion2 += search_state->dist_uvs[uv_tx];
+    rate2 += search_state->rate_uv_intra[uv_tx];
   }
 
-#if CONFIG_INTRABC
-  if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
-    best_rd = rd_cost->rdcost;
-  if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
-    ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
-    assert(rd_cost->rate != INT_MAX);
-    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+  if (skippable) {
+    rate2 -= rd_stats_y.rate;
+    if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+  if (this_rd < search_state->best_rd) {
+    search_state->best_mode_index = 3;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    search_state->best_rd = this_rd;
+    search_state->best_mbmode = *mbmi;
+    search_state->best_skip2 = 0;
+    search_state->best_mode_skippable = skippable;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
   }
-#endif
-  if (rd_cost->rate == INT_MAX) return;
-
-  ctx->mic = *xd->mi[0];
-  ctx->mbmi_ext = *x->mbmi_ext;
 }
 
-// Do we have an internal image edge (e.g. formatting bars).
-int av1_internal_image_edge(const AV1_COMP *cpi) {
-  return (cpi->oxcf.pass == 2) &&
-         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
-          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
-}
+static void init_inter_mode_search_state(InterModeSearchState *search_state,
+                                         const AV1_COMP *cpi,
+                                         const TileDataEnc *tile_data,
+                                         const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                         int64_t best_rd_so_far) {
+  search_state->best_rd = best_rd_so_far;
 
-// Checks to see if a super block is on a horizontal image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_rows;
-  int is_active_h_edge = 0;
+  av1_zero(search_state->best_mbmode);
 
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    const TWO_PASS *const twopass = &cpi->twopass;
+  search_state->best_rate_y = INT_MAX;
 
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+  search_state->best_rate_uv = INT_MAX;
 
-    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = AOMMAX(top_edge, bottom_edge);
-  }
+  search_state->best_mode_skippable = 0;
 
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
-    is_active_h_edge = 1;
-  }
-  return is_active_h_edge;
-}
+  search_state->best_skip2 = 0;
 
-// Checks to see if a super block is on a vertical image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_cols;
-  int is_active_v_edge = 0;
+  search_state->best_mode_index = -1;
 
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    const TWO_PASS *const twopass = &cpi->twopass;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
 
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+  search_state->skip_intra_modes = 0;
 
-    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
-    right_edge = AOMMAX(left_edge, right_edge);
-  }
+  search_state->num_available_refs = 0;
+  memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+  memset(search_state->dist_order_refs, -1,
+         sizeof(search_state->dist_order_refs));
 
-  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
-    is_active_v_edge = 1;
-  }
-  return is_active_v_edge;
-}
+  for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    search_state->mode_threshold[i] = 0;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    search_state->mode_threshold[i] =
+        ((int64_t)rd_threshes[i] * tile_data->thresh_freq_fact[bsize][i]) >> 5;
 
-// Checks to see if a super block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
-  return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
-         av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
-}
+  search_state->best_intra_mode = DC_PRED;
+  search_state->best_intra_rd = INT64_MAX;
 
-static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(bsize >= BLOCK_8X8);
-  int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  float *const data = x->palette_buffer->kmeans_data_buf;
-  float centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  int r, c;
-#if CONFIG_HIGHBITDEPTH
-  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
-  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-#endif  // CONFIG_HIGHBITDEPTH
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-  (void)cpi;
+  search_state->angle_stats_ready = 0;
 
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-#if CONFIG_HIGHBITDEPTH
-      if (cpi->common.use_highbitdepth) {
-        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
-      } else {
-#endif  // CONFIG_HIGHBITDEPTH
-        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
-#if CONFIG_HIGHBITDEPTH
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-    }
-  }
+  search_state->best_pred_sse = UINT_MAX;
 
-  for (r = 1; r < 3; ++r) {
-    for (c = 0; c < pmi->palette_size[1]; ++c) {
-      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
-    }
-  }
+  for (int i = 0; i < TX_SIZES_ALL; i++)
+    search_state->rate_uv_intra[i] = INT_MAX;
 
-  av1_calc_indices(data, centroids, color_map, rows * cols,
-                   pmi->palette_size[1], 2);
-  extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                           plane_block_height);
-}
+  av1_zero(search_state->pmi_uv);
 
-#if CONFIG_FILTER_INTRA
-static void pick_filter_intra_interframe(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-    int mi_col, int *rate_uv_intra, int *rate_uv_tokenonly, int64_t *dist_uv,
-    int *skip_uv, UV_PREDICTION_MODE *mode_uv,
-    FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
-#if CONFIG_EXT_INTRA
-    int8_t *uv_angle_delta,
-#endif  // CONFIG_EXT_INTRA
-    PALETTE_MODE_INFO *pmi_uv, int palette_ctx, int skip_mask,
-    unsigned int *ref_costs_single, int64_t *best_rd, int64_t *best_intra_rd,
-    PREDICTION_MODE *best_intra_mode, int *best_mode_index, int *best_skip2,
-    int *best_mode_skippable,
-#if CONFIG_SUPERTX
-    int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-    int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int try_palette =
-      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
-  int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
-  int dc_mode_index;
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
-  int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
-  int64_t distortion_uv, model_rd = INT64_MAX;
-  TX_SIZE uv_tx;
+  for (int i = 0; i < REFERENCE_MODES; ++i)
+    search_state->best_pred_rd[i] = INT64_MAX;
 
-  for (i = 0; i < MAX_MODES; ++i)
-    if (av1_mode_order[i].mode == DC_PRED &&
-        av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
-      break;
-  dc_mode_index = i;
-  assert(i < MAX_MODES);
+  av1_zero(search_state->single_newmv);
+  av1_zero(search_state->single_newmv_rate);
+  av1_zero(search_state->single_newmv_valid);
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      search_state->modelled_rd[i][ref_frame] = INT64_MAX;
+}
 
-  // TODO(huisu): use skip_mask for further speedup.
-  (void)skip_mask;
-  mbmi->mode = DC_PRED;
-  mbmi->uv_mode = UV_DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
-                                &skippable, bsize, intra_mode_cost[mbmi->mode],
-                                &this_rd, &model_rd, 0)) {
-    return;
+static int inter_mode_search_order_independent_skip(
+    const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
+    int mi_row, int mi_col, uint32_t *mode_skip_mask,
+    uint16_t *ref_frame_skip_mask) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+  const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
+  const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      !x->cb_partition_scan) {
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    int found = 0;
+    // Search in the stats table to see if the ref frames have been used in the
+    // first pass of partition search.
+    for (int row = mi_row; row < mi_row + mi_width && !found;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height && !found;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        const FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        if (stats->ref0_counts[ref_frame[0]] &&
+            (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) {
+          found = 1;
+          break;
+        }
+      }
+    }
+    if (!found) return 1;
   }
-  if (rate_y == INT_MAX) return;
-
-  uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
-                          [xd->plane[1].subsampling_y];
-  if (rate_uv_intra[uv_tx] == INT_MAX) {
-    choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                         &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
-                         &skip_uv[uv_tx], &mode_uv[uv_tx]);
-    if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
-    filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
-#if CONFIG_EXT_INTRA
-    uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
-#endif  // CONFIG_EXT_INTRA
-  }
-
-  rate_uv = rate_uv_tokenonly[uv_tx];
-  distortion_uv = dist_uv[uv_tx];
-  skippable = skippable && skip_uv[uv_tx];
-  mbmi->uv_mode = mode_uv[uv_tx];
-  if (cm->allow_screen_content_tools) {
-    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
-    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-           pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-  }
-#if CONFIG_EXT_INTRA
-  mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
-#endif  // CONFIG_EXT_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-      filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
-  if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
-    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-        filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
-  }
-
-  rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-          x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-  if (try_palette && mbmi->mode == DC_PRED)
-    rate2 += av1_cost_bit(
-        av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
 
-  if (!xd->lossless[mbmi->segment_id]) {
-    // super_block_yrd above includes the cost of the tx_size in the
-    // tokenonly rate, but for intra blocks, tx_size is always coded
-    // (prediction granularity), so we account for it in the full rate,
-    // not the tokenonly rate.
-    rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-  }
-
-  rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
-                        mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
-  rate2 += write_uniform_cost(
-      FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
-#if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
-      av1_use_angle_delta(bsize)) {
-    rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-  }
-#endif  // CONFIG_EXT_INTRA
-  if (mbmi->mode == DC_PRED) {
-    rate2 +=
-        av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
-                     mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
-      rate2 +=
-          write_uniform_cost(FILTER_INTRA_MODES,
-                             mbmi->filter_intra_mode_info.filter_intra_mode[1]);
-  }
-  distortion2 = distortion_y + distortion_uv;
-  av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row,
-                               mi_col);
+  if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+    // Mode must by compatible
+    if (!is_interintra_allowed_mode(this_mode)) return 1;
+    if (!is_interintra_allowed_bsize(bsize)) return 1;
+  }
 
-  rate2 += ref_costs_single[INTRA_FRAME];
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+    return 1;
 
-  if (skippable) {
-    rate2 -= (rate_y + rate_uv);
-    rate_y = 0;
-    rate_uv = 0;
-    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+  if (ref_frame[0] == INTRA_FRAME) {
+    if (this_mode != DC_PRED) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      const unsigned int skip_intra_var_thresh = 64;
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          x->source_variance < skip_intra_var_thresh)
+        return 1;
+    }
   } else {
-    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+    if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1;
+  }
+
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+  if (comp_pred) {
+    if (!cpi->allow_comp_inter_inter) return 1;
+
+    // Skip compound inter modes if ARF is not available.
+    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
+
+    // Do not allow compound prediction if the segment level reference frame
+    // feature is in use as in this case there can only be one reference.
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+  }
+
+  if (sf->selective_ref_frame) {
+    if (sf->selective_ref_frame >= 2 || x->cb_partition_scan) {
+      if (ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME)
+        if (get_relative_dist(
+                cm, cm->cur_frame->ref_frame_offset[ALTREF2_FRAME - LAST_FRAME],
+                cm->frame_offset) < 0)
+          return 1;
+      if (ref_frame[0] == BWDREF_FRAME || ref_frame[1] == BWDREF_FRAME)
+        if (get_relative_dist(
+                cm, cm->cur_frame->ref_frame_offset[BWDREF_FRAME - LAST_FRAME],
+                cm->frame_offset) < 0)
+          return 1;
+    }
+    if (ref_frame[0] == LAST3_FRAME || ref_frame[1] == LAST3_FRAME)
+      if (get_relative_dist(
+              cm, cm->cur_frame->ref_frame_offset[LAST3_FRAME - LAST_FRAME],
+              cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+        return 1;
+    if (ref_frame[0] == LAST2_FRAME || ref_frame[1] == LAST2_FRAME)
+      if (get_relative_dist(
+              cm, cm->cur_frame->ref_frame_offset[LAST2_FRAME - LAST_FRAME],
+              cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+        return 1;
+  }
+
+  // One-sided compound is used only when all reference frames are one-sided.
+  if (sf->selective_ref_frame && comp_pred && !cpi->all_one_sided_refs) {
+    unsigned int ref_offsets[2];
+    for (int i = 0; i < 2; ++i) {
+      const int buf_idx = cm->frame_refs[ref_frame[i] - LAST_FRAME].idx;
+      assert(buf_idx >= 0);
+      ref_offsets[i] = cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    }
+    if ((get_relative_dist(cm, ref_offsets[0], cm->frame_offset) <= 0 &&
+         get_relative_dist(cm, ref_offsets[1], cm->frame_offset) <= 0) ||
+        (get_relative_dist(cm, ref_offsets[0], cm->frame_offset) > 0 &&
+         get_relative_dist(cm, ref_offsets[1], cm->frame_offset) > 0))
+      return 1;
+  }
+
+  if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) {
+    return 1;
   }
-  this_rd = RDCOST(x->rdmult, rate2, distortion2);
 
-  if (this_rd < *best_intra_rd) {
-    *best_intra_rd = this_rd;
-    *best_intra_mode = mbmi->mode;
+  if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) &&
+      (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) {
+    return 1;
   }
-  for (i = 0; i < REFERENCE_MODES; ++i)
-    best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
 
-  if (this_rd < *best_rd) {
-    *best_mode_index = dc_mode_index;
-    mbmi->mv[0].as_int = 0;
-    rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-    if (x->skip)
-      *returnrate_nocoef = rate2;
-    else
-      *returnrate_nocoef = rate2 - rate_y - rate_uv;
-    *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
-    *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
-                                       mbmi->ref_frame[0] != INTRA_FRAME);
-#endif  // CONFIG_SUPERTX
-    rd_cost->dist = distortion2;
-    rd_cost->rdcost = this_rd;
-    *best_rd = this_rd;
-    *best_mbmode = *mbmi;
-    *best_skip2 = 0;
-    *best_mode_skippable = skippable;
+  if (skip_repeated_mv(cm, x, this_mode, ref_frame)) {
+    return 1;
   }
+  return 0;
 }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_MOTION_VAR
-static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
-                                      const MACROBLOCKD *xd, int mi_row,
-                                      int mi_col, const uint8_t *above,
-                                      int above_stride, const uint8_t *left,
-                                      int left_stride);
-#endif  // CONFIG_MOTION_VAR
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = av1_mode_order[mode_index].ref_frame[0];
+  mbmi->ref_frame[1] = av1_mode_order[mode_index].ref_frame[1];
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  set_default_interp_filters(mbmi, cm->interp_filter);
+}
 
-void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
-                               MACROBLOCK *x, int mi_row, int mi_col,
-                               RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                               int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const RD_OPT *const rd_opt = &cpi->rd;
+static int handle_intra_mode(InterModeSearchState *search_state,
+                             const AV1_COMP *cpi, MACROBLOCK *x,
+                             BLOCK_SIZE bsize, int ref_frame_cost,
+                             const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                             RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                             RD_STATS *rd_stats_uv) {
+  const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int try_palette =
       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const struct segmentation *const seg = &cm->seg;
-  PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
-  unsigned char segment_id = mbmi->segment_id;
-  int comp_pred, i, k;
-  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#if CONFIG_COMPOUND_SINGLEREF
-  int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
-  int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-  int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
-  int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
-    0,
-    AOM_LAST_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_LAST2_FLAG,
-    AOM_LAST3_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_GOLD_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_BWD_FLAG,
-    AOM_ALT2_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_ALT_FLAG
-  };
-  int64_t best_rd = best_rd_so_far;
-  int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
-  int64_t best_pred_diff[REFERENCE_MODES];
-  int64_t best_pred_rd[REFERENCE_MODES];
-  MB_MODE_INFO best_mbmode;
-  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-  int best_mode_skippable = 0;
-  int midx, best_mode_index = -1;
-  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_COMP_REFS
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
-#else
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_COMP_REFS
-  aom_prob comp_mode_p;
-  int64_t best_intra_rd = INT64_MAX;
-  unsigned int best_pred_sse = UINT_MAX;
-  PREDICTION_MODE best_intra_mode = DC_PRED;
-  int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
-  int64_t dist_uvs[TX_SIZES_ALL];
-  int skip_uvs[TX_SIZES_ALL];
-  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
-  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
-#if CONFIG_EXT_INTRA
-  int8_t uv_angle_delta[TX_SIZES_ALL];
-  int is_directional_mode, angle_stats_ready = 0;
-  uint8_t directional_mode_skip_mask[INTRA_MODES];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  int8_t dc_skipped = 1;
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL];
-#endif  // CONFIG_FILTER_INTRA
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
-  int best_skip2 = 0;
-  uint16_t ref_frame_skip_mask[2] = { 0 };
-  uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
-#if CONFIG_INTERINTRA
-  MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
-  int64_t best_single_inter_rd = INT64_MAX;
-#endif  // CONFIG_INTERINTRA
-  int mode_skip_start = sf->mode_skip_start + 1;
-  const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
-  int64_t mode_threshold[MAX_MODES];
-  int *mode_map = tile_data->mode_map[bsize];
-  const int mode_search_skip_flags = sf->mode_search_skip_flags;
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf;
-#endif  // CONFIG_PVQ
-
-  HandleInterModeArgs args = {
-#if CONFIG_MOTION_VAR
-    { NULL },
-    { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
-    { NULL },
-    { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
-#endif  // CONFIG_MOTION_VAR
-    NULL,
-    NULL,
-    NULL,
-    { { 0 } },
-  };
-
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
-  int palette_ctx = 0;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
-#if CONFIG_MOTION_VAR
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
-    args.above_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
-    args.above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
-    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
-    args.left_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
-    args.left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
+  const int num_planes = av1_num_planes(cm);
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  av1_init_rd_stats(rd_stats_uv);
+  TX_SIZE uv_tx;
+  int is_directional_mode = av1_is_directional_mode(mbmi->mode);
+  if (is_directional_mode && av1_use_angle_delta(bsize)) {
+    int rate_dummy;
+    int64_t model_rd = INT64_MAX;
+    if (!search_state->angle_stats_ready) {
+      const int src_stride = x->plane[0].src.stride;
+      const uint8_t *src = x->plane[0].src.buf;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        highbd_angle_estimation(src, src_stride, rows, cols, bsize,
+                                search_state->directional_mode_skip_mask);
+      else
+        angle_estimation(src, src_stride, rows, cols, bsize,
+                         search_state->directional_mode_skip_mask);
+      search_state->angle_stats_ready = 1;
+    }
+    if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0;
+    rd_stats_y->rate = INT_MAX;
+    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize,
+                            intra_mode_cost[mbmi->mode], search_state->best_rd,
+                            &model_rd);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    args.above_pred_buf[0] = x->above_pred_buf;
-    args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
-    args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
-    args.left_pred_buf[0] = x->left_pred_buf;
-    args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
-    args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_MOTION_VAR
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
+  }
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  memcpy(best_blk_skip, x->blk_skip,
+         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+
+  if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    RD_STATS rd_stats_y_fi;
+    int filter_intra_selected_flag = 0;
+    TX_SIZE best_tx_size = mbmi->tx_size;
+    TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+    memcpy(best_txk_type, mbmi->txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+    int64_t best_rd_tmp = INT64_MAX;
+    if (rd_stats_y->rate != INT_MAX) {
+      best_rd_tmp = RDCOST(x->rdmult,
+                           rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+                               intra_mode_cost[mbmi->mode],
+                           rd_stats_y->dist);
+    }
+
+    mbmi->filter_intra_mode_info.use_filter_intra = 1;
+    for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
+         fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
+      int64_t this_rd_tmp;
+      mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+
+      super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
+      if (rd_stats_y_fi.rate == INT_MAX) {
+        continue;
+      }
+      const int this_rate_tmp =
+          rd_stats_y_fi.rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize,
+                                 intra_mode_cost[mbmi->mode]);
+      this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
 
-  av1_zero(best_mbmode);
+      if (this_rd_tmp < best_rd_tmp) {
+        best_tx_size = mbmi->tx_size;
+        memcpy(best_txk_type, mbmi->txk_type,
+               sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+        best_fi_mode = fi_mode;
+        *rd_stats_y = rd_stats_y_fi;
+        filter_intra_selected_flag = 1;
+        best_rd_tmp = this_rd_tmp;
+      }
+    }
 
-  av1_zero(pmi_uv);
-  if (try_palette) {
-    if (above_mi)
-      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    if (left_mi)
-      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    mbmi->tx_size = best_tx_size;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+    if (filter_intra_selected_flag) {
+      mbmi->filter_intra_mode_info.use_filter_intra = 1;
+      mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+    } else {
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    }
   }
 
-  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
-                           &comp_mode_p);
+  if (rd_stats_y->rate == INT_MAX) return 0;
 
-  for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX;
-  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
-  for (i = 0; i < MB_MODE_COUNT; ++i) {
-    for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
-      args.single_filter[i][k] = SWITCHABLE;
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+          &search_state->rate_uv_tokenonly[uv_tx],
+          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+          &search_state->mode_uv[uv_tx]);
+      if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
+      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
     }
-  }
-
-  rd_cost->rate = INT_MAX;
-#if CONFIG_SUPERTX
-  *returnrate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    x->pred_mv_sad[ref_frame] = INT_MAX;
-    x->mbmi_ext->mode_context[ref_frame] = 0;
-    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
-      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
-    }
-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-#if CONFIG_GLOBAL_MOTION
-    frame_mv[ZEROMV][ref_frame].as_int =
-        gm_get_motion_vector(&cm->global_motion[ref_frame],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0
-#if CONFIG_AMVR
-                             ,
-                             cm->cur_frame_mv_precision_level
-#endif
-                             )
-            .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-    frame_mv[ZEROMV][ref_frame].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-    frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
-#if CONFIG_COMPOUND_SINGLEREF
-    frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
-    frame_comp_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_GLOBAL_MOTION
-    frame_mv[ZERO_ZEROMV][ref_frame].as_int =
-        gm_get_motion_vector(&cm->global_motion[ref_frame],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0
-#if CONFIG_AMVR
-                             ,
-                             cm->cur_frame_mv_precision_level
-#endif
-                             )
-            .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-    frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
+    rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
+    rd_stats_uv->dist = search_state->dist_uvs[uv_tx];
+    rd_stats_uv->skip = search_state->skip_uvs[uv_tx];
+    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
+    mbmi->uv_mode = search_state->mode_uv[uv_tx];
+    if (try_palette) {
+      pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
   }
 
-  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
-    MODE_INFO *const mi = xd->mi[0];
-    int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
-    x->mbmi_ext->mode_context[ref_frame] = 0;
-    av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                     mbmi_ext->ref_mv_stack[ref_frame],
-                     mbmi_ext->compound_mode_context, candidates, mi_row,
-                     mi_col, NULL, NULL, mbmi_ext->mode_context);
-    if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
-      MV_REFERENCE_FRAME rf[2];
-      av1_set_ref_frame(rf, ref_frame);
-      if (mbmi_ext->ref_mvs[rf[0]][0].as_int !=
-              frame_mv[ZEROMV][rf[0]].as_int ||
-          mbmi_ext->ref_mvs[rf[0]][1].as_int !=
-              frame_mv[ZEROMV][rf[0]].as_int ||
-          mbmi_ext->ref_mvs[rf[1]][0].as_int !=
-              frame_mv[ZEROMV][rf[1]].as_int ||
-          mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int)
-        mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
-    }
-  }
-
-#if CONFIG_MOTION_VAR
-  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-
-  if (check_num_overlappable_neighbors(mbmi) &&
-      is_motion_variation_allowed_bsize(bsize)) {
-    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
-                                        args.above_pred_buf, dst_width1,
-                                        dst_height1, args.above_pred_stride);
-    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
-                                       args.left_pred_buf, dst_width2,
-                                       dst_height2, args.left_pred_stride);
-    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                         mi_col);
-    calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
-                              args.above_pred_stride[0], args.left_pred_buf[0],
-                              args.left_pred_stride[0]);
+  rd_stats->rate =
+      rd_stats_y->rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rd_stats_y->rate -= tx_size_cost(cm, x, bsize, mbmi->tx_size);
   }
-#endif  // CONFIG_MOTION_VAR
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
-      // Skip checking missing references in both single and compound reference
-      // modes. Note that a mode will be skipped iff both reference frames
-      // are masked out.
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    } else {
-      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-        // Skip fixed mv modes for poor references
-        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
-          break;
-        }
-      }
-    }
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    }
+  if (num_planes > 1 && !x->skip_chroma_rd) {
+    const int uv_mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mbmi->uv_mode];
+    rd_stats->rate +=
+        rd_stats_uv->rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
   }
+  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+    rd_stats->rate += intra_cost_penalty;
+  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
 
-  // Disable this drop out case if the ref frame
-  // segment level feature is enabled for this segment. This is to
-  // prevent the possibility that we end up unable to pick any mode.
-  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-    // unless ARNR filtering is enabled in which case we want
-    // an unfiltered alternative. We allow near/nearest as well
-    // because they may result in zero-zero MVs but be cheaper.
-    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      int_mv zeromv;
-      ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
-#if CONFIG_EXT_REFS
-                               (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                               (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) |
-#endif  // CONFIG_EXT_REFS
-                               (1 << GOLDEN_FRAME);
-      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
-      // TODO(zoeliu): To further explore whether following needs to be done for
-      //               BWDREF_FRAME as well.
-      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
-#if CONFIG_GLOBAL_MOTION
-      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
-                                           cm->allow_high_precision_mv, bsize,
-                                           mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                           ,
-                                           cm->cur_frame_mv_precision_level
-#endif
-                                           )
-                          .as_int;
-#else
-      zeromv.as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
-      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
-      if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
-      if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
-#if CONFIG_COMPOUND_SINGLEREF
-      if (frame_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int ||
-          frame_comp_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int !=
-              zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    }
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rd_stats->rate += ref_frame_cost;
+  if (rd_stats->skip) {
+    // Back out the coefficient coding costs
+    rd_stats->rate -= (rd_stats_y->rate + rd_stats_uv->rate);
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    // Cost the skip mb case
+    rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    // Add in the cost of the no skip flag.
+    rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0];
   }
+  // Calculate the final RD estimate for this mode.
+  int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
 
-  if (cpi->rc.is_src_frame_alt_ref) {
-    if (sf->alt_ref_search_fp) {
-      assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
-      mode_skip_mask[ALTREF_FRAME] = 0;
-      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
-      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
-    }
+  // Keep record of best intra rd
+  if (this_rd < search_state->best_intra_rd) {
+    search_state->best_intra_rd = this_rd;
+    search_state->best_intra_mode = mbmi->mode;
   }
 
-  if (sf->alt_ref_search_fp)
-    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
-      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
-        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
-
-  if (sf->adaptive_mode_search) {
-    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
-        cpi->rc.frames_since_golden >= 3)
-      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
-        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  if (sf->skip_intra_in_interframe) {
+    if (search_state->best_rd < (INT64_MAX / 2) &&
+        this_rd > (search_state->best_rd + (search_state->best_rd >> 1)))
+      search_state->skip_intra_modes = 1;
   }
 
-  if (bsize > sf->max_intra_bsize) {
-    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
-    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  if (!disable_skip) {
+    for (int i = 0; i < REFERENCE_MODES; ++i)
+      search_state->best_pred_rd[i] =
+          AOMMIN(search_state->best_pred_rd[i], this_rd);
   }
+  return 1;
+}
 
-  mode_skip_mask[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+                               MACROBLOCK *x, int mi_row, int mi_col,
+                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int i, k;
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  int *mode_map = tile_data->mode_map[bsize];
+  uint32_t mode_skip_mask[REF_FRAMES];
+  uint16_t ref_frame_skip_mask[2];
 
-  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
-  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
-    mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+                               best_rd_so_far);
 
-  midx = sf->schedule_mode_search ? mode_skip_start : 0;
-  while (midx > 4) {
-    uint8_t end_pos = 0;
-    for (i = 5; i < midx; ++i) {
-      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
-        uint8_t tmp = mode_map[i];
-        mode_map[i] = mode_map[i - 1];
-        mode_map[i - 1] = tmp;
-        end_pos = i;
-      }
-    }
-    midx = end_pos;
-  }
+  HandleInterModeArgs args = {
+    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+    NULL,      NULL,
+    NULL,      NULL,
+    { { 0 } }, INT_MAX,
+    INT_MAX
+  };
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
 
-  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
-    x->use_default_intra_tx_type = 1;
-  else
-    x->use_default_intra_tx_type = 0;
+  av1_invalid_rd_stats(rd_cost);
 
-  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
-    x->use_default_inter_tx_type = 1;
-  else
-    x->use_default_inter_tx_type = 0;
-#if CONFIG_PVQ
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-  for (i = 0; i < MB_MODE_COUNT; ++i)
-    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
-      modelled_rd[i][ref_frame] = INT64_MAX;
-
-  for (midx = 0; midx < MAX_MODES; ++midx) {
-    int mode_index;
-    int mode_excluded = 0;
+  // init params, set frame modes, speed features
+  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+                                ref_frame_skip_mask, mode_skip_mask,
+                                ref_costs_single, ref_costs_comp, yv12_mb);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  int64_t best_est_rd = INT64_MAX;
+#endif
+
+  for (int midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index = mode_map[midx];
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
-    int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int64_t distortion2 = 0;
     int skippable = 0;
     int this_skip2 = 0;
-    int64_t total_sse = INT64_MAX;
     uint8_t ref_frame_type;
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-    mode_index = mode_map[midx];
+
     this_mode = av1_mode_order[mode_index].mode;
     ref_frame = av1_mode_order[mode_index].ref_frame[0];
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
-    mbmi->ref_mv_idx = 0;
-
-    if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
-      // Mode must by compatible
-      if (!is_interintra_allowed_mode(this_mode)) continue;
-      if (!is_interintra_allowed_bsize(bsize)) continue;
-    }
-
-    if (is_inter_compound_mode(this_mode)) {
-      frame_mv[this_mode][ref_frame].as_int =
-          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
-      frame_mv[this_mode][second_ref_frame].as_int =
-          frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
-#if CONFIG_COMPOUND_SINGLEREF
-    } else if (is_inter_singleref_comp_mode(this_mode)) {
-      frame_mv[this_mode][ref_frame].as_int =
-          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
-      frame_comp_mv[this_mode][ref_frame].as_int =
-          frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    }
-
-    // Look at the reference frame of the best mode so far and set the
-    // skip mask to look at a subset of the remaining modes.
-    if (midx == mode_skip_start && best_mode_index >= 0) {
-      switch (best_mbmode.ref_frame[0]) {
-        case INTRA_FRAME: break;
-        case LAST_FRAME:
-          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#if CONFIG_EXT_REFS
-        case LAST2_FRAME:
-          ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-        case LAST3_FRAME:
-          ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#endif  // CONFIG_EXT_REFS
-        case GOLDEN_FRAME:
-          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#if CONFIG_EXT_REFS
-        case BWDREF_FRAME:
-          ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-        case ALTREF2_FRAME:
-          ref_frame_skip_mask[0] |= ALTREF2_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#endif  // CONFIG_EXT_REFS
-        case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
-#if CONFIG_EXT_REFS
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-#endif  // CONFIG_EXT_REFS
-          break;
-        case NONE_FRAME:
-        case TOTAL_REFS_PER_FRAME:
-          assert(0 && "Invalid Reference frame");
-          break;
-      }
-    }
 
-    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
-      continue;
+    init_mbmi(mbmi, mode_index, cm);
 
-#if CONFIG_EXT_COMP_REFS
-// TODO(zoeliu): Following toggle between #if 0/1 and the bug will manifest
-// itself.
-#if 0
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame]) ||
-        (second_ref_frame > INTRA_FRAME &&
-         (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))))
-      printf("Frame=%d, bsize=%d, (mi_row,mi_col)=(%d,%d), ref_frame=%d, "
-             "second_ref_frame=%d\n", cm->current_video_frame, bsize, mi_row,
-             mi_col, ref_frame, second_ref_frame);
-
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-    if (second_ref_frame > INTRA_FRAME &&
-        (!(cpi->ref_frame_flags & flag_list[second_ref_frame])))
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index,
+                                                 mi_row, mi_col, mode_skip_mask,
+                                                 ref_frame_skip_mask))
       continue;
-#endif  // 0
 
-#if !USE_UNI_COMP_REFS
-    // NOTE(zoeliu): Temporarily disable uni-directional comp refs
-    if (second_ref_frame > INTRA_FRAME) {
-      if (!((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)))
+    if (ref_frame == INTRA_FRAME) {
+      if (sf->skip_intra_in_interframe && search_state.skip_intra_modes)
         continue;
     }
-    assert(second_ref_frame <= INTRA_FRAME ||
-           ((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)));
-#endif  // !USE_UNI_COMP_REFS
-#endif  // CONFIG_EXT_COMP_REFS
-
-    if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_mode_skippable && sf->schedule_mode_search)
-      mode_threshold[mode_index] <<= 1;
 
-    if (best_rd < mode_threshold[mode_index]) continue;
-
-    // This is only used in motion vector unit test.
-    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
-
-#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS  // Changes LL bitstream
-#if CONFIG_EXT_REFS
-    if (cpi->oxcf.pass == 0) {
-      // Complexity-compression trade-offs
-      // if (ref_frame == ALTREF_FRAME) continue;
-      // if (ref_frame == BWDREF_FRAME) continue;
-      if (second_ref_frame == ALTREF_FRAME) continue;
-      // if (second_ref_frame == BWDREF_FRAME) continue;
+    if (sf->drop_ref) {
+      if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) {
+        if (search_state.num_available_refs > 2) {
+          if ((ref_frame == search_state.dist_order_refs[0] &&
+               second_ref_frame == search_state.dist_order_refs[1]) ||
+              (ref_frame == search_state.dist_order_refs[1] &&
+               second_ref_frame == search_state.dist_order_refs[0]))
+            continue;
+        }
+      }
     }
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-    comp_pred = second_ref_frame > INTRA_FRAME;
-    if (comp_pred) {
-      if (!cpi->allow_comp_inter_inter) continue;
 
-      // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+    if (search_state.best_rd < search_state.mode_threshold[mode_index])
+      continue;
 
-      // Do not allow compound prediction if the segment level reference frame
-      // feature is in use as in this case there can only be one reference.
-      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->reference_mode == REFERENCE_MODE_SELECT ? compmode_cost : 0;
 
-      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+    if (comp_pred) {
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          search_state.best_mode_index >= 0 &&
+          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
         continue;
-
-      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
-    } else {
-      if (ref_frame != INTRA_FRAME)
-        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
     }
 
     if (ref_frame == INTRA_FRAME) {
       if (sf->adaptive_mode_search)
-        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+            search_state.best_pred_sse)
           continue;
 
       if (this_mode != DC_PRED) {
-        // Disable intra modes other than DC_PRED for blocks with low variance
-        // Threshold for intra skipping based on source variance
-        // TODO(debargha): Specialize the threshold for super block sizes
-        const unsigned int skip_intra_var_thresh = 64;
-        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-            x->source_variance < skip_intra_var_thresh)
-          continue;
         // Only search the oblique modes if the best so far is
         // one of the neighboring directional modes
-        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
-          if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
+        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+          if (search_state.best_mode_index >= 0 &&
+              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
             continue;
         }
-        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-          if (conditional_skipintra(this_mode, best_intra_mode)) continue;
+        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+            continue;
         }
       }
-#if CONFIG_GLOBAL_MOTION
-    } else if (cm->global_motion[ref_frame].wmtype == IDENTITY &&
-               (!comp_pred ||
-                cm->global_motion[second_ref_frame].wmtype == IDENTITY)) {
-#else   // CONFIG_GLOBAL_MOTION
-    } else {
-#endif  // CONFIG_GLOBAL_MOTION
-      const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
-      if (!check_best_zero_mv(cpi, x, mbmi_ext->mode_context,
-                              mbmi_ext->compound_mode_context, frame_mv,
-                              this_mode, ref_frames, bsize, -1, mi_row, mi_col))
-        continue;
     }
 
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
-    pmi->palette_size[0] = 0;
-    pmi->palette_size[1] = 0;
-#if CONFIG_FILTER_INTRA
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-        // Evaluate all sub-pel filters irrespective of whether we can use
-        // them for this frame.
-
-    set_default_interp_filters(mbmi, cm->interp_filter);
-
-    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
-    mbmi->motion_mode = SIMPLE_TRANSLATION;
-
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
     // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++) {
+    for (i = 0; i < num_planes; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-#if CONFIG_COMPOUND_SINGLEREF
-    // Single ref compound mode
-    if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
-      xd->block_refs[1] = xd->block_refs[0];
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = xd->plane[i].pre[0];
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_INTERINTRA
-    mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
-#endif  // CONFIG_INTERINTRA
-
     if (ref_frame == INTRA_FRAME) {
-      RD_STATS rd_stats_y;
-      TX_SIZE uv_tx;
-      struct macroblockd_plane *const pd = &xd->plane[1];
-#if CONFIG_EXT_INTRA
-      is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
-      if (is_directional_mode && av1_use_angle_delta(bsize)) {
-        int rate_dummy;
-        int64_t model_rd = INT64_MAX;
-        if (!angle_stats_ready) {
-          const int src_stride = x->plane[0].src.stride;
-          const uint8_t *src = x->plane[0].src.buf;
-#if CONFIG_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            highbd_angle_estimation(src, src_stride, rows, cols, bsize,
-                                    directional_mode_skip_mask);
-          else
-#endif  // CONFIG_HIGHBITDEPTH
-            angle_estimation(src, src_stride, rows, cols, bsize,
-                             directional_mode_skip_mask);
-          angle_stats_ready = 1;
-        }
-        if (directional_mode_skip_mask[mbmi->mode]) continue;
-        rd_stats_y.rate = INT_MAX;
-        rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize,
-                                intra_mode_cost[mbmi->mode], best_rd,
-                                &model_rd);
-      } else {
-        mbmi->angle_delta[0] = 0;
-        super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
-      }
-#else
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
-#endif  // CONFIG_EXT_INTRA
-      rate_y = rd_stats_y.rate;
-      distortion_y = rd_stats_y.dist;
-      skippable = rd_stats_y.skip;
-
-      if (rate_y == INT_MAX) continue;
-
-#if CONFIG_FILTER_INTRA
-      if (mbmi->mode == DC_PRED) dc_skipped = 0;
-#endif  // CONFIG_FILTER_INTRA
-
-      uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
-                              [pd->subsampling_y];
-      if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                             &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
-                             &skip_uvs[uv_tx], &mode_uv[uv_tx]);
-        if (try_palette) pmi_uv[uv_tx] = *pmi;
-
-#if CONFIG_EXT_INTRA
-        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-        filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
-#endif  // CONFIG_FILTER_INTRA
-      }
-
-      rate_uv = rate_uv_tokenonly[uv_tx];
-      distortion_uv = dist_uvs[uv_tx];
-      skippable = skippable && skip_uvs[uv_tx];
-      mbmi->uv_mode = mode_uv[uv_tx];
-      if (try_palette) {
-        pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
-        memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-               pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-               2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-      }
-
-#if CONFIG_EXT_INTRA
-      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-          filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
-      if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
-        mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-            filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
-      }
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_CB4X4
-      rate2 = rate_y + intra_mode_cost[mbmi->mode];
-      if (!x->skip_chroma_rd)
-        rate2 += rate_uv + x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-#else
-      rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-              x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-#endif  // CONFIG_CB4X4
-
-      if (try_palette && mbmi->mode == DC_PRED) {
-        rate2 += av1_cost_bit(
-            av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
-      }
-
-      if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
-        // super_block_yrd above includes the cost of the tx_size in the
-        // tokenonly rate, but for intra blocks, tx_size is always coded
-        // (prediction granularity), so we account for it in the full rate,
-        // not the tokenonly rate.
-        rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-      }
-#if CONFIG_EXT_INTRA
-      if (is_directional_mode) {
-#if CONFIG_INTRA_INTERP
-        const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-        const int p_angle =
-            mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-        if (av1_is_intra_filter_switchable(p_angle))
-          rate2 += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
-#endif  // CONFIG_INTRA_INTERP
-        if (av1_use_angle_delta(bsize)) {
-          rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-        }
-      }
-      if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
-          av1_use_angle_delta(bsize)) {
-        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                    MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-      }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      if (mbmi->mode == DC_PRED) {
-        rate2 +=
-            av1_cost_bit(cm->fc->filter_intra_probs[0],
-                         mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
-        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
-          rate2 += write_uniform_cost(
-              FILTER_INTRA_MODES,
-              mbmi->filter_intra_mode_info.filter_intra_mode[0]);
-        }
+      RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+      const int ret = handle_intra_mode(
+          &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip,
+          &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+      if (!ret) {
+        continue;
       }
-      if (mbmi->uv_mode == UV_DC_PRED) {
-        rate2 +=
-            av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
-                         mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
-        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
-          rate2 += write_uniform_cost(
-              FILTER_INTRA_MODES,
-              mbmi->filter_intra_mode_info.filter_intra_mode[1]);
-      }
-#endif  // CONFIG_FILTER_INTRA
-      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
-        rate2 += intra_cost_penalty;
-      distortion2 = distortion_y + distortion_uv;
+      rate2 = intra_rd_stats.rate;
+      distortion2 = intra_rd_stats.dist;
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
+      skippable = intra_rd_stats.skip;
+      rate_y = intra_rd_stats_y.rate;
     } else {
-      int_mv backup_ref_mv[2];
-
-      if (!is_comp_ref_allowed(bsize) && mbmi->ref_frame[1] > INTRA_FRAME)
-        continue;
-
-      backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
-      if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
-#if CONFIG_INTERINTRA
-      if (second_ref_frame == INTRA_FRAME) {
-        if (best_single_inter_ref != ref_frame) continue;
-        mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
-// TODO(debargha|geza.lore):
-// Should we use ext_intra modes for interintra?
-#if CONFIG_EXT_INTRA
-        mbmi->angle_delta[0] = 0;
-        mbmi->angle_delta[1] = 0;
-#if CONFIG_INTRA_INTERP
-        mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-        mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-        mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-      }
-#endif  // CONFIG_INTERINTRA
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
       mbmi->ref_mv_idx = 0;
       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-
-      if (comp_pred) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-          int ref_mv_idx = 0;
-          // Special case: NEAR_NEWMV and NEW_NEARMV modes use
-          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-          // mbmi->ref_mv_idx (like NEWMV)
-          if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
-            ref_mv_idx = 1;
-
-          if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-            int_mv this_mv =
-                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-          }
-          if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-            int_mv this_mv =
-                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-          }
-        }
-#if CONFIG_COMPOUND_SINGLEREF
-      } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-          // TODO(zoeliu): To further investigate which ref_mv_idx should be
-          //               chosen for the mode of SR_NEAR_NEWMV.
-          int ref_mv_idx = 0;
-          // Special case: SR_NEAR_NEWMV mode use
-          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-          // mbmi->ref_mv_idx (like NEWMV)
-          if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1;
-
-          if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-              compound_ref1_mode(mbmi->mode) == NEWMV) {
-            int_mv this_mv =
-                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-          }
-        }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      } else {
-        if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-          int ref;
-          for (ref = 0; ref < 1 + comp_pred; ++ref) {
-            int_mv this_mv =
-                (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
-                           : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
-          }
-        }
-      }
+      int64_t ref_best_rd = search_state.best_rd;
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
-        args.single_newmv = single_newmv;
-        args.single_newmv_rate = single_newmv_rate;
-        args.modelled_rd = modelled_rd;
+        args.single_newmv = search_state.single_newmv[0];
+        args.single_newmv_rate = search_state.single_newmv_rate[0];
+        args.single_newmv_valid = search_state.single_newmv_valid[0];
+        args.modelled_rd = search_state.modelled_rd;
+        args.single_comp_cost = real_compmode_cost;
+        args.ref_frame_cost = ref_frame_cost;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
+                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
+                                    &args, ref_best_rd, &best_est_rd);
+#else
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
-                                    &rd_stats_uv, &disable_skip, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                    frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                    mi_row, mi_col, &args, best_rd);
+                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
+                                    &args, ref_best_rd);
+#endif
+        if (this_rd < ref_best_rd) {
+          ref_best_rd = this_rd;
+        }
 
         rate2 = rd_stats.rate;
         skippable = rd_stats.skip;
         distortion2 = rd_stats.dist;
-        total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
       }
 
-// TODO(jingning): This needs some refactoring to improve code quality
-// and reduce redundant steps.
-#if CONFIG_COMPOUND_SINGLEREF
-      if ((have_nearmv_in_inter_mode(mbmi->mode) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          ((mbmi->mode == NEWMV || mbmi->mode == SR_NEW_NEWMV ||
-            mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
-#else   // !CONFIG_COMPOUND_SINGLEREF
+      // TODO(jingning): This needs some refactoring to improve code quality
+      // and reduce redundant steps.
       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      {
-        int_mv backup_mv = frame_mv[NEARMV][ref_frame];
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
         MB_MODE_INFO backup_mbmi = *mbmi;
         int backup_skip = x->skip;
         int64_t tmp_ref_rd = this_rd;
@@ -11290,40 +10243,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         // TODO(jingning): This should be deprecated shortly.
         int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
         int ref_set =
-            AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
-
-        uint8_t drl_ctx =
-            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
-        // Dummy
-        int_mv backup_fmv[2];
-        backup_fmv[0] = frame_mv[NEWMV][ref_frame];
-        if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
-
-        rate2 += (rate2 < INT_MAX ? x->drl_mode_cost0[drl_ctx][0] : 0);
-
-        if (this_rd < INT64_MAX) {
-          if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
-              RDCOST(x->rdmult, 0, total_sse))
-            tmp_ref_rd = RDCOST(
-                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                distortion2);
-          else
-            tmp_ref_rd =
-                RDCOST(x->rdmult,
-                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                           rate_y - rate_uv,
-                       total_sse);
-        }
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memcpy(x->blk_skip_drl[i], x->blk_skip[i],
-                 sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+            AOMMIN(MAX_REF_MV_SERCH - 1,
+                   mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+        memcpy(x->blk_skip_drl, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
 
         for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
           int64_t tmp_alt_rd = INT64_MAX;
           int dummy_disable_skip = 0;
-          int ref;
           int_mv cur_mv;
           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
 
@@ -11333,80 +10260,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           mbmi->ref_mv_idx = 1 + ref_idx;
 
-          if (comp_pred) {
-            int ref_mv_idx = mbmi->ref_mv_idx;
-            // Special case: NEAR_NEWMV and NEW_NEARMV modes use
-            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-            // mbmi->ref_mv_idx (like NEWMV)
-            if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
-              ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-            if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            }
-
-            if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-            } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-            }
-#if CONFIG_COMPOUND_SINGLEREF
-          } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
-            int ref_mv_idx = mbmi->ref_mv_idx;
-            // Special case: SR_NEAR_NEWMV mode use
-            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-            // mbmi->ref_mv_idx (like NEWMV)
-            if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-            // TODO(zoeliu): For the mode of SR_NEAREST_NEWMV, as it only runs
-            //               the "if", not the "else if",
-            //               mbmi_ext->ref_mvs[mbmi->ref_frame[0]] takes the
-            //               value for "NEWMV", instead of "NEARESTMV".
-            if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-                compound_ref1_mode(mbmi->mode) == NEWMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV ||
-                       compound_ref1_mode(mbmi->mode) == NEARESTMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          } else {
-            for (ref = 0; ref < 1 + comp_pred; ++ref) {
-              int_mv this_mv =
-                  (ref == 0)
-                      ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
-                            .this_mv
-                      : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
-                            .comp_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+          if (cpi->sf.reduce_inter_modes) {
+            if (mbmi->ref_frame[0] == LAST2_FRAME ||
+                mbmi->ref_frame[0] == LAST3_FRAME ||
+                mbmi->ref_frame[1] == LAST2_FRAME ||
+                mbmi->ref_frame[1] == LAST3_FRAME) {
+              if (mbmi_ext
+                      ->ref_mv_stack[ref_frame_type]
+                                    [mbmi->ref_mv_idx + idx_offset]
+                      .weight < REF_CAT_LEVEL) {
+                *mbmi = backup_mbmi;
+                x->skip = backup_skip;
+                continue;
+              }
             }
           }
 
@@ -11416,69 +10282,31 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           clamp_mv2(&cur_mv.as_mv, xd);
 
           if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
-            int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-            int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
-
-            frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
 
-            // Point to variables that are not maintained between iterations
-            args.single_newmv = dummy_single_newmv;
-            args.single_newmv_rate = dummy_single_newmv_rate;
             args.modelled_rd = NULL;
-            tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
-                                           &tmp_rd_stats_y, &tmp_rd_stats_uv,
-                                           &dummy_disable_skip, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                           frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                           mi_row, mi_col, &args, best_rd);
-            // Prevent pointers from escaping local scope
-            args.single_newmv = NULL;
-            args.single_newmv_rate = NULL;
-          }
-
-          for (i = 0; i < mbmi->ref_mv_idx; ++i) {
-            uint8_t drl1_ctx = 0;
-            drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
-                                   i + idx_offset);
-            tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][1]
-                                             : 0);
-          }
-
-          if (mbmi_ext->ref_mv_count[ref_frame_type] >
-                  mbmi->ref_mv_idx + idx_offset + 1 &&
-              ref_idx < ref_set - 1) {
-            uint8_t drl1_ctx =
-                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
-                            mbmi->ref_mv_idx + idx_offset);
-            tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][0]
-                                             : 0);
-          }
-
-          if (tmp_alt_rd < INT64_MAX) {
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            args.single_newmv = search_state.single_newmv[mbmi->ref_mv_idx];
+            args.single_newmv_rate =
+                search_state.single_newmv_rate[mbmi->ref_mv_idx];
+            args.single_newmv_valid =
+                search_state.single_newmv_valid[mbmi->ref_mv_idx];
+            args.single_comp_cost = real_compmode_cost;
+            args.ref_frame_cost = ref_frame_cost;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
             tmp_alt_rd =
-                RDCOST(x->rdmult, tmp_rd_stats.rate, tmp_rd_stats.dist);
+                handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y,
+                                  &tmp_rd_stats_uv, &dummy_disable_skip, mi_row,
+                                  mi_col, &args, ref_best_rd, &best_est_rd);
 #else
-            if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
-                       tmp_rd_stats.dist) <
-                RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
-              tmp_alt_rd =
-                  RDCOST(x->rdmult,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                         tmp_rd_stats.dist);
-            else
-              tmp_alt_rd =
-                  RDCOST(x->rdmult,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
-                         tmp_rd_stats.sse);
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            tmp_alt_rd = handle_inter_mode(
+                cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
+                &dummy_disable_skip, mi_row, mi_col, &args, ref_best_rd);
+#endif
+
+            // Prevent pointers from escaping local scope
+            args.single_newmv = search_state.single_newmv[0];
+            args.single_newmv_rate = search_state.single_newmv_rate[0];
+            args.single_newmv_valid = search_state.single_newmv_valid[0];
           }
 
           if (tmp_ref_rd > tmp_alt_rd) {
@@ -11488,192 +10316,61 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             skippable = tmp_rd_stats.skip;
             rate_y = tmp_rd_stats_y.rate;
             rate_uv = tmp_rd_stats_uv.rate;
-            total_sse = tmp_rd_stats.sse;
             this_rd = tmp_alt_rd;
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_VAR_TX
-            for (i = 0; i < MAX_MB_PLANE; ++i)
-              memcpy(x->blk_skip_drl[i], x->blk_skip[i],
-                     sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+            memcpy(x->blk_skip_drl, x->blk_skip,
+                   sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
           } else {
             *mbmi = backup_mbmi;
             x->skip = backup_skip;
           }
         }
 
-        frame_mv[NEARMV][ref_frame] = backup_mv;
-        frame_mv[NEWMV][ref_frame] = backup_fmv[0];
-        if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memcpy(x->blk_skip[i], x->blk_skip_drl[i],
-                 sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+        memcpy(x->blk_skip, x->blk_skip_drl,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
-      mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
-      if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
-
       if (this_rd == INT64_MAX) continue;
 
-      if (is_comp_ref_allowed(mbmi->sb_type))
-        compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-
-      if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
-    }
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    if (comp_pred) {
-#if CONFIG_EXT_COMP_REFS
-      rate2 += ref_costs_comp[ref_frame][second_ref_frame];
-#else  // !CONFIG_EXT_COMP_REFS
-      rate2 += ref_costs_comp[ref_frame];
-#if CONFIG_EXT_REFS
-      rate2 += ref_costs_comp[second_ref_frame];
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_COMP_REFS
-    } else {
-      rate2 += ref_costs_single[ref_frame];
-    }
-
-#if CONFIG_COMPOUND_SINGLEREF
-    // Add the cost to signal single/comp mode in single ref.
-    if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
-      aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
-      rate2 += av1_cost_bit(singleref_comp_mode_p,
-                            is_inter_singleref_comp_mode(mbmi->mode));
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    if (ref_frame == INTRA_FRAME)
-#else
-    if (!disable_skip)
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    {
-      if (skippable) {
-        // Back out the coefficient coding costs
-        rate2 -= (rate_y + rate_uv);
-        rate_y = 0;
-        rate_uv = 0;
-        // Cost the skip mb case
-        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
-        if (RDCOST(x->rdmult, rate_y + rate_uv + rate_skip0, distortion2) <
-            RDCOST(x->rdmult, rate_skip1, total_sse)) {
-          // Add in the cost of the no skip flag.
-          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-        } else {
-          // FIXME(rbultje) make this work for splitmv also
-          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-          distortion2 = total_sse;
-          assert(total_sse >= 0);
-          rate2 -= (rate_y + rate_uv);
-          this_skip2 = 1;
-          rate_y = 0;
-          rate_uv = 0;
-        }
-      } else {
-        // Add in the cost of the no skip flag.
-        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, rate2, distortion2);
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    } else {
       this_skip2 = mbmi->skip;
       this_rd = RDCOST(x->rdmult, rate2, distortion2);
       if (this_skip2) {
         rate_y = 0;
         rate_uv = 0;
       }
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    }
-
-    if (ref_frame == INTRA_FRAME) {
-      // Keep record of best intra rd
-      if (this_rd < best_intra_rd) {
-        best_intra_rd = this_rd;
-        best_intra_mode = mbmi->mode;
-      }
-#if CONFIG_INTERINTRA
-    } else if (second_ref_frame == NONE_FRAME) {
-      if (this_rd < best_single_inter_rd) {
-        best_single_inter_rd = this_rd;
-        best_single_inter_ref = mbmi->ref_frame[0];
-      }
-#endif  // CONFIG_INTERINTRA
-    }
-
-    if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
+    if (this_rd < search_state.best_rd || x->skip) {
+      int mode_excluded = 0;
+      if (comp_pred) {
+        mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+      }
       if (!mode_excluded) {
         // Note index of best mode so far
-        best_mode_index = mode_index;
+        search_state.best_mode_index = mode_index;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
         } else {
-          best_pred_sse = x->pred_sse[ref_frame];
+          search_state.best_pred_sse = x->pred_sse[ref_frame];
         }
 
         rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-        if (x->skip)
-          *returnrate_nocoef = rate2;
-        else
-          *returnrate_nocoef = rate2 - rate_y - rate_uv;
-        *returnrate_nocoef -= av1_cost_bit(
-            av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
-        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
-                                           mbmi->ref_frame[0] != INTRA_FRAME);
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_WARPED_MOTION
-        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-        MODE_INFO *const mi = xd->mi[0];
-        const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-            0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-            xd,
-#endif
-            mi);
-        if (motion_allowed == WARPED_CAUSAL)
-          *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
-        else if (motion_allowed == OBMC_CAUSAL)
-          *returnrate_nocoef -= x->motion_mode_cost1[bsize][mbmi->motion_mode];
-#else
-        *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
-        best_rd = this_rd;
-        best_mbmode = *mbmi;
-        best_skip2 = this_skip2;
-        best_mode_skippable = skippable;
-        best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
-                                            this_skip2 || skippable);
-        best_rate_uv = rate_uv;
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memcpy(ctx->blk_skip[i], x->blk_skip[i],
-                 sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+        search_state.best_rd = this_rd;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = this_skip2;
+        search_state.best_mode_skippable = skippable;
+        search_state.best_rate_y =
+            rate_y +
+            x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+        search_state.best_rate_uv = rate_uv;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
     }
 
@@ -11693,458 +10390,136 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
 
       if (!comp_pred) {
-        if (single_rd < best_pred_rd[SINGLE_REFERENCE])
-          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
       } else {
-        if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
-          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
-        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
-    if (x->skip && !comp_pred) break;
-  }
-
-  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
-      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
-        is_inter_mode(best_mbmode.mode)) ||
-       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
-        !is_inter_mode(best_mbmode.mode)))) {
-    int skip_blk = 0;
-    RD_STATS rd_stats_y, rd_stats_uv;
-
-    x->use_default_inter_tx_type = 0;
-    x->use_default_intra_tx_type = 0;
-
-    *mbmi = best_mbmode;
-
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-    // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-      if (has_second_ref(mbmi))
-        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-    }
+    if (sf->drop_ref) {
+      if (second_ref_frame == NONE_FRAME) {
+        const int idx = ref_frame - LAST_FRAME;
+        if (idx && distortion2 > search_state.dist_refs[idx]) {
+          search_state.dist_refs[idx] = distortion2;
+          search_state.dist_order_refs[idx] = ref_frame;
+        }
 
-#if CONFIG_COMPOUND_SINGLEREF
-    // Single ref compound mode
-    if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
-      xd->block_refs[1] = xd->block_refs[0];
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = xd->plane[i].pre[0];
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
+        // Reach the last single ref prediction mode
+        if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+          // bubble sort dist_refs and the order index
+          for (i = 0; i < REF_FRAMES; ++i) {
+            for (k = i + 1; k < REF_FRAMES; ++k) {
+              if (search_state.dist_refs[i] < search_state.dist_refs[k]) {
+                int64_t tmp_dist = search_state.dist_refs[i];
+                search_state.dist_refs[i] = search_state.dist_refs[k];
+                search_state.dist_refs[k] = tmp_dist;
+
+                int tmp_idx = search_state.dist_order_refs[i];
+                search_state.dist_order_refs[i] =
+                    search_state.dist_order_refs[k];
+                search_state.dist_order_refs[k] = tmp_idx;
+              }
+            }
+          }
 
-    if (is_inter_mode(mbmi->mode)) {
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-#if CONFIG_MOTION_VAR
-      if (mbmi->motion_mode == OBMC_CAUSAL) {
-        av1_build_obmc_inter_prediction(
-            cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
-            args.left_pred_buf, args.left_pred_stride);
-      }
-#endif  // CONFIG_MOTION_VAR
-      av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
-      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
-        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-        assert(rd_stats_y.rate != INT_MAX);
-      } else {
-        int idx, idy;
-        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-        for (idy = 0; idy < xd->n8_h; ++idy)
-          for (idx = 0; idx < xd->n8_w; ++idx)
-            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-        memset(x->blk_skip[0], rd_stats_y.skip,
-               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+          for (i = 0; i < REF_FRAMES; ++i) {
+            if (search_state.dist_refs[i] == -1) break;
+            search_state.num_available_refs = i;
+          }
+          search_state.num_available_refs++;
+        }
       }
-
-      inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif  // CONFIG_VAR_TX
-    } else {
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-    }
-
-    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
-               (rd_stats_y.dist + rd_stats_uv.dist)) >
-        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
-      skip_blk = 1;
-      rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      rd_stats_uv.rate = 0;
-      rd_stats_y.dist = rd_stats_y.sse;
-      rd_stats_uv.dist = rd_stats_uv.sse;
-    } else {
-      skip_blk = 0;
-      rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
 
-    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
-        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
-               (rd_stats_y.dist + rd_stats_uv.dist))) {
-#if CONFIG_VAR_TX
-      int idx, idy;
-#endif  // CONFIG_VAR_TX
-      best_mbmode.tx_type = mbmi->tx_type;
-      best_mbmode.tx_size = mbmi->tx_size;
-#if CONFIG_LGT_FROM_PRED
-      best_mbmode.use_lgt = mbmi->use_lgt;
-#endif
-#if CONFIG_VAR_TX
-      for (idy = 0; idy < xd->n8_h; ++idy)
-        for (idx = 0; idx < xd->n8_w; ++idx)
-          best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-
-      for (i = 0; i < MAX_MB_PLANE; ++i)
-        memcpy(ctx->blk_skip[i], x->blk_skip[i],
-               sizeof(uint8_t) * ctx->num_4x4_blk);
-
-      best_mbmode.min_tx_size = mbmi->min_tx_size;
-#endif  // CONFIG_VAR_TX
-      rd_cost->rate +=
-          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
-      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
-      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-      best_skip2 = skip_blk;
-    }
+    if (x->skip && !comp_pred) break;
   }
 
-  // Only try palette mode when the best mode so far is an intra mode.
-  if (try_palette && !is_inter_mode(best_mbmode.mode)) {
-    int rate2 = 0;
-#if CONFIG_SUPERTX
-    int best_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-    int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
-            best_model_rd_palette = INT64_MAX;
-    int skippable = 0, rate_overhead_palette = 0;
-    RD_STATS rd_stats_y;
-    TX_SIZE uv_tx;
-    uint8_t *const best_palette_color_map =
-        x->palette_buffer->best_palette_color_map;
-    uint8_t *const color_map = xd->plane[0].color_index_map;
-    MB_MODE_INFO best_mbmi_palette = best_mbmode;
-
-    mbmi->mode = DC_PRED;
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->ref_frame[0] = INTRA_FRAME;
-    mbmi->ref_frame[1] = NONE_FRAME;
-    rate_overhead_palette = rd_pick_palette_intra_sby(
-        cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
-        &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
-        &best_model_rd_palette, NULL, NULL, NULL, NULL);
-    if (pmi->palette_size[0] == 0) goto PALETTE_EXIT;
-    memcpy(color_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
-    if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT;
-    uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
-                            [xd->plane[1].subsampling_y];
-    if (rate_uv_intra[uv_tx] == INT_MAX) {
-      choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                           &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
-                           &skip_uvs[uv_tx], &mode_uv[uv_tx]);
-      pmi_uv[uv_tx] = *pmi;
-#if CONFIG_EXT_INTRA
-      uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
-#endif  // CONFIG_FILTER_INTRA
-    }
-    mbmi->uv_mode = mode_uv[uv_tx];
-    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
-    if (pmi->palette_size[1] > 0) {
-      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-             pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-    }
-#if CONFIG_EXT_INTRA
-    mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-        filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
-    if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
-      mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-          filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
-    }
-#endif  // CONFIG_FILTER_INTRA
-    skippable = rd_stats_y.skip && skip_uvs[uv_tx];
-    distortion2 = rd_stats_y.dist + dist_uvs[uv_tx];
-    rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx];
-    rate2 += ref_costs_single[INTRA_FRAME];
-
-    if (skippable) {
-      rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
-#if CONFIG_SUPERTX
-      best_rate_nocoef = rate2;
-#endif  // CONFIG_SUPERTX
-      rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-    } else {
-#if CONFIG_SUPERTX
-      best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
-#endif  // CONFIG_SUPERTX
-      rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-    }
-    this_rd = RDCOST(x->rdmult, rate2, distortion2);
-    if (this_rd < best_rd) {
-      best_mode_index = 3;
-      mbmi->mv[0].as_int = 0;
-      rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-      *returnrate_nocoef = best_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-      rd_cost->dist = distortion2;
-      rd_cost->rdcost = this_rd;
-      best_rd = this_rd;
-      best_mbmode = *mbmi;
-      best_skip2 = 0;
-      best_mode_skippable = skippable;
-    }
-  }
-PALETTE_EXIT:
-
-#if CONFIG_FILTER_INTRA
-  // TODO(huisu): filter-intra is turned off in lossless mode for now to
-  // avoid a unit test failure
-  if (!xd->lossless[mbmi->segment_id] && pmi->palette_size[0] == 0 &&
-      !dc_skipped && best_mode_index >= 0 &&
-      best_intra_rd < (best_rd + (best_rd >> 3))) {
-    pick_filter_intra_interframe(
-        cpi, x, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
-        dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
-#if CONFIG_EXT_INTRA
-        uv_angle_delta,
-#endif  // CONFIG_EXT_INTRA
-        pmi_uv, palette_ctx, 0, ref_costs_single, &best_rd, &best_intra_rd,
-        &best_intra_mode, &best_mode_index, &best_skip2, &best_mode_skippable,
-#if CONFIG_SUPERTX
-        returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-        best_pred_rd, &best_mbmode, rd_cost);
-  }
-#endif  // CONFIG_FILTER_INTRA
-
-// The inter modes' rate costs are not calculated precisely in some cases.
-// Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
-// ZEROMV. Here, checks are added for those cases, and the mode decisions
-// are corrected.
-#if CONFIG_COMPOUND_SINGLEREF
-// NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
-//       are surely different from each other.
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  if (best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV) {
-    const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
-                                         best_mbmode.ref_frame[1] };
-    int comp_pred_mode = refs[1] > INTRA_FRAME;
-    int_mv zeromv[2];
-    const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
-#if CONFIG_GLOBAL_MOTION
-    zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
-                                            cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                            ,
-                                            cm->cur_frame_mv_precision_level
-#endif
-                                            )
-                           .as_int;
-    zeromv[1].as_int =
-        comp_pred_mode
-            ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                   cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, 0
-#if CONFIG_AMVR
-                                   ,
-                                   cm->cur_frame_mv_precision_level
-#endif
-                                   )
-                  .as_int
-            : 0;
-#else
-    zeromv[0].as_int = 0;
-    zeromv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-    if (!comp_pred_mode) {
-      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
-                        ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
-                        : INT_MAX;
-
-      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
-        int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-        if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
-          best_mbmode.mode = NEARMV;
-          best_mbmode.ref_mv_idx = i;
-        }
-      }
-
-      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
-        best_mbmode.mode = NEARESTMV;
-      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
-        best_mbmode.mode = ZEROMV;
-    } else {
-      int_mv nearestmv[2];
-      int_mv nearmv[2];
-
-      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
-        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
-        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
-      } else {
-        nearmv[0] = frame_mv[NEARMV][refs[0]];
-        nearmv[1] = frame_mv[NEARMV][refs[1]];
-      }
-      if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
-        nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
-        nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
-      } else {
-        nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
-        nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
-      }
+  // In effect only when speed >= 2.
+  sf_refine_fast_tx_type_search(
+      cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index,
+      &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+      search_state.best_rate_uv, &search_state.best_skip2);
 
-      if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
-        best_mbmode.mode = NEAREST_NEARESTMV;
-      } else {
-        int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
-                          ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
-                          : INT_MAX;
-
-        for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
-          nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-          nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
-
-          // Try switching to the NEAR_NEARMV mode
-          if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-              nearmv[1].as_int == best_mbmode.mv[1].as_int) {
-            best_mbmode.mode = NEAR_NEARMV;
-            best_mbmode.ref_mv_idx = i;
-          }
-        }
+  // Only try palette mode when the best mode so far is an intra mode.
+  if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
+    search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi,
+                        ref_costs_single, &search_state);
+  }
 
-        if (best_mbmode.mode == NEW_NEWMV &&
-            best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-            best_mbmode.mv[1].as_int == zeromv[1].as_int)
-          best_mbmode.mode = ZERO_ZEROMV;
-      }
-    }
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->skip_mode_flag &&
+      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      is_comp_ref_allowed(bsize)) {
+    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+                      yv12_mb);
   }
 
   // Make sure that the ref_mv_idx is only nonzero when we're
   // using a mode which can support ref_mv_idx
-  if (best_mbmode.ref_mv_idx != 0 &&
-#if CONFIG_COMPOUND_SINGLEREF
-      !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
-        best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(best_mbmode.mode)))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-      !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(best_mbmode.mode)))
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    best_mbmode.ref_mv_idx = 0;
-  }
-
-  if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
-      best_mbmode.ref_frame[1] <= INTRA_FRAME) {
-    int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
-    int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
-    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      int_mv zeromv;
-#if CONFIG_GLOBAL_MOTION
-      const MV_REFERENCE_FRAME ref = best_mbmode.ref_frame[0];
-      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ref],
-                                           cm->allow_high_precision_mv, bsize,
-                                           mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                           ,
-                                           cm->cur_frame_mv_precision_level
-#endif
-                                           )
-                          .as_int;
-#else
-      zeromv.as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      if (best_mbmode.mv[0].as_int == zeromv.as_int) {
-        best_mbmode.mode = ZEROMV;
-      }
-    }
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
   }
 
-  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+  if (search_state.best_mode_index < 0 ||
+      search_state.best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
   }
 
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter ==
-          av1_extract_interp_filter(best_mbmode.interp_filters, 0)) ||
-         !is_inter_block(&best_mbmode));
-#if CONFIG_DUAL_FILTER
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter ==
-          av1_extract_interp_filter(best_mbmode.interp_filters, 1)) ||
-         !is_inter_block(&best_mbmode));
-#endif  // CONFIG_DUAL_FILTER
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+      !is_inter_block(&search_state.best_mbmode));
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+      !is_inter_block(&search_state.best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref)
     av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+                              sf->adaptive_rd_thresh, bsize,
+                              search_state.best_mode_index);
 
   // macroblock modes
-  *mbmi = best_mbmode;
-  x->skip |= best_skip2;
-
-// Note: this section is needed since the mode may have been forced to
-// ZEROMV by the all-zero mode handling of ref-mv.
-#if CONFIG_GLOBAL_MOTION
-  if (mbmi->mode == ZEROMV || mbmi->mode == ZERO_ZEROMV) {
-#if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
-    // Correct the motion mode for ZEROMV
-    const MOTION_MODE last_motion_mode_allowed =
-        motion_mode_allowed(0, xd->global_motion,
-#if CONFIG_WARPED_MOTION
-                            xd,
-#endif
-                            xd->mi[0]);
-    if (mbmi->motion_mode > last_motion_mode_allowed)
-      mbmi->motion_mode = last_motion_mode_allowed;
-#endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
-
-    // Correct the interpolation filter for ZEROMV
-    if (is_nontrans_global_motion(xd)) {
-      mbmi->interp_filters = av1_broadcast_interp_filter(
-          av1_unswitchable_filter(cm->interp_filter));
+  *mbmi = search_state.best_mbmode;
+  x->skip |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(
+                 av1_unswitchable_filter(cm->interp_filter)));
     }
   }
-#endif  // CONFIG_GLOBAL_MOTION
-
-  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-    if (mbmi->mode != NEWMV)
-      mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
-    else
-      mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-  }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
+    if (search_state.best_pred_rd[i] == INT64_MAX)
+      search_state.best_pred_diff[i] = INT_MIN;
     else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
+      search_state.best_pred_diff[i] =
+          search_state.best_rd - search_state.best_pred_rd[i];
   }
 
-  x->skip |= best_mode_skippable;
+  x->skip |= search_state.best_mode_skippable;
 
-  assert(best_mode_index >= 0);
+  assert(search_state.best_mode_index >= 0);
 
-  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
-                       best_mode_skippable);
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
 
   if (pmi->palette_size[1] > 0) {
     assert(try_palette);
@@ -12160,18 +10535,14 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
                                         int64_t best_rd_so_far) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   unsigned char segment_id = mbmi->segment_id;
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
-  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_COMP_REFS
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
-#else
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_COMP_REFS
-  aom_prob comp_mode_p;
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -12179,12 +10550,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   (void)mi_row;
   (void)mi_col;
 
-  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
-                           &comp_mode_p);
+  av1_collect_neighbors_ref_counts(xd);
 
-  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
-  for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
-    x->pred_mv_sad[i] = INT_MAX;
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
 
   rd_cost->rate = INT_MAX;
 
@@ -12192,58 +10564,35 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
-
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-  mbmi->mode = ZEROMV;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mode = GLOBALMV;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->uv_mode = UV_DC_PRED;
-  mbmi->ref_frame[0] = LAST_FRAME;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+    mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  else
+    mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
-#if CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int =
       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
-                           cm->allow_high_precision_mv, bsize, mi_col, mi_row, 0
-#if CONFIG_AMVR
-                           ,
-                           cm->cur_frame_mv_precision_level
-#endif
-                           )
+                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                           cm->cur_frame_force_integer_mv)
           .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-  mbmi->mv[0].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
   mbmi->tx_size = max_txsize_lookup[bsize];
   x->skip = 1;
 
   mbmi->ref_mv_idx = 0;
-  mbmi->pred_mv[0].as_int = 0;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-#if CONFIG_MOTION_VAR
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-#endif
-#if CONFIG_WARPED_MOTION
   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#if WARPED_MOTION_SORT_SAMPLES
-    int pts_mv[SAMPLES_ARRAY_SIZE];
-    mbmi->num_proj_ref[0] =
-        findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
-    // Rank the samples by motion vector difference
-    if (mbmi->num_proj_ref[0] > 1)
-      mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
-                                          pts_inref, mbmi->num_proj_ref[0]);
-#else
     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
+    // Select the samples according to motion vector difference
+    if (mbmi->num_proj_ref[0] > 1)
+      mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                            mbmi->num_proj_ref[0], bsize);
   }
-#endif
 
   set_default_interp_filters(mbmi, cm->interp_filter);
 
@@ -12270,7 +10619,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   rate2 += av1_get_switchable_rate(cm, x, xd);
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    rate2 += av1_cost_bit(comp_mode_p, comp_pred);
+    rate2 += comp_inter_cost[comp_pred];
 
   // Estimate the reference frame signaling cost and add it
   // to the rolling cost variable.
@@ -12292,15 +10641,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
           av1_extract_interp_filter(mbmi->interp_filters, 0)));
 
   av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_GLOBALMV);
 
   av1_zero(best_pred_diff);
 
-  store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
+  store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
 }
 
-#if CONFIG_MOTION_VAR
-
 struct calc_target_weighted_pred_ctxt {
   const MACROBLOCK *x;
   const uint8_t *tmp;
@@ -12308,28 +10655,22 @@ struct calc_target_weighted_pred_ctxt {
   int overlap;
 };
 
-static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
-                                                   int rel_mi_col,
-                                                   uint8_t nb_mi_width,
-                                                   MODE_INFO *nb_mi,
-                                                   void *fun_ctxt) {
+static INLINE void calc_target_weighted_pred_above(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t nb_mi_width, MB_MODE_INFO *nb_mi,
+    void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
+  (void)num_planes;
 
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 
   if (!is_hbd) {
     for (int row = 0; row < ctxt->overlap; ++row) {
@@ -12343,7 +10684,6 @@ static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
       mask += bw;
       tmp += ctxt->tmp_stride;
     }
-#if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
@@ -12358,32 +10698,25 @@ static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
       mask += bw;
       tmp16 += ctxt->tmp_stride;
     }
-#endif  // CONFIG_HIGHBITDEPTH
   }
 }
 
-static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
-                                                  int rel_mi_row,
-                                                  uint8_t nb_mi_height,
-                                                  MODE_INFO *nb_mi,
-                                                  void *fun_ctxt) {
+static INLINE void calc_target_weighted_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t nb_mi_height, MB_MODE_INFO *nb_mi,
+    void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
+  (void)num_planes;
 
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 
   if (!is_hbd) {
     for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
@@ -12398,7 +10731,6 @@ static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
       mask += bw;
       tmp += ctxt->tmp_stride;
     }
-#if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
@@ -12414,7 +10746,6 @@ static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
       mask += bw;
       tmp16 += ctxt->tmp_stride;
     }
-#endif  // CONFIG_HIGHBITDEPTH
   }
 }
 
@@ -12461,18 +10792,14 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       int mi_col, const uint8_t *above,
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const int bh = xd->n8_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
 
-  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
-#if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 
   // plane 0 should not be subsampled
   assert(xd->plane[0].subsampling_x == 0);
@@ -12488,7 +10815,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
     struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
                                                    overlap };
     foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
-                                  max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                  max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                   calc_target_weighted_pred_above, &ctxt);
   }
 
@@ -12504,7 +10831,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
     struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
                                                    overlap };
     foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
-                                 max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                                 max_neighbor_obmc[mi_size_high_log2[bsize]],
                                  calc_target_weighted_pred_left, &ctxt);
   }
 
@@ -12518,7 +10845,6 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
       wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
-#if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
 
@@ -12529,462 +10855,5 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
       wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-}
-
-#if CONFIG_NCOBMC
-void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
-                         int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  MB_MODE_INFO backup_mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-  int ref, skip_blk, backup_skip = x->skip;
-  int64_t rd_causal;
-  RD_STATS rd_stats_y, rd_stats_uv;
-  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-
-  // Recompute the best causal predictor and rd
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-    assert(cfg != NULL);
-    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                         &xd->block_refs[ref]->sf);
-  }
-  av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-
-  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-
-  av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  } else {
-    int idx, idy;
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    for (idy = 0; idy < xd->n8_h; ++idy)
-      for (idx = 0; idx < xd->n8_w; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-    memset(x->blk_skip[0], rd_stats_y.skip,
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-  }
-  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif
-  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
-  if (rd_stats_y.skip && rd_stats_uv.skip) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 0;
-  } else if (RDCOST(x->rdmult,
-                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
-                    (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, rate_skip1,
-                    (rd_stats_y.sse + rd_stats_uv.sse))) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 1;
-  } else {
-    rd_stats_y.rate += rate_skip0;
-    skip_blk = 0;
-  }
-  backup_skip = skip_blk;
-  backup_mbmi = *mbmi;
-  rd_causal = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
-                     (rd_stats_y.dist + rd_stats_uv.dist));
-  rd_causal +=
-      RDCOST(x->rdmult, av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
-
-  // Check non-causal mode
-  mbmi->motion_mode = OBMC_CAUSAL;
-  av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-
-  av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  } else {
-    int idx, idy;
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    for (idy = 0; idy < xd->n8_h; ++idy)
-      for (idx = 0; idx < xd->n8_w; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-    memset(x->blk_skip[0], rd_stats_y.skip,
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-  }
-  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif
-  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
-  if (rd_stats_y.skip && rd_stats_uv.skip) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 0;
-  } else if (RDCOST(x->rdmult,
-                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
-                    (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, rate_skip1,
-                    (rd_stats_y.sse + rd_stats_uv.sse))) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 1;
-  } else {
-    rd_stats_y.rate += rate_skip0;
-    skip_blk = 0;
-  }
-
-  if (rd_causal >
-      RDCOST(x->rdmult,
-             rd_stats_y.rate + rd_stats_uv.rate +
-                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
-             (rd_stats_y.dist + rd_stats_uv.dist))) {
-    x->skip = skip_blk;
-  } else {
-    *mbmi = backup_mbmi;
-    x->skip = backup_skip;
-  }
-}
-#endif  // CONFIG_NCOBMC
-
-int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
-                               int mi_row, int mi_col, int *skip_blk,
-                               MB_MODE_INFO *backup_mbmi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      xd->mi[0]);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  RD_STATS rd_stats_y, rd_stats_uv;
-  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-  int64_t this_rd;
-  int ref;
-
-#if CONFIG_CB4X4
-  x->skip_chroma_rd =
-      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y);
-#endif
-
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-    assert(cfg != NULL);
-    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                         &xd->block_refs[ref]->sf);
-  }
-  av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT)
-#endif
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-
-#if CONFIG_MOTION_VAR
-  if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_NCOBMC
-    av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#else
-    av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#endif
-  }
-#endif  // CONFIG_MOTION_VAR
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT)
-    for (int plane = 0; plane < MAX_MB_PLANE; ++plane)
-      get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
-#endif
-  av1_subtract_plane(x, bsize, 0);
-
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  } else {
-    int idx, idy;
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    for (idy = 0; idy < xd->n8_h; ++idy)
-      for (idx = 0; idx < xd->n8_w; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-    memset(x->blk_skip[0], rd_stats_y.skip,
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-  }
-  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif
-  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
-
-  if (rd_stats_y.skip && rd_stats_uv.skip) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    *skip_blk = 1;
-  } else if (RDCOST(x->rdmult,
-                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
-                    (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, rate_skip1,
-                    (rd_stats_y.sse + rd_stats_uv.sse))) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    *skip_blk = 1;
-  } else {
-    rd_stats_y.rate += rate_skip0;
-    *skip_blk = 0;
-  }
-
-  if (backup_mbmi) *backup_mbmi = *mbmi;
-
-  this_rd = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
-                   (rd_stats_y.dist + rd_stats_uv.dist));
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
-    assert(mbmi->motion_mode <= NCOBMC_ADAPT_WEIGHT);
-    this_rd +=
-        RDCOST(x->rdmult, x->motion_mode_cost2[bsize][mbmi->motion_mode], 0);
-  } else if (motion_allowed == OBMC_CAUSAL) {
-    assert(mbmi->motion_mode <= OBMC_CAUSAL);
-    this_rd +=
-        RDCOST(x->rdmult, x->motion_mode_cost1[bsize][mbmi->motion_mode], 0);
-  } else {
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-    this_rd +=
-        RDCOST(x->rdmult, x->motion_mode_cost[bsize][mbmi->motion_mode], 0);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  return this_rd;
-}
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
-                                      struct macroblock *x, int mi_row,
-                                      int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_VAR_TX
-  const int n4 = bsize_to_num_blk(bsize);
-  uint8_t st_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  uint8_t obmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  uint8_t ncobmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#endif
-  MB_MODE_INFO st_mbmi, obmc_mbmi, ncobmc_mbmi;
-  int st_skip, obmc_skip, ncobmc_skip;
-  int64_t st_rd, obmc_rd, ncobmc_rd;
-#if CONFIG_WARPED_MOTION
-  const AV1_COMMON *const cm = &cpi->common;
-  const int is_warp_motion = mbmi->motion_mode == WARPED_CAUSAL;
-  const int rs = RDCOST(x->rdmult, av1_get_switchable_rate(cm, x, xd), 0);
-  MB_MODE_INFO warp_mbmi;
-  int64_t warp_rd;
-  int warp_skip;
-#endif
-
-  // Recompute the rd for the motion mode decided in rd loop
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-  st_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &st_skip, &st_mbmi);
-#if CONFIG_WARPED_MOTION
-  st_rd += rs;
-#endif
-#if CONFIG_VAR_TX
-  memcpy(st_blk_skip, x->blk_skip[0], sizeof(st_blk_skip[0]) * n4);
-#endif
-
-  mbmi->motion_mode = OBMC_CAUSAL;
-  obmc_rd =
-      get_prediction_rd_cost(cpi, x, mi_row, mi_col, &obmc_skip, &obmc_mbmi);
-#if CONFIG_WARPED_MOTION
-  obmc_rd += rs;
-#endif
-#if CONFIG_VAR_TX
-  memcpy(obmc_blk_skip, x->blk_skip[0], sizeof(obmc_blk_skip[0]) * n4);
-#endif
-
-  // Compute the rd cost for ncobmc adaptive weight
-  mbmi->motion_mode = NCOBMC_ADAPT_WEIGHT;
-  ncobmc_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &ncobmc_skip,
-                                     &ncobmc_mbmi);
-#if CONFIG_WARPED_MOTION
-  ncobmc_rd += rs;
-#endif
-  // Calculate the ncobmc mode costs
-  {
-    ADAPT_OVERLAP_BLOCK aob = adapt_overlap_block_lookup[bsize];
-    ncobmc_rd +=
-        RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[0]], 0);
-    if (mi_size_wide[bsize] != mi_size_high[bsize])
-      ncobmc_rd +=
-          RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[1]], 0);
-  }
-#if CONFIG_VAR_TX
-  memcpy(ncobmc_blk_skip, x->blk_skip[0], sizeof(ncobmc_blk_skip[0]) * n4);
-#endif
-
-#if CONFIG_WARPED_MOTION
-  if (is_warp_motion) {
-    mbmi->motion_mode = WARPED_CAUSAL;
-    warp_rd =
-        get_prediction_rd_cost(cpi, x, mi_row, mi_col, &warp_skip, &warp_mbmi);
-  } else {
-    warp_rd = INT64_MAX;
-  }
-#endif
-
-#if CONFIG_WARPED_MOTION
-  if (AOMMIN(ncobmc_rd, warp_rd) < AOMMIN(st_rd, obmc_rd)) {
-    if (ncobmc_rd < warp_rd) {
-      x->skip = ncobmc_skip;
-      *mbmi = ncobmc_mbmi;
-#if CONFIG_VAR_TX
-      memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
-#endif
-    } else {
-      x->skip = warp_skip;
-      *mbmi = warp_mbmi;
-    }
-#else
-  if (ncobmc_rd < AOMMIN(st_rd, obmc_rd)) {
-    x->skip = ncobmc_skip;
-    *mbmi = ncobmc_mbmi;
-#if CONFIG_VAR_TX
-    memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
-#endif
-#endif  // CONFIG_WARPED_MOTION
-  } else {
-    if (obmc_rd < st_rd) {
-      *mbmi = obmc_mbmi;
-      x->skip = obmc_skip;
-#if CONFIG_VAR_TX
-      memcpy(x->blk_skip[0], obmc_blk_skip, sizeof(obmc_blk_skip[0]) * n4);
-#endif
-    } else {
-      *mbmi = st_mbmi;
-      x->skip = st_skip;
-#if CONFIG_VAR_TX
-      memcpy(x->blk_skip[0], st_blk_skip, sizeof(st_blk_skip[0]) * n4);
-#endif
-    }
-  }
-}
-
-int64_t get_ncobmc_error(MACROBLOCKD *xd, int pxl_row, int pxl_col,
-                         BLOCK_SIZE bsize, int plane, struct buf_2d *src) {
-  const int wide = AOMMIN(mi_size_wide[bsize] * MI_SIZE,
-                          (xd->sb_mi_bd.mi_col_end + 1) * MI_SIZE - pxl_col);
-  const int high = AOMMIN(mi_size_high[bsize] * MI_SIZE,
-                          (xd->sb_mi_bd.mi_row_end + 1) * MI_SIZE - pxl_row);
-  const int ss_x = xd->plane[plane].subsampling_x;
-  const int ss_y = xd->plane[plane].subsampling_y;
-  int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
-  int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
-  int dst_stride = xd->ncobmc_pred_buf_stride[plane];
-  int dst_offset = row_offset * dst_stride + col_offset;
-  int src_stride = src->stride;
-
-  int r, c;
-  int64_t tmp, error = 0;
-
-  for (r = 0; r < (high >> ss_y); ++r) {
-    for (c = 0; c < (wide >> ss_x); ++c) {
-      tmp = xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] -
-            src->buf[r * src_stride + c];
-      error += tmp * tmp;
-    }
-  }
-  return error;
-}
-
-int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  uint8_t *pred_buf[4][MAX_MB_PLANE];
-
-  // TODO(weitinglin): stride size needs to be fixed for high-bit depth
-  int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-  // target block in pxl
-  int pxl_row = mi_row << MI_SIZE_LOG2;
-  int pxl_col = mi_col << MI_SIZE_LOG2;
-  int64_t error, best_error = INT64_MAX;
-  int plane, tmp_mode, best_mode = 0;
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
-                            len);
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-
-  av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
-  av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
-
-  for (tmp_mode = 0; tmp_mode < MAX_NCOBMC_MODES; ++tmp_mode) {
-    error = 0;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
-                               pred_stride, tmp_mode);
-      error += get_ncobmc_error(xd, pxl_row, pxl_col, bsize, plane,
-                                &x->plane[plane].src);
-    }
-    if (error < best_error) {
-      best_mode = tmp_mode;
-      best_error = error;
-    }
   }
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
-                             pred_stride, best_mode);
-  }
-
-  return best_mode;
 }
-
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index dbc7527fb..1fa3d68ce 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -13,16 +13,20 @@
 #define AV1_ENCODER_RDOPT_H_
 
 #include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
 
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MAX_REF_MV_SERCH 3
+
 struct TileInfo;
-struct AV1_COMP;
 struct macroblock;
 struct RD_STATS;
 
@@ -35,7 +39,6 @@ static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
   (void)tx_size;
   rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
 
-#if CONFIG_VAR_TX
   {
     const int txb_h = tx_size_high_unit[tx_size];
     const int txb_w = tx_size_wide_unit[tx_size];
@@ -48,113 +51,86 @@ static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
   }
   assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
   assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
-#endif
 }
 #endif
 
-typedef enum OUTPUT_STATUS {
-  OUTPUT_HAS_PREDICTED_PIXELS,
-  OUTPUT_HAS_DECODED_PIXELS
-} OUTPUT_STATUS;
-
 // Returns the number of colors in 'src'.
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
-#if CONFIG_HIGHBITDEPTH
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count);
 // Same as av1_count_colors(), but for high-bitdepth mode.
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth);
-#endif  // CONFIG_HIGHBITDEPTH
-
-void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                    BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
-                    TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
-                    OUTPUT_STATUS output_status);
+                            int bit_depth, int *val_count);
 
 #if CONFIG_DIST_8X8
-int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
+int64_t av1_dist_8x8(const struct AV1_COMP *const cpi, const MACROBLOCK *x,
                      const uint8_t *src, int src_stride, const uint8_t *dst,
                      int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
                      int bsh, int visible_w, int visible_h, int qindex);
 #endif
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
-                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing);
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+                                    int plane, TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int plane, int blk_row, int blk_col,
+                                  int block, TX_SIZE tx_size,
+                                  const TXB_CTX *const txb_ctx,
+                                  int use_fast_coef_costing) {
+#if TXCOEFF_COST_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  (void)use_fast_coef_costing;
+  const int cost = av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block,
+                                       tx_size, txb_ctx);
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  tmp_cm->txcoeff_cost_timer += elapsed_time;
+  ++tmp_cm->txcoeff_cost_count;
 #endif
+  return cost;
+}
+
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
-                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+                               int mi_row, int mi_col, struct RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd);
 
-unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs);
-#if CONFIG_HIGHBITDEPTH
-unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
-#endif
 
 void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x, int mi_row, int mi_col,
-                               struct RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                               int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far);
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
     BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
-int av1_internal_image_edge(const struct AV1_COMP *cpi);
-int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step);
-int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step);
-int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col);
-
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
-void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
-                         int mi_row, int mi_col);
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC
-
-#if CONFIG_SUPERTX
-#if CONFIG_VAR_TX
-void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
-                       int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, const ENTROPY_CONTEXT *a,
-                       const ENTROPY_CONTEXT *l, RD_STATS *rd_stats);
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_TEST 0
+void av1_inter_mode_data_init();
+void av1_inter_mode_data_fit(int rdmult);
+void av1_inter_mode_data_show(const AV1_COMMON *cm);
 #endif
 
-void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
-                                  int64_t *distortion, int *skippable,
-                                  int64_t *sse, int64_t ref_best_rd, int plane,
-                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                  int use_fast_coef_casting);
-#endif  // CONFIG_SUPERTX
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-                     TX_SIZE tx_size, TX_TYPE tx_type);
-
-int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
-                               int mi_row, int mi_col, int *skip_blk,
-                               MB_MODE_INFO *backup_mbmi);
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
-                                      struct macroblock *x, int mi_row,
-                                      int mi_col);
-int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize);
-
-#endif
-
 #endif  // AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
index 4f01fbba4..2e9102745 100644
--- a/third_party/aom/av1/encoder/segmentation.c
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -18,26 +18,21 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/segmentation.h"
-#include "av1/encoder/subexp.h"
 
 void av1_enable_segmentation(struct segmentation *seg) {
   seg->enabled = 1;
   seg->update_map = 1;
   seg->update_data = 1;
+  seg->temporal_update = 0;
 }
 
 void av1_disable_segmentation(struct segmentation *seg) {
   seg->enabled = 0;
   seg->update_map = 0;
   seg->update_data = 0;
+  seg->temporal_update = 0;
 }
 
-void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data,
-                          unsigned char abs_delta) {
-  seg->abs_delta = abs_delta;
-
-  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
-}
 void av1_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
   seg->feature_mask[segment_id] &= ~(1 << feature_id);
@@ -48,76 +43,8 @@ void av1_clear_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = 0;
 }
 
-// Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(unsigned *segcounts,
-                               aom_prob *segment_tree_probs,
-                               const aom_prob *cur_tree_probs,
-                               const int probwt) {
-  // Work out probabilities of each segment
-  const unsigned cc[4] = { segcounts[0] + segcounts[1],
-                           segcounts[2] + segcounts[3],
-                           segcounts[4] + segcounts[5],
-                           segcounts[6] + segcounts[7] };
-  const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
-  int i;
-
-  segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
-  segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
-  segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]);
-  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
-  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
-  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
-  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
-
-  for (i = 0; i < 7; i++) {
-    const unsigned *ct =
-        i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2;
-    av1_prob_diff_update_savings_search(ct, cur_tree_probs[i],
-                                        &segment_tree_probs[i],
-                                        DIFF_UPDATE_PROB, probwt);
-  }
-}
-
-// Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(unsigned *segcounts, aom_prob *probs) {
-  const int c01 = segcounts[0] + segcounts[1];
-  const int c23 = segcounts[2] + segcounts[3];
-  const int c45 = segcounts[4] + segcounts[5];
-  const int c67 = segcounts[6] + segcounts[7];
-  const int c0123 = c01 + c23;
-  const int c4567 = c45 + c67;
-
-  // Cost the top node of the tree
-  int cost = c0123 * av1_cost_zero(probs[0]) + c4567 * av1_cost_one(probs[0]);
-
-  // Cost subsequent levels
-  if (c0123 > 0) {
-    cost += c01 * av1_cost_zero(probs[1]) + c23 * av1_cost_one(probs[1]);
-
-    if (c01 > 0)
-      cost += segcounts[0] * av1_cost_zero(probs[3]) +
-              segcounts[1] * av1_cost_one(probs[3]);
-    if (c23 > 0)
-      cost += segcounts[2] * av1_cost_zero(probs[4]) +
-              segcounts[3] * av1_cost_one(probs[4]);
-  }
-
-  if (c4567 > 0) {
-    cost += c45 * av1_cost_zero(probs[2]) + c67 * av1_cost_one(probs[2]);
-
-    if (c45 > 0)
-      cost += segcounts[4] * av1_cost_zero(probs[5]) +
-              segcounts[5] * av1_cost_one(probs[5]);
-    if (c67 > 0)
-      cost += segcounts[6] * av1_cost_zero(probs[6]) +
-              segcounts[7] * av1_cost_one(probs[6]);
-  }
-
-  return cost;
-}
-
 static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                       const TileInfo *tile, MODE_INFO **mi,
+                       const TileInfo *tile, MB_MODE_INFO **mi,
                        unsigned *no_pred_segcounts,
                        unsigned (*temporal_predictor_count)[2],
                        unsigned *t_unpred_seg_counts, int bw, int bh,
@@ -127,29 +54,27 @@ static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
   xd->mi = mi;
-  segment_id = xd->mi[0]->mbmi.segment_id;
+  segment_id = xd->mi[0]->segment_id;
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id =
-        get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col);
+        cm->last_frame_seg_map
+            ? get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col)
+            : 0;
     const int pred_flag = pred_segment_id == segment_id;
     const int pred_context = av1_get_pred_context_seg_id(xd);
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
+    xd->mi[0]->seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
     // Update the "unpredicted" segment count
@@ -158,21 +83,15 @@ static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                          const TileInfo *tile, MODE_INFO **mi,
+                          const TileInfo *tile, MB_MODE_INFO **mi,
                           unsigned *no_pred_segcounts,
                           unsigned (*temporal_predictor_count)[2],
                           unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const int mis = cm->mi_stride;
   const int bs = mi_size_wide[bsize], hbs = bs / 2;
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
-#if CONFIG_EXT_PARTITION_TYPES_AB
   const int qbs = bs / 4;
-#endif  // CONFIG_EXT_PARTITION_TYPES_AB
-#else
-  int bw, bh;
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
@@ -181,7 +100,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
              no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
              (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
 
-#if CONFIG_EXT_PARTITION_TYPES
   if (bsize == BLOCK_8X8)
     partition = PARTITION_NONE;
   else
@@ -196,28 +114,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
       CSEGS(hbs, bs, 0, 0);
       CSEGS(hbs, bs, 0, hbs);
       break;
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    case PARTITION_HORZ_A:
-      CSEGS(bs, qbs, 0, 0);
-      CSEGS(bs, qbs, qbs, 0);
-      CSEGS(bs, hbs, hbs, 0);
-      break;
-    case PARTITION_HORZ_B:
-      CSEGS(bs, hbs, 0, 0);
-      CSEGS(bs, qbs, hbs, 0);
-      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
-      break;
-    case PARTITION_VERT_A:
-      CSEGS(qbs, bs, 0, 0);
-      CSEGS(qbs, bs, 0, qbs);
-      CSEGS(hbs, bs, 0, hbs);
-      break;
-    case PARTITION_VERT_B:
-      CSEGS(hbs, bs, 0, 0);
-      CSEGS(qbs, bs, 0, hbs);
-      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
-      break;
-#else
     case PARTITION_HORZ_A:
       CSEGS(hbs, hbs, 0, 0);
       CSEGS(hbs, hbs, 0, hbs);
@@ -238,14 +134,24 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
       CSEGS(hbs, hbs, 0, hbs);
       CSEGS(hbs, hbs, hbs, hbs);
       break;
-#endif
+    case PARTITION_HORZ_4:
+      CSEGS(bs, qbs, 0, 0);
+      CSEGS(bs, qbs, qbs, 0);
+      CSEGS(bs, qbs, 2 * qbs, 0);
+      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+      break;
+
+    case PARTITION_VERT_4:
+      CSEGS(qbs, bs, 0, 0);
+      CSEGS(qbs, bs, 0, qbs);
+      CSEGS(qbs, bs, 0, 2 * qbs);
+      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      break;
+
     case PARTITION_SPLIT: {
-      const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       int n;
 
-      assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
-             num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
-
       for (n = 0; n < 4; n++) {
         const int mi_dc = hbs * (n & 1);
         const int mi_dr = hbs * (n >> 1);
@@ -257,34 +163,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
     } break;
     default: assert(0);
   }
-#else
-  bw = mi_size_wide[mi[0]->mbmi.sb_type];
-  bh = mi_size_high[mi[0]->mbmi.sb_type];
-
-  if (bw == bs && bh == bs) {
-    CSEGS(bs, bs, 0, 0);
-  } else if (bw == bs && bh < bs) {
-    CSEGS(bs, hbs, 0, 0);
-    CSEGS(bs, hbs, hbs, 0);
-  } else if (bw < bs && bh == bs) {
-    CSEGS(hbs, bs, 0, 0);
-    CSEGS(hbs, bs, 0, hbs);
-  } else {
-    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
-    int n;
-
-    assert(bw < bs && bh < bs);
-
-    for (n = 0; n < 4; n++) {
-      const int mi_dc = hbs * (n & 1);
-      const int mi_dr = hbs * (n >> 1);
-
-      count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
-                    temporal_predictor_count, t_unpred_seg_counts,
-                    mi_row + mi_dr, mi_col + mi_dc, subsize);
-    }
-  }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #undef CSEGS
 }
@@ -292,83 +170,58 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
   struct segmentation *seg = &cm->seg;
   struct segmentation_probs *segp = &cm->fc->seg;
-
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
-
   int tile_col, tile_row, mi_row, mi_col;
-  const int probwt = cm->num_tg;
-
-  unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
-  unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
-  unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
-
-  aom_prob no_pred_tree[SEG_TREE_PROBS];
-  aom_prob t_pred_tree[SEG_TREE_PROBS];
-#if !CONFIG_NEW_MULTISYMBOL
-  aom_prob t_nopred_prob[PREDICTION_PROBS];
-#endif
-
+  unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
+  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
   (void)xd;
 
-  // We are about to recompute all the segment counts, so zero the accumulators.
-  av1_zero(cm->counts.seg);
-
   // First of all generate stats regarding how well the last segment map
   // predicts this one
   for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
     TileInfo tile_info;
     av1_tile_set_row(&tile_info, cm, tile_row);
     for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-      MODE_INFO **mi_ptr;
+      MB_MODE_INFO **mi_ptr;
       av1_tile_set_col(&tile_info, cm, tile_col);
-#if CONFIG_DEPENDENT_HORZTILES
-      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-#endif
       mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
                tile_info.mi_col_start;
       for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
-        MODE_INFO **mi = mi_ptr;
+           mi_row += cm->seq_params.mib_size,
+          mi_ptr += cm->seq_params.mib_size * cm->mi_stride) {
+        MB_MODE_INFO **mi = mi_ptr;
         for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += cm->mib_size, mi += cm->mib_size) {
+             mi_col += cm->seq_params.mib_size, mi += cm->seq_params.mib_size) {
           count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
                         temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                        mi_col, cm->sb_size);
+                        mi_col, cm->seq_params.sb_size);
         }
       }
     }
   }
 
-  // Work out probability tree for coding segments without prediction
-  // and the cost.
-  calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt);
-  no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
-
-  // Key frames cannot use temporal prediction
-  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
-    // Work out probability tree for coding those segments not
-    // predicted using the temporal method and the cost.
-    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
-                       probwt);
-    t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
-#if !CONFIG_NEW_MULTISYMBOL
-    // Add in the cost of the signaling for each prediction context.
-    int i;
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      const int count0 = temporal_predictor_count[i][0];
-      const int count1 = temporal_predictor_count[i][1];
-
-      t_nopred_prob[i] = get_binary_prob(count0, count1);
-      av1_prob_diff_update_savings_search(
-          temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i],
-          DIFF_UPDATE_PROB, probwt);
-
-      // Add in the predictor signaling cost
-      t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
-                     count1 * av1_cost_one(t_nopred_prob[i]);
+  int seg_id_cost[MAX_SEGMENTS];
+  av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL);
+  no_pred_cost = 0;
+  for (int i = 0; i < MAX_SEGMENTS; ++i)
+    no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
+
+  // Frames without past dependency cannot use temporal prediction
+  if (cm->primary_ref_frame != PRIMARY_REF_NONE) {
+    int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
+      av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
+    t_pred_cost = 0;
+    // Cost for signaling the prediction flag.
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+      for (int j = 0; j < 2; ++j)
+        t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j];
     }
-#endif
+    // Cost for signaling the unpredicted segment id.
+    for (int i = 0; i < MAX_SEGMENTS; ++i)
+      t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i];
   }
 
   // Now choose which coding method to use.
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
index 1d24ed1d1..a207b0f26 100644
--- a/third_party/aom/av1/encoder/segmentation.h
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -27,19 +27,6 @@ void av1_disable_segfeature(struct segmentation *seg, int segment_id,
 void av1_clear_segdata(struct segmentation *seg, int segment_id,
                        SEG_LVL_FEATURES feature_id);
 
-// The values given for each segment can be either deltas (from the default
-// value chosen for the frame) or absolute values.
-//
-// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
-// SEGMENT_ALT_LF)
-// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
-// SEGMENT_ALT_LF)
-//
-// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
-// the absolute values given).
-void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data,
-                          unsigned char abs_delta);
-
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_reset_segment_features(AV1_COMMON *cm);
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index 5608d031e..49740817c 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -17,6 +17,12 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 
+// Setting this to 1 will disable trellis optimization completely.
+// Setting this to 2 will disable trellis optimization within the
+// transform search. Trellis optimization will still be applied
+// in the final encode.
+#define DISABLE_TRELLISQ_SEARCH 0
+
 #define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
 static MESH_PATTERN
     good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
@@ -28,23 +34,21 @@ static MESH_PATTERN
       { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
     };
 static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
-  50, 25, 15, 5, 1, 1
+  50, 50, 25, 15, 5, 1
 };
 
-#if CONFIG_INTRABC
-// TODO(aconverse@google.com): These settings are pretty relaxed, tune them for
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
 // each speed setting
 static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
   { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
-  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
 };
 static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
                                                             25,  25,  10 };
-#endif
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -74,22 +78,18 @@ static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
   }
 }
 
+// Do we have an internal image edge (e.g. formatting bars).
+static int has_internal_image_edge(const AV1_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
 static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
                                                        SPEED_FEATURES *sf,
                                                        int speed) {
   AV1_COMMON *const cm = &cpi->common;
 
-  if (speed >= 1) {
-    if (AOMMIN(cm->width, cm->height) >= 720) {
-      sf->disable_split_mask =
-          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-    } else {
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 21);
-    }
-  }
-
   if (speed >= 2) {
     if (AOMMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask =
@@ -121,11 +121,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
   }
 
   // If this is a two pass clip that fits the criteria for animated or
-  // graphics content then reset disable_split_mask for speeds 1-4.
+  // graphics content then reset disable_split_mask for speeds 2+.
   // Also if the image edge is internal to the coded area.
-  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+  if ((speed >= 2) && (cpi->oxcf.pass == 2) &&
       ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
-       (av1_internal_image_edge(cpi)))) {
+       (has_internal_image_edge(cpi)))) {
     sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
   }
 
@@ -145,85 +145,83 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
 
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->reduce_inter_modes = 1;
+  sf->prune_ext_partition_types_search_level = 1;
+  sf->ml_prune_ab_partition = 1;
+  sf->adaptive_txb_search_level = 1;
+  sf->jnt_comp_skip_mv_search = 1;
+  sf->model_based_prune_tx_search_level = 1;
+  sf->model_based_post_interp_filter_breakout = 1;
+  sf->inter_mode_rd_model_estimation = 1;
+
   if (speed >= 1) {
-    sf->tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->selective_ref_frame = 1;
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->intra_tx_size_search_init_depth_sqr = 1;
+    sf->tx_size_search_lgr_block = 1;
+    sf->two_pass_partition_search = 1;
+    sf->mode_pruning_based_on_two_pass_partition_search = 1;
+    sf->prune_ext_partition_types_search_level = 2;
+    sf->use_fast_interpolation_filter_search = 1;
+    sf->skip_repeat_interpolation_filter_search = 1;
+    sf->tx_type_search.skip_tx_search = 1;
+    sf->tx_type_search.ml_tx_split_thresh = 40;
+    sf->model_based_prune_tx_search_level = 0;
+    sf->model_based_post_interp_filter_breakout = 0;
+    // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation
+    // on speed 1
+    sf->inter_mode_rd_model_estimation = 0;
+    sf->adaptive_txb_search_level = 2;
+    sf->use_intra_txb_hash = 1;
+    sf->optimize_b_precheck = 1;
+    sf->dual_sgr_penalty_level = 1;
   }
 
   if (speed >= 2) {
-    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
-        av1_internal_image_edge(cpi)) {
-      sf->use_square_partition_only = !frame_is_boosted(cpi);
-    } else {
-      sf->use_square_partition_only = !frame_is_intra_only(cm);
-    }
+    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
 
-    sf->less_rectangular_check = 1;
+    sf->selective_ref_frame = 2;
+    sf->fast_cdef_search = 1;
 
     sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 1;
+    sf->mv.auto_mv_step_size = 1;
     sf->mv.subpel_iters_per_step = 1;
-    sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
-
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-#if CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-#endif
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-#endif
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
 
-    sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
-    sf->tx_type_search.prune_mode = PRUNE_ONE;
-    // Use transform domain distortion.
-    // Note var-tx expt always uses pixel domain distortion.
-    sf->use_transform_domain_distortion = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
     sf->disable_wedge_search_var_thresh = 100;
     sf->fast_wedge_sign_estimate = 1;
   }
 
   if (speed >= 3) {
-    sf->tx_size_search_method =
-        frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
-    sf->mode_search_skip_flags =
-        (cm->frame_type == KEY_FRAME)
-            ? 0
-            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
-                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
-    sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->allow_partition_search_skip = 1;
-    sf->use_upsampled_references = 0;
+    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->adaptive_motion_search = 1;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->use_transform_domain_distortion = 1;
+    sf->use_accurate_subpel_search = 0;
     sf->adaptive_rd_thresh = 2;
-#if CONFIG_EXT_TX
-    sf->tx_type_search.prune_mode = PRUNE_TWO;
-#endif
-#if CONFIG_GLOBAL_MOTION
+    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
     sf->gm_search_type = GM_DISABLE_SEARCH;
-#endif  // CONFIG_GLOBAL_MOTION
   }
 
   if (speed >= 4) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    sf->use_square_partition_only = !boosted;
     sf->tx_size_search_method =
         frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
@@ -232,52 +230,44 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
     sf->alt_ref_search_fp = 1;
-    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
-    sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
-#if CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
-#endif  // CONFIG_CFL
     sf->adaptive_interp_filter_search = 1;
   }
 
   if (speed >= 5) {
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->mv.search_method = BIGDIA;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->adaptive_rd_thresh = 4;
-    if (cm->frame_type != KEY_FRAME)
-      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+    sf->mode_search_skip_flags =
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                  FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
     sf->partition_search_breakout_rate_thr = 300;
+    sf->use_transform_domain_distortion = 2;
   }
 
   if (speed >= 6) {
     int i;
-    sf->optimize_coefficients = 0;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
     for (i = 0; i < TX_SIZES; ++i) {
       sf->intra_y_mode_mask[i] = INTRA_DC;
-#if CONFIG_CFL
       sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
-#else
-      sf->intra_uv_mode_mask[i] = INTRA_DC;
-#endif  // CONFIG_CFL
     }
     sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
@@ -288,9 +278,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
     sf->default_max_partition_size = BLOCK_32X32;
     sf->default_min_partition_size = BLOCK_8X8;
-#if CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
-#endif  // CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->mv.search_method = FAST_HEX;
@@ -298,13 +286,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
-#if CONFIG_EXT_PARTITION
     sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
-#endif  // CONFIG_EXT_PARTITION
     sf->partition_search_type = REFERENCE_PARTITION;
-    sf->default_min_partition_size = BLOCK_8X8;
     sf->reuse_inter_pred_sby = 1;
     sf->force_frame_boost =
         is_keyframe ||
@@ -324,31 +309,9 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  AV1_COMMON *const cm = &cpi->common;
   RD_OPT *const rd = &cpi->rd;
   int i;
 
-// Limit memory usage for high resolutions
-#if CONFIG_EXT_REFS
-  // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for
-  //               ext-refs. Need to work with @yunqingwang to have a more
-  //               effective solution.
-  if (AOMMIN(cm->width, cm->height) > 720) {
-    // Turn off the use of upsampled references for HD resolution
-    sf->use_upsampled_references = 0;
-  } else if ((AOMMIN(cm->width, cm->height) > 540) &&
-             (oxcf->profile != PROFILE_0)) {
-    sf->use_upsampled_references = 0;
-  }
-#else
-  if (AOMMIN(cm->width, cm->height) > 1080) {
-    sf->use_upsampled_references = 0;
-  } else if ((AOMMIN(cm->width, cm->height) > 720) &&
-             (oxcf->profile != PROFILE_0)) {
-    sf->use_upsampled_references = 0;
-  }
-#endif  // CONFIG_EXT_REFS
-
   if (oxcf->mode == GOOD) {
     set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
   }
@@ -371,6 +334,52 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 }
 
+static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (speed & TXFM_CODING_SF) {
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->intra_tx_size_search_init_depth_sqr = 1;
+    sf->tx_size_search_method = USE_FAST_RD;
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+  }
+
+  if (speed & INTER_PRED_SF) {
+    sf->selective_ref_frame = 2;
+    // sf->adaptive_motion_search = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->adaptive_pred_interp_filter = 1;
+  }
+
+  if (speed & INTRA_PRED_SF) {
+    sf->max_intra_bsize = BLOCK_32X32;
+  }
+
+  if (speed & PARTITION_SF) {
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        has_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
+    sf->less_rectangular_check = 1;
+    sf->prune_ext_partition_types_search_level = 2;
+  }
+
+  if (speed & LOOP_FILTER_SF) {
+    sf->fast_cdef_search = 1;
+  }
+
+  if (speed & RD_SKIP_SF) {
+    sf->use_rd_breakout = 1;
+  }
+}
+
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -378,7 +387,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
-  (void)cm;
   // best quality defaults
   sf->frame_parameter_update = 1;
   sf->mv.search_method = NSTEP;
@@ -386,7 +394,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->mv.subpel_search_method = SUBPEL_TREE;
   sf->mv.subpel_iters_per_step = 2;
   sf->mv.subpel_force_stop = 0;
-  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+#if DISABLE_TRELLISQ_SEARCH == 2
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                  ? FINAL_PASS_TRELLIS_OPT
+                                  : NO_TRELLIS_OPT;
+#elif DISABLE_TRELLISQ_SEARCH == 1
+  sf->optimize_coefficients = NO_TRELLIS_OPT;
+#else
+  if (is_lossless_requested(&cpi->oxcf))
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+  else
+    sf->optimize_coefficients = FULL_TRELLIS_OPT;
+#endif  // DISABLE_TRELLISQ_SEARCH
+  sf->gm_erroradv_type = GM_ERRORADV_TR_0;
   sf->mv.reduce_first_step_size = 0;
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
@@ -394,6 +414,15 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
   sf->tx_size_search_method = USE_FULL_RD;
+  sf->inter_tx_size_search_init_depth_sqr = 0;
+  sf->inter_tx_size_search_init_depth_rect = 0;
+  sf->intra_tx_size_search_init_depth_rect = 0;
+  sf->intra_tx_size_search_init_depth_sqr = 0;
+  sf->tx_size_search_lgr_block = 0;
+  sf->model_based_prune_tx_search_level = 0;
+  sf->model_based_post_interp_filter_breakout = 0;
+  sf->reduce_inter_modes = 0;
+  sf->selective_ref_gm = 1;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
@@ -401,10 +430,13 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->cb_partition_search = 0;
   sf->alt_ref_search_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
-  sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  sf->tx_type_search.ml_tx_split_thresh = 30;
   sf->tx_type_search.use_skip_flag_prediction = 1;
   sf->tx_type_search.fast_intra_tx_type_search = 0;
   sf->tx_type_search.fast_inter_tx_type_search = 0;
+  sf->tx_type_search.skip_tx_search = 0;
+  sf->selective_ref_frame = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
@@ -420,17 +452,25 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
-  sf->use_upsampled_references = 1;
+  sf->use_accurate_subpel_search = 1;
   sf->disable_wedge_search_var_thresh = 0;
   sf->fast_wedge_sign_estimate = 0;
+  sf->drop_ref = 0;
+  sf->skip_intra_in_interframe = 1;
+  sf->txb_split_cap = 1;
+  sf->adaptive_txb_search_level = 0;
+  sf->two_pass_partition_search = 0;
+  sf->mode_pruning_based_on_two_pass_partition_search = 0;
+  sf->use_intra_txb_hash = 0;
+  sf->use_inter_txb_hash = 1;
+  sf->use_mb_rd_hash = 1;
+  sf->optimize_b_precheck = 0;
+  sf->jnt_comp_fast_tx_search = 0;
+  sf->jnt_comp_skip_mv_search = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
-#if CONFIG_CFL
     sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
-#else
-    sf->intra_uv_mode_mask[i] = INTRA_ALL;
-#endif  // CONFIG_CFL
   }
   sf->use_rd_breakout = 0;
   sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
@@ -448,22 +488,28 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
-  sf->tx_size_search_breakout = 0;
   sf->partition_search_breakout_dist_thr = 0;
   sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
+  sf->prune_ext_partition_types_search_level = 0;
+  sf->ml_prune_ab_partition = 0;
+  sf->fast_cdef_search = 0;
 
   // Set this at the appropriate speed levels
   sf->use_transform_domain_distortion = 0;
-#if CONFIG_GLOBAL_MOTION
   sf->gm_search_type = GM_FULL_SEARCH;
-#endif  // CONFIG_GLOBAL_MOTION
+  sf->use_fast_interpolation_filter_search = 0;
+  sf->skip_repeat_interpolation_filter_search = 0;
+  sf->use_hash_based_trellis = 0;
+
+  // Set decoder side speed feature to use less dual sgr modes
+  sf->dual_sgr_penalty_level = 0;
+
+  sf->inter_mode_rd_model_estimation = 0;
 
-  if (oxcf->mode == GOOD
-#if CONFIG_XIPHRC
-      || oxcf->pass == 1
-#endif
-      )
+  set_dev_sf(cpi, sf, oxcf->dev_sf);
+
+  if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
 
   // sf->partition_search_breakout_dist_thr is set assuming max 64x64
@@ -472,7 +518,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
   }
 
-  cpi->full_search_sad = av1_full_search_sad;
   cpi->diamond_search_sad = av1_diamond_search_sad;
 
   sf->allow_exhaustive_searches = 1;
@@ -490,7 +535,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     sf->mesh_patterns[i].interval =
         good_quality_mesh_patterns[speed][i].interval;
   }
-#if CONFIG_INTRABC
   if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
        cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
@@ -500,18 +544,15 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     }
     sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
   }
-#endif  // CONFIG_INTRABC
 
-#if !CONFIG_XIPHRC
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
-  if (oxcf->pass == 1) sf->optimize_coefficients = 0;
-#endif
+  if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
 
   // No recode for 1 pass.
   if (oxcf->pass == 0) {
     sf->recode_loop = DISALLOW_RECODE;
-    sf->optimize_coefficients = 0;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
   }
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
@@ -524,12 +565,11 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-#if !CONFIG_AOM_QM
-  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
-#else
+  cpi->optimize_speed_feature =
+      oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
   // FIXME: trellis not very efficient for quantisation matrices
-  x->optimize = 0;
-#endif
+  if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+  if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
 
   x->min_partition_size = sf->default_min_partition_size;
   x->max_partition_size = sf->default_max_partition_size;
@@ -543,4 +583,8 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
   else if (cpi->oxcf.motion_vector_unit_test == 2)
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+#if CONFIG_DIST_8X8
+  if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+#endif  // CONFIG_DIST_8X8
 }
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index edd79cd16..59cb6be58 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -20,64 +20,51 @@ extern "C" {
 
 enum {
   INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
-              (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) |
-              (1 << D207_PRED) | (1 << D63_PRED) | (1 << SMOOTH_PRED) |
-#if CONFIG_SMOOTH_HV
-              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) |
-#endif  // CONFIG_SMOOTH_HV
-              (1 << TM_PRED),
-#if CONFIG_CFL
-  UV_INTRA_ALL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
-                 (1 << UV_D45_PRED) | (1 << UV_D135_PRED) |
-                 (1 << UV_D117_PRED) | (1 << UV_D153_PRED) |
-                 (1 << UV_D207_PRED) | (1 << UV_D63_PRED) |
-                 (1 << UV_SMOOTH_PRED) |
-#if CONFIG_SMOOTH_HV
-                 (1 << UV_SMOOTH_V_PRED) | (1 << UV_SMOOTH_H_PRED) |
-#endif  // CONFIG_SMOOTH_HV
-                 (1 << UV_TM_PRED) | (1 << UV_CFL_PRED),
+              (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+              (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+  UV_INTRA_ALL =
+      (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+      (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+      (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+      (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+      (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC = (1 << UV_DC_PRED),
   UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
-  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_TM_PRED),
-  UV_INTRA_DC_TM_CFL =
-      (1 << UV_DC_PRED) | (1 << UV_TM_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+  UV_INTRA_DC_PAETH_CFL =
+      (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
   UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
                         (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
-  UV_INTRA_DC_TM_H_V = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
-                       (1 << UV_V_PRED) | (1 << UV_H_PRED),
-  UV_INTRA_DC_TM_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
-                           (1 << UV_V_PRED) | (1 << UV_H_PRED) |
-                           (1 << UV_CFL_PRED),
-#endif  // CONFIG_CFL
+  UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                          (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                              (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                              (1 << UV_CFL_PRED),
   INTRA_DC = (1 << DC_PRED),
-  INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
   INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
-  INTRA_DC_TM_H_V =
-      (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
+  INTRA_DC_PAETH_H_V =
+      (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
 };
 
 enum {
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): To further consider following single ref comp modes:
-//               SR_NEAREST_NEARMV, SR_NEAREST_NEWMV, SR_NEAR_NEWMV,
-//               SR_ZERO_NEWMV, and SR_NEW_NEWMV.
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
-              (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) |
-              (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) |
-              (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV),
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+              (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+              (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+              (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
   INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
                   (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
   INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
                       (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
                       (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
                       (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
-  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
-                       (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) |
+                       (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                        (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
-  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) |
-                           (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) |
+                           (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                            (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) |
                            (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) |
                            (1 << NEAR_NEWMV),
@@ -86,8 +73,8 @@ enum {
                            (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
                            (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                            (1 << NEAR_NEARMV),
-  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
-                            (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+                            (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                             (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
                             (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                             (1 << NEAR_NEARMV),
@@ -105,6 +92,17 @@ enum {
                               (1 << THR_ALTR) | (1 << THR_GOLD)
 };
 
+typedef enum {
+  TXFM_CODING_SF = 1,
+  INTER_PRED_SF = 2,
+  INTRA_PRED_SF = 4,
+  PARTITION_SF = 8,
+  LOOP_FILTER_SF = 16,
+  RD_SKIP_SF = 32,
+  RESERVE_2_SF = 64,
+  RESERVE_3_SF = 128,
+} DEV_SPEED_FEATURES;
+
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
@@ -141,8 +139,8 @@ typedef enum {
 
 typedef enum {
   USE_FULL_RD = 0,
+  USE_FAST_RD,
   USE_LARGESTALL,
-  USE_TX_8X8
 } TX_SIZE_SEARCH_METHOD;
 
 typedef enum {
@@ -190,10 +188,13 @@ typedef enum {
   NO_PRUNE = 0,
   // eliminates one tx type in vertical and horizontal direction
   PRUNE_ONE = 1,
-#if CONFIG_EXT_TX
   // eliminates two tx types in each direction
   PRUNE_TWO = 2,
-#endif
+  // adaptively prunes the least perspective tx types out of all 16
+  // (tuned to provide negligible quality loss)
+  PRUNE_2D_ACCURATE = 3,
+  // similar, but applies much more aggressive pruning to get better speed-up
+  PRUNE_2D_FAST = 4,
 } TX_TYPE_PRUNE_MODE;
 
 typedef struct {
@@ -204,6 +205,13 @@ typedef struct {
   // Use a skip flag prediction model to detect blocks with skip = 1 early
   // and avoid doing full TX type search for such blocks.
   int use_skip_flag_prediction;
+
+  // Threshold used by the ML based method to predict TX block split decisions.
+  int ml_tx_split_thresh;
+
+  // skip remaining transform type search when we found the rdcost of skip is
+  // better than applying transform
+  int skip_tx_search;
 } TX_TYPE_SEARCH;
 
 typedef enum {
@@ -261,13 +269,29 @@ typedef struct MESH_PATTERN {
   int interval;
 } MESH_PATTERN;
 
-#if CONFIG_GLOBAL_MOTION
 typedef enum {
   GM_FULL_SEARCH,
   GM_REDUCED_REF_SEARCH,
   GM_DISABLE_SEARCH
 } GM_SEARCH_TYPE;
-#endif  // CONFIG_GLOBAL_MOTION
+
+typedef enum {
+  GM_ERRORADV_TR_0,
+  GM_ERRORADV_TR_1,
+  GM_ERRORADV_TR_2,
+  GM_ERRORADV_TR_TYPES,
+} GM_ERRORADV_TYPE;
+
+typedef enum {
+  NO_TRELLIS_OPT,         // No trellis optimization
+  FULL_TRELLIS_OPT,       // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT  // Trellis optimization in only the final encode pass
+} TRELLIS_OPT_TYPE;
+
+typedef enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} TXFM_RD_MODEL;
 
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
@@ -277,8 +301,11 @@ typedef struct SPEED_FEATURES {
 
   RECODE_LOOP_TYPE recode_loop;
 
-  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
-  int optimize_coefficients;
+  // Trellis (dynamic programming) optimization of quantized values
+  TRELLIS_OPT_TYPE optimize_coefficients;
+
+  // Global motion warp error threshold
+  GM_ERRORADV_TYPE gm_erroradv_type;
 
   // Always set to 0. If on it enables 0 cost background transmission
   // (except for the initial transmission of the segmentation). The feature is
@@ -287,6 +314,14 @@ typedef struct SPEED_FEATURES {
   // adds overhead.
   int static_segmentation;
 
+  // Limit the inter mode tested in the RD loop
+  int reduce_inter_modes;
+
+  // Do not compute the global motion parameters for a LAST2_FRAME or
+  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
+  // global model.
+  int selective_ref_gm;
+
   // If 1 we iterate finding a best reference for 2 ref frames together - via
   // a log search that iterates 4 times (check around mv for last for best
   // error of combined predictor then check around mv for alt). If 0 we
@@ -309,6 +344,17 @@ typedef struct SPEED_FEATURES {
   // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
 
+  // Init search depth for square and rectangular transform partitions.
+  // Values:
+  // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+  int inter_tx_size_search_init_depth_sqr;
+  int inter_tx_size_search_init_depth_rect;
+  int intra_tx_size_search_init_depth_sqr;
+  int intra_tx_size_search_init_depth_rect;
+  // If any dimension of a coding block size above 64, always search the
+  // largest transform only, since the largest transform block size is 64x64.
+  int tx_size_search_lgr_block;
+
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -318,9 +364,51 @@ typedef struct SPEED_FEATURES {
 
   TX_TYPE_SEARCH tx_type_search;
 
+  // Skip split transform block partition when the collocated bigger block
+  // is selected as all zero coefficients.
+  int txb_split_cap;
+
+  // Shortcut the transform block partition and type search when the target
+  // rdcost is relatively lower.
+  // Values are 0 (not used) , or 1 - 2 with progressively increasing
+  // aggressiveness
+  int adaptive_txb_search_level;
+
+  // Prune level for tx_size_type search for inter based on rd model
+  // 0: no pruning
+  // 1-2: progressively increasing aggressiveness of pruning
+  int model_based_prune_tx_search_level;
+
+  // Model based breakout after interpolation filter search
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_post_interp_filter_breakout;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
+  // Drop less likely to be picked reference frames in the RD search.
+  // Has three levels for now: 0, 1 and 2, where higher levels prune more
+  // aggressively than lower ones. (0 means no pruning).
+  int selective_ref_frame;
+
+  // Prune extended partition types search
+  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
+  // aggressiveness of pruning in order.
+  int prune_ext_partition_types_search_level;
+
+  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
+  int ml_prune_ab_partition;
+
+  int fast_cdef_search;
+
+  // 2-pass coding block partition search
+  int two_pass_partition_search;
+
+  // Use the mode decisions made in the initial partition search to prune mode
+  // candidates, e.g. ref frames.
+  int mode_pruning_based_on_two_pass_partition_search;
+
   // Skip rectangular partition test when partition type none gives better
   // rd than partition type split.
   int less_rectangular_check;
@@ -427,7 +515,7 @@ typedef struct SPEED_FEATURES {
   // by only looking at counts from 1/2 the bands.
   FAST_COEFF_UPDATE use_fast_coef_updates;
 
-  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV
   // modes are used in order from LSB to MSB for each BLOCK_SIZE.
   int inter_mode_mask[BLOCK_SIZES_ALL];
 
@@ -456,10 +544,6 @@ typedef struct SPEED_FEATURES {
   // default interp filter choice
   InterpFilter default_interp_filter;
 
-  // Early termination in transform size search, which only applies while
-  // tx_size_search_method is USE_FULL_RD.
-  int tx_size_search_breakout;
-
   // adaptive interp_filter search to allow skip of certain filter types.
   int adaptive_interp_filter_search;
 
@@ -476,16 +560,67 @@ typedef struct SPEED_FEATURES {
   // Fast approximation of av1_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
-  // Do sub-pixel search in up-sampled reference frames
-  int use_upsampled_references;
+  // If true, sub-pixel search uses the exact convolve function used for final
+  // encoding and decoding; otherwise, it uses bilinear interpolation.
+  int use_accurate_subpel_search;
 
   // Whether to compute distortion in the image domain (slower but
   // more accurate), or in the transform domain (faster but less acurate).
+  // 0: use image domain
+  // 1: use transform domain in tx_type search, and use image domain for
+  // RD_STATS
+  // 2: use transform domain
   int use_transform_domain_distortion;
 
-#if CONFIG_GLOBAL_MOTION
   GM_SEARCH_TYPE gm_search_type;
-#endif  // CONFIG_GLOBAL_MOTION
+
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
+
+  // Save results of interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are same with previous
+  // saved results, it can be skipped.
+  int skip_repeat_interpolation_filter_search;
+
+  // Use a hash table to store previously computed optimized qcoeffs from
+  // expensive calls to optimize_txb.
+  int use_hash_based_trellis;
+
+  // flag to drop some ref frames in compound motion search
+  int drop_ref;
+
+  // flag to allow skipping intra mode for inter frame prediction
+  int skip_intra_in_interframe;
+
+  // Use hash table to store intra(keyframe only) txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_intra_txb_hash;
+
+  // Use hash table to store inter txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_inter_txb_hash;
+
+  // Use hash table to store macroblock RD search results
+  // to avoid repeated search on the same residue signal.
+  int use_mb_rd_hash;
+
+  // Calculate RD cost before doing optimize_b, and skip if the cost is large.
+  int optimize_b_precheck;
+
+  // Use model rd instead of transform search in jnt_comp
+  int jnt_comp_fast_tx_search;
+
+  // Skip mv search in jnt_comp
+  int jnt_comp_skip_mv_search;
+
+  // Decoder side speed feature to add penalty for use of dual-sgr filters.
+  // Takes values 0 - 10, 0 indicating no penalty and each additional level
+  // adding a penalty of 1%
+  int dual_sgr_penalty_level;
+
+  // Dynamically estimate final rd from prediction error and mode cost
+  int inter_mode_rd_model_estimation;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c
deleted file mode 100644
index dc96d712a..000000000
--- a/third_party/aom/av1/encoder/subexp.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "aom_dsp/bitwriter.h"
-
-#include "av1/common/common.h"
-#include "av1/common/entropy.h"
-#include "av1/encoder/cost.h"
-#include "av1/encoder/subexp.h"
-
-static const uint8_t update_bits[255] = {
-  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,
-  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 0,
-};
-#define MIN_DELP_BITS 5
-
-static int recenter_nonneg(int v, int m) {
-  if (v > (m << 1))
-    return v;
-  else if (v >= m)
-    return ((v - m) << 1);
-  else
-    return ((m - v) << 1) - 1;
-}
-
-static int remap_prob(int v, int m) {
-  int i;
-  static const uint8_t map_table[MAX_PROB - 1] = {
-    // generated by:
-    //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
-    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-    48,  49,  2,   50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
-    3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  4,   74,
-    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,
-    89,  90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102,
-    103, 104, 105, 106, 107, 108, 109, 7,   110, 111, 112, 113, 114, 115, 116,
-    117, 118, 119, 120, 121, 8,   122, 123, 124, 125, 126, 127, 128, 129, 130,
-    131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
-    145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
-    158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171,
-    172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13,  182, 183, 184, 185,
-    186, 187, 188, 189, 190, 191, 192, 193, 14,  194, 195, 196, 197, 198, 199,
-    200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212, 213,
-    214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
-    228, 229, 17,  230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-    18,  242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
-  };
-  v--;
-  m--;
-  if ((m << 1) <= MAX_PROB)
-    i = recenter_nonneg(v, m) - 1;
-  else
-    i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
-
-  i = map_table[i];
-  return i;
-}
-
-static int prob_diff_update_cost(aom_prob newp, aom_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  return update_bits[delp] << AV1_PROB_COST_SHIFT;
-}
-
-static void encode_uniform(aom_writer *w, int v) {
-  const int l = 8;
-  const int m = (1 << l) - 190;
-  if (v < m) {
-    aom_write_literal(w, v, l - 1);
-  } else {
-    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
-    aom_write_literal(w, (v - m) & 1, 1);
-  }
-}
-
-static INLINE int write_bit_gte(aom_writer *w, int word, int test) {
-  aom_write_literal(w, word >= test, 1);
-  return word >= test;
-}
-
-static void encode_term_subexp(aom_writer *w, int word) {
-  if (!write_bit_gte(w, word, 16)) {
-    aom_write_literal(w, word, 4);
-  } else if (!write_bit_gte(w, word, 32)) {
-    aom_write_literal(w, word - 16, 4);
-  } else if (!write_bit_gte(w, word, 64)) {
-    aom_write_literal(w, word - 32, 5);
-  } else {
-    encode_uniform(w, word - 64);
-  }
-}
-
-void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp) {
-  const int delp = remap_prob(newp, oldp);
-  encode_term_subexp(w, delp);
-}
-
-int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
-                                        aom_prob *bestp, aom_prob upd,
-                                        int probwt) {
-  const uint32_t old_b = cost_branch256(ct, oldp);
-  int bestsavings = 0;
-  aom_prob newp, bestnewp = oldp;
-  const int step = *bestp > oldp ? -1 : 1;
-  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
-
-  if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
-    for (newp = *bestp; newp != oldp; newp += step) {
-      const int new_b = cost_branch256(ct, newp);
-      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
-      if (savings > bestsavings) {
-        bestsavings = savings;
-        bestnewp = newp;
-      }
-    }
-  }
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
-void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
-                               const unsigned int ct[2], int probwt) {
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  aom_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
-      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
-  assert(newp >= 1);
-  if (savings > 0) {
-    aom_write(w, 1, upd);
-    av1_write_prob_diff_update(w, newp, *oldp);
-    *oldp = newp;
-  } else {
-    aom_write(w, 0, upd);
-  }
-}
-
-int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
-                                      int probwt) {
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  aom_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
-      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
-  return savings;
-}
diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h
deleted file mode 100644
index 580edabdb..000000000
--- a/third_party/aom/av1/encoder/subexp.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_SUBEXP_H_
-#define AV1_ENCODER_SUBEXP_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "aom_dsp/bitwriter.h"
-#include "aom_dsp/prob.h"
-
-void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm);
-
-void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
-                               const unsigned int ct[2], int probwt);
-
-int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
-                                        aom_prob *bestp, aom_prob upd,
-                                        int probwt);
-
-int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const aom_prob oldp,
-                                              aom_prob *bestp, aom_prob upd,
-                                              int stepsize, int probwt);
-
-int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
-                                      int probwt);
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_SUBEXP_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index daa647689..250feab81 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -12,7 +12,8 @@
 #include <math.h>
 #include <limits.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/alloccommon.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/quant_common.h"
@@ -35,26 +36,17 @@
 static void temporal_filter_predictors_mb_c(
     MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
     int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
-    uint8_t *pred, struct scale_factors *scale, int x, int y) {
+    uint8_t *pred, struct scale_factors *scale, int x, int y,
+    int can_use_previous) {
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
   enum mv_precision mv_precision_uv;
   int uv_stride;
   // TODO(angiebird): change plane setting accordingly
-  ConvolveParams conv_params = get_conv_params(which_mv, which_mv, 0);
-
-#if USE_TEMPORALFILTER_12TAP
-  const InterpFilters interp_filters =
-      av1_broadcast_interp_filter(TEMPORALFILTER_12TAP);
-  (void)xd;
-#else
-  const InterpFilters interp_filters = xd->mi[0]->mbmi.interp_filters;
-#endif  // USE_TEMPORALFILTER_12TAP
-
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd);
+  const InterpFilters interp_filters = xd->mi[0]->interp_filters;
   WarpTypesAllowed warp_types;
   memset(&warp_types, 0, sizeof(WarpTypesAllowed));
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
   if (uv_block_width == 8) {
     uv_stride = (stride + 1) >> 1;
@@ -64,55 +56,36 @@ static void temporal_filter_predictors_mb_c(
     mv_precision_uv = MV_PRECISION_Q3;
   }
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
                                      16, 16, which_mv, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     &warp_types, x, y,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     0, MV_PRECISION_Q3, x, y, xd);
-
-    av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     &warp_types, x, y,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     1, mv_precision_uv, x, y, xd);
-
-    av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     &warp_types, x, y,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     2, mv_precision_uv, x, y, xd);
+                                     &warp_types, x, y, 0, MV_PRECISION_Q3, x,
+                                     y, xd, can_use_previous);
+
+    av1_highbd_build_inter_predictor(
+        u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
+        x, y, 1, mv_precision_uv, x, y, xd, can_use_previous);
+
+    av1_highbd_build_inter_predictor(
+        v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
+        x, y, 2, mv_precision_uv, x, y, xd, can_use_previous);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
-                            &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            &warp_types, x, y, 0, 0,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            MV_PRECISION_Q3, x, y, xd);
+                            &conv_params, interp_filters, &warp_types, x, y, 0,
+                            0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
 
   av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
                             &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            &warp_types, x, y, 1, 0,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            mv_precision_uv, x, y, xd);
+                            &conv_params, interp_filters, &warp_types, x, y, 1,
+                            0, mv_precision_uv, x, y, xd, can_use_previous);
 
   av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
                             &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            &warp_types, x, y, 2, 0,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            mv_precision_uv, x, y, xd);
+                            &conv_params, interp_filters, &warp_types, x, y, 2,
+                            0, mv_precision_uv, x, y, xd, can_use_previous);
 }
 
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
@@ -176,7 +149,6 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_temporal_filter_apply_c(
     uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
     unsigned int block_width, unsigned int block_height, int strength,
@@ -238,7 +210,6 @@ void av1_highbd_temporal_filter_apply_c(
     byte += stride - block_width;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
                                               uint8_t *arf_frame_buf,
@@ -255,7 +226,7 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
   int cost_list[5];
   MvLimits tmp_mv_limits = x->mv_limits;
 
-  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1 = kZeroMv;
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
 
   // Save input state
@@ -276,8 +247,8 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
 
-  x->mvcost = x->mv_cost_stack[0];
-  x->nmvjointcost = x->nmv_vec_cost[0];
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
 
   // Use mv costing from x->mvcost directly
   av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
@@ -286,9 +257,8 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   x->mv_limits = tmp_mv_limits;
 
-// Ignore mv costing by sending NULL pointer instead of cost array
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level == 1) {
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  if (cpi->common.cur_frame_force_integer_mv == 1) {
     const uint8_t *const src_address = x->plane[0].src.buf;
     const int src_stride = x->plane[0].src.stride;
     const uint8_t *const y = xd->plane[0].pre[0].buf;
@@ -301,17 +271,15 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
     bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
                                           src_stride, &sse);
   } else {
-#endif
     bestsme = cpi->find_fractional_mv_step(
-        x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+        x, &cpi->common, 0, 0, &best_ref_mv1,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
         NULL, 0, 0, 0, 0, 0);
-#if CONFIG_AMVR
   }
-#endif
 
-  x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
+  x->e_mbd.mi[0]->mv[0] = x->best_mv;
 
   // Restore input state
   x->plane[0].src = src;
@@ -321,13 +289,12 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 }
 
 static void temporal_filter_iterate_c(AV1_COMP *cpi,
-#if CONFIG_BGSPRITE
-                                      YV12_BUFFER_CONFIG *target,
-#endif  // CONFIG_BGSPRITE
                                       YV12_BUFFER_CONFIG **frames,
                                       int frame_count, int alt_ref_index,
                                       int strength,
                                       struct scale_factors *scale) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   int byte;
   int frame;
   int mb_col, mb_row;
@@ -341,28 +308,22 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
   MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
+  DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]);
   uint8_t *predictor;
-#else
-  DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
-#endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
 
   // Save input state
   uint8_t *input_buffer[MAX_MB_PLANE];
   int i;
-#if CONFIG_HIGHBITDEPTH
   if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     predictor = CONVERT_TO_BYTEPTR(predictor16);
   } else {
     predictor = predictor8;
   }
-#endif
 
-  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+  for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
     // Source frames are extended to 16 pixels. This is different than
@@ -399,8 +360,9 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
 
         if (frames[frame] == NULL) continue;
 
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
+        mbd->mi[0]->mv[0].as_mv.row = 0;
+        mbd->mi[0]->mv[0].as_mv.col = 0;
+        mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -422,60 +384,51 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
               mbd, frames[frame]->y_buffer + mb_y_offset,
               frames[frame]->u_buffer + mb_uv_offset,
               frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
-              mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
-              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
-              mb_col * 16, mb_row * 16);
+              mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
+              mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
+              mb_row * 16, cm->allow_warped_motion);
 
-// Apply the filter (YUV)
-#if CONFIG_HIGHBITDEPTH
+          // Apply the filter (YUV)
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             av1_highbd_temporal_filter_apply(
                 f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
                 adj_strength, filter_weight, accumulator, count);
-            av1_highbd_temporal_filter_apply(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 256, count + 256);
-            av1_highbd_temporal_filter_apply(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 512, count + 512);
+            if (num_planes > 1) {
+              av1_highbd_temporal_filter_apply(
+                  f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                  mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                  accumulator + 256, count + 256);
+              av1_highbd_temporal_filter_apply(
+                  f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                  mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                  accumulator + 512, count + 512);
+            }
           } else {
-#endif  // CONFIG_HIGHBITDEPTH
             av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                         predictor, 16, 16, strength,
                                         filter_weight, accumulator, count);
-            av1_temporal_filter_apply_c(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 256, count + 256);
-            av1_temporal_filter_apply_c(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 512, count + 512);
-#if CONFIG_HIGHBITDEPTH
+            if (num_planes > 1) {
+              av1_temporal_filter_apply_c(
+                  f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                  mb_uv_width, mb_uv_height, strength, filter_weight,
+                  accumulator + 256, count + 256);
+              av1_temporal_filter_apply_c(
+                  f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                  mb_uv_width, mb_uv_height, strength, filter_weight,
+                  accumulator + 512, count + 512);
+            }
           }
-#endif  // CONFIG_HIGHBITDEPTH
         }
       }
 
-// Normalize filter output to produce AltRef frame
-#if CONFIG_HIGHBITDEPTH
+      // Normalize filter output to produce AltRef frame
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         uint16_t *dst1_16;
         uint16_t *dst2_16;
-#if CONFIG_BGSPRITE
-        dst1 = target->y_buffer;
-#else
         dst1 = cpi->alt_ref_buffer.y_buffer;
-#endif  // CONFIG_BGSPRITE
         dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-#if CONFIG_BGSPRITE
-        stride = target->y_stride;
-#else
         stride = cpi->alt_ref_buffer.y_stride;
-#endif  // CONFIG_BGSPRITE
         byte = mb_y_offset;
         for (i = 0, k = 0; i < 16; i++) {
           for (j = 0; j < 16; j++, k++) {
@@ -488,40 +441,31 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
 
           byte += stride - 16;
         }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
-        stride = cpi->alt_ref_buffer.uv_stride;
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            dst1_16[byte] =
-                (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-            // V
-            dst2_16[byte] =
-                (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-
-            // move to next pixel
-            byte++;
+        if (num_planes > 1) {
+          dst1 = cpi->alt_ref_buffer.u_buffer;
+          dst2 = cpi->alt_ref_buffer.v_buffer;
+          dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+          dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+          stride = cpi->alt_ref_buffer.uv_stride;
+          byte = mb_uv_offset;
+          for (i = 0, k = 256; i < mb_uv_height; i++) {
+            for (j = 0; j < mb_uv_width; j++, k++) {
+              int m = k + 256;
+              // U
+              dst1_16[byte] =
+                  (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+              // V
+              dst2_16[byte] =
+                  (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+              // move to next pixel
+              byte++;
+            }
+            byte += stride - mb_uv_width;
           }
-
-          byte += stride - mb_uv_width;
         }
       } else {
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_BGSPRITE
-        dst1 = target->y_buffer;
-        stride = target->y_stride;
-#else
-      dst1 = cpi->alt_ref_buffer.y_buffer;
-      stride = cpi->alt_ref_buffer.y_stride;
-#endif  // CONFIG_BGSPRITE
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
         byte = mb_y_offset;
         for (i = 0, k = 0; i < 16; i++) {
           for (j = 0; j < 16; j++, k++) {
@@ -533,36 +477,27 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           }
           byte += stride - 16;
         }
-#if CONFIG_BGSPRITE
-        dst1 = target->u_buffer;
-        dst2 = target->v_buffer;
-        stride = target->uv_stride;
-#else
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
-      stride = cpi->alt_ref_buffer.uv_stride;
-#endif  // CONFIG_BGSPRITE
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            dst1[byte] =
-                (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-            // V
-            dst2[byte] =
-                (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-
-            // move to next pixel
-            byte++;
+        if (num_planes > 1) {
+          dst1 = cpi->alt_ref_buffer.u_buffer;
+          dst2 = cpi->alt_ref_buffer.v_buffer;
+          stride = cpi->alt_ref_buffer.uv_stride;
+          byte = mb_uv_offset;
+          for (i = 0, k = 256; i < mb_uv_height; i++) {
+            for (j = 0; j < mb_uv_width; j++, k++) {
+              int m = k + 256;
+              // U
+              dst1[byte] =
+                  (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+              // V
+              dst2[byte] =
+                  (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+              // move to next pixel
+              byte++;
+            }
+            byte += stride - mb_uv_width;
           }
-          byte += stride - mb_uv_width;
         }
-#if CONFIG_HIGHBITDEPTH
       }
-#endif  // CONFIG_HIGHBITDEPTH
       mb_y_offset += 16;
       mb_uv_offset += mb_uv_width;
     }
@@ -571,7 +506,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
   }
 
   // Restore input state
-  for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+  for (i = 0; i < num_planes; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
 }
 
 // Apply buffer limits and context specific adjustments to arnr filter.
@@ -633,11 +568,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
   *arnr_strength = strength;
 }
 
-void av1_temporal_filter(AV1_COMP *cpi,
-#if CONFIG_BGSPRITE
-                         YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target,
-#endif  // CONFIG_BGSPRITE
-                         int distance) {
+void av1_temporal_filter(AV1_COMP *cpi, int distance) {
   RATE_CONTROL *const rc = &cpi->rc;
   int frame;
   int frames_to_blur;
@@ -647,17 +578,14 @@ void av1_temporal_filter(AV1_COMP *cpi,
   int frames_to_blur_forward;
   struct scale_factors sf;
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-#if CONFIG_EXT_REFS
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-#endif  // CONFIG_EXT_REFS
 
   // Apply context specific adjustments to the arnr filter parameters.
   adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
-// TODO(weitinglin): Currently, we enforce the filtering strength on
-//                   extra ARFs' to be zeros. We should investigate in which
-//                   case it is more beneficial to use non-zero strength
-//                   filtering.
-#if CONFIG_EXT_REFS
+  // TODO(weitinglin): Currently, we enforce the filtering strength on
+  //                   extra ARFs' to be zeros. We should investigate in which
+  //                   case it is more beneficial to use non-zero strength
+  //                   filtering.
   if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
     strength = 0;
     frames_to_blur = 1;
@@ -685,7 +613,7 @@ void av1_temporal_filter(AV1_COMP *cpi,
     cpi->is_arf_filter_off[which_arf] = 1;
   else
     cpi->is_arf_filter_off[which_arf] = 0;
-#endif  // CONFIG_EXT_REFS
+  cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf];
 
   frames_to_blur_backward = (frames_to_blur / 2);
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
@@ -694,40 +622,20 @@ void av1_temporal_filter(AV1_COMP *cpi,
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
-#if CONFIG_BGSPRITE
-    if (frame == frames_to_blur_backward && bg != NULL) {
-      // Insert bg into frames at ARF index.
-      frames[frames_to_blur - 1 - frame] = bg;
-    } else {
-#endif  // CONFIG_BGSPRITE
-      struct lookahead_entry *buf =
-          av1_lookahead_peek(cpi->lookahead, which_buffer);
-      frames[frames_to_blur - 1 - frame] = &buf->img;
-#if CONFIG_BGSPRITE
-    }
-#endif  // CONFIG_BGSPRITE
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, which_buffer);
+    frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
   if (frames_to_blur > 0) {
-// Setup scaling factors. Scaling on each of the arnr frames is not
-// supported.
-// ARF is produced at the native frame size and resized when coded.
-#if CONFIG_HIGHBITDEPTH
-    av1_setup_scale_factors_for_frame(
-        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
-        frames[0]->y_crop_width, frames[0]->y_crop_height,
-        cpi->common.use_highbitdepth);
-#else
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    // ARF is produced at the native frame size and resized when coded.
     av1_setup_scale_factors_for_frame(
         &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
         frames[0]->y_crop_width, frames[0]->y_crop_height);
-#endif  // CONFIG_HIGHBITDEPTH
   }
 
-  temporal_filter_iterate_c(cpi,
-#if CONFIG_BGSPRITE
-                            target,
-#endif  // CONFIG_BGSPRITE
-                            frames, frames_to_blur, frames_to_blur_backward,
-                            strength, &sf);
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
 }
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index 7dd9fad58..bc0863a63 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -16,11 +16,7 @@
 extern "C" {
 #endif
 
-void av1_temporal_filter(AV1_COMP *cpi,
-#if CONFIG_BGSPRITE
-                         YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target,
-#endif  // CONFIG_BGSPRITE
-                         int distance);
+void av1_temporal_filter(AV1_COMP *cpi, int distance);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
index a2e24d66b..16a6a9a35 100644
--- a/third_party/aom/av1/encoder/tokenize.c
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -23,314 +23,13 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
-#include "av1/encoder/encodetxb.c"
-#endif
+#include "av1/encoder/encodetxb.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
-  { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 },
-  { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 },
-  { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 },
-  { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 },
-  { 9, 7 },  { 9, 5 },  { 9, 3 },  { 9, 1 },  { 8, 31 }, { 8, 29 }, { 8, 27 },
-  { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 },
-  { 8, 11 }, { 8, 9 },  { 8, 7 },  { 8, 5 },  { 8, 3 },  { 8, 1 },  { 7, 15 },
-  { 7, 13 }, { 7, 11 }, { 7, 9 },  { 7, 7 },  { 7, 5 },  { 7, 3 },  { 7, 1 },
-  { 6, 7 },  { 6, 5 },  { 6, 3 },  { 6, 1 },  { 5, 3 },  { 5, 1 },  { 4, 1 },
-  { 3, 1 },  { 2, 1 },  { 1, 1 },  { 0, 0 },  { 1, 0 },  { 2, 0 },  { 3, 0 },
-  { 4, 0 },  { 5, 0 },  { 5, 2 },  { 6, 0 },  { 6, 2 },  { 6, 4 },  { 6, 6 },
-  { 7, 0 },  { 7, 2 },  { 7, 4 },  { 7, 6 },  { 7, 8 },  { 7, 10 }, { 7, 12 },
-  { 7, 14 }, { 8, 0 },  { 8, 2 },  { 8, 4 },  { 8, 6 },  { 8, 8 },  { 8, 10 },
-  { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 },
-  { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 },  { 9, 2 },  { 9, 4 },  { 9, 6 },
-  { 9, 8 },  { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 },
-  { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 },
-  { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 },
-  { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 }
-};
-const TOKENVALUE *av1_dct_cat_lt_10_value_tokens =
-    dct_cat_lt_10_value_tokens +
-    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) /
-        2;
-// The corresponding costs of the extrabits for the tokens in the above table
-// are stored in the table below. The values are obtained from looking up the
-// entry for the specified extrabits in the table corresponding to the token
-// (as defined in cost element av1_extra_bits)
-// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
-static const int dct_cat_lt_10_value_cost[] = {
-  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282,
-  3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772,
-  2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742,
-  2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195,
-  2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864,  512,  512,  512,
-  512,  0,    512,  512,  512,  512,  864,  1229, 1256, 1453, 1696, 1893, 1652,
-  1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476,
-  2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622,
-  2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
-  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681,
-  3704, 3750, 3773,
-};
-const int *av1_dct_cat_lt_10_value_cost =
-    dct_cat_lt_10_value_cost +
-    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2;
-
-// Array indices are identical to previously-existing CONTEXT_NODE indices
-/* clang-format off */
-const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  -EOB_TOKEN, 2,                       // 0  = EOB
-  -ZERO_TOKEN, 4,                      // 1  = ZERO
-  -ONE_TOKEN, 6,                       // 2  = ONE
-  8, 12,                               // 3  = LOW_VAL
-  -TWO_TOKEN, 10,                      // 4  = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
-  14, 16,                              // 6  = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
-  18, 20,                              // 8  = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
-};
-/* clang-format on */
-
-static const int16_t zero_cost[] = { 0 };
-static const int16_t sign_cost[1] = { 512 };
-static const int16_t cat1_cost[1 << 1] = { 864, 1229 };
-static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 };
-static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023,
-                                           2195, 2334, 2427, 2566 };
-static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476,
-                                           2534, 2615, 2661, 2742, 2800, 2881,
-                                           2977, 3058, 3116, 3197 };
-static const int16_t cat5_cost[1 << 5] = {
-  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
-  2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
-  3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773
-};
-const int16_t av1_cat6_low_cost[256] = {
-  3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574,
-  3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822,
-  3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053,
-  4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301,
-  4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253,
-  4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461,
-  4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708,
-  4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940,
-  4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198,
-  5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000,
-  5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207,
-  5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
-  5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722,
-  5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945,
-  5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886,
-  5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094,
-  6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352,
-  6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609,
-  6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
-  6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
-};
-const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES] = {
-  100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
-  8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
-  9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
-  5841,  6317,  8480,  6738,  8901,  9377,  11540, 7256,  9419,  9895,  12058,
-  10316, 12479, 12955, 15118, 7256,  9419,  9895,  12058, 10316, 12479, 12955,
-  15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696,
-#if CONFIG_HIGHBITDEPTH
-  4193,  6356,  6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410,
-  12573, 10831, 12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994,
-  13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,
-  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
-  14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
-  19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193,  6356,
-  6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831,
-  12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
-  11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410,
-  12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
-  17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
-  17090, 17566, 19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088,
-  11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
-  19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
-  18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924,
-  17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
-  15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659,
-  23822, 22080, 24243, 24719, 26882, 4193,  6356,  6832,  8995,  7253,  9416,
-  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 7771,
-  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
-  14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994, 13470,
-  15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512,
-  13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987,
-  20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148,
-  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503,
-  16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
-  21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442,
-  17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244,
-  18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719,
-  26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
-  14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
-  17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
-  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
-  20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
-  21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379,
-  14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759,
-  19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
-  23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120,
-  18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595,
-  24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
-  23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 4193,  6356,  6832,
-  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994,
-  13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349,
-  13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573,
-  10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
-  19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090,
-  17566, 19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346,
-  13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
-  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
-  20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087,
-  17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442,
-  17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822,
-  22080, 24243, 24719, 26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985,
-  16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
-  14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
-  20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
-  15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
-  20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
-  24719, 26882, 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957,
-  18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759,
-  19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
-  27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
-  22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595,
-  24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975,
-  8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503,
-  16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087,
-  17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864,
-  14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
-  18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
-  23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542,
-  15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017,
-  21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819,
-  19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596,
-  20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758,
-  25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113,
-  25276, 25752, 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181,
-  15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
-  23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
-  22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
-  21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
-  19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
-  27915, 26173, 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695,
-  22171, 24334, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050,
-  22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430,
-  26688, 28851, 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749,
-  27912, 23628, 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791,
-  26267, 28430, 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266,
-  32429, 32905, 35068
-#endif
-};
-
-const uint8_t av1_cat6_skipped_bits_discount[8] = {
-  0, 3, 6, 9, 12, 18, 24, 30
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
-  { 0, 0, 0, zero_cost },                        // ZERO_TOKEN
-  { 0, 0, 1, sign_cost },                        // ONE_TOKEN
-  { 0, 0, 2, sign_cost },                        // TWO_TOKEN
-  { 0, 0, 3, sign_cost },                        // THREE_TOKEN
-  { 0, 0, 4, sign_cost },                        // FOUR_TOKEN
-  { av1_cat1_cdf, 1, CAT1_MIN_VAL, cat1_cost },  // CATEGORY1_TOKEN
-  { av1_cat2_cdf, 2, CAT2_MIN_VAL, cat2_cost },  // CATEGORY2_TOKEN
-  { av1_cat3_cdf, 3, CAT3_MIN_VAL, cat3_cost },  // CATEGORY3_TOKEN
-  { av1_cat4_cdf, 4, CAT4_MIN_VAL, cat4_cost },  // CATEGORY4_TOKEN
-  { av1_cat5_cdf, 5, CAT5_MIN_VAL, cat5_cost },  // CATEGORY5_TOKEN
-  { av1_cat6_cdf, 18, CAT6_MIN_VAL, 0 },         // CATEGORY6_TOKEN
-  { 0, 0, 0, zero_cost }                         // EOB_TOKEN
-};
-#else
-const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
-  { 0, 0, 0, zero_cost },                         // ZERO_TOKEN
-  { 0, 0, 1, sign_cost },                         // ONE_TOKEN
-  { 0, 0, 2, sign_cost },                         // TWO_TOKEN
-  { 0, 0, 3, sign_cost },                         // THREE_TOKEN
-  { 0, 0, 4, sign_cost },                         // FOUR_TOKEN
-  { av1_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost },  // CATEGORY1_TOKEN
-  { av1_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost },  // CATEGORY2_TOKEN
-  { av1_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost },  // CATEGORY3_TOKEN
-  { av1_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost },  // CATEGORY4_TOKEN
-  { av1_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost },  // CATEGORY5_TOKEN
-  { av1_cat6_prob, 18, CAT6_MIN_VAL, 0 },         // CATEGORY6_TOKEN
-  { 0, 0, 0, zero_cost }                          // EOB_TOKEN
-};
-#endif
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
-                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *const cpi = args->cpi;
-  const AV1_COMMON *cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const PLANE_TYPE type = pd->plane_type;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int rate = av1_cost_coeffs(
-      cpi, x, plane, blk_row, blk_col, block, tx_size, scan_order,
-      pd->above_context + blk_col, pd->left_context + blk_row, 0);
-  args->this_rate += rate;
-  (void)plane_bsize;
-  av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
-                   blk_row);
-}
-
-static void set_entropy_context_b(int plane, int block, int blk_row,
-                                  int blk_col, BLOCK_SIZE plane_bsize,
-                                  TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  (void)plane_bsize;
-  av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
-                   blk_row);
-}
-
-static INLINE void add_token(TOKENEXTRA **t,
-                             aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
-                             aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
-                             int eob_val, int first_val, int32_t extra,
-                             uint8_t token) {
-  (*t)->token = token;
-  (*t)->extra = extra;
-  (*t)->tail_cdf = tail_cdf;
-  (*t)->head_cdf = head_cdf;
-  (*t)->eob_val = eob_val;
-  (*t)->first_val = first_val;
-  (*t)++;
-
-  if (token == BLOCK_Z_TOKEN) {
-    update_cdf(*head_cdf, 0, HEAD_TOKENS + 1);
-  } else {
-    if (eob_val != LAST_EOB) {
-      const int symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + first_val;
-      update_cdf(*head_cdf, symb, HEAD_TOKENS + first_val);
-    }
-    if (token > ONE_TOKEN)
-      update_cdf(*tail_cdf, token - TWO_TOKEN, TAIL_TOKENS);
-  }
-}
-#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
-
 static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
-                                 int calc_rate) {
+                                 int plane, int calc_rate, int allow_update_cdf,
+                                 FRAME_COUNTS *counts) {
   const uint8_t *const color_map = param->color_map;
   MapCdf map_cdf = param->map_cdf;
   ColorCost color_cost = param->color_cost;
@@ -338,28 +37,37 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
   const int rows = param->rows;
   const int cols = param->cols;
   const int n = param->n_colors;
-
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
   int this_rate = 0;
   uint8_t color_order[PALETTE_MAX_SIZE];
-#if CONFIG_PALETTE_THROUGHPUT
+
+  (void)plane;
+  (void)counts;
+
   for (int k = 1; k < rows + cols - 1; ++k) {
     for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
       int i = k - j;
-#else
-  for (int i = 0; i < rows; ++i) {
-    for (int j = (i == 0 ? 1 : 0); j < cols; ++j) {
-#endif  // CONFIG_PALETTE_THROUGHPUT
       int color_new_idx;
       const int color_ctx = av1_get_palette_color_index_context(
           color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
       if (calc_rate) {
-        this_rate +=
-            (*color_cost)[n - PALETTE_MIN_SIZE][color_ctx][color_new_idx];
+        this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
       } else {
         (*t)->token = color_new_idx;
-        (*t)->color_map_cdf = map_cdf[n - PALETTE_MIN_SIZE][color_ctx];
+        (*t)->color_map_cdf = map_cdf[palette_size_idx][color_ctx];
         ++(*t);
+        if (allow_update_cdf)
+          update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+        if (plane) {
+          ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+                                          [color_new_idx];
+        } else {
+          ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+                                         [color_new_idx];
+        }
+#endif
       }
     }
   }
@@ -370,7 +78,7 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
 static void get_palette_params(const MACROBLOCK *const x, int plane,
                                BLOCK_SIZE bsize, Av1ColorMapParam *params) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   params->color_map = xd->plane[plane].color_index_map;
   params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
@@ -382,263 +90,62 @@ static void get_palette_params(const MACROBLOCK *const x, int plane,
                            &params->rows, &params->cols);
 }
 
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-static void get_mrc_params(const MACROBLOCK *const x, int block,
-                           TX_SIZE tx_size, Av1ColorMapParam *params) {
-  memset(params, 0, sizeof(*params));
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-  params->color_map = BLOCK_OFFSET(xd->mrc_mask, block);
-  params->map_cdf = is_inter ? xd->tile_ctx->mrc_mask_inter_cdf
-                             : xd->tile_ctx->mrc_mask_intra_cdf;
-  params->color_cost =
-      is_inter ? &x->mrc_mask_inter_cost : &x->mrc_mask_intra_cost;
-  params->n_colors = 2;
-  params->plane_width = tx_size_wide[tx_size];
-  params->rows = tx_size_high[tx_size];
-  params->cols = tx_size_wide[tx_size];
-}
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
 static void get_color_map_params(const MACROBLOCK *const x, int plane,
-                                 int block, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                 BLOCK_SIZE bsize, TX_SIZE tx_size,
                                  COLOR_MAP_TYPE type,
                                  Av1ColorMapParam *params) {
-  (void)block;
   (void)tx_size;
   memset(params, 0, sizeof(*params));
   switch (type) {
     case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-    case MRC_MAP: get_mrc_params(x, block, tx_size, params); break;
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
     default: assert(0 && "Invalid color map type"); return;
   }
 }
 
-int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block,
-                       BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type) {
   assert(plane == 0 || plane == 1);
   Av1ColorMapParam color_map_params;
-  get_color_map_params(x, plane, block, bsize, tx_size, type,
-                       &color_map_params);
-  return cost_and_tokenize_map(&color_map_params, NULL, 1);
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
 }
 
-void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block,
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
                             TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            COLOR_MAP_TYPE type) {
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            FRAME_COUNTS *counts) {
   assert(plane == 0 || plane == 1);
-#if CONFIG_MRC_TX
-  if (type == MRC_MAP) {
-    const int is_inter = is_inter_block(&x->e_mbd.mi[0]->mbmi);
-    if ((is_inter && !SIGNAL_MRC_MASK_INTER) ||
-        (!is_inter && !SIGNAL_MRC_MASK_INTRA))
-      return;
-  }
-#endif  // CONFIG_MRC_TX
   Av1ColorMapParam color_map_params;
-  get_color_map_params(x, plane, block, bsize, tx_size, type,
-                       &color_map_params);
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
   // The first color index does not use context or entropy.
   (*t)->token = color_map_params.color_map[0];
   (*t)->color_map_cdf = NULL;
   ++(*t);
-  cost_and_tokenize_map(&color_map_params, t, 0);
-}
-
-#if CONFIG_PVQ
-static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x,
-                          PVQ_INFO *pvq) {
-  PVQ_QUEUE *q = x->pvq_q;
-  if (q->curr_pos >= q->buf_len) {
-    int new_buf_len = 2 * q->buf_len + 1;
-    PVQ_INFO *new_buf;
-    CHECK_MEM_ERROR(cm, new_buf, aom_malloc(new_buf_len * sizeof(PVQ_INFO)));
-    memcpy(new_buf, q->buf, q->buf_len * sizeof(PVQ_INFO));
-    aom_free(q->buf);
-    q->buf = new_buf;
-    q->buf_len = new_buf_len;
-  }
-  OD_COPY(q->buf + q->curr_pos, pvq, 1);
-  ++q->curr_pos;
-}
-
-// NOTE: This does not actually generate tokens, instead we store the encoding
-// decisions made for PVQ in a queue that we will read from when
-// actually writing the bitstream in write_modes_b
-static void tokenize_pvq(int plane, int block, int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *cpi = args->cpi;
-  const AV1_COMMON *const cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  PVQ_INFO *pvq_info;
-
-  (void)block;
-  (void)blk_row;
-  (void)blk_col;
-  (void)plane_bsize;
-  (void)tx_size;
-
-  assert(block < MAX_PVQ_BLOCKS_IN_SB);
-  pvq_info = &x->pvq[block][plane];
-  add_pvq_block((AV1_COMMON * const) cm, x, pvq_info);
-}
-#endif  // CONFIG_PVQ
-
-static void tokenize_b(int plane, int block, int blk_row, int blk_col,
-                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-#if !CONFIG_PVQ
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *cpi = args->cpi;
-  const AV1_COMMON *const cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[MAX_TX_SQUARE];
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int pt; /* near block/prev token context index */
-  int c;
-  TOKENEXTRA *t = *tp; /* store tokens starting here */
-  const int eob = p->eobs[block];
-  const PLANE_TYPE type = pd->plane_type;
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-#if CONFIG_SUPERTX
-  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
-#else
-  const int segment_id = mbmi->segment_id;
-#endif  // CONFIG_SUEPRTX
-  const int16_t *scan, *nb;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int ref = is_inter_block(mbmi);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  aom_cdf_prob(
-      *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref];
-  aom_cdf_prob(
-      *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref];
-  int eob_val;
-  int first_val = 1;
-  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  const uint8_t *const band = get_band_translate(tx_size);
-  int16_t token;
-  EXTRABIT extra;
-  (void)plane_bsize;
-  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
-                           pd->left_context + blk_row);
-  scan = scan_order->scan;
-  nb = scan_order->neighbors;
-  c = 0;
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  if (tx_type == MRC_DCT)
-    av1_tokenize_color_map(x, plane, block, &t, plane_bsize, tx_size, MRC_MAP);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  if (eob == 0)
-    add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1,
-              1, 0, BLOCK_Z_TOKEN);
-
-  while (c < eob) {
-    int v = qcoeff[scan[c]];
-    first_val = (c == 0);
-
-    if (!v) {
-      add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
-                0, first_val, 0, ZERO_TOKEN);
-      token_cache[scan[c]] = 0;
-    } else {
-      eob_val =
-          (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-      av1_get_token_extra(v, &token, &extra);
-      add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
-                eob_val, first_val, extra, (uint8_t)token);
-      token_cache[scan[c]] = av1_pt_energy_class[token];
-    }
-    ++c;
-    pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1));
-  }
-
-#if CONFIG_COEF_INTERLEAVE
-  t->token = EOSB_TOKEN;
-  t++;
-#endif
-
-  *tp = t;
-
-#if CONFIG_ADAPT_SCAN
-  // Since dqcoeff is not available here, we pass qcoeff into
-  // av1_update_scan_count_facade(). The update behavior should be the same
-  // because av1_update_scan_count_facade() only cares if coefficients are zero
-  // or not.
-  av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
-                               qcoeff, c);
-#endif
-
-  av1_set_contexts(xd, pd, plane, tx_size, c > 0, blk_col, blk_row);
-#else   // !CONFIG_PVQ
-  tokenize_pvq(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-#endif  // !CONFIG_PVQ
-}
-
-struct is_skippable_args {
-  uint16_t *eobs;
-  int *skippable;
-};
-static void is_skippable(int plane, int block, int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) {
-  struct is_skippable_args *args = argv;
-  (void)plane;
-  (void)plane_bsize;
-  (void)tx_size;
-  (void)blk_row;
-  (void)blk_col;
-  args->skippable[0] &= (!args->eobs[block]);
+  cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+                        counts);
 }
 
-// TODO(yaowu): rewrite and optimize this function to remove the usage of
-//              av1_foreach_transform_block() and simplify is_skippable().
-int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
-  int result = 1;
-  struct is_skippable_args args = { x->plane[plane].eobs, &result };
-  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
-                                         &args);
-  return result;
-}
-
-#if CONFIG_VAR_TX
 void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
                     TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
                     int blk_col, int block, int plane, void *arg) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  TX_SIZE plane_tx_size;
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
 
-  if (tx_size == plane_tx_size) {
-    plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
-#if CONFIG_LV_MAP
+  if (tx_size == plane_tx_size || plane) {
+    plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                       pd->subsampling_y);
     if (!dry_run) {
       av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
                                         plane_bsize, tx_size, arg);
@@ -649,120 +156,71 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
       printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
       assert(0);
     }
-#else
-    if (!dry_run)
-      tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-    else if (dry_run == DRY_RUN_NORMAL)
-      set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
-                            tx_size, arg);
-    else if (dry_run == DRY_RUN_COSTCOEFFS)
-      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-#endif
   } else {
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-#else
     // Half the block size in transform block unit.
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-#endif
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsw * bsh;
 
-    assert(bsl > 0);
+    assert(bsw > 0 && bsh > 0);
 
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + ((i >> 1) * bsl);
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + ((i & 0x01) * bsl);
-#else
-      const int offsetr = blk_row + ((i >> 1) * bsl);
-      const int offsetc = blk_col + ((i & 0x01) * bsl);
-#endif
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
 
-      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
-                     block, plane, arg);
-      block += step;
+        tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+                       block, plane, arg);
+        block += step;
+      }
     }
   }
 }
 
 void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                            RUN_TYPE dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize, int *rate) {
+                           BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_LV_MAP
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   (void)t;
-#else
-  TOKENEXTRA *t_backup = *t;
-#endif
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t, 0 };
-  int plane;
+  struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf };
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
   if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-#if !CONFIG_LV_MAP
-    if (dry_run) *t = t_backup;
-#endif
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
     return;
   }
 
-  if (!dry_run) td->counts->skip[ctx][0] += skip_inc;
-#if !CONFIG_LV_MAP
-  else
-    *t = t_backup;
-#endif
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
+  for (int plane = 0; plane < num_planes; ++plane) {
     if (!is_chroma_reference(mi_row, mi_col, bsize,
                              xd->plane[plane].subsampling_x,
                              xd->plane[plane].subsampling_y)) {
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-      if (!dry_run) {
-        (*t)->token = EOSB_TOKEN;
-        (*t)++;
-      }
-#endif
       continue;
     }
-#endif
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
+    const BLOCK_SIZE bsizec =
+        scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
     const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
+        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(
-        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-    int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
     int idx, idy;
     int block = 0;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
 
-    const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
     int mu_blocks_wide =
         block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
     int mu_blocks_high =
@@ -785,144 +243,6 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
         }
       }
     }
-#if !CONFIG_LV_MAP
-    if (!dry_run) {
-      (*t)->token = EOSB_TOKEN;
-      (*t)++;
-    }
-#endif
-  }
-  if (rate) *rate += arg.this_rate;
-}
-#endif  // CONFIG_VAR_TX
-
-void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                     const int mi_row, const int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t, 0 };
-  if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-    return;
-  }
-
-  if (!dry_run) {
-#if CONFIG_COEF_INTERLEAVE
-    td->counts->skip[ctx][0] += skip_inc;
-    av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg);
-#else
-    int plane;
-
-    td->counts->skip[ctx][0] += skip_inc;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, bsize,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y)) {
-#if !CONFIG_PVQ
-        (*t)->token = EOSB_TOKEN;
-        (*t)++;
-#endif
-        continue;
-      }
-#else
-      (void)mi_row;
-      (void)mi_col;
-#endif
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
-                                             &arg);
-#if !CONFIG_PVQ
-      (*t)->token = EOSB_TOKEN;
-      (*t)++;
-#endif  // !CONFIG_PVQ
-    }
-#endif
-  }
-#if !CONFIG_PVQ
-  else if (dry_run == DRY_RUN_NORMAL) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, bsize,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y))
-        continue;
-#else
-      (void)mi_row;
-      (void)mi_col;
-#endif
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                             set_entropy_context_b, &arg);
-    }
-  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, bsize,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y))
-        continue;
-#else
-      (void)mi_row;
-      (void)mi_col;
-#endif
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
-                                             &arg);
-    }
-  }
-#endif  // !CONFIG_PVQ
-
-  if (rate) *rate += arg.this_rate;
-}
-
-#if CONFIG_SUPERTX
-void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                             int mi_col, BLOCK_SIZE bsize, int *rate) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &td->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  TOKENEXTRA *t_backup = *t;
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t, 0 };
-  if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-    if (dry_run) *t = t_backup;
-    return;
-  }
-
-  if (!dry_run) {
-    int plane;
-    td->counts->skip[ctx][0] += skip_inc;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
-                                             &arg);
-      (*t)->token = EOSB_TOKEN;
-      (*t)++;
-    }
-  } else if (dry_run == DRY_RUN_NORMAL) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                             set_entropy_context_b, &arg);
-    *t = t_backup;
-  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
-                                             &arg);
   }
   if (rate) *rate += arg.this_rate;
 }
-#endif  // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index 20000e502..de1cbe99c 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -13,51 +13,29 @@
 #define AV1_ENCODER_TOKENIZE_H_
 
 #include "av1/common/entropy.h"
-
 #include "av1/encoder/block.h"
-#include "av1/encoder/treewriter.h"
+#include "aom_dsp/bitwriter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define EOSB_TOKEN 127  // Not signalled, encoder only
-
-#if CONFIG_HIGHBITDEPTH
-typedef int32_t EXTRABIT;
-#else
-typedef int16_t EXTRABIT;
-#endif
-
-typedef struct {
-  int16_t token;
-  EXTRABIT extra;
-} TOKENVALUE;
-
 typedef struct {
-  aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
-  aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
   aom_cdf_prob *color_map_cdf;
-  int eob_val;
-  int first_val;
-  const aom_prob *context_tree;
-  EXTRABIT extra;
+  // TODO(yaowu: use packed enum type if appropriate)
   uint8_t token;
 } TOKENEXTRA;
 
-extern const aom_tree_index av1_coef_tree[];
-extern const aom_tree_index av1_coef_con_tree[];
-
-int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
-
 struct AV1_COMP;
 struct ThreadData;
+struct FRAME_COUNTS;
 
 struct tokenize_b_args {
   const struct AV1_COMP *cpi;
   struct ThreadData *td;
   TOKENEXTRA **tp;
   int this_rate;
+  uint8_t allow_update_cdf;
 };
 
 typedef enum {
@@ -69,78 +47,22 @@ typedef enum {
 // Note in all the tokenize functions rate if non NULL is incremented
 // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
 // otherwise rate is not incremented.
-#if CONFIG_VAR_TX
 void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
                            TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize, int *rate);
-#endif
+                           int mi_col, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf);
 
-int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block,
-                       BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type);
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type);
 
-void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block,
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
                             TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            COLOR_MAP_TYPE type);
-
-void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                     int *rate, const int mi_row, const int mi_col);
-#if CONFIG_SUPERTX
-void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                             int mi_col, BLOCK_SIZE bsize, int *rate);
-#endif
-
-extern const int16_t *av1_dct_value_cost_ptr;
-/* TODO: The Token field should be broken out into a separate char array to
- *  improve cache locality, since it's needed for costing when the rest of the
- *  fields are not.
- */
-extern const TOKENVALUE *av1_dct_value_tokens_ptr;
-extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens;
-extern const int *av1_dct_cat_lt_10_value_cost;
-extern const int16_t av1_cat6_low_cost[256];
-#if CONFIG_HIGHBITDEPTH
-#define CAT6_HIGH_COST_ENTRIES 1024
-#else
-#define CAT6_HIGH_COST_ENTRIES 64
-#endif
-extern const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES];
-extern const uint8_t av1_cat6_skipped_bits_discount[8];
-
-static INLINE void av1_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
-  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
-    *token = CATEGORY6_TOKEN;
-    if (v >= CAT6_MIN_VAL)
-      *extra = 2 * v - 2 * CAT6_MIN_VAL;
-    else
-      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
-    return;
-  }
-  *token = av1_dct_cat_lt_10_value_tokens[v].token;
-  *extra = av1_dct_cat_lt_10_value_tokens[v].extra;
-}
-static INLINE int16_t av1_get_token(int v) {
-  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10;
-  return av1_dct_cat_lt_10_value_tokens[v].token;
-}
-
-static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) {
-  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
-    EXTRABIT extrabits;
-    *token = CATEGORY6_TOKEN;
-    extrabits = abs(v) - CAT6_MIN_VAL;
-    return av1_cat6_low_cost[extrabits & 0xff] +
-           av1_cat6_high_cost[extrabits >> 8] -
-           av1_cat6_skipped_bits_discount[18 - cat6_bits];
-  }
-  *token = av1_dct_cat_lt_10_value_tokens[v].token;
-  return av1_dct_cat_lt_10_value_cost[v];
-}
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            struct FRAME_COUNTS *counts);
 
 static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
                                  TX_SIZE tx_size) {
-  const int eob_max = tx_size_2d[tx_size];
+  const int eob_max = av1_get_max_eob(tx_size);
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
diff --git a/third_party/aom/av1/encoder/treewriter.c b/third_party/aom/av1/encoder/treewriter.c
deleted file mode 100644
index 50be72413..000000000
--- a/third_party/aom/av1/encoder/treewriter.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/encoder/treewriter.h"
-
-static void tree2tok(struct av1_token *tokens, const aom_tree_index *tree,
-                     int i, int v, int l) {
-  v += v;
-  ++l;
-
-  do {
-    const aom_tree_index j = tree[i++];
-    if (j <= 0) {
-      tokens[-j].value = v;
-      tokens[-j].len = l;
-    } else {
-      tree2tok(tokens, tree, j, v, l);
-    }
-  } while (++v & 1);
-}
-
-void av1_tokens_from_tree(struct av1_token *tokens,
-                          const aom_tree_index *tree) {
-  tree2tok(tokens, tree, 0, 0, 0);
-}
-
-static unsigned int convert_distribution(unsigned int i, aom_tree tree,
-                                         unsigned int branch_ct[][2],
-                                         const unsigned int num_events[]) {
-  unsigned int left, right;
-
-  if (tree[i] <= 0)
-    left = num_events[-tree[i]];
-  else
-    left = convert_distribution(tree[i], tree, branch_ct, num_events);
-
-  if (tree[i + 1] <= 0)
-    right = num_events[-tree[i + 1]];
-  else
-    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
-
-  branch_ct[i >> 1][0] = left;
-  branch_ct[i >> 1][1] = right;
-  return left + right;
-}
-
-void av1_tree_probs_from_distribution(aom_tree tree,
-                                      unsigned int branch_ct[/* n-1 */][2],
-                                      const unsigned int num_events[/* n */]) {
-  convert_distribution(0, tree, branch_ct, num_events);
-}
diff --git a/third_party/aom/av1/encoder/treewriter.h b/third_party/aom/av1/encoder/treewriter.h
deleted file mode 100644
index 9a4cb86cb..000000000
--- a/third_party/aom/av1/encoder/treewriter.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_TREEWRITER_H_
-#define AV1_ENCODER_TREEWRITER_H_
-
-#include "aom_dsp/bitwriter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_tree_probs_from_distribution(aom_tree tree,
-                                      unsigned int branch_ct[/* n - 1 */][2],
-                                      const unsigned int num_events[/* n */]);
-
-struct av1_token {
-  int value;
-  int len;
-};
-
-void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *);
-
-static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree,
-                                   const aom_prob *probs,
-                                   const struct av1_token *token) {
-  aom_write_tree(w, tree, probs, token->value, token->len, 0);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_TREEWRITER_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 000000000..69063b801
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,2086 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_layer0[32] = {
+  0.72406f,  -0.40019f, 0.51795f,  -0.43881f, -0.49746f, -0.41780f, -0.39409f,
+  -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f,  1.06229f,  0.04188f,
+  0.60919f,  0.92405f,  -0.39359f, 0.70570f,  0.75375f,  1.11966f,  -1.86360f,
+  -0.35421f, 0.18743f,  0.13346f,  -0.21262f, 0.07050f,  0.10533f,  -0.47402f,
+  1.33417f,  1.72899f,  1.17983f,  0.10552f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_layer0[8] = {
+  1.96273f, -0.69845f, -0.10999f, -1.11311f,
+  1.35101f, 0.43842f,  -0.29264f, -1.15376f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_layer1[32] = {
+  0.79770f,  0.08520f,  0.23298f,  0.05285f,  0.87506f,  -0.90784f, -0.06197f,
+  -1.00580f, 0.68639f,  -0.34881f, 0.15366f,  -1.64658f, 0.80755f,  -0.26293f,
+  0.10253f,  -0.23915f, 1.14696f,  -0.10928f, -1.61377f, 0.00863f,  0.98599f,
+  -0.43872f, 0.61196f,  -0.03787f, 1.01060f,  0.17643f,  -0.00208f, -0.15738f,
+  0.06517f,  0.72885f,  0.24387f,  1.28535f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_layer1[4] = {
+  1.23769f,
+  1.40308f,
+  0.09871f,
+  1.82070f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4 = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_4x4_layer0,
+      av1_tx_type_nn_weights_4x4_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_4x4_layer0,
+      av1_tx_type_nn_bias_4x4_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+  0.68355f,  -0.06887f, 0.68525f,  -0.86048f, -0.35906f, -0.28597f, -0.21108f,
+  0.12591f,  -1.13025f, -0.65695f, -0.25658f, 0.39155f,  0.89011f,  0.19258f,
+  0.28316f,  0.61172f,  0.52587f,  0.99182f,  0.75704f,  0.66788f,  -1.61814f,
+  -1.23483f, -0.62868f, -0.11902f, 0.33295f,  0.64796f,  0.92345f,  -0.71821f,
+  0.07575f,  0.34687f,  0.20518f,  -0.87850f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+  1.14049f, -0.18583f, 1.92114f, -0.72057f,
+  1.32715f, 0.96713f,  1.09877f, -0.64345f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+  0.71978f,  0.06896f,  1.48617f,  0.97124f,  -0.02487f, -0.95359f, 0.68983f,
+  -0.16313f, 0.51324f,  -0.33770f, 0.45938f,  -1.08238f, 0.72938f,  0.42300f,
+  0.85691f,  -0.03783f, 1.12617f,  -0.04034f, 0.36923f,  0.25638f,  1.10167f,
+  0.41633f,  0.72602f,  -0.14797f, 0.66888f,  0.11437f,  -0.99797f, -0.20725f,
+  1.01163f,  2.06308f,  1.23331f,  -0.15481f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+  2.14443f,
+  1.98356f,
+  0.74616f,
+  2.58795f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_4x8_hor_layer0,
+      av1_tx_type_nn_weights_4x8_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_4x8_hor_layer0,
+      av1_tx_type_nn_bias_4x8_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+  0.88859f,  1.02796f,  1.15509f,  0.61719f,  0.85804f,  1.17581f,  0.93524f,
+  0.06546f,  0.08018f,  -0.78562f, -0.36614f, 0.14149f,  -0.30069f, -0.52647f,
+  -0.82789f, 0.60527f,  -1.74026f, -0.20271f, 0.09875f,  0.03708f,  0.09430f,
+  -0.24043f, -0.38433f, 1.21014f,  1.42443f,  0.69586f,  1.07812f,  1.21748f,
+  1.10989f,  0.93122f,  1.04127f,  0.39424f,  0.95592f,  0.12904f,  0.46330f,
+  0.49722f,  0.46303f,  0.36979f,  0.60227f,  0.39345f,  -2.01632f, -0.05706f,
+  0.07766f,  -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f,  0.27662f,
+  0.42028f,  0.44748f,  1.14585f,  1.38805f,  0.46182f,  -0.22982f, -0.07324f,
+  0.29886f,  -0.46959f, -0.04228f, -0.01064f, 0.24260f,  -0.32282f, -0.23804f,
+  1.44466f,  -0.42190f, -0.36385f, 0.39746f,  0.38557f,  -0.09624f, -0.21540f,
+  0.57385f,  -0.72878f, -0.39677f, -0.00717f, 0.60499f,  1.33849f,  1.05337f,
+  1.11947f,  0.38487f,  0.86534f,  -0.33970f, 0.71140f,  0.20772f,  0.61132f,
+  0.06181f,  -0.20027f, 0.13736f,  -0.72321f, 0.64586f,  -0.56740f, -0.90912f,
+  -0.20452f, 0.15381f,  -0.84346f, 0.19550f,  0.63164f,  1.35441f,  0.63218f,
+  0.82883f,  0.38803f,  -0.23874f, -0.02962f, 0.23846f,  -0.06822f, -0.40159f,
+  -0.17850f, -0.69524f, 1.12299f,  -0.08286f, -0.14150f, -0.28456f, -0.41519f,
+  -0.12792f, -0.55286f, 0.51655f,  0.06636f,  0.73759f,  0.70072f,  0.12616f,
+  0.31282f,  0.17130f,  -1.34233f, 0.37221f,  0.95838f,  0.16286f,  1.04301f,
+  0.73600f,  -0.11233f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+  -0.89131f, 0.09124f,  -0.71678f, -1.19929f, 0.98963f,  0.16896f,
+  -0.44943f, -0.97532f, -0.13997f, 1.07136f,  -0.46362f, -0.45253f,
+  -0.63015f, -0.20008f, 1.24048f,  -0.21265f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+  -0.79795f, 0.45973f,  -0.54188f, -1.05095f, 0.64404f,  -0.56470f, -0.57018f,
+  0.61644f,  0.50229f,  1.14006f,  0.13805f,  -0.42058f, -0.07468f, 0.66203f,
+  0.93180f,  -0.59662f, -0.25152f, 0.00336f,  1.09769f,  -1.11921f, 0.15151f,
+  0.58750f,  -0.42480f, -0.95908f, -0.10980f, 1.31715f,  0.06665f,  -0.52371f,
+  0.37228f,  -0.12364f, 0.54876f,  -0.32698f, 0.39863f,  -0.97669f, -1.06351f,
+  1.82755f,  1.02851f,  0.10322f,  -0.08322f, 0.08891f,  -0.05715f, 0.93503f,
+  0.02096f,  -0.39506f, -0.99330f, -0.09407f, 0.75108f,  -0.30104f, 1.78314f,
+  -0.01786f, -0.17392f, 0.00461f,  0.41394f,  0.92566f,  1.11251f,  -0.71380f,
+  -0.04907f, 0.12736f,  0.00208f,  0.94451f,  -0.31783f, -0.19655f, 0.64619f,
+  0.50359f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+  0.39274f,
+  1.27276f,
+  0.30322f,
+  2.55238f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_4x8_ver_layer0,
+      av1_tx_type_nn_weights_4x8_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_4x8_ver_layer0,
+      av1_tx_type_nn_bias_4x8_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+  0.64828f,  0.61618f,  0.98975f,  -0.14562f, 0.26957f,  1.80872f,  0.58299f,
+  -0.06917f, 0.00937f,  -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f,
+  -0.85166f, 0.88799f,  -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f,
+  -0.49650f, -0.32171f, 1.37181f,  1.30432f,  0.71843f,  1.01916f,  1.01582f,
+  0.90999f,  0.86334f,  1.04603f,  0.40734f,  0.96187f,  0.53742f,  0.07510f,
+  0.44167f,  0.02049f,  -0.02874f, 0.97191f,  1.03647f,  -2.62751f, -0.01390f,
+  -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f,  0.77191f,
+  0.75416f,  0.69067f,  0.93561f,  1.35982f,  0.76193f,  0.57869f,  0.00251f,
+  -0.87244f, -0.26922f, -0.06682f, 0.07176f,  0.51142f,  0.58948f,  0.13914f,
+  0.71165f,  -0.40329f, -0.33201f, 0.35293f,  0.33437f,  -0.01812f, -0.24765f,
+  0.26810f,  -0.77088f, 1.35707f,  0.22243f,  0.78402f,  0.66191f,  0.79890f,
+  1.90669f,  0.73189f,  0.24222f,  -0.34682f, 0.66990f,  0.19554f,  0.58414f,
+  0.05060f,  -0.21271f, 0.11656f,  -0.74907f, 0.68837f,  -0.39147f, -1.78263f,
+  -0.69918f, -0.06838f, -0.26927f, 0.38502f,  0.08305f,  1.29848f,  0.67328f,
+  0.67269f,  0.65805f,  -0.47778f, -1.02617f, 0.16523f,  0.12223f,  -0.35294f,
+  -0.15866f, -0.56224f, 1.25895f,  -0.21422f, -0.33518f, -0.33519f, -0.37414f,
+  0.55122f,  0.14806f,  0.44312f,  -0.07865f, 0.75295f,  0.10766f,  0.59922f,
+  0.48837f,  -0.19099f, -2.07991f, 0.35755f,  0.87813f,  0.07559f,  1.00724f,
+  0.25223f,  -0.06761f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+  -0.54227f, 0.08599f,  -0.77447f, -1.10920f, 0.89298f,  0.05454f,
+  -0.73681f, 0.21048f,  -0.41041f, 1.25690f,  -0.60918f, 0.14661f,
+  -0.65392f, -0.25881f, 1.67995f,  -0.03550f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+  -0.22312f, 0.73552f,  0.48399f,  -0.66996f, 0.36527f,  -0.42228f, -1.10793f,
+  0.31167f,  0.16177f,  1.69315f,  -0.06287f, -0.35804f, -0.24889f, 0.80824f,
+  1.08952f,  -0.62838f, 0.30066f,  -0.19043f, -0.00518f, -1.31005f, 0.65797f,
+  1.07714f,  -0.24253f, 0.49779f,  0.05848f,  1.08914f,  0.08015f,  -0.38853f,
+  0.35108f,  -0.11026f, 0.64528f,  -0.37615f, 0.39995f,  -0.58117f, -1.29627f,
+  1.74169f,  0.75558f,  -0.04910f, 0.35020f,  0.04556f,  0.12634f,  1.27223f,
+  0.02608f,  -0.19687f, -0.78649f, -0.22746f, 1.02589f,  -0.28411f, 1.42443f,
+  -0.42115f, -0.21153f, -0.01733f, 0.62001f,  0.87167f,  1.66008f,  -0.39179f,
+  -0.06293f, 0.27012f,  0.16871f,  0.64597f,  0.67358f,  -0.20053f, 0.95830f,
+  0.44232f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+  0.14889f,
+  1.74197f,
+  0.53696f,
+  2.87574f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x4_hor_layer0,
+      av1_tx_type_nn_weights_8x4_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x4_hor_layer0,
+      av1_tx_type_nn_bias_8x4_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+  0.81919f,  0.15527f,  0.60055f,  -0.54617f, -0.35510f, -0.28223f, -0.20478f,
+  0.15001f,  -1.84806f, -0.30274f, -0.00865f, 0.33939f,  1.11970f,  0.44630f,
+  0.32074f,  0.39637f,  0.08149f,  1.28070f,  0.86703f,  0.76503f,  -1.83991f,
+  -1.13575f, -0.68605f, -0.23690f, 0.07099f,  0.64960f,  0.82543f,  -0.72028f,
+  0.08220f,  0.34338f,  0.20245f,  -0.88920f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+  1.14995f, -0.16021f, 2.38325f, -0.65179f,
+  1.09624f, 1.07662f,  0.63837f, -0.64847f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+  0.10278f,  0.06819f,  1.73885f,  1.29889f,  -0.18482f, -1.06132f, 0.67003f,
+  -0.23280f, 0.50181f,  -0.33890f, 0.43524f,  -1.03147f, 1.09640f,  0.66332f,
+  0.47652f,  -0.02251f, 0.94245f,  -0.03861f, 0.84776f,  0.28377f,  0.92044f,
+  0.23572f,  0.52082f,  -0.16266f, 0.45290f,  0.11342f,  -0.50310f, -0.92633f,
+  1.46345f,  1.84714f,  1.06804f,  -0.13610f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+  2.41028f,
+  1.95675f,
+  0.82387f,
+  2.41923f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x4_ver_layer0,
+      av1_tx_type_nn_weights_8x4_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x4_ver_layer0,
+      av1_tx_type_nn_bias_8x4_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_layer0[128] = {
+  0.98214f,  1.05643f,  0.91173f,  0.24165f,  0.39961f,  0.25736f,  0.68593f,
+  0.10553f,  0.13353f,  -0.49687f, -1.66413f, 1.16584f,  2.25147f,  -0.72247f,
+  -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f,
+  -1.06022f, -1.42138f, 1.10500f,  2.96552f,  -0.40638f, 0.02258f,  -0.23137f,
+  0.34922f,  -0.01454f, 0.41251f,  0.35944f,  -1.56742f, 0.01406f,  0.88114f,
+  1.42462f,  0.87243f,  0.02439f,  0.07035f,  0.34303f,  -3.16843f, 0.25798f,
+  0.07494f,  0.38926f,  -0.12267f, 0.09049f,  -0.36711f, 0.01551f,  1.41269f,
+  1.33505f,  1.43627f,  1.41909f,  1.44605f,  1.43008f,  1.36721f,  0.19443f,
+  -0.08606f, 0.17285f,  0.63692f,  0.92092f,  0.61007f,  0.87100f,  -0.33631f,
+  1.98025f,  -0.40686f, -0.33808f, 0.34919f,  0.33817f,  -0.01807f, -0.25259f,
+  0.26442f,  -0.76979f, 1.07788f,  -1.38747f, 1.34315f,  2.79947f,  2.02838f,
+  -0.25062f, 0.00174f,  1.25888f,  0.17344f,  0.20897f,  1.28765f,  1.95749f,
+  1.62351f,  1.04556f,  0.43858f,  0.12463f,  1.66399f,  0.03971f,  0.36614f,
+  0.56932f,  0.15982f,  0.11587f,  0.21402f,  1.89386f,  -0.91267f, -0.79781f,
+  1.79155f,  0.60147f,  -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f,
+  -0.11409f, -0.79470f, 0.69697f,  -0.16588f, -0.16090f, -0.21236f, -0.52776f,
+  -0.64455f, 0.09173f,  0.80766f,  0.76097f,  0.20295f,  -0.93467f, -0.43509f,
+  0.59659f,  0.07788f,  -3.79459f, 0.16268f,  0.47343f,  0.05106f,  -0.24880f,
+  1.18941f,  0.10346f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_layer0[16] = {
+  0.75780f,  0.25628f,  0.19911f,  -0.41384f, 1.33909f,  0.31498f,
+  -1.37171f, -1.09561f, -0.44056f, 0.49001f,  -0.65804f, -1.96031f,
+  0.64806f,  -0.52520f, 1.38838f,  0.15519f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_layer1[64] = {
+  -0.63856f, -2.02670f, -0.92947f, 0.00216f,  1.47710f,  -2.01099f, -2.11289f,
+  -0.92288f, 0.19296f,  1.37866f,  -0.85975f, -0.78624f, -2.10392f, 0.13976f,
+  1.06968f,  -2.04120f, 0.57991f,  -1.84941f, -0.81512f, -2.08254f, -0.47334f,
+  0.12256f,  -1.39594f, -1.02829f, 0.06134f,  2.25646f,  -1.25196f, -2.65317f,
+  -1.94473f, 0.10989f,  0.55446f,  -1.76557f, 0.33455f,  -1.85556f, -3.01878f,
+  -0.25100f, 1.65520f,  -1.61409f, 1.16336f,  -1.15560f, 0.13631f,  1.50733f,
+  -1.07538f, -0.91200f, -1.93132f, 0.09271f,  0.24425f,  -1.80655f, -0.01138f,
+  -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f,  -1.60316f,
+  -0.02495f, 1.04938f,  0.28411f,  -0.79809f, -1.48232f, 0.00766f,  0.94016f,
+  -1.11974f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_layer1[4] = {
+  0.53574f,
+  1.57736f,
+  -0.13698f,
+  2.64613f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x8_layer0,
+      av1_tx_type_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x8_layer0,
+      av1_tx_type_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+  1.36274f,  1.37313f,  1.26859f,  1.26459f,  1.37979f,  1.47217f,  1.29710f,
+  0.15765f,  0.31552f,  -0.05727f, 0.25562f,  0.47925f,  -0.32913f, -0.55757f,
+  -0.98010f, 0.08568f,  -0.62754f, 0.12834f,  -0.03717f, 0.06286f,  0.26159f,
+  0.26023f,  -0.62605f, 1.34500f,  1.47720f,  0.47937f,  0.84793f,  0.87866f,
+  0.81260f,  0.74761f,  0.84217f,  0.53321f,  -0.78232f, 0.35321f,  0.41240f,
+  0.45002f,  0.88973f,  0.51055f,  0.91115f,  -0.45512f, -2.37418f, -0.25205f,
+  0.05893f,  -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f,  0.18796f,
+  -0.05797f, 0.26484f,  1.23515f,  1.70393f,  0.46022f,  -0.14354f, 0.08501f,
+  -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f,
+  1.59912f,  -0.40332f, -0.33209f, 0.37274f,  0.36831f,  -0.00248f, -0.24295f,
+  0.29539f,  -0.76136f, -0.22531f, 0.12371f,  0.37889f,  1.02639f,  1.73330f,
+  1.09686f,  1.04111f,  0.69006f,  -1.27157f, 0.94013f,  0.61621f,  0.62274f,
+  0.48759f,  0.55672f,  0.62597f,  -0.38846f, 1.72124f,  0.08214f,  -0.06650f,
+  0.32617f,  0.10958f,  0.24650f,  0.10740f,  1.16861f,  0.50701f,  0.45383f,
+  0.90016f,  -0.00695f, -0.11986f, -0.07834f, 0.20346f,  0.25863f,  -0.40889f,
+  -0.11344f, -0.79108f, 0.76259f,  -0.14562f, -0.15459f, -0.20954f, -0.51306f,
+  0.02743f,  -0.82456f, -0.00861f, -0.27274f, 0.28762f,  0.07282f,  0.26410f,
+  0.53413f,  -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f,  0.80313f,
+  1.07698f,  0.02531f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+  -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f,  0.29276f,
+  -0.41990f, -0.96924f, -0.30933f, 0.95264f,  -0.25330f, -1.19584f,
+  1.46564f,  -0.42959f, 1.55720f,  0.18479f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+  -1.72959f, -0.21670f, 0.10616f,  -0.02006f, 0.15084f,  -0.85303f, -0.27535f,
+  0.58704f,  0.23683f,  1.19743f,  0.77971f,  0.49874f,  0.19508f,  0.19641f,
+  1.47895f,  -0.52173f, -0.56746f, -0.50761f, 0.15864f,  -0.95168f, 0.48103f,
+  0.91904f,  -0.11700f, 0.62863f,  0.06526f,  1.63803f,  -0.72325f, -1.80449f,
+  0.66373f,  0.12831f,  0.27139f,  -0.26346f, 1.50852f,  0.25079f,  -0.54255f,
+  1.78815f,  1.39691f,  -0.44989f, -0.18511f, -1.52903f, 0.13983f,  1.06906f,
+  -0.30184f, 0.37566f,  0.46209f,  0.10440f,  0.64695f,  -0.34002f, 1.96990f,
+  0.21189f,  -0.91248f, -0.11263f, 0.26708f,  1.27405f,  1.89776f,  0.02081f,
+  -0.06977f, -0.02584f, 0.47733f,  0.27117f,  1.33315f,  -0.09175f, 0.48747f,
+  1.16772f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+  1.25783f,
+  1.19452f,
+  0.69964f,
+  2.41982f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x16_hor_layer0,
+      av1_tx_type_nn_weights_8x16_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x16_hor_layer0,
+      av1_tx_type_nn_bias_8x16_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+  0.90888f,  0.86305f,  0.81674f,  0.75352f,  1.07834f,  0.99048f,  0.96355f,
+  0.13836f,  -0.51334f, 0.19906f,  1.84608f,  0.67828f,  0.45876f,  0.08325f,
+  0.28190f,  -0.01958f, -1.96553f, 0.27837f,  -0.05929f, 0.13491f,  0.21036f,
+  0.05797f,  -0.01373f, 0.73765f,  1.39603f,  -0.53767f, 0.10362f,  0.03420f,
+  0.41909f,  0.09510f,  0.32284f,  0.83860f,  0.13954f,  0.48434f,  1.47762f,
+  0.45891f,  0.23613f,  0.13013f,  0.82097f,  -0.03251f, -1.89757f, 0.21589f,
+  -0.10370f, 0.02530f,  -0.25659f, 0.01466f,  -0.23661f, 0.22783f,  0.92100f,
+  1.02915f,  1.20358f,  1.17251f,  0.97749f,  1.04696f,  0.91333f,  0.54576f,
+  -0.52792f, 0.02217f,  0.25652f,  0.31405f,  -0.18398f, 0.04572f,  -0.81359f,
+  1.82883f,  -0.40047f, -0.33056f, 0.35255f,  0.34448f,  -0.00339f, -0.23857f,
+  0.28925f,  -0.77175f, -0.24325f, -0.21420f, 1.11451f,  1.39553f,  0.51573f,
+  0.05476f,  1.13791f,  0.94959f,  -0.35710f, 0.67467f,  0.16722f,  0.61213f,
+  0.07683f,  -0.20613f, 0.13440f,  -0.72131f, -0.15418f, -0.17688f, -0.16510f,
+  -0.19226f, 0.09270f,  -2.43559f, -0.12669f, 0.05074f,  0.30414f,  0.00927f,
+  0.60630f,  0.00801f,  -1.07310f, -0.06227f, 2.10607f,  0.02382f,  -0.39891f,
+  -0.09149f, -0.78596f, 0.83966f,  -0.14802f, -0.14083f, -0.20831f, -0.55136f,
+  0.08566f,  -0.00647f, 0.07044f,  0.53408f,  0.85720f,  -0.07393f, 0.24476f,
+  0.43767f,  0.30519f,  -1.89430f, 0.23252f,  1.63790f,  0.17316f,  -0.03903f,
+  0.25269f,  0.01562f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+  -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f,  0.16745f,
+  -1.34680f, -1.07083f, -0.34649f, 0.65598f,  -0.56278f, 0.22660f,
+  -0.25956f, -0.29608f, 1.24359f,  -0.09167f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+  -0.71147f, -0.63964f, -0.69220f, 0.22326f,  0.67191f,  -0.58894f, -0.98464f,
+  0.23583f,  0.22824f,  1.39838f,  0.09920f,  -0.59411f, -0.67101f, 0.19088f,
+  0.83025f,  -0.66991f, -0.42889f, -0.49969f, 1.39532f,  -1.02000f, 0.62101f,
+  0.57175f,  -0.83226f, 0.01551f,  0.05604f,  1.23028f,  0.02030f,  -0.55995f,
+  -0.42349f, 0.15375f,  0.52132f,  -0.52421f, 0.89586f,  -0.73778f, -0.10911f,
+  0.22447f,  1.16858f,  -0.48169f, 1.73890f,  -0.69860f, 0.12504f,  1.10492f,
+  0.04391f,  -0.85670f, -0.49257f, 0.09616f,  0.76518f,  -0.44854f, 1.50938f,
+  0.62246f,  -0.40366f, -0.11182f, -0.01680f, 0.59724f,  1.32170f,  -1.09061f,
+  -0.04278f, -0.02449f, 0.25024f,  1.26239f,  0.42345f,  -0.10031f, 0.80871f,
+  0.44198f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+  0.68329f,
+  1.33555f,
+  0.25943f,
+  3.23439f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x16_ver_layer0,
+      av1_tx_type_nn_weights_8x16_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x16_ver_layer0,
+      av1_tx_type_nn_bias_8x16_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+  0.89821f,  0.90804f,  1.13052f,  0.74855f,  1.02053f,  0.91260f,  0.97102f,
+  0.16808f,  -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
+  -0.12236f, -0.03158f, -1.43561f, 0.07794f,  0.16586f,  0.09731f,  0.12967f,
+  0.09725f,  -0.16826f, 1.26640f,  0.88004f,  0.27312f,  -0.07993f, 0.33640f,
+  0.11732f,  0.33384f,  0.97066f,  -0.61744f, -0.48545f, 0.44622f,  0.73744f,
+  0.32262f,  -0.05713f, 0.42280f,  1.10378f,  0.18540f,  -2.07906f, 0.11443f,
+  0.37877f,  0.24136f,  -0.12524f, -0.12434f, 0.02116f,  0.11716f,  1.28267f,
+  1.01508f,  1.26184f,  1.22545f,  1.29582f,  1.18855f,  1.27564f,  0.42001f,
+  -0.41481f, 0.06725f,  -0.13133f, -0.24801f, 0.16515f,  0.16228f,  0.35197f,
+  0.53610f,  -0.39805f, -0.32584f, 0.40096f,  0.38621f,  -0.00030f, -0.23434f,
+  0.29149f,  -0.76542f, 0.04996f,  -0.30036f, 1.48687f,  0.90852f,  -0.03083f,
+  -0.15953f, 1.19259f,  0.87690f,  -1.08977f, 0.78757f,  0.81149f,  0.54089f,
+  0.35400f,  0.37919f,  0.84997f,  -0.20449f, 0.39601f,  -0.37596f, 0.64748f,
+  0.26021f,  0.37354f,  0.23593f,  0.16335f,  1.70681f,  0.31800f,  -0.00964f,
+  0.82687f,  -0.78372f, -1.47438f, 0.32410f,  1.37436f,  0.07476f,  -0.40574f,
+  -0.10353f, -0.79300f, 0.74381f,  -0.15601f, -0.14380f, -0.20961f, -0.52697f,
+  0.04669f,  -0.00870f, 0.05624f,  -0.09036f, 0.25701f,  0.30336f,  0.24199f,
+  0.45579f,  0.66330f,  -1.81834f, 0.74965f,  1.22747f,  0.25072f,  0.25100f,
+  0.43289f,  -0.00362f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+  -0.87643f, 0.36754f,  -0.86409f, 1.37761f,  1.22688f,  0.09074f,
+  -1.47139f, -1.06100f, -0.24087f, 1.10382f,  -0.32837f, -1.39592f,
+  -0.14741f, -0.43954f, 1.72137f,  -0.21704f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+  -0.81860f, -0.80745f, -0.43612f, 0.58656f,  0.37455f, -0.56519f, -1.71536f,
+  0.23278f,  0.23951f,  1.09610f,  0.49986f,  0.43375f, -0.53182f, 0.17376f,
+  1.05626f,  -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f,  1.14295f,
+  0.45571f,  -0.52504f, -0.00303f, 0.06044f,  0.66119f, -0.60340f, -1.14344f,
+  -0.28045f, 0.12742f,  0.61484f,  -0.41016f, 1.36102f, -0.86969f, -0.52728f,
+  1.01725f,  0.67083f,  -0.10138f, 1.36406f,  0.34066f, 0.12498f,  0.86595f,
+  -0.39636f, -0.27888f, -0.40244f, 0.09847f,  0.81178f, -0.45313f, 1.39127f,
+  0.99865f,  -0.57908f, 0.55072f,  0.49638f,  1.11524f, 1.85504f,  -0.28316f,
+  -0.05195f, -0.23284f, 0.26461f,  -1.28120f, 0.60707f, -0.06110f, 0.74085f,
+  0.63304f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+  0.71765f,
+  1.40400f,
+  0.32221f,
+  3.07234f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x8_hor_layer0,
+      av1_tx_type_nn_weights_16x8_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x8_hor_layer0,
+      av1_tx_type_nn_bias_16x8_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+  1.20497f,  1.23691f,  1.23738f,  1.07773f,  1.15264f,  1.31959f,  1.15365f,
+  0.17179f,  0.68612f,  0.55636f,  0.57145f,  0.67022f,  0.19636f,  -1.27420f,
+  -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f,
+  -1.02689f, -0.88153f, 0.83857f,  1.53355f,  0.13601f,  0.35451f,  0.53750f,
+  0.62381f,  0.32438f,  0.59405f,  0.33090f,  -1.52948f, -0.46094f, 0.42634f,
+  0.48763f,  0.30707f,  0.52553f,  0.71427f,  -0.31287f, -2.37106f, -0.18756f,
+  0.16561f,  -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f,  0.45010f,
+  -0.00317f, -0.06403f, 0.95442f,  1.59636f,  0.30602f,  -0.05515f, 0.05467f,
+  -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f,  0.26141f,  -0.32174f,
+  1.78129f,  -0.40161f, -0.33158f, 0.38084f,  0.38081f,  0.01053f,  -0.23567f,
+  0.29239f,  -0.76159f, -0.19373f, 0.13649f,  0.66949f,  1.19733f,  1.92557f,
+  1.16691f,  0.94955f,  0.62324f,  -0.85434f, -0.07699f, 0.87683f,  0.95911f,
+  0.86106f,  0.57959f,  0.40146f,  -0.35851f, 1.55427f,  0.15349f,  -0.01582f,
+  0.32517f,  0.03784f,  0.15916f,  0.09024f,  1.43187f,  0.56160f,  0.11521f,
+  0.52476f,  -0.26107f, -0.38167f, -0.31596f, 0.31304f,  -0.65366f, -0.40680f,
+  -0.11082f, -0.78585f, 0.77906f,  -0.13322f, -0.13747f, -0.21001f, -0.53204f,
+  -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f,  0.13586f,  -0.42001f,
+  0.85388f,  0.08300f,  -0.89325f, -1.73681f, -0.70473f, 0.23151f,  0.69549f,
+  0.72124f,  0.12769f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+  -1.15644f, -0.31062f, 0.20697f,  -0.60304f, -1.19498f, 0.21451f,
+  -0.42825f, -0.71800f, -0.25816f, 1.47408f,  -0.24423f, -1.45773f,
+  -0.55834f, -0.36938f, 1.56759f,  0.07238f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+  -1.45227f, -0.67141f, 0.75237f,  0.32681f,  -0.70528f, -0.76730f, -0.49777f,
+  0.02418f,  0.25096f,  1.14840f,  0.23548f,  0.48755f,  0.33164f,  0.21050f,
+  1.41651f,  -0.28888f, -0.76668f, 0.04439f,  0.67538f,  -1.06438f, 0.68128f,
+  0.95824f,  0.08530f,  -0.03635f, 0.06820f,  1.38621f,  -0.50424f, -1.72992f,
+  -0.20949f, 0.13400f,  0.93366f,  -0.05324f, 1.41593f,  -0.75119f, -1.80912f,
+  1.05440f,  0.62580f,  -0.30867f, -0.07025f, -0.34654f, 0.13621f,  1.74426f,
+  -0.22417f, 0.47031f,  -0.08142f, 0.10151f,  0.42498f,  0.06635f,  1.50623f,
+  1.04130f,  0.85107f,  0.23382f,  0.69800f,  1.10856f,  1.18767f,  -0.69395f,
+  -0.07985f, 0.50412f,  0.46019f,  0.49214f,  0.44219f,  -0.09502f, 0.75745f,
+  0.99208f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+  0.68774f,
+  0.88572f,
+  0.77462f,
+  3.05667f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x8_ver_layer0,
+      av1_tx_type_nn_weights_16x8_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x8_ver_layer0,
+      av1_tx_type_nn_bias_16x8_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x16_layer0,
+      av1_tx_type_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x16_layer0,
+      av1_tx_type_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 16x32 block.
+static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = {
+  0.89821f,  0.90804f,  1.13052f,  0.74855f,  1.02053f,  0.91260f,  0.97102f,
+  0.16808f,  -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
+  -0.12236f, -0.03158f, -1.43561f, 0.07794f,  0.16586f,  0.09731f,  0.12967f,
+  0.09725f,  -0.16826f, 1.26640f,  0.88004f,  0.27312f,  -0.07993f, 0.33640f,
+  0.11732f,  0.33384f,  0.97066f,  -0.61744f, -0.48545f, 0.44622f,  0.73744f,
+  0.32262f,  -0.05713f, 0.42280f,  1.10378f,  0.18540f,  -2.07906f, 0.11443f,
+  0.37877f,  0.24136f,  -0.12524f, -0.12434f, 0.02116f,  0.11716f,  1.28267f,
+  1.01508f,  1.26184f,  1.22545f,  1.29582f,  1.18855f,  1.27564f,  0.42001f,
+  -0.41481f, 0.06725f,  -0.13133f, -0.24801f, 0.16515f,  0.16228f,  0.35197f,
+  0.53610f,  -0.39805f, -0.32584f, 0.40096f,  0.38621f,  -0.00030f, -0.23434f,
+  0.29149f,  -0.76542f, 0.04996f,  -0.30036f, 1.48687f,  0.90852f,  -0.03083f,
+  -0.15953f, 1.19259f,  0.87690f,  -1.08977f, 0.78757f,  0.81149f,  0.54089f,
+  0.35400f,  0.37919f,  0.84997f,  -0.20449f, 0.39601f,  -0.37596f, 0.64748f,
+  0.26021f,  0.37354f,  0.23593f,  0.16335f,  1.70681f,  0.31800f,  -0.00964f,
+  0.82687f,  -0.78372f, -1.47438f, 0.32410f,  1.37436f,  0.07476f,  -0.40574f,
+  -0.10353f, -0.79300f, 0.74381f,  -0.15601f, -0.14380f, -0.20961f, -0.52697f,
+  0.04669f,  -0.00870f, 0.05624f,  -0.09036f, 0.25701f,  0.30336f,  0.24199f,
+  0.45579f,  0.66330f,  -1.81834f, 0.74965f,  1.22747f,  0.25072f,  0.25100f,
+  0.43289f,  -0.00362f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = {
+  -0.87643f, 0.36754f,  -0.86409f, 1.37761f,  1.22688f,  0.09074f,
+  -1.47139f, -1.06100f, -0.24087f, 1.10382f,  -0.32837f, -1.39592f,
+  -0.14741f, -0.43954f, 1.72137f,  -0.21704f,
+};
+
+static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = {
+  -0.81860f, -0.80745f, -0.43612f, 0.58656f,  0.37455f, -0.56519f, -1.71536f,
+  0.23278f,  0.23951f,  1.09610f,  0.49986f,  0.43375f, -0.53182f, 0.17376f,
+  1.05626f,  -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f,  1.14295f,
+  0.45571f,  -0.52504f, -0.00303f, 0.06044f,  0.66119f, -0.60340f, -1.14344f,
+  -0.28045f, 0.12742f,  0.61484f,  -0.41016f, 1.36102f, -0.86969f, -0.52728f,
+  1.01725f,  0.67083f,  -0.10138f, 1.36406f,  0.34066f, 0.12498f,  0.86595f,
+  -0.39636f, -0.27888f, -0.40244f, 0.09847f,  0.81178f, -0.45313f, 1.39127f,
+  0.99865f,  -0.57908f, 0.55072f,  0.49638f,  1.11524f, 1.85504f,  -0.28316f,
+  -0.05195f, -0.23284f, 0.26461f,  -1.28120f, 0.60707f, -0.06110f, 0.74085f,
+  0.63304f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = {
+  0.71765f,
+  1.40400f,
+  0.32221f,
+  3.07234f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x32_hor_layer0,
+      av1_tx_type_nn_weights_16x32_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x32_hor_layer0,
+      av1_tx_type_nn_bias_16x32_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = {
+  -0.01219f, 0.51494f,  0.25450f,  0.45788f,  -0.87277f, 0.32954f,  -0.04851f,
+  -0.24321f, -0.40000f, 0.21915f,  0.14108f,  0.98268f,  0.18989f,  0.54298f,
+  0.36349f,  0.38931f,  1.08124f,  0.87199f,  1.03553f,  1.14777f,  1.04254f,
+  1.11336f,  0.92198f,  0.84715f,  1.89363f,  1.21587f,  0.72377f,  1.25097f,
+  0.84231f,  0.95529f,  1.12346f,  0.19113f,  -0.04559f, 0.56859f,  0.59747f,
+  0.60176f,  0.82465f,  0.59009f,  0.67240f,  1.58674f,  -0.92951f, -0.23449f,
+  0.11923f,  -0.19151f, -0.15914f, 0.03146f,  -0.16541f, 0.17181f,  -0.21834f,
+  0.21906f,  0.96708f,  0.36085f,  -0.42380f, -2.25681f, -0.48812f, 0.72875f,
+  0.06585f,  0.18818f,  -0.02109f, -0.10996f, 0.00187f,  -0.02078f, 0.04484f,
+  -0.07171f, 0.94773f,  -0.33466f, 0.28484f,  0.14791f,  0.30274f,  0.13377f,
+  0.40970f,  0.45133f,  1.69265f,  -0.36422f, -0.15889f, 0.07670f,  0.44675f,
+  -0.28665f, -0.07097f, 1.03803f,  -0.83274f, -0.24571f, 0.08039f,  -0.23790f,
+  -0.23276f, -0.28031f, 0.26451f,  -0.18513f, -2.23336f, -0.62073f, 0.32495f,
+  -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f,  0.19723f,
+  0.04017f,  0.31624f,  0.58369f,  0.30411f,  -0.81165f, -2.58541f, -0.20491f,
+  0.68089f,  -0.14799f, 0.13925f,  0.12867f,  0.15229f,  0.06887f,  -0.03784f,
+  0.02288f,  -0.28712f, 0.14107f,  0.29485f,  -0.11662f, 0.25239f,  0.30311f,
+  -0.07377f, -0.10962f, 0.59856f,  0.47967f,  0.01847f,  -0.27889f, 0.46786f,
+  0.18118f,  0.09355f,  -2.10076f, 0.38823f,  0.28202f,  0.29104f,  0.86977f,
+  0.52377f,  0.21161f,  0.72888f,  -0.00952f, 0.15982f,  -0.14651f, 0.28763f,
+  -0.14155f, 0.00093f,  0.08351f,  0.34685f,  -0.22066f, 0.20378f,  0.25416f,
+  0.03423f,  -0.11068f, -0.41612f, 0.56913f,  -0.06697f, -0.12585f, -0.21033f,
+  -0.14513f, -0.04477f, -0.35778f, 0.03437f,  0.06956f,  -0.25356f, -1.46010f,
+  -0.08142f, 0.11926f,  -0.63551f, -0.13882f, 0.34164f,  0.10821f,  1.07323f,
+  -0.62435f, -0.27116f, 0.25971f,  0.11952f,  -0.39480f, -0.05474f, -0.12582f,
+  0.28289f,  0.13723f,  0.58369f,  0.41865f,  0.28574f,  1.01357f,  0.46661f,
+  0.61717f,  0.85708f,  -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f,
+  -0.01041f, 0.12119f,  -0.20786f, 0.55915f,  0.67511f,  0.55554f,  0.56540f,
+  0.76647f,  0.54766f,  0.45166f,  0.61384f,  0.95407f,  -0.06811f, -0.62132f,
+  0.12713f,  0.63713f,  2.04090f,  1.17054f,  0.00469f,  -0.93692f, -0.24136f,
+  -0.04281f, -0.15787f, 0.37956f,  -0.09174f, -0.72494f, 0.55285f,  -1.40996f,
+  -0.54077f, 0.38445f,  -0.08258f, 0.64259f,  -0.54058f, -0.49865f, 1.41371f,
+  0.89014f,  0.78788f,  0.37919f,  0.87447f,  -0.00760f, -0.00947f, 0.16323f,
+  -0.36632f, -1.38115f, -0.24619f, 0.40490f,  -0.08871f, -0.25365f, -0.60842f,
+  0.11128f,  0.18658f,  -0.86001f, -0.28271f, 0.39572f,  -0.29930f, -0.10110f,
+  0.33706f,  0.21731f,  0.15383f,  -0.01707f, 0.02812f,  0.31192f,  0.39742f,
+  0.38260f,  -0.48263f, 0.57385f,  0.53239f,  -0.60013f, -0.63211f, -0.45140f,
+  -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f,  -0.05195f, -0.07138f,
+  -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f,  0.09419f,
+  0.36919f,  -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f,
+  0.28414f,  0.19273f,  -0.68516f, 0.09514f,  -0.45381f, 0.19917f,  -0.32377f,
+  1.32549f,  0.08244f,  -0.64405f, 0.13195f,  2.85307f,  0.47631f,  -0.33408f,
+  0.04168f,  0.18585f,  -0.18029f, 0.07986f,  -0.08816f, -0.00703f, -0.01515f,
+  -0.13164f, 0.00571f,  0.05676f,  1.51425f,  0.73360f,  0.43486f,  -0.08223f,
+  -0.06183f, -0.57098f, -0.29948f, 0.05945f,  0.19238f,  -0.47980f, -0.35902f,
+  -0.19931f, 0.43443f,  0.67436f,  0.78573f,  0.25703f,  1.01863f,  0.99047f,
+  0.95228f,  1.02429f,  1.19264f,  0.29935f,  -0.26583f, -0.98749f, -0.46167f,
+  -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f,
+  -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f,
+  -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f,  -0.18699f, 0.11037f,
+  1.39110f,  0.05715f,  3.00762f,  1.52243f,  0.25028f,  0.12779f,  -0.12871f,
+  0.04764f,  0.08288f,  -0.16572f, -0.06580f, 0.05845f,  -0.01474f, 0.04886f,
+  -0.10000f, 0.12911f,  -0.01416f, -0.12472f, 0.14358f,  0.16554f,  0.08853f,
+  0.13418f,  -0.05408f, -0.13871f, -0.00049f, 0.20725f,  -0.05603f, 0.27885f,
+  -0.14277f, 0.29653f,  -0.24739f, 0.10101f,  -0.17068f, -2.43802f, 0.41834f,
+  0.49784f,  0.34949f,  0.98487f,  0.16792f,  1.07355f,  0.32546f,  1.32377f,
+  -0.08584f, 0.85214f,  -0.05721f, 0.90307f,  0.20167f,  0.52664f,  -0.14478f,
+  0.64997f,  0.06846f,  0.32475f,  0.64453f,  0.70143f,  -0.03091f, -0.24958f,
+  -0.39021f, -0.57693f, -0.18319f, 0.11793f,  -0.05948f, 0.36670f,  -0.27932f,
+  0.14800f,  -0.55459f, -0.89673f, 0.65922f,  0.54308f,  -0.16731f, -0.59731f,
+  -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f,
+  0.57665f,  0.14171f,  0.54693f,  -0.22144f, -0.59664f, 0.13295f,  0.07057f,
+  -0.19698f, 0.03328f,  -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f,
+  0.10399f,  -0.29824f, 0.16028f,  0.00053f,  0.22699f,  0.04203f,  -0.43880f,
+  -0.12654f, 0.12172f,  0.21087f,  -0.46350f, -0.22081f, -0.06173f, -0.23287f,
+  0.90314f,  0.04466f,  -0.06149f, 0.32682f,  0.16609f,  -0.58991f, -0.03786f,
+  -0.41329f, 0.02632f,  0.23411f,  0.25344f,  0.16468f,  0.31007f,  0.21845f,
+  0.32462f,  0.33945f,  0.11527f,  -0.35926f, -0.18584f, 0.29340f,  0.78199f,
+  2.39287f,  0.53838f,  -1.55085f, 0.02238f,  -0.26153f, -0.42498f, -0.02460f,
+  0.19261f,  -0.10870f, -0.08453f, -0.39561f, 0.08600f,  0.36310f,  0.58439f,
+  -0.59526f, 0.13104f,  -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f,
+  -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f,
+  -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f,
+  -0.01752f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = {
+  1.02458f,  -1.02185f, -0.18978f, 0.05981f,  -0.94931f, 0.34544f,  0.04415f,
+  -0.60036f, -0.11368f, -0.14154f, 1.23438f,  0.51640f,  -0.57587f, -0.91380f,
+  0.95720f,  0.68298f,  -0.06353f, -2.14960f, -0.11080f, 0.79380f,  -0.94199f,
+  0.43040f,  0.01358f,  0.07201f,  -0.49689f, -0.14839f, -0.80132f, -0.13925f,
+  -0.11834f, -0.24998f, -0.33976f, 0.35497f,
+};
+
+static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = {
+  0.87367f,  -1.06469f, -0.50829f, -0.70540f, 1.14596f,  -1.12346f, -0.94467f,
+  0.01380f,  -0.18911f, 0.07961f,  -0.18626f, 0.61902f,  -0.64423f, 1.21545f,
+  1.01149f,  0.26309f,  1.50380f,  1.93940f,  -0.64064f, 1.03987f,  -1.88000f,
+  -0.44574f, -1.53303f, 1.36307f,  1.00292f,  0.37031f,  0.21594f,  0.16758f,
+  0.02592f,  -0.77431f, -0.31797f, -1.53826f, 1.14013f,  -1.21957f, 0.04571f,
+  -0.22168f, 0.32299f,  0.25949f,  -0.13306f, 0.17850f,  0.92494f,  0.19999f,
+  0.07494f,  -0.03362f, -0.53453f, 1.02970f,  -0.22947f, 0.73964f,  1.08445f,
+  0.16855f,  -0.02686f, 0.25254f,  0.05952f,  0.02194f,  0.05649f,  0.39195f,
+  0.14139f,  0.53843f,  -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f,
+  -1.21977f, 0.62932f,  1.07173f,  0.24049f,  -0.51574f, 0.97492f,  -0.28169f,
+  -0.15406f, -0.05441f, -0.25415f, 0.16583f,  0.43674f,  -0.00593f, -0.09277f,
+  0.61402f,  1.35562f,  -0.03926f, 0.18967f,  -0.29548f, -0.55509f, 0.23661f,
+  0.05023f,  0.36226f,  -0.83314f, 0.39357f,  0.19943f,  -0.63431f, -0.03847f,
+  0.12213f,  0.62024f,  -0.11704f, -0.22483f, 0.96624f,  0.18518f,  0.09181f,
+  -0.63068f, 0.66797f,  0.74107f,  0.40624f,  0.70636f,  -0.06921f, 0.34175f,
+  -0.15513f, 2.07844f,  0.22126f,  0.52919f,  0.26793f,  -0.50018f, 1.10549f,
+  0.10970f,  0.05831f,  0.82842f,  -1.22975f, 1.78377f,  0.92679f,  2.01480f,
+  -1.19011f, -0.53381f, 0.38533f,  0.45579f,  -0.10683f, -0.40828f, 0.31398f,
+  0.14978f,  0.91325f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = {
+  1.03659f,
+  1.80249f,
+  1.25710f,
+  1.32000f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = {
+  16,  // num_inputs
+  4,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x32_ver_layer0,
+      av1_tx_type_nn_weights_16x32_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x32_ver_layer0,
+      av1_tx_type_nn_bias_16x32_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 32x16 block.
+static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = {
+  -0.07289f, 0.30798f,  0.41881f,  0.33434f,  -0.01599f, 0.85307f,  -0.16060f,
+  -0.07922f, -0.04693f, 0.29186f,  0.44117f,  1.02417f,  0.12447f,  0.46321f,
+  0.40060f,  0.50140f,  0.48338f,  0.47298f,  0.36585f,  0.42821f,  0.41289f,
+  0.47534f,  0.42900f,  0.26061f,  0.45887f,  0.38163f,  0.17302f,  1.00888f,
+  1.79910f,  1.36140f,  0.24471f,  0.04557f,  1.10823f,  0.74325f,  0.91210f,
+  0.81387f,  0.98865f,  -0.09874f, 0.55146f,  0.19385f,  -0.50752f, -0.17249f,
+  0.27261f,  -0.02763f, -0.03286f, 0.09122f,  0.07015f,  0.20012f,  0.68983f,
+  -1.25345f, -0.00145f, 0.71567f,  0.54948f,  -0.56154f, -0.28918f, 0.11997f,
+  -0.09907f, 0.09195f,  0.05768f,  0.15558f,  0.11284f,  -0.35195f, -0.08723f,
+  -0.03571f, 0.94031f,  0.63737f,  0.98202f,  0.93826f,  0.87126f,  0.88530f,
+  0.97697f,  0.55283f,  0.58670f,  0.86502f,  0.97008f,  0.99709f,  0.66214f,
+  0.96660f,  0.99890f,  0.31945f,  -1.00301f, 0.13215f,  -0.03950f, 0.21148f,
+  0.05128f,  0.10955f,  0.44839f,  -0.33438f, -2.09773f, 0.13908f,  0.58669f,
+  0.25268f,  -0.24006f, 0.01286f,  -0.05732f, 0.03401f,  -0.06896f, 0.35397f,
+  0.05133f,  -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f,
+  -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f,  -0.26912f,
+  0.06866f,  -0.25694f, 0.17632f,  0.32032f,  -0.10666f, 0.26278f,  0.31877f,
+  -0.09338f, -0.14289f, 0.54232f,  0.46070f,  0.00059f,  -0.27914f, 0.45177f,
+  0.16274f,  -0.08811f, -0.45791f, 0.53946f,  -0.16794f, 0.16229f,  0.11840f,
+  -0.24435f, 0.26894f,  -0.33180f, -0.47314f, 0.34061f,  -0.13939f, 0.13321f,
+  -0.05208f, -0.18139f, -0.35234f, 1.37298f,  -0.19360f, 0.21728f,  0.26088f,
+  0.04045f,  -0.10763f, -0.40470f, 0.50026f,  -0.06726f, -0.12871f, -0.20963f,
+  -0.14583f, -0.04711f, -0.35988f, 0.03091f,  0.06491f,  -0.31668f, -0.52190f,
+  0.23397f,  -0.13984f, -0.15207f, -0.49977f, 0.51205f,  0.12559f,  -0.03631f,
+  0.33447f,  -0.36684f, 0.17533f,  0.15671f,  -0.00096f, 0.06817f,  0.20922f,
+  0.34006f,  0.71260f,  0.45024f,  0.53033f,  0.15645f,  0.76019f,  0.56870f,
+  0.83066f,  0.63022f,  1.74436f,  -0.24798f, 0.06795f,  -0.00749f, 0.17795f,
+  0.10371f,  0.06527f,  0.41054f,  0.49003f,  0.34630f,  0.02615f,  0.30320f,
+  -0.47133f, -0.49584f, 0.21775f,  0.27530f,  -0.29977f, -0.64269f, 0.52627f,
+  -0.02492f, 0.08077f,  0.40786f,  -0.36015f, -0.70714f, -1.98185f, -0.28187f,
+  0.35018f,  -0.06105f, -0.12710f, 0.06606f,  -0.27805f, 0.44630f,  -0.84731f,
+  -0.26699f, 0.25856f,  0.06194f,  -0.18674f, -0.11560f, -0.43277f, 1.10579f,
+  0.95876f,  0.17415f,  0.56386f,  0.68426f,  0.50180f,  0.24844f,  0.12347f,
+  0.15281f,  -0.19089f, 0.52279f,  0.41860f,  -0.05270f, -0.17029f, -0.03542f,
+  0.10621f,  -0.25088f, 0.24070f,  -0.08951f, 0.29950f,  -0.36720f, 0.02151f,
+  0.20129f,  -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f,
+  0.23814f,  -0.25991f, 0.05812f,  0.60554f,  -0.06106f, -0.58326f, 0.28762f,
+  -0.18747f, 0.08232f,  -0.04243f, -0.03293f, 0.14722f,  -0.13017f, -0.67263f,
+  0.38698f,  -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f,  0.04684f,
+  0.04214f,  0.00030f,  0.02410f,  0.19966f,  -0.04246f, 0.00442f,  0.23121f,
+  0.13364f,  0.21548f,  -0.12748f, -0.14066f, -0.28354f, 0.59937f,  -0.27553f,
+  1.57503f,  -0.01050f, -0.17724f, 0.44110f,  -0.80334f, 0.72064f,  1.00501f,
+  -0.72638f, 0.02774f,  0.48540f,  -0.72016f, -0.27721f, 0.31559f,  0.07322f,
+  0.20279f,  -0.19647f, 0.02352f,  0.12662f,  0.19743f,  0.30543f,  0.25712f,
+  0.44702f,  0.16417f,  0.17888f,  -2.58469f, 0.20555f,  0.57782f,  -0.10892f,
+  0.14527f,  0.82251f,  0.04200f,  0.44626f,  0.10818f,  0.71204f,  0.62903f,
+  0.69178f,  0.73603f,  0.52717f,  0.83020f,  0.48824f,  1.03270f,  -0.00152f,
+  0.07958f,  0.24181f,  -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f,
+  0.56318f,  0.32580f,  -0.58503f, -0.33673f, -0.00838f, 0.48924f,  0.43362f,
+  0.12750f,  0.00295f,  0.38624f,  0.17037f,  0.00729f,  -0.26256f, -0.41669f,
+  0.36847f,  0.22424f,  1.33334f,  0.18112f,  0.37682f,  0.49173f,  -0.45240f,
+  -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f,  0.38033f,  0.13685f,
+  0.17597f,  0.28668f,  0.31193f,  -0.43281f, 0.43267f,  -0.50495f, 0.01969f,
+  0.14131f,  -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f,
+  -0.38584f, -0.10953f, 0.19669f,  0.34540f,  -0.49941f, 0.04605f,  -0.43535f,
+  0.27519f,  0.03659f,  -0.31961f, 0.13330f,  0.87009f,  0.20101f,  -0.70392f,
+  -0.27883f, 0.33874f,  -0.34308f, 0.67760f,  0.88195f,  0.55752f,  -0.26563f,
+  0.17875f,  0.06964f,  0.87607f,  1.47616f,  0.46747f,  -0.56408f, -0.39352f,
+  -0.16427f, -0.41185f, 0.14187f,  0.19265f,  -0.58613f, 0.56345f,  -0.17729f,
+  -0.11320f, 0.08752f,  -0.01329f, 1.20981f,  0.45170f,  -0.20571f, -0.01150f,
+  0.26476f,  0.13508f,  0.22020f,  -0.42684f, -0.22499f, -1.51212f, 0.86648f,
+  0.21776f,  0.24666f,  0.71339f,  0.42742f,  -0.00952f, 0.14762f,  0.07693f,
+  -0.19599f, 0.03075f,  -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f,
+  0.10038f,  -0.30038f, 0.14686f,  0.00548f,  0.20350f,  0.00763f,  -0.43756f,
+  -0.01997f, 0.00902f,  0.07470f,  -0.41441f, -0.20605f, 0.07626f,  -0.34973f,
+  0.47455f,  -0.15251f, -0.05325f, 0.04964f,  0.32477f,  -0.54604f, 0.25273f,
+  -0.18461f, -0.30841f, 0.64908f,  0.60752f,  0.64148f,  0.72788f,  0.71232f,
+  0.58597f,  0.73017f,  0.58857f,  0.71908f,  0.59860f,  0.61849f,  0.99398f,
+  0.39572f,  -0.36165f, -1.88646f, 0.14384f,  -0.60541f, -0.21380f, -0.55498f,
+  -0.50960f, -0.08801f, 0.51892f,  0.19126f,  0.57879f,  1.19447f,  0.25673f,
+  -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f,  -0.60983f,
+  -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f,
+  -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f,
+  -0.14533f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = {
+  1.53781f,  -0.49320f, -0.31646f, 0.02826f,  -1.05554f, 0.06559f,  -0.12399f,
+  -0.61671f, -0.28956f, -0.15419f, 0.87189f,  -0.43375f, -1.08477f, -0.66006f,
+  0.36233f,  0.82678f,  -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f,
+  0.50173f,  -0.07560f, 0.71598f,  1.50795f,  -0.04745f, -0.14008f, -0.18510f,
+  -0.14988f, -0.67044f, 0.79659f,  0.70610f,
+};
+
+static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = {
+  0.84983f,  -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f,
+  -0.02850f, 0.50767f,  0.10252f,  0.24540f,  0.67748f,  -0.43483f, -0.22242f,
+  0.23431f,  0.57287f,  0.69560f,  1.13814f,  -0.47427f, -0.55858f, -1.47072f,
+  0.26587f,  -0.36335f, 0.83060f,  1.01645f,  -0.52895f, -0.11614f, 0.17390f,
+  -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f,  -0.55612f, 0.46852f,
+  0.07406f,  -0.80467f, 0.23059f,  0.09992f,  -0.06164f, 0.13541f,  0.06135f,
+  0.83605f,  -0.53224f, -0.13867f, 0.93838f,  -0.61290f, 0.27732f,  -0.46688f,
+  -0.41810f, 0.12885f,  0.13619f,  -0.24612f, 0.07215f,  0.98866f,  0.10993f,
+  1.05799f,  -0.27146f, -0.00079f, -0.08585f, 0.08322f,  -0.33809f, 0.67598f,
+  -1.06515f, 1.28866f,  0.61028f,  -0.31704f, -0.59905f, 1.62151f,  0.10969f,
+  0.20671f,  -0.17818f, 0.14170f,  0.19322f,  0.30602f,  0.93111f,  0.19011f,
+  -0.45609f, 0.82506f,  0.32936f,  -0.07858f, -0.27106f, -0.31638f, 0.23299f,
+  0.81491f,  0.32584f,  -0.52093f, -0.32472f, 0.53643f,  -0.42605f, 0.01641f,
+  0.09002f,  0.15832f,  -0.08790f, 0.05511f,  1.00730f,  0.46309f,  0.68166f,
+  -0.18835f, 0.64512f,  -1.00540f, 0.86802f,  0.18981f,  -0.06982f, -0.24514f,
+  -0.08027f, 0.61199f,  -0.20830f, 0.72001f,  0.17477f,  0.06511f,  0.00801f,
+  -0.43590f, 0.37257f,  0.70323f,  0.60233f,  1.62541f,  0.74383f,  -0.22254f,
+  -0.33892f, 0.22881f,  0.62817f,  0.68915f,  -0.06417f, 0.00969f,  1.65869f,
+  0.89060f,  0.75948f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = {
+  0.95359f,
+  1.56043f,
+  1.06017f,
+  2.54520f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = {
+  16,  // num_inputs
+  4,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_32x16_hor_layer0,
+      av1_tx_type_nn_weights_32x16_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_32x16_hor_layer0,
+      av1_tx_type_nn_bias_32x16_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = {
+  1.30219f,  1.30548f,  1.33334f,  1.20560f,  1.01572f,  1.38100f,  1.37504f,
+  0.12599f,  -0.96957f, 0.19400f,  0.75734f,  0.11295f,  -0.40447f, -1.53062f,
+  -0.82980f, 0.02168f,  -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f,
+  -0.87176f, -1.10711f, 0.71207f,  1.49689f,  -0.12715f, 0.29357f,  0.35234f,
+  0.61016f,  0.80708f,  0.83564f,  1.05961f,  -0.99842f, 0.82004f,  0.02638f,
+  0.44606f,  0.32298f,  0.21321f,  0.47290f,  -0.71442f, -2.81050f, -0.02520f,
+  -0.08919f, 0.00369f,  -0.05257f, -0.07011f, -0.16394f, 0.06290f,  0.80086f,
+  0.32349f,  0.47411f,  1.36126f,  1.68162f,  0.91325f,  -0.27495f, 0.00262f,
+  0.06025f,  0.42832f,  0.36965f,  0.38063f,  0.32772f,  0.40914f,  0.44510f,
+  3.02239f,  -1.84077f, 0.49536f,  -0.27340f, -0.10437f, -0.34293f, -0.08047f,
+  -0.29651f, -0.97111f, -0.34187f, 0.52869f,  1.27240f,  1.20306f,  1.19121f,
+  1.28742f,  0.26393f,  -0.62319f, 0.92285f,  -0.08303f, -0.33118f, -0.13053f,
+  0.24875f,  -0.52089f, 0.44691f,  -1.08908f, 1.20921f,  0.36538f,  -0.46792f,
+  -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f,  0.68519f,  0.08228f,
+  -0.49027f, -0.34381f, 0.04719f,  -0.33298f, 0.72525f,  0.09538f,  -0.29216f,
+  -0.07260f, -0.55827f, 0.54542f,  -0.10144f, -0.09292f, -0.14427f, -0.38361f,
+  -0.41559f, 0.75338f,  -0.04530f, 0.27944f,  0.06932f,  -0.11537f, 0.29568f,
+  1.92155f,  -0.98996f, -0.08841f, 0.49386f,  0.15947f,  0.53290f,  1.46747f,
+  0.59360f,  0.25468f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = {
+  -1.19673f, 0.33043f,  0.24408f, 0.46221f,  2.00646f, 0.19031f,
+  -0.64944f, -0.43452f, 1.04400f, 1.47371f,  0.52460f, -1.39577f,
+  0.83852f,  -0.25536f, 1.33200f, -0.24444f,
+};
+
+static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = {
+  -1.31447f, -0.86455f, 0.85217f,  1.00048f,  0.37395f, -1.35713f, -0.54032f,
+  0.82803f,  0.89606f,  1.57696f,  0.68067f,  0.42512f, -0.26250f, 0.14621f,
+  0.93249f,  -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f,
+  0.67700f,  -0.29310f, 0.91604f,  -1.21881f, 1.11188f, 0.45045f,  -0.86119f,
+  -0.09294f, 0.09360f,  0.80794f,  0.41027f,  1.80399f, -0.50362f, -1.44689f,
+  0.85148f,  0.90707f,  -0.18458f, 0.14165f,  1.17367f, 0.70869f,  1.57147f,
+  0.24692f,  0.16626f,  0.56794f,  0.07313f,  0.14728f, -0.74296f, 1.74127f,
+  1.26560f,  0.17753f,  1.10194f,  0.56435f,  1.73779f, 1.42841f,  -1.16773f,
+  0.24584f,  0.10813f,  -0.60187f, 0.79802f,  0.75229f, -0.06112f, 1.77282f,
+  1.01058f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = {
+  0.83082f,
+  2.03845f,
+  0.59627f,
+  2.31341f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_32x16_ver_layer0,
+      av1_tx_type_nn_weights_32x16_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_32x16_ver_layer0,
+      av1_tx_type_nn_bias_32x16_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4,        // 4x4
+  &av1_tx_type_nnconfig_8x8,        // 8x8
+  &av1_tx_type_nnconfig_16x16,      // 16x16
+  NULL,                             // 32x32
+  NULL,                             // 64x64
+  &av1_tx_type_nnconfig_4x8_hor,    // 4x8
+  &av1_tx_type_nnconfig_8x4_hor,    // 8x4
+  &av1_tx_type_nnconfig_8x16_hor,   // 8x16
+  &av1_tx_type_nnconfig_16x8_hor,   // 16x8
+  &av1_tx_type_nnconfig_16x32_hor,  // 16x32
+  &av1_tx_type_nnconfig_32x16_hor,  // 32x16
+  NULL,                             // 32x64
+  NULL,                             // 64x32
+  NULL,                             // 4x16
+  NULL,                             // 16x4
+  NULL,                             // 8x32
+  NULL,                             // 32x8
+  NULL,                             // 16x64
+  NULL,                             // 64x16
+};
+
+static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4,        // 4x4 transform
+  &av1_tx_type_nnconfig_8x8,        // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,      // 16x16 transform
+  NULL,                             // 32x32 transform
+  NULL,                             // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,    // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,    // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,   // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,   // 16x8 transform
+  &av1_tx_type_nnconfig_16x32_ver,  // 16x32 transform
+  &av1_tx_type_nnconfig_32x16_ver,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  NULL,                             // 4x16 transform
+  NULL,                             // 16x4 transform
+  NULL,                             // 8x32 transform
+  NULL,                             // 32x8 transform
+  NULL,                             // 16x64 transform
+  NULL,                             // 64x16 transform
+};
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+  0.068650f,  -0.732073f, -0.040361f, 0.322550f,  -0.021123f, 0.212518f,
+  -0.350546f, 0.435987f,  -0.111756f, -0.401568f, 0.069548f,  -0.313000f,
+  0.073918f,  -0.373805f, -0.775810f, -0.124753f, 0.181094f,  -0.602641f,
+  -0.026219f, -0.350112f, 0.020599f,  -0.311752f, -0.476482f, -0.669465f,
+  -0.310921f, 0.348869f,  -0.115984f, 0.154250f,  0.200485f,  -0.016689f,
+  0.020392f,  0.413810f,  0.634064f,  -0.627530f, 0.399178f,  -0.012284f,
+  0.472030f,  0.091087f,  -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+  0.309339f,  0.505522f,  0.038496f,  -0.152809f, 0.408684f,  -0.068151f,
+  0.271612f,  0.353233f,  -0.150365f, 0.075212f,  -0.035096f, 0.346615f,
+  0.124382f,  0.477072f,  0.216288f,  0.070548f,  -0.106362f, 0.681613f,
+  -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+  0.063009f,  -0.123053f, 0.104875f,  -0.137581f, -0.282933f, -0.003624f,
+  -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+  -0.670248f, -0.353762f, 0.181109f,  0.289715f,  -0.071206f, 0.261141f,
+  0.052796f,  -0.114554f, -0.139214f, -0.261380f, 0.075984f,  -0.647925f,
+  -0.099528f, -0.677814f, 0.015712f,  -0.389385f, -0.095622f, -0.165117f,
+  -0.109454f, -0.175240f, -0.393914f, 0.212330f,  0.037822f,  0.248280f,
+  0.180197f,  0.110493f,  -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+  -0.542373f, -0.435626f, -0.912194f, 0.062794f,  0.160433f,  0.741485f,
+  -0.103659f, -0.119327f, -0.055275f, 0.334358f,  0.014713f,  0.046327f,
+  0.831114f,  -0.576682f, 0.354369f,  -0.082088f, 0.452331f,  0.039730f,
+  -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+  0.238621f,  2.186830f,  1.383035f,  -0.867139f, 1.257119f, -0.351571f,
+  -0.240650f, -0.971692f, 2.744843f,  1.116991f,  0.139062f, -0.165332f,
+  0.262171f,  -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+  -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+  -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f,  0.278987f,
+  0.085082f,  0.614986f, 0.847904f,  0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+  0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x8_layer0,
+      av1_tx_split_nn_weights_4x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x8_layer0,
+      av1_tx_split_nn_bias_4x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+  0.177983f,  -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+  -0.098202f, -0.279510f, 0.001054f,  -0.119319f, -1.835282f, -0.581507f,
+  -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+  -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+  0.015331f,  -0.341818f, 0.145549f,  -0.348362f, 0.147647f,  -0.323400f,
+  0.047558f,  -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+  0.447740f,  0.782381f,  -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+  -0.096783f, 0.038342f,  -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+  0.757204f,  -1.296465f, 0.631015f,  0.009265f,  0.646192f,  0.044523f,
+  0.653161f,  0.033820f,  0.849639f,  -0.068555f, -1.036085f, -0.511652f,
+  0.104693f,  -1.458690f, 0.286051f,  -0.089800f, 0.381564f,  -0.302640f,
+  0.304465f,  -0.268706f, 0.432603f,  -0.117914f, -2.070031f, -0.565696f,
+  -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+  -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+  0.101894f,  -0.221847f, 0.018412f,  -0.423887f, -0.266684f, -0.444930f,
+  -0.196237f, 0.106638f,  -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+  1.089957f,  -0.799928f, 0.504112f,  -0.165763f, 0.578741f,  -0.172653f,
+  0.547316f,  -0.143484f, 0.717220f,  -0.297190f, -1.237854f, -0.074819f,
+  -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+  -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+  0.231249f,  -1.693073f, -0.035899f, 0.380845f,  -0.058476f, 0.409405f,
+  -0.066679f, 0.406731f,  -0.068501f, 0.396748f,  0.639462f,  0.150834f,
+  -0.418659f, -1.421931f, 0.101889f,  0.083573f,  0.129746f,  0.134460f,
+  0.081185f,  0.127420f,  0.083664f,  0.051096f,  1.361688f,  0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+  4.280443f, 2.218902f, -0.256953f, 3.161431f,  2.082548f, 2.506052f,
+  2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+  1.178833f,  -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+  -0.766968f, -0.356663f, 0.450146f,  0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+  -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      12,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x8_layer0,
+      av1_tx_split_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x8_layer0,
+      av1_tx_split_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+  0.374660f,  0.218905f,  -0.139779f, 0.212141f,  0.056517f,  0.051114f,
+  0.042860f,  -0.273258f, -0.340809f, 0.138983f,  -0.216996f, -0.241519f,
+  -0.123244f, 0.078577f,  -0.472273f, -0.194201f, 0.125056f,  0.239761f,
+  -0.332782f, 0.174782f,  -0.211400f, -0.129795f, 0.062195f,  0.113176f,
+  -0.008869f, 0.140764f,  0.059833f,  0.163826f,  0.359293f,  -0.109797f,
+  -0.022091f, -0.059536f, -0.188226f, 0.179709f,  0.031386f,  0.164790f,
+  0.214364f,  0.198555f,  0.152262f,  -0.242980f, 0.319367f,  -0.136902f,
+  0.046524f,  -0.043591f, 0.342178f,  -0.011757f, -0.014286f, 0.072871f,
+  -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+  -0.120865f, -0.160042f, 0.240028f,  0.112902f,  -0.141587f, -0.703012f,
+  -0.136591f, 0.318993f,  -0.154417f, -0.054668f, 0.192870f,  0.176166f,
+  -0.029965f, 0.266942f,  -0.178384f, 0.038680f,  0.134403f,  -0.002426f,
+  0.534825f,  -0.070923f, 0.413281f,  0.418148f,  0.093729f,  0.016454f,
+  0.305358f,  -0.040512f, 0.069904f,  -0.227588f, -0.362220f, -0.031604f,
+  -0.394901f, 0.071506f,  -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+  0.213062f,  0.076805f,  0.278758f,  0.125613f,  -0.035552f, 0.040971f,
+  0.182785f,  -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+  0.114657f,  0.047121f,  0.195902f,  0.264759f,  0.017799f,  0.210230f,
+  0.150749f,  -0.142142f, 0.182494f,  -0.142415f, -0.259782f, -0.114830f,
+  -0.198826f, 0.000061f,  -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+  0.422680f,  0.066960f,  0.351106f,  -0.209034f, 0.367195f,  -0.110274f,
+  0.115573f,  -0.066642f, -0.389673f, -0.260447f, 0.056949f,  -0.180425f,
+  0.069922f,  -0.153506f, -0.097053f, -0.111757f, 0.094069f,  0.144837f,
+  -0.052984f, -0.506681f, -0.034474f, 0.279057f,  -0.105025f, 0.006656f,
+  -0.125017f, -0.114096f, 0.103153f,  -0.117402f, -0.359472f, 0.072534f,
+  0.110291f,  0.003088f,  -0.456897f, 0.038331f,  -0.322298f, 0.113942f,
+  -0.119916f, -0.194392f, 0.093167f,  0.193459f,  0.074671f,  0.033602f,
+  0.004440f,  -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+  0.319160f,  -0.066218f, 0.291246f,  0.181292f,  0.089914f,  0.025273f,
+  0.303128f,  0.019063f,  0.078545f,  -0.396919f, 0.014065f,  -0.122121f,
+  0.037107f,  -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+  0.102970f,  -0.225040f, 0.061059f,  -0.258188f, -0.469871f, -0.099607f,
+  -0.061524f, -0.213700f, 0.070237f,  -0.289134f, -0.238225f, 0.256403f,
+  -0.119344f, 0.067782f,  -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+  0.026569f,  0.031037f,  0.094302f,  -0.101239f, 0.433307f,  -0.303612f,
+  0.088537f,  -0.164436f, 0.202471f,  -0.048592f, -0.251904f, 0.122577f,
+  -0.309874f, -0.263405f, -0.292503f, 0.216589f,  0.035378f,  0.136599f,
+  -0.145844f, -0.018211f, 0.174084f,  -0.449941f, -0.001428f, 0.064134f,
+  0.039652f,  0.111083f,  -0.246076f, -0.204733f, 0.056559f,  -0.000123f,
+  0.104049f,  0.138512f,  -0.128309f, 0.087855f,  0.232784f,  0.247138f,
+  0.162766f,  0.154829f,  0.313605f,  -0.164115f, -0.050844f, 0.156549f,
+  0.185279f,  -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+  -0.203399f, -0.096831f, -0.127867f, 0.310674f,  -0.008181f, 0.004078f,
+  -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+  0.114268f,  -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+  -0.352853f, -0.224001f, -0.156330f, 0.215436f,  0.171846f,  0.291849f,
+  0.108832f,  0.046991f,  -0.127801f, 0.032485f,  0.141493f,  0.123319f,
+  -0.057250f, 0.315346f,  -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+  -0.239089f, -0.073251f, -0.327718f, 0.054905f,  -0.283169f, -0.028900f,
+  0.071450f,  0.270072f,  0.248891f,  0.088052f,  0.253319f,  0.122808f,
+  0.175490f,  -0.147805f, 0.089169f,  -0.045457f, -0.330788f, 0.099791f,
+  -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+  0.162554f,  -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+  -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+  -0.182036f, 0.176772f,  -0.070823f, 0.216054f,  -0.211533f, -0.232992f,
+  0.279346f,  0.117984f,  0.236674f,  0.126625f,  -0.046220f, 0.044919f,
+  0.278492f,  0.083944f,  0.180512f,  0.217994f,  0.401170f,  -0.064417f,
+  0.011636f,  -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+  -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f,  -0.312849f,
+  -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+  -0.354389f, 0.169464f,  0.094151f,  -0.217122f, -0.456397f, 0.211478f,
+  0.219232f,  -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+  -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f,  -0.144181f,
+  0.335028f,  0.176439f,  0.105980f,  0.169390f,  0.155615f,  -0.040618f,
+  -0.176029f, 0.155569f,  -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+  -0.434334f, 0.092238f,  -0.263103f, 0.061804f,  -0.172957f, 0.005962f,
+  -0.100176f, 0.125898f,  0.048092f,  -0.088141f, 0.247196f,  -0.221601f,
+  -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+  0.403401f,  -0.046200f, 0.322259f,  0.219678f,  0.109850f,  0.051837f,
+  0.196861f,  -0.019118f, 0.248818f,  -0.137567f, 0.127862f,  0.052293f,
+  0.298726f,  0.275788f,  0.015344f,  0.058714f,  0.283691f,  -0.053794f,
+  -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+  -0.252396f, -0.069017f, 0.034803f,  -0.003388f, -0.262577f, 0.062115f,
+  -0.298393f, 0.215415f,  -0.153615f, 0.289902f,  0.085886f,  -0.504290f,
+  0.077178f,  0.150861f,  -0.228848f, -0.261020f, 0.198204f,  0.162113f,
+  0.346418f,  -0.286950f, 0.354756f,  -0.226419f, 0.024720f,  0.208037f,
+  0.107286f,  -0.110849f, 0.104415f,  -0.207725f, 0.063932f,  -0.037748f,
+  -0.167037f, -0.068282f, 0.320815f,  -0.051884f, 0.099989f,  -0.078388f,
+  0.127071f,  0.046675f,  -0.336571f, -0.273080f, 0.264694f,  -0.007352f,
+  -0.093828f, 0.094773f,  -0.144434f, 0.091795f,  -0.031615f, 0.056914f,
+  0.064673f,  -0.136669f, 0.344734f,  0.225926f,  0.283451f,  -0.068354f,
+  0.030572f,  0.180784f,  -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+  0.052094f,  -0.017932f, 0.216302f,  -0.184396f, 0.079888f,  0.210406f,
+  -0.020627f, 0.244744f,  0.336972f,  -0.182914f, -0.220976f, -0.304225f,
+  -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+  -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+  -0.274107f, 0.445751f,  0.234359f,  0.291593f,  0.163298f,  0.183707f,
+  -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f,  -0.354974f,
+  0.000000f,  -0.254630f, 0.220149f,  0.371104f,  0.789759f,  0.270300f,
+  0.195126f,  -0.206958f, 0.917708f,  -0.256232f, 1.131933f,  1.178944f,
+  0.461270f,  0.246169f,  -0.818614f, -0.111986f, 0.759355f,  0.154889f,
+  0.470299f,  -1.025250f, 0.678678f,  0.959346f,  -0.164105f, 0.544079f,
+  -0.448733f, 0.649221f,  -0.536672f, 0.962758f,  -0.256427f, 0.808664f,
+  -0.118694f, 0.684873f,  -0.015635f, -0.046469f, 0.075481f,  0.412647f,
+  0.454456f,  -0.107169f, 0.775235f,  -0.261629f, -1.194849f, 0.010093f,
+  -0.231289f, 0.658286f,  -0.769320f, 0.564545f,  0.482962f,  -0.131378f,
+  -0.255844f, -0.078400f, 0.476752f,  0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+  -0.145065f, -0.145101f, 0.174786f,  0.196692f,  0.102025f,  -0.087735f,
+  0.386353f,  -0.660539f, -0.183940f, 0.490045f,  -0.276404f, -0.145669f,
+  0.209846f,  -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+  -0.108545f, -0.261181f, 1.435606f,  -0.176621f, -1.158548f, 2.035680f,
+  0.218069f,  -0.138629f, 0.305958f,  -0.277194f, -0.602468f, 0.203873f,
+  0.120720f,  0.216095f,  -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+  0.545643f,  0.232091f,  0.330169f,  0.988136f,  -0.070465f, -0.345584f,
+  -0.162455f, -0.617064f, 0.123881f,  -0.201098f, 0.222756f,  0.112932f,
+  0.048647f,  -0.147890f, 0.394584f,  -0.262148f, 0.280564f,  -0.195432f,
+  -0.047515f, 1.133410f,  0.255415f,  -0.299032f, -0.397807f, -0.153246f,
+  -0.256734f, 0.177370f,  0.213522f,  -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+  0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      64,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x16_layer0,
+      av1_tx_split_nn_weights_8x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x16_layer0,
+      av1_tx_split_nn_bias_8x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+  -0.177215f, -0.297166f, 0.299924f,  0.207878f,  0.216871f,  0.173264f,
+  0.295464f,  0.048395f,  0.154731f,  0.305880f,  0.056787f,  -0.166617f,
+  0.115653f,  -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+  -0.024940f, -0.007055f, 0.001392f,  0.021678f,  -1.594600f, -0.099593f,
+  0.332930f,  0.103574f,  0.158249f,  0.182601f,  0.332665f,  0.226207f,
+  -0.139566f, 0.185531f,  0.099074f,  -0.185654f, -0.203121f, -0.285678f,
+  -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+  -0.066150f, -0.099058f, -0.458879f, 0.127544f,  0.338314f,  -0.161350f,
+  0.030091f,  -0.075528f, 0.004320f,  0.353690f,  -0.013480f, -0.420402f,
+  -0.004659f, -0.329401f, -0.001745f, 0.227384f,  -0.055183f, 0.121405f,
+  0.160340f,  0.143603f,  -0.221813f, 0.079107f,  -0.657639f, -0.084348f,
+  -0.303414f, 0.046774f,  -0.367679f, 0.060005f,  0.168645f,  0.084421f,
+  -0.133625f, 0.301375f,  0.079412f,  -0.419303f, 0.017235f,  0.068637f,
+  0.018384f,  -0.428325f, -0.019753f, 0.149444f,  -0.474836f, -0.287162f,
+  0.198083f,  0.028292f,  -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+  -0.217561f, -0.264003f, 0.269411f,  0.207032f,  -0.339411f, -0.198431f,
+  -0.028521f, 0.158076f,  0.177116f,  0.345702f,  -0.145132f, 0.064623f,
+  -0.090867f, 0.288816f,  -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+  -0.014100f, -0.271192f, -0.318559f, 0.129015f,  -0.050314f, -0.093355f,
+  -0.578498f, 0.099090f,  -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+  -0.321153f, -0.343671f, -0.242959f, 0.128304f,  0.017170f,  0.072787f,
+  -0.475838f, -0.003806f, -0.068615f, 0.150556f,  -0.159903f, -0.416513f,
+  0.218794f,  -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+  -0.077329f, -0.089747f, -0.096526f, 0.537952f,  0.134725f,  -0.006469f,
+  -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f,  -0.021712f,
+  -0.513992f, 0.259135f,  -0.319808f, 0.077811f,  0.104613f,  0.370571f,
+  0.185244f,  0.065530f,  -0.091098f, -0.573741f, 0.111934f,  0.437417f,
+  -0.123691f, 0.220641f,  -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+  0.038015f,  -0.380596f, 0.250980f,  0.142208f,  0.135170f,  -0.131129f,
+  -0.357556f, -0.530945f, 0.159672f,  -0.147025f, -0.377829f, -0.504508f,
+  -0.492870f, 0.020753f,  0.142818f,  0.025172f,  0.086140f,  0.091283f,
+  0.087491f,  -0.186415f, 0.177785f,  -0.195121f, -1.191148f, -0.477102f,
+  0.023371f,  0.227004f,  -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+  0.162900f,  0.415509f,  -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+  -0.080845f, -0.330274f, 0.021874f,  0.232398f,  0.069277f,  0.220567f,
+  -0.024237f, -0.366771f, 0.081673f,  -0.429906f, -0.302170f, 0.061045f,
+  0.352777f,  -0.230376f, 0.408153f,  0.064758f,  0.142051f,  0.007219f,
+  0.622878f,  0.212577f,  0.036489f,  0.081150f,  -0.284767f, 0.107763f,
+  -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+  -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f,  0.057565f,
+  0.414265f,  -0.159155f, 0.221456f,  0.146314f,  0.265776f,  -0.006516f,
+  0.473978f,  -0.186431f, 0.288672f,  -0.060437f, 0.083380f,  -0.205641f,
+  0.360016f,  0.222041f,  0.420011f,  0.024579f,  0.377546f,  0.250380f,
+  -0.069900f, 0.296743f,  0.073532f,  -0.243225f, -0.374987f, -0.387288f,
+  -0.237255f, -0.287013f, 0.417831f,  -0.252988f, -0.257652f, -0.066775f,
+  -0.253926f, 0.057841f,  0.346133f,  -0.157797f, -0.406028f, -0.286893f,
+  0.274507f,  -0.452561f, 0.143381f,  -0.097755f, 0.021242f,  0.034561f,
+  0.044115f,  0.004065f,  0.066729f,  0.043558f,  0.102991f,  -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+  -0.479033f, 1.467402f,  -0.366291f, 0.372511f,  0.715322f,  -0.605500f,
+  0.176848f,  0.032318f,  0.237429f,  -0.046047f, 0.452082f,  0.451805f,
+  -0.822845f, 0.636762f,  -0.057350f, 1.163978f,  0.728287f,  0.603654f,
+  -0.245519f, -0.893569f, -1.428185f, 0.808870f,  -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+  -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f,  -0.170504f,
+  -0.538432f, 0.033893f, 0.149842f,  0.404140f,  -0.377812f, 0.338838f,
+  -0.176091f, 0.249844f, -0.362533f, 1.412460f,  0.196862f,  0.278194f,
+  -0.140444f, 0.297746f, 0.172533f,  0.116470f,  -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+  0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x16_layer0,
+      av1_tx_split_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x16_layer0,
+      av1_tx_split_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+  -0.439303f, 0.004813f,  -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+  -0.196770f, -0.076096f, 0.357004f,  -0.044909f, -0.112910f, -0.129081f,
+  0.156725f,  -0.386346f, 0.038971f,  0.160696f,  0.204923f,  -0.384333f,
+  -0.319546f, 0.028179f,  -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+  -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f,  0.141414f,
+  0.303016f,  0.098066f,  0.482455f,  0.036069f,  -0.166279f, 0.210119f,
+  -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+  -0.306403f, 0.026318f,  -0.277296f, 0.092684f,  -0.033584f, -0.018371f,
+  -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+  0.361851f,  -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+  -0.097051f, 0.259172f,  0.016432f,  0.259358f,  0.145059f,  0.037196f,
+  0.091581f,  -0.219644f, 0.140384f,  -0.446837f, -0.234531f, 0.149508f,
+  -0.083429f, 0.186189f,  -0.099890f, -0.111277f, 0.495214f,  0.085053f,
+  -0.266613f, -0.051366f, 0.148593f,  0.111875f,  0.077787f,  -0.371653f,
+  -0.146157f, -0.229235f, 0.076203f,  0.488975f,  0.096771f,  -0.009483f,
+  0.192985f,  0.246273f,  -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+  -0.106892f, -0.329659f, 0.012105f,  -0.359326f, 0.170723f,  -0.004357f,
+  0.171593f,  -0.478768f, -0.236016f, -0.035077f, 0.133731f,  0.137962f,
+  -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+  -0.649359f, 0.127605f,  0.097930f,  0.182775f,  -0.313324f, 0.053349f,
+  0.204203f,  -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+  -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+  0.632147f,  0.221825f,  0.268394f,  -0.096357f, 0.442545f,  -0.007117f,
+  -0.036125f, 0.000525f,  0.088092f,  -0.203653f, 0.086925f,  0.439141f,
+  0.329889f,  -0.370050f, -0.194306f, -0.207430f, 0.132779f,  -0.217614f,
+  -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+  -0.007300f, 0.062257f,  -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+  -0.189117f, -0.087622f, -0.561091f, 0.184182f,  -0.044980f, 0.012643f,
+  0.241672f,  0.050272f,  -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+  -0.269471f, 0.231668f,  0.135749f,  -0.131162f, 0.062760f,  0.100949f,
+  0.074967f,  -0.056918f, 0.251707f,  0.034098f,  0.341290f,  -0.105027f,
+  0.313246f,  -0.092679f, -0.014632f, -0.390967f, 0.136881f,  -0.241554f,
+  0.097674f,  0.110832f,  -0.390245f, 0.017654f,  -0.506222f, 0.065252f,
+  0.244834f,  -0.171352f, -0.331702f, 0.111043f,  0.125217f,  -0.058116f,
+  -0.382595f, -0.052545f, 0.114261f,  -0.493617f, 0.243984f,  -0.171053f,
+  0.165009f,  -0.063020f, 0.096502f,  0.341339f,  -0.013443f, 0.056372f,
+  0.339284f,  0.398376f,  0.389409f,  0.257252f,  0.517368f,  0.078856f,
+  0.087716f,  -0.171092f, 0.227461f,  0.125307f,  -0.054423f, -0.143161f,
+  0.224041f,  -0.086477f, -0.092548f, 0.072392f,  -0.061608f, 0.258347f,
+  0.147033f,  -0.478244f, -0.204869f, 0.038552f,  -0.144563f, 0.224087f,
+  -0.296705f, 0.153889f,  -0.064624f, 0.085265f,  -0.103826f, 0.127971f,
+  0.019965f,  0.111937f,  -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+  0.042714f,  0.070052f,  -0.202360f, 0.348144f,  -0.132097f, -0.209585f,
+  -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f,  -0.013468f,
+  -0.406090f, -0.144936f, 0.208620f,  0.343445f,  -0.059639f, 0.114857f,
+  -0.069431f, -0.218725f, 0.190575f,  -0.368101f, 0.030030f,  0.062815f,
+  -0.239369f, -0.537852f, 0.022487f,  0.023038f,  0.190788f,  0.040123f,
+  -0.004304f, 0.060749f,  -0.108929f, 0.136796f,  -0.542875f, -0.227074f,
+  -0.182244f, 0.082559f,  0.019149f,  0.178854f,  0.120284f,  0.009070f,
+  0.068268f,  -0.544822f, 0.120536f,  0.354028f,  -0.119890f, -0.122055f,
+  -0.405335f, 0.122341f,  -0.304412f, 0.062405f,  -0.302568f, -0.276505f,
+  -0.120915f, -0.221841f, 0.282007f,  -0.253971f, 0.059517f,  -0.144976f,
+  0.149391f,  -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+  0.017485f,  0.021038f,  -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+  -0.114365f, -0.397732f, -0.065421f, 0.053084f,  0.035201f,  0.053019f,
+  -0.105377f, -0.039500f, 0.131904f,  -0.123911f, -0.390328f, -0.125198f,
+  -0.000126f, 0.014864f,  -0.220187f, 0.084056f,  -0.492155f, -0.164979f,
+  0.133592f,  0.121519f,  -0.240813f, 0.186680f,  0.118673f,  0.235006f,
+  -0.239894f, -0.185759f, -0.336992f, 0.209620f,  -0.298845f, 0.127803f,
+  -0.083992f, 0.194340f,  -0.245378f, 0.212308f,  0.142512f,  -0.163324f,
+  0.383495f,  0.291065f,  0.286620f,  -0.239957f, 0.225127f,  -0.174424f,
+  0.297231f,  -0.045434f, 0.156444f,  -0.184273f, -0.204567f, 0.202551f,
+  0.370019f,  -0.073910f, 0.344897f,  0.063100f,  0.338547f,  -0.099145f,
+  0.391863f,  -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+  0.143343f,  -0.021982f, -0.314939f, 0.170867f,  -0.081248f, 0.125758f,
+  -0.355762f, 0.279798f,  1.027712f,  -0.434660f, 1.072005f,  0.668893f,
+  -0.031216f, -0.528650f, 0.328349f,  0.543645f,  -0.188810f, 0.221110f,
+  -1.638637f, 0.058045f,  -1.731105f, -0.444284f, 0.513693f,  0.890025f,
+  0.160288f,  0.393312f,  0.332856f,  -0.080767f, 0.299822f,  0.235876f,
+  0.254942f,  -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+  -0.090326f, -0.267553f, -0.026071f, 0.100912f,  0.279137f,  0.079064f,
+  -0.074885f, 0.053804f,  0.736810f,  -0.031693f, -0.970514f, 0.174069f,
+  0.095940f,  -0.065047f, 0.052911f,  0.176728f,  -0.058274f, 0.148364f,
+  -0.162210f, 0.093875f,  -0.367663f, 0.020876f,  0.137280f,  -1.099116f,
+  0.146854f,  0.075590f,  0.228534f,  0.141993f,  0.072143f,  0.101421f,
+  -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+  0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x32_layer0,
+      av1_tx_split_nn_weights_32x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x32_layer0,
+      av1_tx_split_nn_bias_32x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+  -0.006828f, 0.149944f,  -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+  0.001039f,  0.037164f,  0.015091f,  -0.306620f, -0.162047f, -0.369440f,
+  0.396310f,  0.087121f,  0.208609f,  -0.083068f, 0.493774f,  0.217682f,
+  0.377393f,  0.172879f,  0.397422f,  0.078919f,  0.741350f,  0.064169f,
+  -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+  -0.436596f, -0.007551f, -0.396721f, 0.153570f,  -0.190838f, -0.071869f,
+  0.048799f,  -0.301301f, -0.005015f, 0.500480f,  -0.030622f, -0.559095f,
+  -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f,  -0.411323f,
+  -0.005366f, -0.069496f, 0.019990f,  0.327931f,  -0.002516f, 0.393190f,
+  0.001759f,  0.035093f,  -0.030302f, -0.528984f, 0.174781f,  0.241462f,
+  -0.415427f, -0.164502f, 0.143065f,  -0.122595f, 0.082049f,  -0.143346f,
+  0.055642f,  -0.124701f, 0.004050f,  -0.216235f, -2.681730f, 0.101658f,
+  0.381239f,  0.465936f,  0.331154f,  0.301708f,  -0.360171f, 0.054886f,
+  -0.118658f, 0.287921f,  0.277859f,  0.203784f,  0.247809f,  0.656924f,
+  -0.354628f, 0.315081f,  0.105108f,  -0.510179f, 0.059267f,  0.061386f,
+  0.076423f,  0.347119f,  0.100134f,  0.028402f,  -0.118621f, -0.238689f,
+  0.080141f,  -0.138863f, 0.009009f,  -0.100526f, -0.138875f, 0.066992f,
+  0.005949f,  0.564336f,  0.046994f,  0.004655f,  0.366047f,  0.014695f,
+  -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f,  -0.020925f,
+  -0.227236f, -0.068141f, 0.282009f,  0.040192f,  -0.267100f, 0.229228f,
+  0.133861f,  0.338706f,  -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+  -0.066931f, -0.110580f, -0.072056f, 0.599457f,  -0.020738f, 0.169200f,
+  0.836240f,  -0.157548f, 0.386273f,  0.002404f,  0.329410f,  -0.007020f,
+  0.351705f,  -0.041259f, 0.388861f,  0.003899f,  0.582627f,  0.023572f,
+  0.409912f,  -0.158472f, 0.536383f,  0.525093f,  0.604247f,  0.439159f,
+  0.692832f,  0.046272f,  0.590367f,  -0.082166f, 0.262357f,  0.478671f,
+  0.031935f,  0.042675f,  0.120002f,  0.398616f,  -0.078967f, 0.227986f,
+  -0.044679f, 0.151061f,  -0.085564f, 0.220205f,  -0.265606f, -0.203623f,
+  0.204719f,  -0.125922f, 0.038544f,  -0.269379f, 0.025866f,  0.109967f,
+  0.019064f,  -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+  0.278496f,  0.018620f,  0.209971f,  0.296250f,  0.142850f,  0.288689f,
+  0.137084f,  0.130517f,  0.128171f,  -0.155396f, -0.008449f, -0.099845f,
+  0.173455f,  -0.059909f, -0.147318f, 0.102851f,  -0.251389f, -0.001448f,
+  0.103907f,  0.297273f,  -0.027846f, 0.028260f,  -0.382601f, 0.346695f,
+  -0.601641f, 0.162366f,  -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+  -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f,  0.003008f,
+  0.099917f,  -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+  -0.050644f, 0.020041f,  -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+  -0.129115f, -0.710559f, 0.157213f,  -0.844037f, -0.121991f, -0.943386f,
+  -0.231269f, -0.003462f, 0.331478f,  -0.132703f, -1.285993f, -0.120957f,
+  -0.373755f, -0.322609f, 0.309059f,  -0.131523f, -0.118334f, -0.063805f,
+  -0.104251f, 0.012166f,  -0.094699f, -0.283753f, 0.128168f,  -0.526929f,
+  -0.050331f, 0.186153f,  0.005913f,  -0.221236f, 0.036363f,  0.160909f,
+  -0.001342f, -0.382749f, 0.037820f,  0.281689f,  -0.024275f, 0.028854f,
+  0.318291f,  0.318526f,  0.035778f,  0.034031f,  0.189663f,  -0.293367f,
+  0.082022f,  0.127923f,  0.078866f,  -0.081361f, -0.268117f, 0.246675f,
+  0.248605f,  -0.215479f, -0.073084f, 0.496140f,  -0.067327f, 0.396237f,
+  -0.120739f, 0.033752f,  -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+  -0.040400f, 0.281604f,  -0.100471f, 0.415207f,  -0.258503f, -0.429749f,
+  0.150569f,  -0.010859f, 0.136448f,  0.026589f,  0.148466f,  0.110764f,
+  0.380967f,  0.009177f,  0.103075f,  0.116417f,  0.226273f,  -0.327746f,
+  0.169346f,  0.284553f,  -0.094986f, 0.312745f,  -0.147840f, 0.025062f,
+  -0.494482f, 0.112388f,  -0.213962f, 0.107050f,  -0.433371f, -0.096276f,
+  -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f,  0.042846f,
+  -0.237479f, 0.104746f,  0.158677f,  0.358937f,  0.099921f,  0.277109f,
+  0.012410f,  -0.062897f, 0.116130f,  0.255309f,  0.341628f,  0.145002f,
+  -0.429344f, -0.016433f, -0.068985f, 0.285194f,  -0.286719f, -0.018298f,
+  -0.179369f, -0.194655f, -0.165380f, 0.026071f,  -0.428268f, -0.379929f,
+  -0.727543f, 0.179610f,  -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+  -0.784966f, 0.061205f,  -0.713357f, 0.129795f,  0.120512f,  -0.339545f,
+  0.353557f,  0.114906f,  -0.329813f, -0.209987f, 0.085410f,  0.214313f,
+  -0.122082f, 0.335770f,  -0.020937f, 0.202456f,  0.289023f,  -0.421186f,
+  0.337905f,  0.407663f,  0.132771f,  0.071734f,  0.213914f,  0.128595f,
+  0.302659f,  -0.209501f, 0.217756f,  0.253079f,  -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+  0.296914f,  -1.826816f, 0.346130f,  0.969520f,  -0.528154f, 1.175862f,
+  -0.075985f, -0.097323f, -0.233059f, 0.004846f,  0.401279f,  -2.272435f,
+  0.086257f,  0.414162f,  -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+  0.861214f,  0.298361f,  0.267397f,  -0.158557f, -0.119911f, -0.098134f,
+  -0.339263f, 0.385871f,  -0.678123f, 0.263218f,  0.251611f,  -1.155773f,
+  -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+  0.502104f,  -0.708023f, 0.419648f,  1.583418f,  0.419355f,  -1.462981f,
+  -0.439623f, 0.405691f,  0.823257f,  0.061654f,  0.750875f,  0.775031f,
+  -0.387909f, 0.447385f,  0.284690f,  0.353262f,  -0.224347f, 0.832864f,
+  -1.708491f, -1.042447f, -0.272829f, 0.540640f,  0.310509f,  0.723745f,
+  0.245592f,  -0.218417f, -0.597987f, -0.362301f, 0.702217f,  -0.692614f,
+  0.207812f,  0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_64x64_layer0,
+      av1_tx_split_nn_weights_64x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_64x64_layer0,
+      av1_tx_split_nn_bias_64x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+  -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+  -2.128968f, -0.655518f, 0.432180f,  0.879752f,  -0.222211f, 0.061615f,
+  -0.230969f, 0.569496f,  1.424188f,  0.598063f,  -0.436005f, -0.737606f,
+  -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+  -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f,  -0.331752f,
+  -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+  -0.636060f, 0.183271f,  -0.610212f, 0.345895f,  -1.100906f, -1.605713f,
+  0.111888f,  -0.140937f, 0.063013f,  -0.013315f, -0.273472f, -0.255870f,
+  1.200328f,  0.274002f,  1.005776f,  0.322392f,  1.222373f,  0.158227f,
+  0.408810f,  0.145022f,  0.139842f,  -1.249412f, 0.286672f,  -0.635699f,
+  0.312562f,  -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+  -0.132199f, -0.863055f, 0.217579f,  -1.161425f, -0.302087f, -1.357271f,
+  -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+  -2.057684f, -0.228755f, 0.606278f,  0.101198f,  -0.314847f, -1.303255f,
+  -0.294964f, 1.301923f,  0.041712f,  0.077593f,  -1.152746f, 0.495315f,
+  -0.751566f, 0.230249f,  -0.840661f, 0.100731f,  1.346269f,  0.649898f,
+  -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+  -0.354072f, 0.068292f,  -0.234168f, 0.277503f,  0.179134f,  0.907420f,
+  0.354626f,  -0.627210f, 0.905779f,  0.512612f,  0.161190f,  -0.843177f,
+  0.014953f,  -0.354983f, 0.011116f,  -0.429598f, -1.017138f, -0.211432f,
+  0.941840f,  -0.281747f, 0.957776f,  -0.541914f, 1.041880f,  -0.433580f,
+  -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+  3.086118f,  -3.235095f, 4.830956f,  -0.165706f, 0.955031f,  4.055783f,
+  -0.311489f, 4.660205f,  -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+  -1.191704f, -3.800073f, 4.121552f,  -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+  -0.758677f, 0.388776f,  0.439906f,  0.011390f, -0.084319f, -0.667969f,
+  -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f,  -0.549682f,
+  0.462109f,  0.343315f,  1.092593f,  0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+  0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x16_layer0,
+      av1_tx_split_nn_weights_4x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x16_layer0,
+      av1_tx_split_nn_bias_4x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+  0.180713f,  0.033211f,  0.607561f,  0.138642f,  0.637204f,  -0.000940f,
+  0.012630f,  0.358109f,  0.022238f,  0.190418f,  0.079088f,  0.065925f,
+  0.038242f,  0.162380f,  -0.122728f, 0.379382f,  -0.303283f, -0.327550f,
+  0.029120f,  -0.284553f, 0.269588f,  -0.309805f, -0.241036f, -0.161103f,
+  -0.304887f, 0.239843f,  -0.149146f, 0.311234f,  -0.073640f, -0.132718f,
+  0.178901f,  0.474712f,  0.020280f,  0.063685f,  -0.609170f, -0.013658f,
+  -0.338074f, 0.250429f,  0.082978f,  -0.186315f, -0.788959f, 0.039859f,
+  -0.426461f, -0.001524f, -0.447211f, 0.378102f,  0.315617f,  0.017428f,
+  0.745494f,  -0.219024f, 0.512836f,  0.200522f,  0.680449f,  0.313686f,
+  -0.412569f, -0.132927f, 0.631120f,  0.042735f,  0.336153f,  0.044772f,
+  0.432606f,  0.175681f,  -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+  -0.104034f, -0.570495f, -0.247365f, 0.063256f,  -0.582021f, -0.492585f,
+  -0.194955f, -0.207934f, -0.506627f, 0.021743f,  -0.416518f, 0.320876f,
+  0.115889f,  0.149399f,  -0.229376f, 0.095505f,  0.115191f,  -0.471921f,
+  0.113068f,  0.343684f,  -0.036831f, 0.021240f,  0.295112f,  0.031166f,
+  0.448201f,  -0.132241f, 0.164032f,  0.355572f,  0.072154f,  0.017335f,
+  -0.046113f, 0.178719f,  -0.026881f, -0.242590f, 0.055073f,  -0.012958f,
+  0.077904f,  0.351356f,  0.107655f,  0.260568f,  -0.080052f, -0.197553f,
+  0.085763f,  0.263416f,  -0.327741f, 0.158855f,  0.056899f,  -0.162121f,
+  0.339518f,  -0.571204f, 0.264966f,  -0.252214f, -0.202560f, -0.134213f,
+  -0.330188f, 0.009470f,  -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+  -0.222238f, -0.458716f, 0.186493f,  -0.391415f, 0.118649f,  -0.104653f,
+  -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+  -0.598358f, 0.164947f,  -0.119694f, -0.058520f, 0.203829f,  -0.267404f,
+  -0.048202f, -0.600006f, 0.181594f,  -0.731805f, 0.146417f,  -0.687148f,
+  -1.210525f, -0.450101f, -0.620635f, 0.208825f,  -0.611357f, 0.112202f,
+  -0.309468f, -0.323545f, 0.357770f,  0.308061f,  0.553199f,  0.049012f,
+  0.530093f,  -0.208597f, 0.607882f,  -0.058120f, -0.527634f, 0.018136f,
+  0.060753f,  0.118894f,  0.175649f,  0.014731f,  0.428318f,  -0.106465f,
+  -0.119077f, 0.080179f,  0.524997f,  0.368286f,  0.528286f,  0.213659f,
+  0.639286f,  0.195079f,  -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+  -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f,  0.325622f,
+  -0.115293f, 0.155188f,  0.047225f,  0.231050f,  -0.167447f, 0.349754f,
+  0.295544f,  -0.319466f, 0.095144f,  0.174612f,  -0.194652f, 0.305915f,
+  -0.239008f, -0.037453f, 0.280696f,  0.125850f,  0.749196f,  -0.101919f,
+  0.791808f,  -0.236811f, 0.064157f,  0.032865f,  -0.225911f, 0.350384f,
+  0.723183f,  -0.103992f, 0.483085f,  -0.123992f, 0.602138f,  0.023895f,
+  -0.692601f, -0.118387f, 0.162527f,  0.145178f,  -0.184702f, -0.017753f,
+  -0.159436f, 0.124105f,  -0.131067f, 0.310275f,  0.151499f,  0.138924f,
+  0.537459f,  0.263212f,  0.615896f,  0.281255f,  0.021293f,  -0.473459f,
+  0.210145f,  -0.056682f, 0.063658f,  0.377254f,  -0.314410f, -0.183487f,
+  0.300384f,  0.328471f,  0.164694f,  -0.159272f, -0.160942f, -0.502861f,
+  -0.129147f, 0.045916f,  -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+  0.051664f,  -0.212487f, -0.077596f, -0.818467f, 0.638475f,  -0.759937f,
+  0.157198f,  0.989640f,  1.586035f,  0.431144f,  0.041605f,  0.543085f,
+  0.498379f,  0.320504f,  0.134233f,  0.670979f,  -0.105562f, -1.574879f,
+  1.261812f,  -0.287530f, -1.610592f, 0.730899f,  -0.894240f, -0.657790f,
+  0.270806f,  -0.181708f, 0.298578f,  0.817240f,  -0.221508f, -0.201771f,
+  -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+  1.208914f,  0.324728f,  0.383352f,  -0.874321f, 0.172565f,  -0.580927f,
+  -0.432927f, 0.433698f,  -0.801935f, 0.672028f,  0.563493f,  0.260077f,
+  -0.200557f, -0.121638f, 0.530735f,  -0.525196f, 0.281799f,  0.624204f,
+  -0.662775f, -0.230887f, 0.980989f,  0.223437f,  -0.790591f, 0.600724f,
+  -0.273445f, 0.427635f,  -0.501641f, -0.878390f, 0.234731f,  -0.172550f,
+  0.418904f,  1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+  -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x32_layer0,
+      av1_tx_split_nn_weights_16x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x32_layer0,
+      av1_tx_split_nn_bias_16x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+  0.031614f,  -0.110926f, 0.052418f,  -0.702506f, 0.045708f,  0.238329f,
+  -0.021806f, -0.208128f, 0.509745f,  -0.293891f, 0.277788f,  0.113937f,
+  0.741576f,  0.062848f,  0.351878f,  0.212532f,  0.385842f,  0.081517f,
+  0.398502f,  -0.015156f, 0.242616f,  0.214619f,  -0.182678f, -0.170546f,
+  0.110605f,  -0.236749f, -0.023831f, -0.285243f, 0.147156f,  -0.257639f,
+  0.341355f,  -0.571641f, -0.721797f, 0.139588f,  -0.518494f, -0.206526f,
+  -0.570560f, -0.184295f, 0.110271f,  0.210292f,  -0.109132f, -0.001080f,
+  0.129251f,  -0.204230f, -0.396312f, -0.183024f, 0.421243f,  -0.013154f,
+  0.222627f,  0.169826f,  0.226037f,  0.218153f,  -0.343528f, 0.274906f,
+  -0.156632f, 0.250261f,  -0.484020f, 0.019909f,  -0.349575f, -0.286643f,
+  -0.507396f, 0.202446f,  -0.154110f, -0.292644f, 0.122666f,  0.306963f,
+  0.424895f,  0.005579f,  0.494094f,  -0.079551f, 0.473740f,  0.352414f,
+  -0.356917f, 0.264331f,  -0.554487f, 0.119978f,  0.012291f,  -0.141641f,
+  -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f,  -0.118501f,
+  0.305151f,  -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+  -0.177066f, -0.055114f, 0.229698f,  -0.199523f, 0.054278f,  0.365020f,
+  -0.060586f, -0.300618f, 0.157563f,  -0.064338f, -0.005711f, -0.176991f,
+  -0.424502f, -0.111914f, 0.092608f,  0.126621f,  0.078547f,  0.148008f,
+  0.024221f,  0.124599f,  0.001343f,  0.059402f,  0.453753f,  0.047102f,
+  0.242544f,  0.055735f,  -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+  0.214908f,  0.248889f,  0.544348f,  -0.084566f, 0.402478f,  0.298031f,
+  0.099038f,  -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+  -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+  0.100219f,  0.293934f,  0.099271f,  -0.036320f, 0.356626f,  -0.261445f,
+  0.879544f,  0.000878f,  0.532920f,  -0.093918f, 0.508867f,  -0.040215f,
+  -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f,  0.352989f,
+  -0.058831f, -0.164588f, 0.039890f,  0.122861f,  0.222508f,  0.061217f,
+  0.466487f,  0.022666f,  0.423777f,  -0.002200f, -0.656835f, -0.099760f,
+  -0.520606f, 0.303204f,  -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+  -0.336516f, -0.206764f, -0.236040f, 0.325899f,  -0.418748f, 0.163205f,
+  -0.476242f, -0.121928f, 0.139178f,  -0.157193f, -0.531766f, -0.180202f,
+  -0.485254f, 0.187703f,  -0.440072f, 0.137854f,  0.029139f,  0.109530f,
+  -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+  -0.304542f, 0.005123f,  0.413995f,  0.314639f,  0.342648f,  -0.293264f,
+  0.358135f,  -0.180425f, -0.369530f, -0.048413f, 0.498366f,  0.121875f,
+  0.270948f,  -0.187966f, 0.342503f,  0.174420f,  -0.352105f, 0.088080f,
+  0.008277f,  0.020275f,  -0.002381f, 0.504389f,  -0.018832f, -0.366047f,
+  -0.090947f, -0.168150f, 0.016184f,  -0.328914f, 0.089579f,  -0.017349f,
+  0.005844f,  -0.005010f, -1.857514f, -0.282426f, 0.010177f,  -0.214727f,
+  -0.182529f, 0.156943f,  -0.162032f, -0.472654f, 0.069432f,  0.016901f,
+  -0.767905f, 0.137129f,  -0.411463f, 0.049056f,  -0.431657f, -0.037641f,
+  0.785500f,  0.046225f,  0.195831f,  0.245204f,  0.368614f,  0.212261f,
+  0.440626f,  -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+  0.490777f,  -1.894238f, 0.621333f,  -0.076756f, 0.286298f, 0.286375f,
+  -0.126431f, -0.350034f, -1.017572f, 0.620125f,  0.408128f, 0.238756f,
+  -0.060728f, 0.210912f,  0.043124f,  0.445649f,  0.907025f, 0.360272f,
+  1.083101f,  -0.068952f, 1.062348f,  0.396354f,  0.280075f, 0.501732f,
+  0.328422f,  0.066241f,  0.474697f,  0.126313f,  0.741206f, 0.314796f,
+  0.552712f,  0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+  1.033823f,  0.603439f,  0.304591f,  -0.279940f, -0.780909f, -0.132801f,
+  0.154059f,  0.662014f,  -0.718368f, 0.198733f,  0.039766f,  -0.208516f,
+  -0.104909f, -0.394209f, 0.081617f,  0.365041f,  -0.874960f, -0.063315f,
+  -1.189897f, 0.337225f,  0.410893f,  0.307519f,  0.221323f,  0.233895f,
+  0.469536f,  0.438557f,  0.280144f,  0.422423f,  -1.394513f, 0.781900f,
+  0.352981f,  0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+  -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x64_layer0,
+      av1_tx_split_nn_weights_32x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x64_layer0,
+      av1_tx_split_nn_bias_32x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+  -0.687846f, 0.121404f,  -0.372905f, 0.126770f,  -0.103298f, -0.101650f,
+  -0.148490f, -0.271740f, 0.682915f,  -0.079765f, 0.634347f,  -0.151503f,
+  0.287692f,  -0.079072f, -0.236948f, 0.065064f,  0.713383f,  0.397123f,
+  0.553621f,  0.368529f,  0.767663f,  -0.046601f, -0.392402f, -0.294822f,
+  -0.292325f, -0.010573f, -0.837945f, 0.050113f,  -0.811360f, 0.199162f,
+  0.150832f,  0.011602f,  0.369694f,  -0.225876f, 0.234113f,  -0.269808f,
+  0.303805f,  -0.190281f, -0.451136f, 0.209755f,  -0.308894f, 0.326956f,
+  0.313591f,  0.089923f,  -0.095754f, 0.390981f,  0.467366f,  0.169670f,
+  0.853322f,  0.054055f,  0.830319f,  -0.121918f, 0.262019f,  -0.093526f,
+  0.385558f,  0.419174f,  0.040198f,  -0.347030f, -0.450492f, -0.106764f,
+  0.487502f,  -0.204188f, 0.430374f,  -0.116388f, 0.236407f,  -0.157376f,
+  0.732294f,  -0.651387f, 0.347446f,  0.342575f,  0.048406f,  0.187657f,
+  0.434899f,  -0.447782f, 0.032728f,  -0.071168f, -0.255327f, 0.104174f,
+  0.095689f,  -0.431743f, 0.725694f,  0.031797f,  0.523171f,  0.061801f,
+  0.469804f,  -0.071068f, -0.059024f, -0.211937f, 0.392134f,  -0.321490f,
+  0.366060f,  -0.427798f, 0.166771f,  0.299652f,  0.044660f,  0.205142f,
+  0.039133f,  -0.051835f, -0.465475f, 0.216976f,  -0.341156f, 0.095358f,
+  0.230807f,  0.201674f,  0.279266f,  -0.713534f, -0.091690f, -0.569708f,
+  -0.119001f, 0.252160f,  -1.544578f, -0.284477f, 0.555348f,  0.226471f,
+  0.347690f,  0.034365f,  0.770835f,  -0.241859f, -0.130241f, 0.292936f,
+  0.396622f,  -0.417916f, 0.492224f,  0.125517f,  0.344824f,  0.232172f,
+  -0.432106f, -0.278745f, 0.035069f,  -0.307247f, -0.120760f, 0.170950f,
+  0.433601f,  0.044286f,  0.141463f,  -0.041382f, 0.529346f,  0.010868f,
+  -0.323674f, 0.185205f,  0.623459f,  0.232842f,  -0.406693f, -0.142944f,
+  0.222988f,  0.343634f,  0.065401f,  0.002621f,  0.805335f,  -0.426926f,
+  0.279181f,  0.131364f,  0.192339f,  -0.402391f, 0.544120f,  -0.060618f,
+  0.467780f,  0.165224f,  -0.373131f, 0.002427f,  0.688064f,  0.322317f,
+  0.259713f,  0.130583f,  0.185032f,  -0.189111f, -0.067821f, 0.010875f,
+  0.644724f,  -0.179291f, 0.463222f,  0.155230f,  0.721384f,  -0.046019f,
+  0.438501f,  0.440027f,  -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+  -0.328530f, 0.370102f,  0.482531f,  0.043471f,  -0.469732f, -0.532663f,
+  0.122081f,  -0.379659f, 0.037219f,  -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+  -1.198965f, 0.395204f,  -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+  -0.288354f, 1.207574f,  0.411608f,  0.964678f,  -1.176893f, 1.059006f,
+  -0.472969f, 2.087975f,  1.065536f,  0.595569f,  0.197907f,  -0.349938f,
+  1.013651f,  -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+  0.815787f,  -0.393465f, -0.483427f, -0.565592f, 0.493494f,  0.430229f,
+  -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f,  0.649146f,
+  -0.487383f, 1.844503f,  0.480324f,  -0.982705f, -0.501446f, -0.220584f,
+  0.334299f,  0.802238f,  0.805838f,  -0.487848f, 0.300772f,  -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+  0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x32_layer0,
+      av1_tx_split_nn_weights_8x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x32_layer0,
+      av1_tx_split_nn_bias_8x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+  -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+  -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+  -0.454709f, -0.059461f, 0.210313f,  -0.155683f, 0.192968f,  -0.127804f,
+  0.471996f,  0.253377f,  0.472625f,  0.485322f,  0.150560f,  0.164868f,
+  -0.475587f, 0.447559f,  -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+  -0.243897f, 0.293020f,  -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+  -0.618848f, 0.096273f,  -0.444586f, 0.347750f,  -0.280643f, -0.062872f,
+  0.118661f,  0.540099f,  0.104141f,  -0.279300f, -0.098721f, -0.173427f,
+  -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+  -0.523103f, 0.093620f,  -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+  -0.422581f, -0.005354f, 0.450552f,  0.369210f,  0.562484f,  0.679922f,
+  0.282099f,  -0.039075f, 0.404196f,  0.006371f,  0.069679f,  -0.196160f,
+  -0.213675f, 0.275187f,  -0.104235f, -0.193090f, 0.003116f,  -0.252454f,
+  -0.094591f, 0.210439f,  -0.137070f, 0.145043f,  0.024558f,  0.121718f,
+  0.010138f,  0.301651f,  -0.377990f, 0.444414f,  0.001845f,  -0.095334f,
+  0.550259f,  0.087603f,  0.792492f,  -0.044584f, 0.641706f,  -0.328458f,
+  -0.447791f, 0.135376f,  0.356385f,  0.135748f,  0.310370f,  0.293757f,
+  -0.062000f, -0.056368f, 0.343930f,  0.312039f,  0.370763f,  0.452381f,
+  -0.023630f, -0.185909f, 0.422277f,  -0.006306f, 0.045166f,  0.423359f,
+  -0.157735f, -0.084901f, 0.219527f,  -0.209510f, 0.575057f,  0.249276f,
+  0.069267f,  0.233898f,  -0.229392f, 0.117197f,  -0.038551f, 0.293976f,
+  0.101996f,  0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+  1.036995f,  0.160249f,  0.100264f,  0.694881f,  0.694677f,  0.128379f,
+  -0.843405f, -0.405515f, 0.104139f,  0.182980f,  -0.025472f, 0.901067f,
+  -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+  -1.778868f, 0.174690f,  0.211991f, 0.712138f,  0.589352f,  0.466652f,
+  1.029146f,  -0.490044f, 0.483015f, 0.600215f,  -0.577776f, -0.755546f,
+  0.348337f,  -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+  0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x64_layer0,
+      av1_tx_split_nn_weights_16x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x64_layer0,
+      av1_tx_split_nn_bias_16x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+  NULL,                          // TX_4X4,
+  &av1_tx_split_nnconfig_8x8,    // TX_8X8,
+  &av1_tx_split_nnconfig_16x16,  // TX_16X16,
+  &av1_tx_split_nnconfig_32x32,  // TX_32X32,
+  &av1_tx_split_nnconfig_64x64,  // TX_64X64,
+  &av1_tx_split_nnconfig_4x8,    // TX_4X8,
+  &av1_tx_split_nnconfig_4x8,    // TX_8X4,
+  &av1_tx_split_nnconfig_8x16,   // TX_8X16,
+  &av1_tx_split_nnconfig_8x16,   // TX_16X8,
+  &av1_tx_split_nnconfig_16x32,  // TX_16X32,
+  &av1_tx_split_nnconfig_16x32,  // TX_32X16,
+  &av1_tx_split_nnconfig_32x64,  // TX_32X64,
+  &av1_tx_split_nnconfig_32x64,  // TX_64X32,
+  &av1_tx_split_nnconfig_4x16,   // TX_4X16,
+  &av1_tx_split_nnconfig_4x16,   // TX_16X4,
+  &av1_tx_split_nnconfig_8x32,   // TX_8X32,
+  &av1_tx_split_nnconfig_8x32,   // TX_32X8,
+  &av1_tx_split_nnconfig_16x64,  // TX_16X64,
+  &av1_tx_split_nnconfig_16x64,  // TX_64X16,
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..84065d6de
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1205 @@
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  __m128i buf0[32];
+  __m128i buf1[32];
+  const int32_t *cospi;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm_add_epi32(input[0], input[31]);
+  buf1[31] = _mm_sub_epi32(input[0], input[31]);
+  buf1[1] = _mm_add_epi32(input[1], input[30]);
+  buf1[30] = _mm_sub_epi32(input[1], input[30]);
+  buf1[2] = _mm_add_epi32(input[2], input[29]);
+  buf1[29] = _mm_sub_epi32(input[2], input[29]);
+  buf1[3] = _mm_add_epi32(input[3], input[28]);
+  buf1[28] = _mm_sub_epi32(input[3], input[28]);
+  buf1[4] = _mm_add_epi32(input[4], input[27]);
+  buf1[27] = _mm_sub_epi32(input[4], input[27]);
+  buf1[5] = _mm_add_epi32(input[5], input[26]);
+  buf1[26] = _mm_sub_epi32(input[5], input[26]);
+  buf1[6] = _mm_add_epi32(input[6], input[25]);
+  buf1[25] = _mm_sub_epi32(input[6], input[25]);
+  buf1[7] = _mm_add_epi32(input[7], input[24]);
+  buf1[24] = _mm_sub_epi32(input[7], input[24]);
+  buf1[8] = _mm_add_epi32(input[8], input[23]);
+  buf1[23] = _mm_sub_epi32(input[8], input[23]);
+  buf1[9] = _mm_add_epi32(input[9], input[22]);
+  buf1[22] = _mm_sub_epi32(input[9], input[22]);
+  buf1[10] = _mm_add_epi32(input[10], input[21]);
+  buf1[21] = _mm_sub_epi32(input[10], input[21]);
+  buf1[11] = _mm_add_epi32(input[11], input[20]);
+  buf1[20] = _mm_sub_epi32(input[11], input[20]);
+  buf1[12] = _mm_add_epi32(input[12], input[19]);
+  buf1[19] = _mm_sub_epi32(input[12], input[19]);
+  buf1[13] = _mm_add_epi32(input[13], input[18]);
+  buf1[18] = _mm_sub_epi32(input[13], input[18]);
+  buf1[14] = _mm_add_epi32(input[14], input[17]);
+  buf1[17] = _mm_sub_epi32(input[14], input[17]);
+  buf1[15] = _mm_add_epi32(input[15], input[16]);
+  buf1[16] = _mm_sub_epi32(input[15], input[16]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                      cos_bit);
+  buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                      buf0[31], cos_bit);
+  btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+
+  // stage 9
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        cos_bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+  __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+  __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+  __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+  __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+  __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+  __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+  __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+  __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+  __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+  __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+  __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+  __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+  __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+  __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+  __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+  __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+  __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+  __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+  __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+  __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+  __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+  __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+  __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+  __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+  __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+  __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+  __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+  __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+  __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+  __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+  __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+  __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+  __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+  __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+  __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+  __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+  __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+  __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+  __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+  __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+  __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+  __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+  __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+  __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+  __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+  __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+  __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+  __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+  __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+  __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+  __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+  __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+  __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+  __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+  __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+  __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+  __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+  __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+  __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+  __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+  __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+  __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+  __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+  __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+  __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+  __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+  __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+  __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+  __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+  __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+  __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+  __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+  __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+  __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+  __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+  __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+  __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_add_epi32(input[0], input[63]);
+  x1[63] = _mm_sub_epi32(input[0], input[63]);
+  x1[1] = _mm_add_epi32(input[1], input[62]);
+  x1[62] = _mm_sub_epi32(input[1], input[62]);
+  x1[2] = _mm_add_epi32(input[2], input[61]);
+  x1[61] = _mm_sub_epi32(input[2], input[61]);
+  x1[3] = _mm_add_epi32(input[3], input[60]);
+  x1[60] = _mm_sub_epi32(input[3], input[60]);
+  x1[4] = _mm_add_epi32(input[4], input[59]);
+  x1[59] = _mm_sub_epi32(input[4], input[59]);
+  x1[5] = _mm_add_epi32(input[5], input[58]);
+  x1[58] = _mm_sub_epi32(input[5], input[58]);
+  x1[6] = _mm_add_epi32(input[6], input[57]);
+  x1[57] = _mm_sub_epi32(input[6], input[57]);
+  x1[7] = _mm_add_epi32(input[7], input[56]);
+  x1[56] = _mm_sub_epi32(input[7], input[56]);
+  x1[8] = _mm_add_epi32(input[8], input[55]);
+  x1[55] = _mm_sub_epi32(input[8], input[55]);
+  x1[9] = _mm_add_epi32(input[9], input[54]);
+  x1[54] = _mm_sub_epi32(input[9], input[54]);
+  x1[10] = _mm_add_epi32(input[10], input[53]);
+  x1[53] = _mm_sub_epi32(input[10], input[53]);
+  x1[11] = _mm_add_epi32(input[11], input[52]);
+  x1[52] = _mm_sub_epi32(input[11], input[52]);
+  x1[12] = _mm_add_epi32(input[12], input[51]);
+  x1[51] = _mm_sub_epi32(input[12], input[51]);
+  x1[13] = _mm_add_epi32(input[13], input[50]);
+  x1[50] = _mm_sub_epi32(input[13], input[50]);
+  x1[14] = _mm_add_epi32(input[14], input[49]);
+  x1[49] = _mm_sub_epi32(input[14], input[49]);
+  x1[15] = _mm_add_epi32(input[15], input[48]);
+  x1[48] = _mm_sub_epi32(input[15], input[48]);
+  x1[16] = _mm_add_epi32(input[16], input[47]);
+  x1[47] = _mm_sub_epi32(input[16], input[47]);
+  x1[17] = _mm_add_epi32(input[17], input[46]);
+  x1[46] = _mm_sub_epi32(input[17], input[46]);
+  x1[18] = _mm_add_epi32(input[18], input[45]);
+  x1[45] = _mm_sub_epi32(input[18], input[45]);
+  x1[19] = _mm_add_epi32(input[19], input[44]);
+  x1[44] = _mm_sub_epi32(input[19], input[44]);
+  x1[20] = _mm_add_epi32(input[20], input[43]);
+  x1[43] = _mm_sub_epi32(input[20], input[43]);
+  x1[21] = _mm_add_epi32(input[21], input[42]);
+  x1[42] = _mm_sub_epi32(input[21], input[42]);
+  x1[22] = _mm_add_epi32(input[22], input[41]);
+  x1[41] = _mm_sub_epi32(input[22], input[41]);
+  x1[23] = _mm_add_epi32(input[23], input[40]);
+  x1[40] = _mm_sub_epi32(input[23], input[40]);
+  x1[24] = _mm_add_epi32(input[24], input[39]);
+  x1[39] = _mm_sub_epi32(input[24], input[39]);
+  x1[25] = _mm_add_epi32(input[25], input[38]);
+  x1[38] = _mm_sub_epi32(input[25], input[38]);
+  x1[26] = _mm_add_epi32(input[26], input[37]);
+  x1[37] = _mm_sub_epi32(input[26], input[37]);
+  x1[27] = _mm_add_epi32(input[27], input[36]);
+  x1[36] = _mm_sub_epi32(input[27], input[36]);
+  x1[28] = _mm_add_epi32(input[28], input[35]);
+  x1[35] = _mm_sub_epi32(input[28], input[35]);
+  x1[29] = _mm_add_epi32(input[29], input[34]);
+  x1[34] = _mm_sub_epi32(input[29], input[34]);
+  x1[30] = _mm_add_epi32(input[30], input[33]);
+  x1[33] = _mm_sub_epi32(input[30], input[33]);
+  x1[31] = _mm_add_epi32(input[31], input[32]);
+  x1[32] = _mm_sub_epi32(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                          __rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                          __rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                          __rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                          __rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                          __rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                          __rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+                          __rounding, cos_bit);
+  x6[4] = _mm_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                          __rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                          __rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                          __rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+                          __rounding, cos_bit);
+  x7[8] = _mm_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                          __rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                          __rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+                          __rounding, cos_bit);
+  x8[16] = _mm_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                          __rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                          __rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                          __rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                          __rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+                          __rounding, cos_bit);
+  x9[32] = _mm_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+                          x10[63], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+                          x10[62], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+                          x10[61], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+                          x10[60], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+                          x10[59], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+                          x10[58], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+                          x10[57], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+                          x10[56], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+                          x10[55], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+                          x10[54], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+                          x10[53], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+                          x10[52], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+                          x10[51], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+                          x10[50], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+                          x10[49], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+                          x10[48], __rounding, cos_bit);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..abb95f31e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+    av1_fdct32_new_sse4_1(buf0, buf1, cos_bit);
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride,
+                                     const TXFM_2D_FLIP_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  // TODO(sarahparker) This does not currently support rectangular transforms
+  // and will break without splitting txfm_size out into row and col size.
+  // Rectangular transforms use c code only, so it should be ok for now.
+  // It will be corrected when there are sse implementations for rectangular
+  // transforms.
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
+                                      const __m128i *inputB, __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
+  __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
+  __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
+  __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+
+  temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
+  temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
+  temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
+  temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
+
+  output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m128i buf0[64], buf1[512];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[32];
+    __m128i bufB[32];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < (32 / 4); ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_sse4_1,    // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_sse4_1,    // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_sse4_1,    // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+                               int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 000000000..6aae7ce1e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2889 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i u[4], v[4];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], cospi_p32_p32);  // 0
+  u[1] = _mm_madd_epi16(v[0], cospi_p32_m32);  // 2
+  u[2] = _mm_madd_epi16(v[1], cospi_p16_p48);  // 1
+  u[3] = _mm_madd_epi16(v[1], cospi_p48_m16);  // 3
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[3], __rounding);
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[1]);
+  output[1] = _mm_packs_epi32(u[2], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x1[4];
+  x1[0] = _mm_adds_epi16(input[0], input[3]);
+  x1[3] = _mm_subs_epi16(input[0], input[3]);
+  x1[1] = _mm_adds_epi16(input[1], input[2]);
+  x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+  // stage 2
+  __m128i x2[4];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+                 &x1[6], &x2[5], &x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+                 &x2[1], &x3[0], &x3[1]);
+  btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+                 &x2[3], &x3[2], &x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+                 &x3[7], &x4[4], &x4[7]);
+  btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+                 &x3[6], &x4[5], &x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = _mm_adds_epi16(input[0], input[15]);
+  x1[15] = _mm_subs_epi16(input[0], input[15]);
+  x1[1] = _mm_adds_epi16(input[1], input[14]);
+  x1[14] = _mm_subs_epi16(input[1], input[14]);
+  x1[2] = _mm_adds_epi16(input[2], input[13]);
+  x1[13] = _mm_subs_epi16(input[2], input[13]);
+  x1[3] = _mm_adds_epi16(input[3], input[12]);
+  x1[12] = _mm_subs_epi16(input[3], input[12]);
+  x1[4] = _mm_adds_epi16(input[4], input[11]);
+  x1[11] = _mm_subs_epi16(input[4], input[11]);
+  x1[5] = _mm_adds_epi16(input[5], input[10]);
+  x1[10] = _mm_subs_epi16(input[5], input[10]);
+  x1[6] = _mm_adds_epi16(input[6], input[9]);
+  x1[9] = _mm_subs_epi16(input[6], input[9]);
+  x1[7] = _mm_adds_epi16(input[7], input[8]);
+  x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+  x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+  x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+  x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+  x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+  x2[14] = x1[14];
+  x2[15] = x1[15];
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+  x3[4] = x2[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+  x3[7] = x2[7];
+  x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+  x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+  x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+  x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+  x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+  // stage 4
+  __m128i x4[16];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+  x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+  x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+  x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+  x4[8] = x3[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+  x4[11] = x3[11];
+  x4[12] = x3[12];
+  x4[15] = x3[15];
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = x4[0];
+  x5[1] = x4[1];
+  x5[2] = x4[2];
+  x5[3] = x4[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+  x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+  x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+  x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+  x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+  // stage 7
+  output[0] = x6[0];
+  output[1] = x6[8];
+  output[2] = x6[4];
+  output[3] = x6[12];
+  output[4] = x6[2];
+  output[5] = x6[10];
+  output[6] = x6[6];
+  output[7] = x6[14];
+  output[8] = x6[1];
+  output[9] = x6[9];
+  output[10] = x6[5];
+  output[11] = x6[13];
+  output[12] = x6[3];
+  output[13] = x6[11];
+  output[14] = x6[7];
+  output[15] = x6[15];
+}
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m128i x1[32];
+  x1[0] = _mm_adds_epi16(input[0], input[31]);
+  x1[31] = _mm_subs_epi16(input[0], input[31]);
+  x1[1] = _mm_adds_epi16(input[1], input[30]);
+  x1[30] = _mm_subs_epi16(input[1], input[30]);
+  x1[2] = _mm_adds_epi16(input[2], input[29]);
+  x1[29] = _mm_subs_epi16(input[2], input[29]);
+  x1[3] = _mm_adds_epi16(input[3], input[28]);
+  x1[28] = _mm_subs_epi16(input[3], input[28]);
+  x1[4] = _mm_adds_epi16(input[4], input[27]);
+  x1[27] = _mm_subs_epi16(input[4], input[27]);
+  x1[5] = _mm_adds_epi16(input[5], input[26]);
+  x1[26] = _mm_subs_epi16(input[5], input[26]);
+  x1[6] = _mm_adds_epi16(input[6], input[25]);
+  x1[25] = _mm_subs_epi16(input[6], input[25]);
+  x1[7] = _mm_adds_epi16(input[7], input[24]);
+  x1[24] = _mm_subs_epi16(input[7], input[24]);
+  x1[8] = _mm_adds_epi16(input[8], input[23]);
+  x1[23] = _mm_subs_epi16(input[8], input[23]);
+  x1[9] = _mm_adds_epi16(input[9], input[22]);
+  x1[22] = _mm_subs_epi16(input[9], input[22]);
+  x1[10] = _mm_adds_epi16(input[10], input[21]);
+  x1[21] = _mm_subs_epi16(input[10], input[21]);
+  x1[11] = _mm_adds_epi16(input[11], input[20]);
+  x1[20] = _mm_subs_epi16(input[11], input[20]);
+  x1[12] = _mm_adds_epi16(input[12], input[19]);
+  x1[19] = _mm_subs_epi16(input[12], input[19]);
+  x1[13] = _mm_adds_epi16(input[13], input[18]);
+  x1[18] = _mm_subs_epi16(input[13], input[18]);
+  x1[14] = _mm_adds_epi16(input[14], input[17]);
+  x1[17] = _mm_subs_epi16(input[14], input[17]);
+  x1[15] = _mm_adds_epi16(input[15], input[16]);
+  x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+  // stage 2
+  __m128i x2[32];
+  x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+  x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+  x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+  x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+  x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+  x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+  x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+  x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+  x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+  x2[16] = x1[16];
+  x2[17] = x1[17];
+  x2[18] = x1[18];
+  x2[19] = x1[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+  x2[28] = x1[28];
+  x2[29] = x1[29];
+  x2[30] = x1[30];
+  x2[31] = x1[31];
+
+  // stage 3
+  __m128i x3[32];
+  x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+  x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+  x3[8] = x2[8];
+  x3[9] = x2[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+  x3[14] = x2[14];
+  x3[15] = x2[15];
+  x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+  x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+  x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+  x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+  x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+  x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+  x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+  x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+  x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+  x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+  x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+  x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+  x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+  x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+  x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+  x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+  // stage 4
+  __m128i x4[32];
+  x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+  x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+  x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+  x4[4] = x3[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+  x4[7] = x3[7];
+  x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+  x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+  x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+  x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+  x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+  x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+  x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+  x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+  x4[16] = x3[16];
+  x4[17] = x3[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+  x4[22] = x3[22];
+  x4[23] = x3[23];
+  x4[24] = x3[24];
+  x4[25] = x3[25];
+  x4[30] = x3[30];
+  x4[31] = x3[31];
+
+  // stage 5
+  __m128i x5[32];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+  x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+  x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+  x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+  x5[8] = x4[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+  x5[11] = x4[11];
+  x5[12] = x4[12];
+  x5[15] = x4[15];
+  x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+  x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+  x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+  x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+  x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+  x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+  x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+  x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+  x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+  x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+  x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+  x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+  x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+  x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+  x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+  x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+  // stage 6
+  __m128i x6[32];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+  x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+  x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+  x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+  x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+  x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+  x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+  x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+  x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+  x6[16] = x5[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+  x6[19] = x5[19];
+  x6[20] = x5[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+  x6[23] = x5[23];
+  x6[24] = x5[24];
+  x6[27] = x5[27];
+  x6[28] = x5[28];
+  x6[31] = x5[31];
+
+  // stage 7
+  __m128i x7[32];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  x7[4] = x6[4];
+  x7[5] = x6[5];
+  x7[6] = x6[6];
+  x7[7] = x6[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+  x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+  x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+  x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+  x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+  x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+  x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+  x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+  x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+  x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+  x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+  x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+  x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+  x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+  x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+  x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+  x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+  // stage 8
+  __m128i x8[32];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  x8[8] = x7[8];
+  x8[9] = x7[9];
+  x8[10] = x7[10];
+  x8[11] = x7[11];
+  x8[12] = x7[12];
+  x8[13] = x7[13];
+  x8[14] = x7[14];
+  x8[15] = x7[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+  // stage 9
+  output[0] = x8[0];
+  output[1] = x8[16];
+  output[2] = x8[8];
+  output[3] = x8[24];
+  output[4] = x8[4];
+  output[5] = x8[20];
+  output[6] = x8[12];
+  output[7] = x8[28];
+  output[8] = x8[2];
+  output[9] = x8[18];
+  output[10] = x8[10];
+  output[11] = x8[26];
+  output[12] = x8[6];
+  output[13] = x8[22];
+  output[14] = x8[14];
+  output[15] = x8[30];
+  output[16] = x8[1];
+  output[17] = x8[17];
+  output[18] = x8[9];
+  output[19] = x8[25];
+  output[20] = x8[5];
+  output[21] = x8[21];
+  output[22] = x8[13];
+  output[23] = x8[29];
+  output[24] = x8[3];
+  output[25] = x8[19];
+  output[26] = x8[11];
+  output[27] = x8[27];
+  output[28] = x8[7];
+  output[29] = x8[23];
+  output[30] = x8[15];
+  output[31] = x8[31];
+}
+
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+  __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+  __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+  __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+  __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+  __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+  __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+  __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+  __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+  __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+  __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+  __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+  __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+  __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+  __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+  __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+  __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+  __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+  __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+  __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+  __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+  __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+  __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+  __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+  __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+  __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+  __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+  __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+  __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+  __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+  __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+  __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+  __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_adds_epi16(input[0], input[63]);
+  x1[63] = _mm_subs_epi16(input[0], input[63]);
+  x1[1] = _mm_adds_epi16(input[1], input[62]);
+  x1[62] = _mm_subs_epi16(input[1], input[62]);
+  x1[2] = _mm_adds_epi16(input[2], input[61]);
+  x1[61] = _mm_subs_epi16(input[2], input[61]);
+  x1[3] = _mm_adds_epi16(input[3], input[60]);
+  x1[60] = _mm_subs_epi16(input[3], input[60]);
+  x1[4] = _mm_adds_epi16(input[4], input[59]);
+  x1[59] = _mm_subs_epi16(input[4], input[59]);
+  x1[5] = _mm_adds_epi16(input[5], input[58]);
+  x1[58] = _mm_subs_epi16(input[5], input[58]);
+  x1[6] = _mm_adds_epi16(input[6], input[57]);
+  x1[57] = _mm_subs_epi16(input[6], input[57]);
+  x1[7] = _mm_adds_epi16(input[7], input[56]);
+  x1[56] = _mm_subs_epi16(input[7], input[56]);
+  x1[8] = _mm_adds_epi16(input[8], input[55]);
+  x1[55] = _mm_subs_epi16(input[8], input[55]);
+  x1[9] = _mm_adds_epi16(input[9], input[54]);
+  x1[54] = _mm_subs_epi16(input[9], input[54]);
+  x1[10] = _mm_adds_epi16(input[10], input[53]);
+  x1[53] = _mm_subs_epi16(input[10], input[53]);
+  x1[11] = _mm_adds_epi16(input[11], input[52]);
+  x1[52] = _mm_subs_epi16(input[11], input[52]);
+  x1[12] = _mm_adds_epi16(input[12], input[51]);
+  x1[51] = _mm_subs_epi16(input[12], input[51]);
+  x1[13] = _mm_adds_epi16(input[13], input[50]);
+  x1[50] = _mm_subs_epi16(input[13], input[50]);
+  x1[14] = _mm_adds_epi16(input[14], input[49]);
+  x1[49] = _mm_subs_epi16(input[14], input[49]);
+  x1[15] = _mm_adds_epi16(input[15], input[48]);
+  x1[48] = _mm_subs_epi16(input[15], input[48]);
+  x1[16] = _mm_adds_epi16(input[16], input[47]);
+  x1[47] = _mm_subs_epi16(input[16], input[47]);
+  x1[17] = _mm_adds_epi16(input[17], input[46]);
+  x1[46] = _mm_subs_epi16(input[17], input[46]);
+  x1[18] = _mm_adds_epi16(input[18], input[45]);
+  x1[45] = _mm_subs_epi16(input[18], input[45]);
+  x1[19] = _mm_adds_epi16(input[19], input[44]);
+  x1[44] = _mm_subs_epi16(input[19], input[44]);
+  x1[20] = _mm_adds_epi16(input[20], input[43]);
+  x1[43] = _mm_subs_epi16(input[20], input[43]);
+  x1[21] = _mm_adds_epi16(input[21], input[42]);
+  x1[42] = _mm_subs_epi16(input[21], input[42]);
+  x1[22] = _mm_adds_epi16(input[22], input[41]);
+  x1[41] = _mm_subs_epi16(input[22], input[41]);
+  x1[23] = _mm_adds_epi16(input[23], input[40]);
+  x1[40] = _mm_subs_epi16(input[23], input[40]);
+  x1[24] = _mm_adds_epi16(input[24], input[39]);
+  x1[39] = _mm_subs_epi16(input[24], input[39]);
+  x1[25] = _mm_adds_epi16(input[25], input[38]);
+  x1[38] = _mm_subs_epi16(input[25], input[38]);
+  x1[26] = _mm_adds_epi16(input[26], input[37]);
+  x1[37] = _mm_subs_epi16(input[26], input[37]);
+  x1[27] = _mm_adds_epi16(input[27], input[36]);
+  x1[36] = _mm_subs_epi16(input[27], input[36]);
+  x1[28] = _mm_adds_epi16(input[28], input[35]);
+  x1[35] = _mm_subs_epi16(input[28], input[35]);
+  x1[29] = _mm_adds_epi16(input[29], input[34]);
+  x1[34] = _mm_subs_epi16(input[29], input[34]);
+  x1[30] = _mm_adds_epi16(input[30], input[33]);
+  x1[33] = _mm_subs_epi16(input[30], input[33]);
+  x1[31] = _mm_adds_epi16(input[31], input[32]);
+  x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+  x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+  x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+  x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+  x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+  x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+  x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+  x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+  x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+  x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+  x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+  x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+  x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+  x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+  x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+  x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+  x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+  x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+  x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+  x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+  x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+  x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+  x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+  x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+  x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+  x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+  x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+  x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+  x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+  x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+  x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+  x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+  x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+  x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+  x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+  x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+  x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+  x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+  x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+  x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+  x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+  x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+  x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+  x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+  x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+  x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+  x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+  x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+  x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+  x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+  x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+  x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+  x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+  x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+  x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+  x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+  x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+  x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+  x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+  x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+  x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+  x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+  x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+  x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+  x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+  x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+  x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+  x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+  x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+  x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+  x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+  x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+  x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+  x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+  x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+  x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+  x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+  x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+  x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+  x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+  x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+  x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+  x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+  x5[7] = x4[7];
+  x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+  x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+  x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+  x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+  x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+  x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+  x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+  x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+  x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+  x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+  x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+  x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+  x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+  x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+  x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+  x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+  x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+  x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+  x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+  x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+  x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+  x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+  x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+  x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+  x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+  x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+  x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+  x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+  x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+  x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+  x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+  x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+  x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+  x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+  x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+  x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+  x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+  x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+  x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+  x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+  x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+  x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+  x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+  x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+  x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+  x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+  x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+  x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+  x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+  x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+  x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+  x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+  x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+  x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+  x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+  x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+  x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+  x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+  x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+  x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+  x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+  x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+  x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+  x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+  x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+  x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+  x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+  x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+  x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+  x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+  x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+  x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+  x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+  x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+  x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+  x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+  x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+  x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+  x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+  x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+  x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+  x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+  x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+  x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+  x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+  x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+  x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+  x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+  x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+  x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+  x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+  x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+  x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+  x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+  x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+  x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+  x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+  x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+  x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+  x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+  x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+  x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+  x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+  x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+  x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+  x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+  x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+  x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+  x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+  x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+  x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+  x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+  x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+  x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+  x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+  x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+  x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+  x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+  x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+  x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+  x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+  x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+  x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+  x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+  x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+  x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+  x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+  x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+  x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+  x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+  x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+  x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+  x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+  x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+  x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+  x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+  x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+  x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+  x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+  x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+  x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+  btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+  btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+  btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+  btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+  btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+  btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+  btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+  btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+  btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+  btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+  btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+  btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+  btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+  btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+  btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u[8], v[8];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u[2] = _mm_unpacklo_epi16(in7, __zero);
+  u[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+  v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[6], __rounding);
+
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[2]);
+  output[1] = _mm_packs_epi32(u[1], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+                 &x1[3], &x2[2], &x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+                 &x1[7], &x2[6], &x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+                 &x3[5], &x4[4], &x4[5]);
+  btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+                 &x3[7], &x4[6], &x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+                 &x5[1], &x6[0], &x6[1]);
+  btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+                 &x5[3], &x6[2], &x6[3]);
+  btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+                 &x5[5], &x6[4], &x6[5]);
+  btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+                 &x5[7], &x6[6], &x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+  u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+  u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+  u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+  u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+  u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+  u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+  u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+  v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02);  // s0 + s2
+  v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02);  // s0 + s2
+  v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04);  // s4 + s5
+  v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04);  // s4 + s5
+  v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03);  // x1
+  v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03);  // x1
+  v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01);  // s1 - s3
+  v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01);  // s1 - s3
+  v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02);  // -s4 + s6
+  v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02);  // -s4 + s6
+  v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03);  // s4
+  v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03);  // s4
+  v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+  v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+  u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+  u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+  u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+  u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+  u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+  u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+  u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+  u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+  u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+  u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+  u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+  u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+  u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+  u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+  v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+  v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+  v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+  v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+  v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+  v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+  v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+  v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+  u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+  u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+  u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+  u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+  u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+  u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+  u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+  u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+  output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+  output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+  output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[15]);
+  x1[2] = _mm_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm_subs_epi16(__zero, input[11]);
+  x1[8] = _mm_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm_subs_epi16(__zero, input[13]);
+  x1[14] = _mm_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+  x2[12] = x1[12];
+  x2[13] = x1[13];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+  x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+  x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+  x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+  // stage 4
+  __m128i x4[16];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  x4[10] = x3[10];
+  x4[11] = x3[11];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+  x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+  x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+  x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+  x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+  // stage 7
+  __m128i x7[16];
+  x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+  x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+  x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+  x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+  x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+  x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+  x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+  x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+  x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+  x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+  x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+  x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+  x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+  x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+  // stage 8
+  __m128i x8[16];
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+  // stage 9
+  output[0] = x8[1];
+  output[1] = x8[14];
+  output[2] = x8[3];
+  output[3] = x8[12];
+  output[4] = x8[5];
+  output[5] = x8[10];
+  output[6] = x8[7];
+  output[7] = x8[8];
+  output[8] = x8[9];
+  output[9] = x8[6];
+  output[10] = x8[11];
+  output[11] = x8[4];
+  output[12] = x8[13];
+  output[13] = x8[2];
+  output[14] = x8[15];
+  output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fadst4x4_new_sse2,      // ADST_DCT
+  fdct4x4_new_sse2,       // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fadst4x4_new_sse2,      // FLIPADST_DCT
+  fdct4x4_new_sse2,       // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fdct4x4_new_sse2,       // V_DCT
+  fidentity4x4_new_sse2,  // H_DCT
+  fadst4x4_new_sse2,      // V_ADST
+  fidentity4x4_new_sse2,  // H_ADST
+  fadst4x4_new_sse2,      // V_FLIPADST
+  fidentity4x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fdct4x4_new_sse2,       // ADST_DCT
+  fadst4x4_new_sse2,      // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fdct4x4_new_sse2,       // FLIPADST_DCT
+  fadst4x4_new_sse2,      // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fidentity4x4_new_sse2,  // V_DCT
+  fdct4x4_new_sse2,       // H_DCT
+  fidentity4x4_new_sse2,  // V_ADST
+  fadst4x4_new_sse2,      // H_ADST
+  fidentity4x4_new_sse2,  // V_FLIPADST
+  fadst4x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fadst4x8_new_sse2,      // ADST_DCT
+  fdct4x8_new_sse2,       // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fadst4x8_new_sse2,      // FLIPADST_DCT
+  fdct4x8_new_sse2,       // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct4x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst4x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst4x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fdct8x4_new_sse2,       // ADST_DCT
+  fadst8x4_new_sse2,      // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fdct8x4_new_sse2,       // FLIPADST_DCT
+  fadst8x4_new_sse2,      // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fidentity8x4_new_sse2,  // V_DCT
+  fdct8x4_new_sse2,       // H_DCT
+  fidentity8x4_new_sse2,  // V_ADST
+  fadst8x4_new_sse2,      // H_ADST
+  fidentity8x4_new_sse2,  // V_FLIPADST
+  fadst8x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fadst8x4_new_sse2,      // ADST_DCT
+  fdct8x4_new_sse2,       // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fadst8x4_new_sse2,      // FLIPADST_DCT
+  fdct8x4_new_sse2,       // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fdct8x4_new_sse2,       // V_DCT
+  fidentity8x4_new_sse2,  // H_DCT
+  fadst8x4_new_sse2,      // V_ADST
+  fidentity8x4_new_sse2,  // H_ADST
+  fadst8x4_new_sse2,      // V_FLIPADST
+  fidentity8x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fdct4x8_new_sse2,       // ADST_DCT
+  fadst4x8_new_sse2,      // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fdct4x8_new_sse2,       // FLIPADST_DCT
+  fadst4x8_new_sse2,      // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct4x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst4x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst4x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fadst8x16_new_sse2,      // ADST_DCT
+  fdct8x16_new_sse2,       // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fadst8x16_new_sse2,      // FLIPADST_DCT
+  fdct8x16_new_sse2,       // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fdct8x16_new_sse2,       // V_DCT
+  fidentity8x16_new_sse2,  // H_DCT
+  fadst8x16_new_sse2,      // V_ADST
+  fidentity8x16_new_sse2,  // H_ADST
+  fadst8x16_new_sse2,      // V_FLIPADST
+  fidentity8x16_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fdct8x16_new_sse2,       // ADST_DCT
+  fadst8x16_new_sse2,      // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fdct8x16_new_sse2,       // FLIPADST_DCT
+  fadst8x16_new_sse2,      // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fidentity8x16_new_sse2,  // V_DCT
+  fdct8x16_new_sse2,       // H_DCT
+  fidentity8x16_new_sse2,  // V_ADST
+  fadst8x16_new_sse2,      // H_ADST
+  fidentity8x16_new_sse2,  // V_FLIPADST
+  fadst8x16_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_new_sse2,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fidentity8x32_new_sse2,  // V_DCT
+  fdct8x32_new_sse2,       // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[4], buf1[4], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x4(buf, buf);
+  store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x4(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x4(buf, buf);
+    store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_4x8(buf + 8, buf + 8);
+  store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                   8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                     height);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     height);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, height);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, height);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+    }
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * width * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  NULL,                             // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  else
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 000000000..aa14d3ade
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a = _mm_unpacklo_epi16(input[i], one);
+    const __m128i b = scale_round_sse2(a, NewSqrt2);
+    output[i] = _mm_packs_epi32(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm_adds_epi16(input[0], input[0]);
+  output[1] = _mm_adds_epi16(input[1], input[1]);
+  output[2] = _mm_adds_epi16(input[2], input[2]);
+  output[3] = _mm_adds_epi16(input[3], input[3]);
+  output[4] = _mm_adds_epi16(input[4], input[4]);
+  output[5] = _mm_adds_epi16(input[5], input[5]);
+  output[6] = _mm_adds_epi16(input[6], input[6]);
+  output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm_slli_epi16(input[i], 2);
+  }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_new_sse2,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fdct8x32_new_sse2,       // V_DCT
+  fidentity8x32_new_sse2,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
index c8d4ccb70..b58911fcb 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -32,7 +33,10 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
                            const int16_t *dequant_ptr, int log_scale,
                            __m256i *qp) {
   __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
-  round = _mm_srai_epi16(round, log_scale);
+  if (log_scale) {
+    const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+    round = _mm_mulhrs_epi16(round, round_scale);
+  }
   const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
   const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
 
@@ -45,8 +49,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, int log_scale,
                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
                             __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi32(*c);
-  __m256i q = _mm256_add_epi32(abs, qp[0]);
+  const __m256i abs_coeff = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
 
   __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
   __m256i q_hi = _mm256_srli_epi64(q, 32);
@@ -56,6 +60,9 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
   q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
   q_hi = _mm256_slli_epi64(q_hi, 32);
   q = _mm256_or_si256(q_lo, q_hi);
+  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+  q = _mm256_andnot_si256(mask, q);
 
   __m256i dq = _mm256_mullo_epi32(q, qp[2]);
   dq = _mm256_srai_epi32(dq, log_scale);
@@ -81,8 +88,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
 }
 
 void av1_highbd_quantize_fp_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale) {
@@ -90,14 +97,23 @@ void av1_highbd_quantize_fp_avx2(
   (void)zbin_ptr;
   (void)quant_shift_ptr;
   const unsigned int step = 8;
+  __m256i qp[3], coeff;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[3], coeff;
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
 
-    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
-    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
 
-    __m256i eob = _mm256_setzero_si256();
+  update_qp(qp);
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
     quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -105,39 +121,17 @@ void av1_highbd_quantize_fp_avx2(
     dqcoeff_ptr += step;
     iscan += step;
     n_coeffs -= step;
-
-    update_qp(qp);
-    while (n_coeffs > 0) {
-      coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-      quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
-  } else {
-    do {
-      const __m256i zero = _mm256_setzero_si256();
-      _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
-      _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
   }
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 8d717a083..40b3b460b 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -12,8 +12,10 @@
 #include <smmintrin.h>
 #include <stdint.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
 
 // Coefficient quantization phase 1
 // param[0-2] : rounding/quan/dequan constants
@@ -36,6 +38,8 @@ static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
   qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
   dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
   dquan[0] = _mm_srli_epi64(dquan[0], scale);
+  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
 }
 
 // Coefficient quantization phase 2
@@ -70,7 +74,8 @@ static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
 
   qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
   dquan[0] = _mm_sign_epi32(dquan[0], *sign);
-
+  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
   _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
 }
@@ -108,12 +113,12 @@ static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
 }
 
 void av1_highbd_quantize_fp_sse4_1(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale) {
-  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
   __m128i eob = _mm_setzero_si128();
   const tran_low_t *src = coeff_ptr;
   tran_low_t *quanAddr = qcoeff_ptr;
@@ -121,7 +126,6 @@ void av1_highbd_quantize_fp_sse4_1(
   const int shift = 16 - log_scale;
   const int coeff_stride = 4;
   const int quan_stride = coeff_stride;
-  (void)skip_block;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
   (void)scan;
@@ -129,29 +133,54 @@ void av1_highbd_quantize_fp_sse4_1(
   memset(quanAddr, 0, count * sizeof(quanAddr[0]));
   memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
 
-  if (!skip_block) {
-    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+  const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+  qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+  qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+  qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+  qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+                            dequant_ptr[0]);
+
+  // DC and first 3 AC
+  quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+
+  // update round/quan/dquan for AC
+  qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+  qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+  qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+  qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr, dquanAddr);
+
+  // next 4 AC
+  coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+  quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr + quan_stride, dquanAddr + quan_stride);
+
+  find_eob(quanAddr, iscan, &eob);
+
+  count -= 8;
+
+  // loop for the rest of AC
+  while (count > 0) {
+    src += coeff_stride << 1;
+    quanAddr += quan_stride << 1;
+    dquanAddr += quan_stride << 1;
+    iscan += quan_stride << 1;
 
-    qparam[0] =
-        _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
-                      round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
-    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
-    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
 
-    // DC and first 3 AC
     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
                           &coeff_sign);
-
-    // update round/quan/dquan for AC
-    qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
-    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
-    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
-
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                           log_scale, quanAddr, dquanAddr);
 
-    // next 4 AC
-    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
     quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
                           &coeff_sign);
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
@@ -161,34 +190,6 @@ void av1_highbd_quantize_fp_sse4_1(
     find_eob(quanAddr, iscan, &eob);
 
     count -= 8;
-
-    // loop for the rest of AC
-    while (count > 0) {
-      src += coeff_stride << 1;
-      quanAddr += quan_stride << 1;
-      dquanAddr += quan_stride << 1;
-      iscan += quan_stride << 1;
-
-      coeff[0] = _mm_loadu_si128((__m128i const *)src);
-      coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
-
-      quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
-                            dequant, &coeff_sign);
-      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
-                            log_scale, quanAddr, dquanAddr);
-
-      quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
-                            dequant, &coeff_sign);
-      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
-                            log_scale, quanAddr + quan_stride,
-                            dquanAddr + quan_stride);
-
-      find_eob(quanAddr, iscan, &eob);
-
-      count -= 8;
-    }
-    *eob_ptr = get_accumulated_eob(&eob);
-  } else {
-    *eob_ptr = 0;
   }
+  *eob_ptr = get_accumulated_eob(&eob);
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
index 078a67510..df22aaba7 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -57,7 +58,7 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
   init_one_qp(&round, &qp[0]);
   init_one_qp(&quant, &qp[1]);
 
-  if (log_scale > 0) {
+  if (log_scale == 1) {
     qp[1] = _mm256_slli_epi16(qp[1], log_scale);
   }
 
@@ -94,16 +95,25 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
     }                                            \
   } while (0)
 
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+  __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+  eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+  eob_s = _mm_minpos_epu16(eob_s);
+  return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
 static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, tran_low_t *qcoeff,
                             tran_low_t *dqcoeff, __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi16(*c);
-  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
-  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
   const int nzflag = _mm256_movemask_epi8(mask);
 
   if (nzflag) {
-    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     q = _mm256_mulhi_epi16(q, qp[1]);
     q = _mm256_sign_epi16(q, *c);
     const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
@@ -123,8 +133,8 @@ static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
 }
 
 void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -134,15 +144,26 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   (void)quant_shift_ptr;
   const unsigned int step = 16;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[3];
-    __m256i coeff, thr;
-    const int log_scale = 0;
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 0;
 
-    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
-    read_coeff(coeff_ptr, &coeff);
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
 
-    __m256i eob = _mm256_setzero_si256();
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
     quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -150,54 +171,21 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dqcoeff_ptr += step;
     iscan_ptr += step;
     n_coeffs -= step;
-
-    update_qp(log_scale, &thr, qp);
-
-    while (n_coeffs > 0) {
-      read_coeff(coeff_ptr, &coeff);
-      quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan_ptr += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
-  } else {
-    do {
-      write_zero(qcoeff_ptr);
-      write_zero(dqcoeff_ptr);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
   }
+  *eob_ptr = quant_gather_eob(eob);
 }
 
 static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
                                   __m256i *c, const int16_t *iscan_ptr,
                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                   __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi16(*c);
-  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
-  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
   const int nzflag = _mm256_movemask_epi8(mask);
 
   if (nzflag) {
-    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     q = _mm256_mulhi_epu16(q, qp[1]);
 
     __m256i dq = _mm256_mullo_epi16(q, qp[2]);
@@ -221,8 +209,8 @@ static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
 }
 
 void av1_quantize_fp_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan_ptr, const int16_t *iscan_ptr) {
@@ -231,15 +219,26 @@ void av1_quantize_fp_32x32_avx2(
   (void)quant_shift_ptr;
   const unsigned int step = 16;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[3];
-    __m256i coeff, thr;
-    const int log_scale = 1;
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 1;
 
-    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
-    read_coeff(coeff_ptr, &coeff);
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
 
-    __m256i eob = _mm256_setzero_si256();
+  __m256i eob = _mm256_setzero_si256();
+  quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
     quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -247,40 +246,85 @@ void av1_quantize_fp_32x32_avx2(
     dqcoeff_ptr += step;
     iscan_ptr += step;
     n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
 
-    update_qp(log_scale, &thr, qp);
-
-    while (n_coeffs > 0) {
-      read_coeff(coeff_ptr, &coeff);
-      quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
-                     &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan_ptr += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+    __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+    qh = _mm256_slli_epi16(qh, 2);
+    ql = _mm256_srli_epi16(ql, 14);
+    q = _mm256_or_si256(qh, ql);
+    const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+    const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+    __m256i dq = _mm256_or_si256(dqh, dql);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
   } else {
-    do {
-      write_zero(qcoeff_ptr);
-      write_zero(dqcoeff_ptr);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 2;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
   }
+  *eob_ptr = quant_gather_eob(eob);
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
index 4f7c09546..b07e7717f 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -12,7 +12,8 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
@@ -67,16 +68,80 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
   }
 }
 
+static INLINE void quantize(const int16_t *iscan_ptr,
+                            const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const __m128i *round0, const __m128i *round1,
+                            const __m128i *quant0, const __m128i *quant1,
+                            const __m128i *dequant0, const __m128i *dequant1,
+                            const __m128i *thr0, const __m128i *thr1,
+                            __m128i *eob) {
+  __m128i coeff0, coeff1;
+  // Do DC and first 15 AC
+  read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+                                     _mm_cmpeq_epi16(qcoeff0, *thr0));
+  const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+                                     _mm_cmpeq_epi16(qcoeff1, *thr1));
+  const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+  if (nzflag) {
+    qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+    qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+    const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+    const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+    // Reinsert signs
+    qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+    qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+    write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+    coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+    coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+    write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+    const __m128i zero = _mm_setzero_si128();
+    // Scan for eob
+    const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+    const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+    const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+    const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+    const __m128i iscan0 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+    const __m128i iscan1 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+    // Add one to convert from indices to counts
+    const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+    const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+    const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+    const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+    const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+    *eob = _mm_max_epi16(*eob, eob2);
+  } else {
+    write_zero(qcoeff_ptr, n_coeffs);
+    write_zero(dqcoeff_ptr, n_coeffs);
+  }
+}
+
 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan_ptr, const int16_t *iscan_ptr) {
-  __m128i zero;
-  __m128i thr;
-  int16_t nzflag;
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -86,167 +151,39 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    thr = _mm_srai_epi16(dequant, 1);
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
-        } else {
-          write_zero(qcoeff_ptr, n_coeffs);
-          write_zero(dqcoeff_ptr, n_coeffs);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      write_zero(dqcoeff_ptr, n_coeffs);
-      write_zero(qcoeff_ptr, n_coeffs);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+  const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+  __m128i eob = _mm_setzero_si128();
+
+  quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+           &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+             &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+             &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
index dcc697ba3..faa2a232a 100644
--- a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -47,6 +47,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..0adefecdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t cos_bit, const int8_t *stage_range);
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    const __m128i ww0 = _mm_set1_epi32(w0);                    \
+    const __m128i ww1 = _mm_set1_epi32(w1);                    \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = av1_round_shift_32_sse4_1(out0, bit);               \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = av1_round_shift_32_sse4_1(out1, bit);               \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
+    out0 = _mm_add_epi32(out0, r);                                      \
+    out0 = _mm_srai_epi32(out0, bit);                                   \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
+    out1 = _mm_add_epi32(out1, r);                                      \
+    out1 = _mm_srai_epi32(out1, bit);                                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
index 179da0d28..381f757da 100644
--- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -5,7 +5,8 @@
 
 #include <smmintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "av1/encoder/corner_match.h"
 
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
deleted file mode 100644
index e5b19a44c..000000000
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ /dev/null
@@ -1,3483 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i mask;
-
-  if (!flipud) {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-  } else {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], 4);
-  in[1] = _mm_slli_epi16(in[1], 4);
-  in[2] = _mm_slli_epi16(in[2], 4);
-  in[3] = _mm_slli_epi16(in[3], 4);
-
-  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
-  in[0] = _mm_add_epi16(in[0], mask);
-  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
-  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
-  __m128i out01 = _mm_add_epi16(in01, kOne);
-  __m128i out23 = _mm_add_epi16(in23, kOne);
-  out01 = _mm_srai_epi16(out01, 2);
-  out23 = _mm_srai_epi16(out23, 2);
-  store_output(&out01, (output + 0 * 8));
-  store_output(&out23, (output + 1 * 8));
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  // Combine and transpose
-  // 00 01 02 03 20 21 22 23
-  // 10 11 12 13 30 31 32 33
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
-  // 00 10 20 30 01 11 21 31
-  // 02 12 22 32 03 13 23 33
-  // only use the first 4 16-bit integers
-  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
-  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-static void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u[4], v[4];
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpacklo_epi16(in[3], in[2]);
-
-  v[0] = _mm_add_epi16(u[0], u[1]);
-  v[1] = _mm_sub_epi16(u[0], u[1]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
-  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
-  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-  transpose_4x4(in);
-}
-
-static void fadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
-  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
-  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-  __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpacklo_epi16(in[2], kZero);
-  u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
-  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_sub_epi32(v[2], v[6]);
-  u[2] = _mm_add_epi32(v[3], v[4]);
-  u[3] = _mm_sub_epi32(u[2], u[0]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_sub_epi32(u[4], v[5]);
-  u[6] = _mm_add_epi32(u[3], u[5]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  transpose_4x4(in);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx4_sse2(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3;
-  __m128i u0, u1, u2, u3;
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-
-  u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u0, u2);
-  in[1] = _mm_packs_epi32(u1, u3);
-  transpose_4x4(in);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[4];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
-    case ADST_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DCT_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case ADST_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fdct4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_4x4(input, in, stride, 1, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case IDTX:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case V_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case H_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case V_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case H_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case V_FLIPADST:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case H_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fidtx4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
-  }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  if (!flipud) {
-    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
-    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  } else {
-    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
-    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = mm_reverse_epi16(in[0]);
-    in[1] = mm_reverse_epi16(in[1]);
-    in[2] = mm_reverse_epi16(in[2]);
-    in[3] = mm_reverse_epi16(in[3]);
-    in[4] = mm_reverse_epi16(in[4]);
-    in[5] = mm_reverse_epi16(in[5]);
-    in[6] = mm_reverse_epi16(in[6]);
-    in[7] = mm_reverse_epi16(in[7]);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], 2);
-  in[1] = _mm_slli_epi16(in[1], 2);
-  in[2] = _mm_slli_epi16(in[2], 2);
-  in[3] = _mm_slli_epi16(in[3], 2);
-  in[4] = _mm_slli_epi16(in[4], 2);
-  in[5] = _mm_slli_epi16(in[5], 2);
-  in[6] = _mm_slli_epi16(in[6], 2);
-  in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, const int bit) {
-  __m128i sign0 = _mm_srai_epi16(res[0], 15);
-  __m128i sign1 = _mm_srai_epi16(res[1], 15);
-  __m128i sign2 = _mm_srai_epi16(res[2], 15);
-  __m128i sign3 = _mm_srai_epi16(res[3], 15);
-  __m128i sign4 = _mm_srai_epi16(res[4], 15);
-  __m128i sign5 = _mm_srai_epi16(res[5], 15);
-  __m128i sign6 = _mm_srai_epi16(res[6], 15);
-  __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
-  if (bit == 2) {
-    const __m128i const_rounding = _mm_set1_epi16(1);
-    res[0] = _mm_adds_epi16(res[0], const_rounding);
-    res[1] = _mm_adds_epi16(res[1], const_rounding);
-    res[2] = _mm_adds_epi16(res[2], const_rounding);
-    res[3] = _mm_adds_epi16(res[3], const_rounding);
-    res[4] = _mm_adds_epi16(res[4], const_rounding);
-    res[5] = _mm_adds_epi16(res[5], const_rounding);
-    res[6] = _mm_adds_epi16(res[6], const_rounding);
-    res[7] = _mm_adds_epi16(res[7], const_rounding);
-  }
-
-  res[0] = _mm_sub_epi16(res[0], sign0);
-  res[1] = _mm_sub_epi16(res[1], sign1);
-  res[2] = _mm_sub_epi16(res[2], sign2);
-  res[3] = _mm_sub_epi16(res[3], sign3);
-  res[4] = _mm_sub_epi16(res[4], sign4);
-  res[5] = _mm_sub_epi16(res[5], sign5);
-  res[6] = _mm_sub_epi16(res[6], sign6);
-  res[7] = _mm_sub_epi16(res[7], sign7);
-
-  if (bit == 1) {
-    res[0] = _mm_srai_epi16(res[0], 1);
-    res[1] = _mm_srai_epi16(res[1], 1);
-    res[2] = _mm_srai_epi16(res[2], 1);
-    res[3] = _mm_srai_epi16(res[3], 1);
-    res[4] = _mm_srai_epi16(res[4], 1);
-    res[5] = _mm_srai_epi16(res[5], 1);
-    res[6] = _mm_srai_epi16(res[6], 1);
-    res[7] = _mm_srai_epi16(res[7], 1);
-  } else {
-    res[0] = _mm_srai_epi16(res[0], 2);
-    res[1] = _mm_srai_epi16(res[1], 2);
-    res[2] = _mm_srai_epi16(res[2], 2);
-    res[3] = _mm_srai_epi16(res[3], 2);
-    res[4] = _mm_srai_epi16(res[4], 2);
-    res[5] = _mm_srai_epi16(res[5], 2);
-    res[6] = _mm_srai_epi16(res[6], 2);
-    res[7] = _mm_srai_epi16(res[7], 2);
-  }
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
-                                    int stride) {
-  store_output(&res[0], (output + 0 * stride));
-  store_output(&res[1], (output + 1 * stride));
-  store_output(&res[2], (output + 2 * stride));
-  store_output(&res[3], (output + 3 * stride));
-  store_output(&res[4], (output + 4 * stride));
-  store_output(&res[5], (output + 5 * stride));
-  store_output(&res[6], (output + 6 * stride));
-  store_output(&res[7], (output + 7 * stride));
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
-static void fdct8_sse2(__m128i *in) {
-  // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
-  // stage 1
-  s0 = _mm_add_epi16(in[0], in[7]);
-  s1 = _mm_add_epi16(in[1], in[6]);
-  s2 = _mm_add_epi16(in[2], in[5]);
-  s3 = _mm_add_epi16(in[3], in[4]);
-  s4 = _mm_sub_epi16(in[3], in[4]);
-  s5 = _mm_sub_epi16(in[2], in[5]);
-  s6 = _mm_sub_epi16(in[1], in[6]);
-  s7 = _mm_sub_epi16(in[0], in[7]);
-
-  u0 = _mm_add_epi16(s0, s3);
-  u1 = _mm_add_epi16(s1, s2);
-  u2 = _mm_sub_epi16(s1, s2);
-  u3 = _mm_sub_epi16(s0, s3);
-  // interleave and perform butterfly multiplication/addition
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpackhi_epi16(u0, u1);
-  v2 = _mm_unpacklo_epi16(u2, u3);
-  v3 = _mm_unpackhi_epi16(u2, u3);
-
-  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
-  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
-  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
-  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
-  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
-  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
-  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
-  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[4] = _mm_packs_epi32(u2, u3);
-  in[6] = _mm_packs_epi32(u6, u7);
-
-  // stage 2
-  // interleave and perform butterfly multiplication/addition
-  u0 = _mm_unpacklo_epi16(s6, s5);
-  u1 = _mm_unpackhi_epi16(s6, s5);
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
-  // shift and rounding
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
-  u0 = _mm_packs_epi32(v0, v1);
-  u1 = _mm_packs_epi32(v2, v3);
-
-  // stage 3
-  s0 = _mm_add_epi16(s4, u0);
-  s1 = _mm_sub_epi16(s4, u0);
-  s2 = _mm_sub_epi16(s7, u1);
-  s3 = _mm_add_epi16(s7, u1);
-
-  // stage 4
-  u0 = _mm_unpacklo_epi16(s0, s3);
-  u1 = _mm_unpackhi_epi16(s0, s3);
-  u2 = _mm_unpacklo_epi16(s1, s2);
-  u3 = _mm_unpackhi_epi16(s1, s2);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
-  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
-  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
-  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
-  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
-  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
-  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
-  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
-  // shift and rounding
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  in[1] = _mm_packs_epi32(v0, v1);
-  in[3] = _mm_packs_epi32(v4, v5);
-  in[5] = _mm_packs_epi32(v2, v3);
-  in[7] = _mm_packs_epi32(v6, v7);
-
-  // transpose
-  array_transpose_8x8(in, in);
-}
-
-static void fadst8_sse2(__m128i *in) {
-  // Constants
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  v0 = _mm_add_epi32(w0, w4);
-  v1 = _mm_add_epi32(w1, w5);
-  v2 = _mm_add_epi32(w2, w6);
-  v3 = _mm_add_epi32(w3, w7);
-  v4 = _mm_sub_epi32(w0, w4);
-  v5 = _mm_sub_epi32(w1, w5);
-  v6 = _mm_sub_epi32(w2, w6);
-  v7 = _mm_sub_epi32(w3, w7);
-
-  w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
-
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_packs_epi32(v0, v1);
-  s1 = _mm_packs_epi32(v2, v3);
-  s2 = _mm_packs_epi32(v4, v5);
-  s3 = _mm_packs_epi32(v6, v7);
-
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  // FIXME(jingning): do subtract using bit inversion?
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-
-  // transpose
-  array_transpose_8x8(in, in);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx8_sse2(__m128i *in) {
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-
-  array_transpose_8x8(in, in);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
-    case ADST_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DCT_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdct8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case ADST_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fdct8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 1, 1);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case IDTX:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fidtx8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case V_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdct8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case H_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fidtx8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case V_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case H_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fidtx8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fidtx8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
-  }
-}
-
-static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
-                                     __m128i *in1, int stride, int flipud,
-                                     int fliplr) {
-  // Load 4 8x8 blocks
-  const int16_t *topL = input;
-  const int16_t *topR = input + 8;
-  const int16_t *botL = input + 8 * stride;
-  const int16_t *botR = input + 8 * stride + 8;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    // Swap left columns
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-    // Swap right columns
-    tmp = topR;
-    topR = botR;
-    botR = tmp;
-  }
-
-  if (fliplr) {
-    // Swap top rows
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-    // Swap bottom rows
-    tmp = botL;
-    botL = botR;
-    botR = tmp;
-  }
-
-  // load first 8 columns
-  load_buffer_8x8(topL, in0, stride, flipud, fliplr);
-  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
-
-  // load second 8 columns
-  load_buffer_8x8(topR, in1, stride, flipud, fliplr);
-  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
-}
-
-static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
-                                      __m128i *in1, int stride) {
-  // write first 8 columns
-  write_buffer_8x8(output, in0, stride);
-  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
-  // write second 8 columns
-  output += 8;
-  write_buffer_8x8(output, in1, stride);
-  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
-  // perform rounding operations
-  right_shift_8x8(res0, 2);
-  right_shift_8x8(res0 + 8, 2);
-  right_shift_8x8(res1, 2);
-  right_shift_8x8(res1 + 8, 2);
-}
-
-static void fdct16_8col(__m128i *in) {
-  // perform 16x16 1-D DCT for 8 columns
-  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  // stage 1
-  i[0] = _mm_add_epi16(in[0], in[15]);
-  i[1] = _mm_add_epi16(in[1], in[14]);
-  i[2] = _mm_add_epi16(in[2], in[13]);
-  i[3] = _mm_add_epi16(in[3], in[12]);
-  i[4] = _mm_add_epi16(in[4], in[11]);
-  i[5] = _mm_add_epi16(in[5], in[10]);
-  i[6] = _mm_add_epi16(in[6], in[9]);
-  i[7] = _mm_add_epi16(in[7], in[8]);
-
-  s[0] = _mm_sub_epi16(in[7], in[8]);
-  s[1] = _mm_sub_epi16(in[6], in[9]);
-  s[2] = _mm_sub_epi16(in[5], in[10]);
-  s[3] = _mm_sub_epi16(in[4], in[11]);
-  s[4] = _mm_sub_epi16(in[3], in[12]);
-  s[5] = _mm_sub_epi16(in[2], in[13]);
-  s[6] = _mm_sub_epi16(in[1], in[14]);
-  s[7] = _mm_sub_epi16(in[0], in[15]);
-
-  p[0] = _mm_add_epi16(i[0], i[7]);
-  p[1] = _mm_add_epi16(i[1], i[6]);
-  p[2] = _mm_add_epi16(i[2], i[5]);
-  p[3] = _mm_add_epi16(i[3], i[4]);
-  p[4] = _mm_sub_epi16(i[3], i[4]);
-  p[5] = _mm_sub_epi16(i[2], i[5]);
-  p[6] = _mm_sub_epi16(i[1], i[6]);
-  p[7] = _mm_sub_epi16(i[0], i[7]);
-
-  u[0] = _mm_add_epi16(p[0], p[3]);
-  u[1] = _mm_add_epi16(p[1], p[2]);
-  u[2] = _mm_sub_epi16(p[1], p[2]);
-  u[3] = _mm_sub_epi16(p[0], p[3]);
-
-  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
-  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
-  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
-  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
-  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
-  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
-  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
-  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
-  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
-  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[4] = _mm_packs_epi32(u[4], u[5]);
-  in[8] = _mm_packs_epi32(u[2], u[3]);
-  in[12] = _mm_packs_epi32(u[6], u[7]);
-
-  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
-  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[2], v[3]);
-
-  t[0] = _mm_add_epi16(p[4], u[0]);
-  t[1] = _mm_sub_epi16(p[4], u[0]);
-  t[2] = _mm_sub_epi16(p[7], u[1]);
-  t[3] = _mm_add_epi16(p[7], u[1]);
-
-  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
-  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
-  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  in[2] = _mm_packs_epi32(v[0], v[1]);
-  in[6] = _mm_packs_epi32(v[4], v[5]);
-  in[10] = _mm_packs_epi32(v[2], v[3]);
-  in[14] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
-  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
-  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[2] = _mm_packs_epi32(v[0], v[1]);
-  t[3] = _mm_packs_epi32(v[2], v[3]);
-  t[4] = _mm_packs_epi32(v[4], v[5]);
-  t[5] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 3
-  p[0] = _mm_add_epi16(s[0], t[3]);
-  p[1] = _mm_add_epi16(s[1], t[2]);
-  p[2] = _mm_sub_epi16(s[1], t[2]);
-  p[3] = _mm_sub_epi16(s[0], t[3]);
-  p[4] = _mm_sub_epi16(s[7], t[4]);
-  p[5] = _mm_sub_epi16(s[6], t[5]);
-  p[6] = _mm_add_epi16(s[6], t[5]);
-  p[7] = _mm_add_epi16(s[7], t[4]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
-  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
-  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
-  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[1] = _mm_packs_epi32(v[0], v[1]);
-  t[2] = _mm_packs_epi32(v[2], v[3]);
-  t[5] = _mm_packs_epi32(v[4], v[5]);
-  t[6] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 5
-  s[0] = _mm_add_epi16(p[0], t[1]);
-  s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_sub_epi16(p[3], t[2]);
-  s[3] = _mm_add_epi16(p[3], t[2]);
-  s[4] = _mm_add_epi16(p[4], t[5]);
-  s[5] = _mm_sub_epi16(p[4], t[5]);
-  s[6] = _mm_sub_epi16(p[7], t[6]);
-  s[7] = _mm_add_epi16(p[7], t[6]);
-
-  // stage 6
-  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
-  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
-  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
-  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
-  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
-  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
-  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
-  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
-  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
-  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
-  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
-  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
-  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
-  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
-  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[1] = _mm_packs_epi32(v[0], v[1]);
-  in[9] = _mm_packs_epi32(v[2], v[3]);
-  in[5] = _mm_packs_epi32(v[4], v[5]);
-  in[13] = _mm_packs_epi32(v[6], v[7]);
-  in[3] = _mm_packs_epi32(v[8], v[9]);
-  in[11] = _mm_packs_epi32(v[10], v[11]);
-  in[7] = _mm_packs_epi32(v[12], v[13]);
-  in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-static void fadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  v[0] = _mm_add_epi32(u[0], u[8]);
-  v[1] = _mm_add_epi32(u[1], u[9]);
-  v[2] = _mm_add_epi32(u[2], u[10]);
-  v[3] = _mm_add_epi32(u[3], u[11]);
-  v[4] = _mm_add_epi32(u[4], u[12]);
-  v[5] = _mm_add_epi32(u[5], u[13]);
-  v[6] = _mm_add_epi32(u[6], u[14]);
-  v[7] = _mm_add_epi32(u[7], u[15]);
-
-  v[16] = _mm_add_epi32(v[0], v[4]);
-  v[17] = _mm_add_epi32(v[1], v[5]);
-  v[18] = _mm_add_epi32(v[2], v[6]);
-  v[19] = _mm_add_epi32(v[3], v[7]);
-  v[20] = _mm_sub_epi32(v[0], v[4]);
-  v[21] = _mm_sub_epi32(v[1], v[5]);
-  v[22] = _mm_sub_epi32(v[2], v[6]);
-  v[23] = _mm_sub_epi32(v[3], v[7]);
-  v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  s[0] = _mm_packs_epi32(v[16], v[17]);
-  s[1] = _mm_packs_epi32(v[18], v[19]);
-  s[2] = _mm_packs_epi32(v[20], v[21]);
-  s[3] = _mm_packs_epi32(v[22], v[23]);
-
-  v[8] = _mm_sub_epi32(u[0], u[8]);
-  v[9] = _mm_sub_epi32(u[1], u[9]);
-  v[10] = _mm_sub_epi32(u[2], u[10]);
-  v[11] = _mm_sub_epi32(u[3], u[11]);
-  v[12] = _mm_sub_epi32(u[4], u[12]);
-  v[13] = _mm_sub_epi32(u[5], u[13]);
-  v[14] = _mm_sub_epi32(u[6], u[14]);
-  v[15] = _mm_sub_epi32(u[7], u[15]);
-
-  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  s[4] = _mm_packs_epi32(v[8], v[9]);
-  s[5] = _mm_packs_epi32(v[10], v[11]);
-  s[6] = _mm_packs_epi32(v[12], v[13]);
-  s[7] = _mm_packs_epi32(v[14], v[15]);
-  //
-
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  v[8] = _mm_add_epi32(u[0], u[4]);
-  v[9] = _mm_add_epi32(u[1], u[5]);
-  v[10] = _mm_add_epi32(u[2], u[6]);
-  v[11] = _mm_add_epi32(u[3], u[7]);
-  v[12] = _mm_sub_epi32(u[0], u[4]);
-  v[13] = _mm_sub_epi32(u[1], u[5]);
-  v[14] = _mm_sub_epi32(u[2], u[6]);
-  v[15] = _mm_sub_epi32(u[3], u[7]);
-
-  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  s[8] = _mm_packs_epi32(v[8], v[9]);
-  s[9] = _mm_packs_epi32(v[10], v[11]);
-  s[10] = _mm_packs_epi32(v[12], v[13]);
-  s[11] = _mm_packs_epi32(v[14], v[15]);
-
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(s[4], s[5]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[5]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void fdct16_sse2(__m128i *in0, __m128i *in1) {
-  fdct16_8col(in0);
-  fdct16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-
-static void fadst16_sse2(__m128i *in0, __m128i *in1) {
-  fadst16_8col(in0);
-  fadst16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
-  idtx16_8col(in0);
-  idtx16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i in0[16], in1[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 1);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case IDTX:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case V_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case H_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case V_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case H_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-
-static INLINE void prepare_4x8_row_first(__m128i *in) {
-  in[0] = _mm_unpacklo_epi64(in[0], in[2]);
-  in[1] = _mm_unpacklo_epi64(in[1], in[3]);
-  transpose_4x4(in);
-  in[4] = _mm_unpacklo_epi64(in[4], in[6]);
-  in[5] = _mm_unpacklo_epi64(in[5], in[7]);
-  transpose_4x4(in + 4);
-}
-
-// Load input into the left-hand half of in (ie, into lanes 0..3 of
-// each element of in). The right hand half (lanes 4..7) should be
-// treated as being filled with "don't care" values.
-static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  const int shift = 2;
-  if (!flipud) {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-    in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
-    in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
-    in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
-    in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
-  } else {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
-    in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-    in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-    in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
-    in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
-    in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
-    in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], shift);
-  in[1] = _mm_slli_epi16(in[1], shift);
-  in[2] = _mm_slli_epi16(in[2], shift);
-  in[3] = _mm_slli_epi16(in[3], shift);
-  in[4] = _mm_slli_epi16(in[4], shift);
-  in[5] = _mm_slli_epi16(in[5], shift);
-  in[6] = _mm_slli_epi16(in[6], shift);
-  in[7] = _mm_slli_epi16(in[7], shift);
-
-  scale_sqrt2_8x4(in);
-  scale_sqrt2_8x4(in + 4);
-  prepare_4x8_row_first(in);
-}
-
-static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
-  __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
-  const int shift = 1;
-
-  // revert the 8x8 txfm's transpose
-  array_transpose_8x8(res, res);
-
-  in01 = _mm_unpacklo_epi64(res[0], res[1]);
-  in23 = _mm_unpacklo_epi64(res[2], res[3]);
-  in45 = _mm_unpacklo_epi64(res[4], res[5]);
-  in67 = _mm_unpacklo_epi64(res[6], res[7]);
-
-  sign01 = _mm_srai_epi16(in01, 15);
-  sign23 = _mm_srai_epi16(in23, 15);
-  sign45 = _mm_srai_epi16(in45, 15);
-  sign67 = _mm_srai_epi16(in67, 15);
-
-  in01 = _mm_sub_epi16(in01, sign01);
-  in23 = _mm_sub_epi16(in23, sign23);
-  in45 = _mm_sub_epi16(in45, sign45);
-  in67 = _mm_sub_epi16(in67, sign67);
-
-  in01 = _mm_srai_epi16(in01, shift);
-  in23 = _mm_srai_epi16(in23, shift);
-  in45 = _mm_srai_epi16(in45, shift);
-  in67 = _mm_srai_epi16(in67, shift);
-
-  store_output(&in01, (output + 0 * 8));
-  store_output(&in23, (output + 1 * 8));
-  store_output(&in45, (output + 2 * 8));
-  store_output(&in67, (output + 3 * 8));
-}
-
-void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case ADST_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case DCT_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case ADST_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_4x8(input, in, stride, 1, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_4x8(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_4x8(input, in, stride, 1, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_4x8(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_4x8(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case IDTX:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case H_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case H_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_4x8(input, in, stride, 1, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_4x8(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_4x8(output, in);
-}
-
-// Load input into the left-hand half of in (ie, into lanes 0..3 of
-// each element of in). The right hand half (lanes 4..7) should be
-// treated as being filled with "don't care" values.
-// The input is split horizontally into two 4x4
-// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
-// block of 'in' and 'r' is stored in the bottom-left block.
-// This is to allow us to reuse 4x4 transforms.
-static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  const int shift = 2;
-  if (!flipud) {
-    in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
-  } else {
-    in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
-    in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
-    in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
-    in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = mm_reverse_epi16(in[0]);
-    in[1] = mm_reverse_epi16(in[1]);
-    in[2] = mm_reverse_epi16(in[2]);
-    in[3] = mm_reverse_epi16(in[3]);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], shift);
-  in[1] = _mm_slli_epi16(in[1], shift);
-  in[2] = _mm_slli_epi16(in[2], shift);
-  in[3] = _mm_slli_epi16(in[3], shift);
-
-  scale_sqrt2_8x4(in);
-
-  in[4] = _mm_shuffle_epi32(in[0], 0xe);
-  in[5] = _mm_shuffle_epi32(in[1], 0xe);
-  in[6] = _mm_shuffle_epi32(in[2], 0xe);
-  in[7] = _mm_shuffle_epi32(in[3], 0xe);
-}
-
-static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
-  __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
-  const int shift = 1;
-  sign0 = _mm_srai_epi16(res[0], 15);
-  sign1 = _mm_srai_epi16(res[1], 15);
-  sign2 = _mm_srai_epi16(res[2], 15);
-  sign3 = _mm_srai_epi16(res[3], 15);
-
-  out0 = _mm_sub_epi16(res[0], sign0);
-  out1 = _mm_sub_epi16(res[1], sign1);
-  out2 = _mm_sub_epi16(res[2], sign2);
-  out3 = _mm_sub_epi16(res[3], sign3);
-
-  out0 = _mm_srai_epi16(out0, shift);
-  out1 = _mm_srai_epi16(out1, shift);
-  out2 = _mm_srai_epi16(out2, shift);
-  out3 = _mm_srai_epi16(out3, shift);
-
-  store_output(&out0, (output + 0 * 8));
-  store_output(&out1, (output + 1 * 8));
-  store_output(&out2, (output + 2 * 8));
-  store_output(&out3, (output + 3 * 8));
-}
-
-void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case ADST_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case DCT_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case ADST_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_8x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x4(input, in, stride, 0, 1);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x4(input, in, stride, 1, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x4(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case IDTX:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case H_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case V_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case H_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x4(input, in, stride, 0, 1);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_8x4(output, in);
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
-                                    int stride, int flipud, int fliplr) {
-  // Load 2 8x8 blocks
-  const int16_t *t = input;
-  const int16_t *b = input + 8 * stride;
-
-  if (flipud) {
-    const int16_t *const tmp = t;
-    t = b;
-    b = tmp;
-  }
-
-  load_buffer_8x8(t, in, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in);
-  load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in + 8);
-}
-
-static INLINE void round_power_of_two_signed(__m128i *x, int n) {
-  const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
-  const __m128i sign = _mm_srai_epi16(*x, 15);
-  const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
-  *x = _mm_srai_epi16(res, n);
-}
-
-static void row_8x16_rounding(__m128i *in, int bits) {
-  int i;
-  for (i = 0; i < 16; i++) {
-    round_power_of_two_signed(&in[i], bits);
-  }
-}
-
-void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  __m128i *const t = in;      // Alias to top 8x8 sub block
-  __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case ADST_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case DCT_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case ADST_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_8x16(input, in, stride, 1, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x16(input, in, stride, 0, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x16(input, in, stride, 1, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x16(input, in, stride, 0, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x16(input, in, stride, 1, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case IDTX:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case H_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case H_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x16(input, in, stride, 1, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x16(input, in, stride, 0, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_8x8(output, t, 8);
-  write_buffer_8x8(output + 64, b, 8);
-}
-
-static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
-                                    int stride, int flipud, int fliplr) {
-  // Load 2 8x8 blocks
-  const int16_t *l = input;
-  const int16_t *r = input + 8;
-
-  if (fliplr) {
-    const int16_t *const tmp = l;
-    l = r;
-    r = tmp;
-  }
-
-  // load first 8 columns
-  load_buffer_8x8(l, in, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in);
-  load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in + 8);
-}
-
-#define col_16x8_rounding row_8x16_rounding
-
-void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  __m128i *const l = in;      // Alias to left 8x8 sub block
-  __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
-                              // in the second half of the array
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case ADST_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case DCT_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case ADST_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x8(input, in, stride, 1, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x8(input, in, stride, 0, 1);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x8(input, in, stride, 1, 1);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x8(input, in, stride, 0, 1);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x8(input, in, stride, 1, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case IDTX:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case H_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case V_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case H_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x8(input, in, stride, 1, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x8(input, in, stride, 0, 1);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  array_transpose_8x8(l, l);
-  array_transpose_8x8(r, r);
-  write_buffer_8x8(output, l, 16);
-  write_buffer_8x8(output + 8, r, 16);
-}
-
-// Note: The 16-column 32-element transforms expect their input to be
-// split up into a 2x2 grid of 8x16 blocks
-static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                __m128i *br) {
-  fdct32_8col(tl, bl);
-  fdct32_8col(tr, br);
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-}
-
-#if CONFIG_EXT_TX
-static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                 __m128i *br) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    tl[i] = _mm_slli_epi16(tl[i], 2);
-    tr[i] = _mm_slli_epi16(tr[i], 2);
-    bl[i] = _mm_slli_epi16(bl[i], 2);
-    br[i] = _mm_slli_epi16(br[i], 2);
-  }
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-}
-#endif
-
-static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
-                                     __m128i *intr, __m128i *inbl,
-                                     __m128i *inbr, int stride, int flipud,
-                                     int fliplr) {
-  int i;
-  if (flipud) {
-    input = input + 31 * stride;
-    stride = -stride;
-  }
-
-  for (i = 0; i < 16; ++i) {
-    intl[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
-    intr[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
-    inbl[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
-    inbr[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
-  }
-
-  if (fliplr) {
-    __m128i tmp;
-    for (i = 0; i < 16; ++i) {
-      tmp = intl[i];
-      intl[i] = mm_reverse_epi16(intr[i]);
-      intr[i] = mm_reverse_epi16(tmp);
-      tmp = inbl[i];
-      inbl[i] = mm_reverse_epi16(inbr[i]);
-      inbr[i] = mm_reverse_epi16(tmp);
-    }
-  }
-
-  scale_sqrt2_8x16(intl);
-  scale_sqrt2_8x16(intr);
-  scale_sqrt2_8x16(inbl);
-  scale_sqrt2_8x16(inbr);
-}
-
-static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
-                                      __m128i *restr, __m128i *resbl,
-                                      __m128i *resbr) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    store_output(&restl[i], output + i * 16 + 0);
-    store_output(&restr[i], output + i * 16 + 8);
-    store_output(&resbl[i], output + (i + 16) * 16 + 0);
-    store_output(&resbr[i], output + (i + 16) * 16 + 8);
-  }
-}
-
-static INLINE void round_signed_8x8(__m128i *in, const int bit) {
-  const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
-  __m128i sign0 = _mm_srai_epi16(in[0], 15);
-  __m128i sign1 = _mm_srai_epi16(in[1], 15);
-  __m128i sign2 = _mm_srai_epi16(in[2], 15);
-  __m128i sign3 = _mm_srai_epi16(in[3], 15);
-  __m128i sign4 = _mm_srai_epi16(in[4], 15);
-  __m128i sign5 = _mm_srai_epi16(in[5], 15);
-  __m128i sign6 = _mm_srai_epi16(in[6], 15);
-  __m128i sign7 = _mm_srai_epi16(in[7], 15);
-
-  in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
-  in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
-  in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
-  in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
-  in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
-  in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
-  in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
-  in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
-
-  in[0] = _mm_srai_epi16(in[0], bit);
-  in[1] = _mm_srai_epi16(in[1], bit);
-  in[2] = _mm_srai_epi16(in[2], bit);
-  in[3] = _mm_srai_epi16(in[3], bit);
-  in[4] = _mm_srai_epi16(in[4], bit);
-  in[5] = _mm_srai_epi16(in[5], bit);
-  in[6] = _mm_srai_epi16(in[6], bit);
-  in[7] = _mm_srai_epi16(in[7], bit);
-}
-
-static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
-  const int bit = 4;
-  round_signed_8x8(in0, bit);
-  round_signed_8x8(in0 + 8, bit);
-  round_signed_8x8(in1, bit);
-  round_signed_8x8(in1 + 8, bit);
-}
-
-// Note:
-//  suffix "t" indicates the transpose operation comes first
-static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  fdct16_8col(in0);
-  fdct16_8col(in1);
-}
-
-static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  fadst16_8col(in0);
-  fadst16_8col(in1);
-}
-
-static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                 __m128i *br) {
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-  fdct32_8col(tl, bl);
-  fdct32_8col(tr, br);
-}
-
-typedef enum transpose_indicator_ {
-  transpose,
-  no_transpose,
-} transpose_indicator;
-
-static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                      __m128i *br, transpose_indicator t) {
-  __m128i tmpl[16], tmpr[16];
-  int i;
-
-  // Copy the bottom half of the input to temporary storage
-  for (i = 0; i < 16; ++i) {
-    tmpl[i] = bl[i];
-    tmpr[i] = br[i];
-  }
-
-  // Generate the bottom half of the output
-  for (i = 0; i < 16; ++i) {
-    bl[i] = _mm_slli_epi16(tl[i], 2);
-    br[i] = _mm_slli_epi16(tr[i], 2);
-  }
-  array_transpose_16x16(bl, br);
-
-  // Copy the temporary storage back to the top half of the input
-  for (i = 0; i < 16; ++i) {
-    tl[i] = tmpl[i];
-    tr[i] = tmpr[i];
-  }
-
-  // Generate the top half of the output
-  scale_sqrt2_8x16(tl);
-  scale_sqrt2_8x16(tr);
-  if (t == transpose)
-    fdct16t_sse2(tl, tr);
-  else
-    fdct16_sse2(tl, tr);
-}
-
-// Note on data layout, for both this and the 32x16 transforms:
-// So that we can reuse the 16-element transforms easily,
-// we want to split the input into 8x16 blocks.
-// For 16x32, this means the input is a 2x2 grid of such blocks.
-// For 32x16, it means the input is a 4x1 grid.
-void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i intl[16], intr[16], inbl[16], inbr[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case ADST_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case DCT_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case ADST_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case IDTX:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-    case V_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case H_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-    case V_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case H_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_16x32(output, intl, intr, inbl, inbr);
-}
-
-static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
-                                     __m128i *in1, __m128i *in2, __m128i *in3,
-                                     int stride, int flipud, int fliplr) {
-  int i;
-  if (flipud) {
-    input += 15 * stride;
-    stride = -stride;
-  }
-
-  for (i = 0; i < 16; ++i) {
-    in0[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
-    in1[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
-    in2[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
-    in3[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
-  }
-
-  if (fliplr) {
-    for (i = 0; i < 16; ++i) {
-      __m128i tmp1 = in0[i];
-      __m128i tmp2 = in1[i];
-      in0[i] = mm_reverse_epi16(in3[i]);
-      in1[i] = mm_reverse_epi16(in2[i]);
-      in2[i] = mm_reverse_epi16(tmp2);
-      in3[i] = mm_reverse_epi16(tmp1);
-    }
-  }
-
-  scale_sqrt2_8x16(in0);
-  scale_sqrt2_8x16(in1);
-  scale_sqrt2_8x16(in2);
-  scale_sqrt2_8x16(in3);
-}
-
-static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
-                                      __m128i *res1, __m128i *res2,
-                                      __m128i *res3) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    store_output(&res0[i], output + i * 32 + 0);
-    store_output(&res1[i], output + i * 32 + 8);
-    store_output(&res2[i], output + i * 32 + 16);
-    store_output(&res3[i], output + i * 32 + 24);
-  }
-}
-
-void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i in0[16], in1[16], in2[16], in3[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-  switch (tx_type) {
-    case DCT_DCT:
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case ADST_ADST:
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case IDTX:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case V_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case H_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case V_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case H_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case V_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case H_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_32x16(output, in0, in1, in2, in3);
-}
-
-// Note:
-// 32x32 hybrid fwd txfm
-//  4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
-static INLINE void load_buffer_32x32(const int16_t *input,
-                                     __m128i *in0 /*in0[32]*/,
-                                     __m128i *in1 /*in1[32]*/,
-                                     __m128i *in2 /*in2[32]*/,
-                                     __m128i *in3 /*in3[32]*/, int stride,
-                                     int flipud, int fliplr) {
-  if (flipud) {
-    input += 31 * stride;
-    stride = -stride;
-  }
-
-  int i;
-  for (i = 0; i < 32; ++i) {
-    in0[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
-    in1[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
-    in2[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
-    in3[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
-  }
-
-  if (fliplr) {
-    for (i = 0; i < 32; ++i) {
-      __m128i tmp1 = in0[i];
-      __m128i tmp2 = in1[i];
-      in0[i] = mm_reverse_epi16(in3[i]);
-      in1[i] = mm_reverse_epi16(in2[i]);
-      in2[i] = mm_reverse_epi16(tmp2);
-      in3[i] = mm_reverse_epi16(tmp1);
-    }
-  }
-}
-
-static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
-                              __m128i *b0r /*b0r[16]*/,
-                              __m128i *b1l /*b1l[16]*/,
-                              __m128i *b1r /*b1r[16]*/) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    __m128i tmp0 = b1l[i];
-    __m128i tmp1 = b1r[i];
-    b1l[i] = b0l[i];
-    b1r[i] = b0r[i];
-    b0l[i] = tmp0;
-    b0r[i] = tmp1;
-  }
-}
-
-static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
-                          __m128i *in3) {
-  fdct32_8col(in0, &in0[16]);
-  fdct32_8col(in1, &in1[16]);
-  fdct32_8col(in2, &in2[16]);
-  fdct32_8col(in3, &in3[16]);
-
-  array_transpose_16x16(in0, in1);
-  array_transpose_16x16(&in0[16], &in1[16]);
-  array_transpose_16x16(in2, in3);
-  array_transpose_16x16(&in2[16], &in3[16]);
-
-  swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-
-static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
-                                __m128i *in3) {
-  fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
-  fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
-  swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-
-#if CONFIG_EXT_TX
-static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
-                           __m128i *in3) {
-  fidtx32_16col(in0, in1, &in0[16], &in1[16]);
-  fidtx32_16col(in2, in3, &in2[16], &in3[16]);
-  swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-#endif
-
-static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
-                                      __m128i *in3) {
-  round_signed_16x16(in0, in1);
-  round_signed_16x16(&in0[16], &in1[16]);
-  round_signed_16x16(in2, in3);
-  round_signed_16x16(&in2[16], &in3[16]);
-}
-
-static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
-                                      __m128i *in3, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; ++i) {
-    store_output(&in0[i], output + i * 32 + 0);
-    store_output(&in1[i], output + i * 32 + 8);
-    store_output(&in2[i], output + i * 32 + 16);
-    store_output(&in3[i], output + i * 32 + 24);
-  }
-}
-
-void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i in0[32], in1[32], in2[32], in3[32];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
-#endif
-
-  load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
-  switch (tx_type) {
-    case DCT_DCT:
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case ADST_ADST:
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case IDTX:
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case V_DCT:
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case H_DCT:
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case V_ADST:
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case H_ADST:
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case V_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case H_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-#endif
-    default: assert(0);
-  }
-  write_buffer_32x32(in0, in1, in2, in3, output);
-}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
index a99db3d6e..b18554818 100644
--- a/third_party/aom/av1/encoder/x86/dct_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -63,7 +63,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   psllw           m0,        2
   psllw           m1,        2
 
-%if CONFIG_HIGHBITDEPTH
   ; sign extension
   mova            m2,             m0
   mova            m3,             m1
@@ -79,9 +78,5 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   mova            [outputq + 16], m2
   mova            [outputq + 32], m1
   mova            [outputq + 48], m3
-%else
-  mova            [outputq],      m0
-  mova            [outputq + 16], m1
-%endif
 
   RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 000000000..dedb4d02f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+                                           const int stride,
+                                           const ptrdiff_t *const offsets,
+                                           __m128i *const level) {
+  level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+  level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+  level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+  level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+  level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+  const __m128i const_3 = _mm_set1_epi8(3);
+  const __m128i const_4 = _mm_set1_epi8(4);
+  __m128i count;
+
+  count = _mm_min_epu8(level[0], const_3);
+  level[1] = _mm_min_epu8(level[1], const_3);
+  level[2] = _mm_min_epu8(level[2], const_3);
+  level[3] = _mm_min_epu8(level[3], const_3);
+  level[4] = _mm_min_epu8(level[4], const_3);
+  count = _mm_add_epi8(count, level[1]);
+  count = _mm_add_epi8(count, level[2]);
+  count = _mm_add_epi8(count, level[3]);
+  count = _mm_add_epi8(count, level[4]);
+  count = _mm_avg_epu8(count, _mm_setzero_si128());
+  count = _mm_min_epu8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+  __m128i pos_to_offset =
+      (height == 4)
+          ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+          : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+                          21, 21);
+  __m128i count;
+  __m128i level[5];
+  int8_t *cc = coeff_contexts;
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+  __m128i pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] =
+        _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  } else if (height < 8) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+                                     21, 21, 21, 21, 21);
+  } else {
+    pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+                                     11, 11, 11, 11, 11);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  }
+  pos_to_offset[2] = _mm_set1_epi8(21);
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset[0]);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i pos_to_offset[5];
+  __m128i pos_to_offset_large[3];
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = _mm_set1_epi8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+        16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  const __m128i pos_to_offset_large =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  do {
+    __m128i pos_to_offset =
+        _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  __m128i pos_to_offset[3];
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 16) {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 000000000..b3a879b0f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+                                const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  const __m128i zeros = _mm_setzero_si128();
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+      _mm_storeu_si128((__m128i *)ls, lsAB);
+      ls += (stride << 1);
+      cf += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      _mm_storeu_si128((__m128i *)ls, absAB8);
+      ls += stride;
+      cf += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+        const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+        const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
+        const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+        const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+        const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+        const __m128i absAB = _mm_abs_epi16(coeffAB);
+        const __m128i absCD = _mm_abs_epi16(coeffCD);
+        const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+        _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < width);
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index 6599630d0..7d4f69585 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>  // AVX2
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
index 4680f1fab..72e9e22b1 100644
--- a/third_party/aom/av1/encoder/x86/error_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -77,49 +77,3 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   movd    edx, m5
 %endif
   RET
-
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
-;                            intptr_t block_size)
-
-INIT_XMM sse2
-cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
-.loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
-  psubw     m0, m2
-  psubw     m1, m3
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-  pmaddwd   m0, m0
-  pmaddwd   m1, m1
-  ; accumulate in 64bit
-  punpckldq m3, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m3
-  punpckldq m3, m1, m5
-  paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m3
-  paddq     m4, m1
-  add    sizeq, mmsize
-  jl .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  paddq     m4, m5
-%if ARCH_X86_64
-  movq    rax, m4
-%else
-  pshufd   m5, m4, 0x1
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 000000000..65fa46311
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                     size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  return (crc ^= 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index b684f7a3a..4cd6371a6 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -11,11 +11,12 @@
 #include <assert.h>
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_ports/mem.h"
@@ -121,72 +122,57 @@ static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
 }
 
 static void fadst4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const int32_t *sinpi = sinpi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i kZero = _mm_setzero_si128();
-  __m128i s0, s1, s2, s3;
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
 
-  // stage 0
-  // stage 1
-  // stage 2
-  u0 = _mm_mullo_epi32(in[3], cospi8);
-  u1 = _mm_mullo_epi32(in[0], cospi56);
-  u2 = _mm_add_epi32(u0, u1);
-  s0 = _mm_add_epi32(u2, rnding);
-  s0 = _mm_srai_epi32(s0, bit);
-
-  v0 = _mm_mullo_epi32(in[3], cospi56);
-  v1 = _mm_mullo_epi32(in[0], cospi8);
-  v2 = _mm_sub_epi32(v0, v1);
-  s1 = _mm_add_epi32(v2, rnding);
-  s1 = _mm_srai_epi32(s1, bit);
-
-  u0 = _mm_mullo_epi32(in[1], cospi40);
-  u1 = _mm_mullo_epi32(in[2], cospi24);
-  u2 = _mm_add_epi32(u0, u1);
-  s2 = _mm_add_epi32(u2, rnding);
-  s2 = _mm_srai_epi32(s2, bit);
-
-  v0 = _mm_mullo_epi32(in[1], cospi24);
-  v1 = _mm_mullo_epi32(in[2], cospi40);
-  v2 = _mm_sub_epi32(v0, v1);
-  s3 = _mm_add_epi32(v2, rnding);
-  s3 = _mm_srai_epi32(s3, bit);
-
-  // stage 3
-  u0 = _mm_add_epi32(s0, s2);
-  u2 = _mm_sub_epi32(s0, s2);
-  u1 = _mm_add_epi32(s1, s3);
-  u3 = _mm_sub_epi32(s1, s3);
-
-  // stage 4
-  v0 = _mm_mullo_epi32(u2, cospi32);
-  v1 = _mm_mullo_epi32(u3, cospi32);
-  v2 = _mm_add_epi32(v0, v1);
-  s2 = _mm_add_epi32(v2, rnding);
-  u2 = _mm_srai_epi32(s2, bit);
+  s0 = _mm_mullo_epi32(in[0], sinpi1);
+  s1 = _mm_mullo_epi32(in[0], sinpi4);
+  s2 = _mm_mullo_epi32(in[1], sinpi2);
+  s3 = _mm_mullo_epi32(in[1], sinpi1);
+  s4 = _mm_mullo_epi32(in[2], sinpi3);
+  s5 = _mm_mullo_epi32(in[3], sinpi4);
+  s6 = _mm_mullo_epi32(in[3], sinpi2);
+  t = _mm_add_epi32(in[0], in[1]);
+  s7 = _mm_sub_epi32(t, in[3]);
+
+  t = _mm_add_epi32(s0, s2);
+  x0 = _mm_add_epi32(t, s5);
+  x1 = _mm_mullo_epi32(s7, sinpi3);
+  t = _mm_sub_epi32(s1, s3);
+  x2 = _mm_add_epi32(t, s6);
+  x3 = s4;
+
+  s0 = _mm_add_epi32(x0, x3);
+  s1 = x1;
+  s2 = _mm_sub_epi32(x2, x3);
+  t = _mm_sub_epi32(x2, x0);
+  s3 = _mm_add_epi32(t, x3);
+
+  u0 = _mm_add_epi32(s0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(s1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(s2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(s3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
 
-  v2 = _mm_sub_epi32(v0, v1);
-  s3 = _mm_add_epi32(v2, rnding);
-  u3 = _mm_srai_epi32(s3, bit);
-
-  // u0, u1, u2, u3
-  u2 = _mm_sub_epi32(kZero, u2);
-  u1 = _mm_sub_epi32(kZero, u1);
-
-  // u0, u2, u3, u1
-  // Transpose 4x4 32-bit
-  v0 = _mm_unpacklo_epi32(u0, u2);
-  v1 = _mm_unpackhi_epi32(u0, u2);
-  v2 = _mm_unpacklo_epi32(u3, u1);
-  v3 = _mm_unpackhi_epi32(u3, u1);
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
 
   in[0] = _mm_unpacklo_epi64(v0, v2);
   in[1] = _mm_unpackhi_epi64(v0, v2);
@@ -197,84 +183,65 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) {
 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
                                int input_stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
-      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
-#endif
     default: assert(0);
   }
   (void)bd;
@@ -624,415 +591,274 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
 
 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
   const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i kZero = _mm_setzero_si128();
-  __m128i u[8], v[8], x;
-
-  // Even 8 points: 0, 2, ..., 14
-  // stage 0
-  // stage 1
-  // stage 2
-  // (1)
-  u[0] = _mm_mullo_epi32(in[14], cospi4);
-  x = _mm_mullo_epi32(in[0], cospi60);
-  u[0] = _mm_add_epi32(u[0], x);
-  u[0] = _mm_add_epi32(u[0], rnding);
-  u[0] = _mm_srai_epi32(u[0], bit);
-
-  u[1] = _mm_mullo_epi32(in[14], cospi60);
-  x = _mm_mullo_epi32(in[0], cospi4);
-  u[1] = _mm_sub_epi32(u[1], x);
-  u[1] = _mm_add_epi32(u[1], rnding);
-  u[1] = _mm_srai_epi32(u[1], bit);
-
-  // (2)
-  u[2] = _mm_mullo_epi32(in[10], cospi20);
-  x = _mm_mullo_epi32(in[4], cospi44);
-  u[2] = _mm_add_epi32(u[2], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
-
-  u[3] = _mm_mullo_epi32(in[10], cospi44);
-  x = _mm_mullo_epi32(in[4], cospi20);
-  u[3] = _mm_sub_epi32(u[3], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
-
-  // (3)
-  u[4] = _mm_mullo_epi32(in[6], cospi36);
-  x = _mm_mullo_epi32(in[8], cospi28);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(in[6], cospi28);
-  x = _mm_mullo_epi32(in[8], cospi36);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  // (4)
-  u[6] = _mm_mullo_epi32(in[2], cospi52);
-  x = _mm_mullo_epi32(in[12], cospi12);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(in[2], cospi12);
-  x = _mm_mullo_epi32(in[12], cospi52);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 3
-  v[0] = _mm_add_epi32(u[0], u[4]);
-  v[4] = _mm_sub_epi32(u[0], u[4]);
-  v[1] = _mm_add_epi32(u[1], u[5]);
-  v[5] = _mm_sub_epi32(u[1], u[5]);
-  v[2] = _mm_add_epi32(u[2], u[6]);
-  v[6] = _mm_sub_epi32(u[2], u[6]);
-  v[3] = _mm_add_epi32(u[3], u[7]);
-  v[7] = _mm_sub_epi32(u[3], u[7]);
-
-  // stage 4
-  u[0] = v[0];
-  u[1] = v[1];
-  u[2] = v[2];
-  u[3] = v[3];
-
-  u[4] = _mm_mullo_epi32(v[4], cospi16);
-  x = _mm_mullo_epi32(v[5], cospi48);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(v[4], cospi48);
-  x = _mm_mullo_epi32(v[5], cospi16);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  u[6] = _mm_mullo_epi32(v[6], cospim48);
-  x = _mm_mullo_epi32(v[7], cospi16);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(v[6], cospi16);
-  x = _mm_mullo_epi32(v[7], cospim48);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 5
-  v[0] = _mm_add_epi32(u[0], u[2]);
-  v[2] = _mm_sub_epi32(u[0], u[2]);
-  v[1] = _mm_add_epi32(u[1], u[3]);
-  v[3] = _mm_sub_epi32(u[1], u[3]);
-  v[4] = _mm_add_epi32(u[4], u[6]);
-  v[6] = _mm_sub_epi32(u[4], u[6]);
-  v[5] = _mm_add_epi32(u[5], u[7]);
-  v[7] = _mm_sub_epi32(u[5], u[7]);
-
-  // stage 6
-  u[0] = v[0];
-  u[1] = v[1];
-  u[4] = v[4];
-  u[5] = v[5];
-
-  v[0] = _mm_mullo_epi32(v[2], cospi32);
-  x = _mm_mullo_epi32(v[3], cospi32);
-  u[2] = _mm_add_epi32(v[0], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
 
-  u[3] = _mm_sub_epi32(v[0], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
 
-  v[0] = _mm_mullo_epi32(v[6], cospi32);
-  x = _mm_mullo_epi32(v[7], cospi32);
-  u[6] = _mm_add_epi32(v[0], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_sub_epi32(v[0], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 7
-  out[0] = u[0];
-  out[2] = _mm_sub_epi32(kZero, u[4]);
-  out[4] = u[6];
-  out[6] = _mm_sub_epi32(kZero, u[2]);
-  out[8] = u[3];
-  out[10] = _mm_sub_epi32(kZero, u[7]);
-  out[12] = u[5];
-  out[14] = _mm_sub_epi32(kZero, u[1]);
+    // stage 2
+    v0 = u0;
+    v1 = u1;
 
-  // Odd 8 points: 1, 3, ..., 15
-  // stage 0
-  // stage 1
-  // stage 2
-  // (1)
-  u[0] = _mm_mullo_epi32(in[15], cospi4);
-  x = _mm_mullo_epi32(in[1], cospi60);
-  u[0] = _mm_add_epi32(u[0], x);
-  u[0] = _mm_add_epi32(u[0], rnding);
-  u[0] = _mm_srai_epi32(u[0], bit);
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
 
-  u[1] = _mm_mullo_epi32(in[15], cospi60);
-  x = _mm_mullo_epi32(in[1], cospi4);
-  u[1] = _mm_sub_epi32(u[1], x);
-  u[1] = _mm_add_epi32(u[1], rnding);
-  u[1] = _mm_srai_epi32(u[1], bit);
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
 
-  // (2)
-  u[2] = _mm_mullo_epi32(in[11], cospi20);
-  x = _mm_mullo_epi32(in[5], cospi44);
-  u[2] = _mm_add_epi32(u[2], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
+    v4 = u4;
+    v5 = u5;
 
-  u[3] = _mm_mullo_epi32(in[11], cospi44);
-  x = _mm_mullo_epi32(in[5], cospi20);
-  u[3] = _mm_sub_epi32(u[3], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
 
-  // (3)
-  u[4] = _mm_mullo_epi32(in[7], cospi36);
-  x = _mm_mullo_epi32(in[9], cospi28);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(in[7], cospi28);
-  x = _mm_mullo_epi32(in[9], cospi36);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  // (4)
-  u[6] = _mm_mullo_epi32(in[3], cospi52);
-  x = _mm_mullo_epi32(in[13], cospi12);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(in[3], cospi12);
-  x = _mm_mullo_epi32(in[13], cospi52);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
 
-  // stage 3
-  v[0] = _mm_add_epi32(u[0], u[4]);
-  v[4] = _mm_sub_epi32(u[0], u[4]);
-  v[1] = _mm_add_epi32(u[1], u[5]);
-  v[5] = _mm_sub_epi32(u[1], u[5]);
-  v[2] = _mm_add_epi32(u[2], u[6]);
-  v[6] = _mm_sub_epi32(u[2], u[6]);
-  v[3] = _mm_add_epi32(u[3], u[7]);
-  v[7] = _mm_sub_epi32(u[3], u[7]);
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
 
-  // stage 4
-  u[0] = v[0];
-  u[1] = v[1];
-  u[2] = v[2];
-  u[3] = v[3];
-
-  u[4] = _mm_mullo_epi32(v[4], cospi16);
-  x = _mm_mullo_epi32(v[5], cospi48);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(v[4], cospi48);
-  x = _mm_mullo_epi32(v[5], cospi16);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  u[6] = _mm_mullo_epi32(v[6], cospim48);
-  x = _mm_mullo_epi32(v[7], cospi16);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(v[6], cospi16);
-  x = _mm_mullo_epi32(v[7], cospim48);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
 
-  // stage 5
-  v[0] = _mm_add_epi32(u[0], u[2]);
-  v[2] = _mm_sub_epi32(u[0], u[2]);
-  v[1] = _mm_add_epi32(u[1], u[3]);
-  v[3] = _mm_sub_epi32(u[1], u[3]);
-  v[4] = _mm_add_epi32(u[4], u[6]);
-  v[6] = _mm_sub_epi32(u[4], u[6]);
-  v[5] = _mm_add_epi32(u[5], u[7]);
-  v[7] = _mm_sub_epi32(u[5], u[7]);
-
-  // stage 6
-  u[0] = v[0];
-  u[1] = v[1];
-  u[4] = v[4];
-  u[5] = v[5];
-
-  v[0] = _mm_mullo_epi32(v[2], cospi32);
-  x = _mm_mullo_epi32(v[3], cospi32);
-  u[2] = _mm_add_epi32(v[0], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
 
-  u[3] = _mm_sub_epi32(v[0], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
 
-  v[0] = _mm_mullo_epi32(v[6], cospi32);
-  x = _mm_mullo_epi32(v[7], cospi32);
-  u[6] = _mm_add_epi32(v[0], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_sub_epi32(v[0], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 7
-  out[1] = u[0];
-  out[3] = _mm_sub_epi32(kZero, u[4]);
-  out[5] = u[6];
-  out[7] = _mm_sub_epi32(kZero, u[2]);
-  out[9] = u[3];
-  out[11] = _mm_sub_epi32(kZero, u[7]);
-  out[13] = u[5];
-  out[15] = _mm_sub_epi32(kZero, u[1]);
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
 }
 
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
   (void)bd;
@@ -1402,230 +1228,174 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
 
 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
   __m128i u[16], v[16], x, y;
-  const int col_num = 4;
   int col;
 
-  // Calculate the column 0, 1, 2, 3
-  for (col = 0; col < col_num; ++col) {
+  for (col = 0; col < 4; ++col) {
     // stage 0
     // stage 1
-    // stage 2
-    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
-    v[0] = _mm_add_epi32(v[0], x);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
 
-    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
-    v[1] = _mm_sub_epi32(v[1], x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
 
-    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
-    v[2] = _mm_add_epi32(v[2], x);
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
     v[2] = _mm_add_epi32(v[2], rnding);
     v[2] = _mm_srai_epi32(v[2], bit);
 
-    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
-    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_sub_epi32(x, y);
     v[3] = _mm_add_epi32(v[3], rnding);
     v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
-
-    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
-
-    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
-    v[6] = _mm_add_epi32(v[6], x);
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
     v[6] = _mm_add_epi32(v[6], rnding);
     v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
-    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_sub_epi32(x, y);
     v[7] = _mm_add_epi32(v[7], rnding);
     v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
-
-    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+    v[8] = u[8];
+    v[9] = u[9];
 
-    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
-    v[10] = _mm_add_epi32(v[10], x);
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
     v[10] = _mm_add_epi32(v[10], rnding);
     v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
-    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_sub_epi32(x, y);
     v[11] = _mm_add_epi32(v[11], rnding);
     v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+    v[12] = u[12];
+    v[13] = u[13];
 
-    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
-    v[14] = _mm_add_epi32(v[14], x);
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
     v[14] = _mm_add_epi32(v[14], rnding);
     v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
-    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_sub_epi32(x, y);
     v[15] = _mm_add_epi32(v[15], rnding);
     v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 3
-    u[0] = _mm_add_epi32(v[0], v[8]);
-    u[8] = _mm_sub_epi32(v[0], v[8]);
-    u[1] = _mm_add_epi32(v[1], v[9]);
-    u[9] = _mm_sub_epi32(v[1], v[9]);
-    u[2] = _mm_add_epi32(v[2], v[10]);
-    u[10] = _mm_sub_epi32(v[2], v[10]);
-    u[3] = _mm_add_epi32(v[3], v[11]);
-    u[11] = _mm_sub_epi32(v[3], v[11]);
-    u[4] = _mm_add_epi32(v[4], v[12]);
-    u[12] = _mm_sub_epi32(v[4], v[12]);
-    u[5] = _mm_add_epi32(v[5], v[13]);
-    u[13] = _mm_sub_epi32(v[5], v[13]);
-    u[6] = _mm_add_epi32(v[6], v[14]);
-    u[14] = _mm_sub_epi32(v[6], v[14]);
-    u[7] = _mm_add_epi32(v[7], v[15]);
-    u[15] = _mm_sub_epi32(v[7], v[15]);
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
 
     // stage 4
     v[0] = u[0];
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-
-    v[8] = _mm_mullo_epi32(u[8], cospi8);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
-
-    v[9] = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi8);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
-
-    v[10] = _mm_mullo_epi32(u[10], cospi40);
-    x = _mm_mullo_epi32(u[11], cospi24);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
-
-    v[11] = _mm_mullo_epi32(u[10], cospi24);
-    x = _mm_mullo_epi32(u[11], cospi40);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
-
-    v[12] = _mm_mullo_epi32(u[12], cospim56);
-    x = _mm_mullo_epi32(u[13], cospi8);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(u[12], cospi8);
-    x = _mm_mullo_epi32(u[13], cospim56);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(u[14], cospim24);
-    x = _mm_mullo_epi32(u[15], cospi40);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(u[14], cospi40);
-    x = _mm_mullo_epi32(u[15], cospim24);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
 
     // stage 5
     u[0] = _mm_add_epi32(v[0], v[4]);
-    u[4] = _mm_sub_epi32(v[0], v[4]);
     u[1] = _mm_add_epi32(v[1], v[5]);
-    u[5] = _mm_sub_epi32(v[1], v[5]);
     u[2] = _mm_add_epi32(v[2], v[6]);
-    u[6] = _mm_sub_epi32(v[2], v[6]);
     u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
     u[7] = _mm_sub_epi32(v[3], v[7]);
     u[8] = _mm_add_epi32(v[8], v[12]);
-    u[12] = _mm_sub_epi32(v[8], v[12]);
     u[9] = _mm_add_epi32(v[9], v[13]);
-    u[13] = _mm_sub_epi32(v[9], v[13]);
     u[10] = _mm_add_epi32(v[10], v[14]);
-    u[14] = _mm_sub_epi32(v[10], v[14]);
     u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
     u[15] = _mm_sub_epi32(v[11], v[15]);
 
     // stage 6
@@ -1633,148 +1403,72 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-
-    v[4] = _mm_mullo_epi32(u[4], cospi16);
-    x = _mm_mullo_epi32(u[5], cospi48);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
-
-    v[5] = _mm_mullo_epi32(u[4], cospi48);
-    x = _mm_mullo_epi32(u[5], cospi16);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
-
-    v[6] = _mm_mullo_epi32(u[6], cospim48);
-    x = _mm_mullo_epi32(u[7], cospi16);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_mullo_epi32(u[6], cospi16);
-    x = _mm_mullo_epi32(u[7], cospim48);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
-
-    v[12] = _mm_mullo_epi32(u[12], cospi16);
-    x = _mm_mullo_epi32(u[13], cospi48);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(u[12], cospi48);
-    x = _mm_mullo_epi32(u[13], cospi16);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(u[14], cospim48);
-    x = _mm_mullo_epi32(u[15], cospi16);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(u[14], cospi16);
-    x = _mm_mullo_epi32(u[15], cospim48);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
-
-    // stage 7
-    u[0] = _mm_add_epi32(v[0], v[2]);
-    u[2] = _mm_sub_epi32(v[0], v[2]);
-    u[1] = _mm_add_epi32(v[1], v[3]);
-    u[3] = _mm_sub_epi32(v[1], v[3]);
-    u[4] = _mm_add_epi32(v[4], v[6]);
-    u[6] = _mm_sub_epi32(v[4], v[6]);
-    u[5] = _mm_add_epi32(v[5], v[7]);
-    u[7] = _mm_sub_epi32(v[5], v[7]);
-    u[8] = _mm_add_epi32(v[8], v[10]);
-    u[10] = _mm_sub_epi32(v[8], v[10]);
-    u[9] = _mm_add_epi32(v[9], v[11]);
-    u[11] = _mm_sub_epi32(v[9], v[11]);
-    u[12] = _mm_add_epi32(v[12], v[14]);
-    u[14] = _mm_sub_epi32(v[12], v[14]);
-    u[13] = _mm_add_epi32(v[13], v[15]);
-    u[15] = _mm_sub_epi32(v[13], v[15]);
-
-    // stage 8
-    v[0] = u[0];
-    v[1] = u[1];
-
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
-    v[2] = _mm_add_epi32(y, x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
-
-    v[3] = _mm_sub_epi32(y, x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
-
     v[4] = u[4];
     v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
 
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
-    v[6] = _mm_add_epi32(y, x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_sub_epi32(y, x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
-    v[10] = _mm_add_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
-
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
-
-    v[12] = u[12];
-    v[13] = u[13];
-
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
-    v[14] = _mm_add_epi32(y, x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
 
-    v[15] = _mm_sub_epi32(y, x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    // stage 8
+    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
 
     // stage 9
-    out[0 * col_num + col] = v[0];
-    out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
-    out[2 * col_num + col] = v[12];
-    out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
-    out[4 * col_num + col] = v[6];
-    out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
-    out[6 * col_num + col] = v[10];
-    out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
-    out[8 * col_num + col] = v[3];
-    out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
-    out[10 * col_num + col] = v[15];
-    out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
-    out[12 * col_num + col] = v[5];
-    out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
-    out[14 * col_num + col] = v[9];
-    out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
   }
 }
 
@@ -1802,111 +1496,91 @@ static void write_buffer_16x16(const __m128i *in, int32_t *output) {
 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
-
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
   (void)bd;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
deleted file mode 100644
index 88621c82b..000000000
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ /dev/null
@@ -1,1627 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // avx2
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/fwd_txfm_avx2.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_buffer_16x16(const int16_t *input, int stride,
-                                     int flipud, int fliplr, __m256i *in) {
-  if (!flipud) {
-    in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
-    in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
-    in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
-    in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
-    in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
-    in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
-    in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
-    in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
-    in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
-    in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
-    in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
-    in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
-    in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
-    in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
-    in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
-    in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
-  } else {
-    in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
-    in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
-    in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
-    in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
-    in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
-    in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
-    in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
-    in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
-    in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
-    in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
-    in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
-    in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
-    in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
-    in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
-    in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
-    in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    mm256_reverse_epi16(&in[0]);
-    mm256_reverse_epi16(&in[1]);
-    mm256_reverse_epi16(&in[2]);
-    mm256_reverse_epi16(&in[3]);
-    mm256_reverse_epi16(&in[4]);
-    mm256_reverse_epi16(&in[5]);
-    mm256_reverse_epi16(&in[6]);
-    mm256_reverse_epi16(&in[7]);
-    mm256_reverse_epi16(&in[8]);
-    mm256_reverse_epi16(&in[9]);
-    mm256_reverse_epi16(&in[10]);
-    mm256_reverse_epi16(&in[11]);
-    mm256_reverse_epi16(&in[12]);
-    mm256_reverse_epi16(&in[13]);
-    mm256_reverse_epi16(&in[14]);
-    mm256_reverse_epi16(&in[15]);
-  }
-
-  in[0] = _mm256_slli_epi16(in[0], 2);
-  in[1] = _mm256_slli_epi16(in[1], 2);
-  in[2] = _mm256_slli_epi16(in[2], 2);
-  in[3] = _mm256_slli_epi16(in[3], 2);
-  in[4] = _mm256_slli_epi16(in[4], 2);
-  in[5] = _mm256_slli_epi16(in[5], 2);
-  in[6] = _mm256_slli_epi16(in[6], 2);
-  in[7] = _mm256_slli_epi16(in[7], 2);
-  in[8] = _mm256_slli_epi16(in[8], 2);
-  in[9] = _mm256_slli_epi16(in[9], 2);
-  in[10] = _mm256_slli_epi16(in[10], 2);
-  in[11] = _mm256_slli_epi16(in[11], 2);
-  in[12] = _mm256_slli_epi16(in[12], 2);
-  in[13] = _mm256_slli_epi16(in[13], 2);
-  in[14] = _mm256_slli_epi16(in[14], 2);
-  in[15] = _mm256_slli_epi16(in[15], 2);
-}
-
-static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    storeu_output_avx2(&in[i], output + (i << 4));
-  }
-}
-
-static void right_shift_16x16(__m256i *in) {
-  const __m256i one = _mm256_set1_epi16(1);
-  __m256i s0 = _mm256_srai_epi16(in[0], 15);
-  __m256i s1 = _mm256_srai_epi16(in[1], 15);
-  __m256i s2 = _mm256_srai_epi16(in[2], 15);
-  __m256i s3 = _mm256_srai_epi16(in[3], 15);
-  __m256i s4 = _mm256_srai_epi16(in[4], 15);
-  __m256i s5 = _mm256_srai_epi16(in[5], 15);
-  __m256i s6 = _mm256_srai_epi16(in[6], 15);
-  __m256i s7 = _mm256_srai_epi16(in[7], 15);
-  __m256i s8 = _mm256_srai_epi16(in[8], 15);
-  __m256i s9 = _mm256_srai_epi16(in[9], 15);
-  __m256i s10 = _mm256_srai_epi16(in[10], 15);
-  __m256i s11 = _mm256_srai_epi16(in[11], 15);
-  __m256i s12 = _mm256_srai_epi16(in[12], 15);
-  __m256i s13 = _mm256_srai_epi16(in[13], 15);
-  __m256i s14 = _mm256_srai_epi16(in[14], 15);
-  __m256i s15 = _mm256_srai_epi16(in[15], 15);
-
-  in[0] = _mm256_add_epi16(in[0], one);
-  in[1] = _mm256_add_epi16(in[1], one);
-  in[2] = _mm256_add_epi16(in[2], one);
-  in[3] = _mm256_add_epi16(in[3], one);
-  in[4] = _mm256_add_epi16(in[4], one);
-  in[5] = _mm256_add_epi16(in[5], one);
-  in[6] = _mm256_add_epi16(in[6], one);
-  in[7] = _mm256_add_epi16(in[7], one);
-  in[8] = _mm256_add_epi16(in[8], one);
-  in[9] = _mm256_add_epi16(in[9], one);
-  in[10] = _mm256_add_epi16(in[10], one);
-  in[11] = _mm256_add_epi16(in[11], one);
-  in[12] = _mm256_add_epi16(in[12], one);
-  in[13] = _mm256_add_epi16(in[13], one);
-  in[14] = _mm256_add_epi16(in[14], one);
-  in[15] = _mm256_add_epi16(in[15], one);
-
-  in[0] = _mm256_sub_epi16(in[0], s0);
-  in[1] = _mm256_sub_epi16(in[1], s1);
-  in[2] = _mm256_sub_epi16(in[2], s2);
-  in[3] = _mm256_sub_epi16(in[3], s3);
-  in[4] = _mm256_sub_epi16(in[4], s4);
-  in[5] = _mm256_sub_epi16(in[5], s5);
-  in[6] = _mm256_sub_epi16(in[6], s6);
-  in[7] = _mm256_sub_epi16(in[7], s7);
-  in[8] = _mm256_sub_epi16(in[8], s8);
-  in[9] = _mm256_sub_epi16(in[9], s9);
-  in[10] = _mm256_sub_epi16(in[10], s10);
-  in[11] = _mm256_sub_epi16(in[11], s11);
-  in[12] = _mm256_sub_epi16(in[12], s12);
-  in[13] = _mm256_sub_epi16(in[13], s13);
-  in[14] = _mm256_sub_epi16(in[14], s14);
-  in[15] = _mm256_sub_epi16(in[15], s15);
-
-  in[0] = _mm256_srai_epi16(in[0], 2);
-  in[1] = _mm256_srai_epi16(in[1], 2);
-  in[2] = _mm256_srai_epi16(in[2], 2);
-  in[3] = _mm256_srai_epi16(in[3], 2);
-  in[4] = _mm256_srai_epi16(in[4], 2);
-  in[5] = _mm256_srai_epi16(in[5], 2);
-  in[6] = _mm256_srai_epi16(in[6], 2);
-  in[7] = _mm256_srai_epi16(in[7], 2);
-  in[8] = _mm256_srai_epi16(in[8], 2);
-  in[9] = _mm256_srai_epi16(in[9], 2);
-  in[10] = _mm256_srai_epi16(in[10], 2);
-  in[11] = _mm256_srai_epi16(in[11], 2);
-  in[12] = _mm256_srai_epi16(in[12], 2);
-  in[13] = _mm256_srai_epi16(in[13], 2);
-  in[14] = _mm256_srai_epi16(in[14], 2);
-  in[15] = _mm256_srai_epi16(in[15], 2);
-}
-
-static void fdct16_avx2(__m256i *in) {
-  // sequence: cospi_L_H = pairs(L, H) and L first
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-
-  const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64);
-  const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
-
-  const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64);
-  const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
-
-  const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64);
-  const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
-
-  const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64);
-  const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
-
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
-  __m256i v0, v1, v2, v3;
-  __m256i x0, x1;
-
-  // 0, 4, 8, 12
-  u0 = _mm256_add_epi16(in[0], in[15]);
-  u1 = _mm256_add_epi16(in[1], in[14]);
-  u2 = _mm256_add_epi16(in[2], in[13]);
-  u3 = _mm256_add_epi16(in[3], in[12]);
-  u4 = _mm256_add_epi16(in[4], in[11]);
-  u5 = _mm256_add_epi16(in[5], in[10]);
-  u6 = _mm256_add_epi16(in[6], in[9]);
-  u7 = _mm256_add_epi16(in[7], in[8]);
-
-  s0 = _mm256_add_epi16(u0, u7);
-  s1 = _mm256_add_epi16(u1, u6);
-  s2 = _mm256_add_epi16(u2, u5);
-  s3 = _mm256_add_epi16(u3, u4);
-
-  // 0, 8
-  v0 = _mm256_add_epi16(s0, s3);
-  v1 = _mm256_add_epi16(s1, s2);
-
-  x0 = _mm256_unpacklo_epi16(v0, v1);
-  x1 = _mm256_unpackhi_epi16(v0, v1);
-
-  t0 = butter_fly(&x0, &x1, &cospi_p16_p16);
-  t1 = butter_fly(&x0, &x1, &cospi_p16_m16);
-
-  // 4, 12
-  v0 = _mm256_sub_epi16(s1, s2);
-  v1 = _mm256_sub_epi16(s0, s3);
-
-  x0 = _mm256_unpacklo_epi16(v0, v1);
-  x1 = _mm256_unpackhi_epi16(v0, v1);
-
-  t2 = butter_fly(&x0, &x1, &cospi_p24_p08);
-  t3 = butter_fly(&x0, &x1, &cospi_m08_p24);
-
-  // 2, 6, 10, 14
-  s0 = _mm256_sub_epi16(u3, u4);
-  s1 = _mm256_sub_epi16(u2, u5);
-  s2 = _mm256_sub_epi16(u1, u6);
-  s3 = _mm256_sub_epi16(u0, u7);
-
-  v0 = s0;  // output[4]
-  v3 = s3;  // output[7]
-
-  x0 = _mm256_unpacklo_epi16(s2, s1);
-  x1 = _mm256_unpackhi_epi16(s2, s1);
-
-  v2 = butter_fly(&x0, &x1, &cospi_p16_p16);  // output[5]
-  v1 = butter_fly(&x0, &x1, &cospi_p16_m16);  // output[6]
-
-  s0 = _mm256_add_epi16(v0, v1);  // step[4]
-  s1 = _mm256_sub_epi16(v0, v1);  // step[5]
-  s2 = _mm256_sub_epi16(v3, v2);  // step[6]
-  s3 = _mm256_add_epi16(v3, v2);  // step[7]
-
-  // 2, 14
-  x0 = _mm256_unpacklo_epi16(s0, s3);
-  x1 = _mm256_unpackhi_epi16(s0, s3);
-
-  t4 = butter_fly(&x0, &x1, &cospi_p28_p04);
-  t5 = butter_fly(&x0, &x1, &cospi_m04_p28);
-
-  // 10, 6
-  x0 = _mm256_unpacklo_epi16(s1, s2);
-  x1 = _mm256_unpackhi_epi16(s1, s2);
-  t6 = butter_fly(&x0, &x1, &cospi_p12_p20);
-  t7 = butter_fly(&x0, &x1, &cospi_m20_p12);
-
-  // 1, 3, 5, 7, 9, 11, 13, 15
-  s0 = _mm256_sub_epi16(in[7], in[8]);  // step[8]
-  s1 = _mm256_sub_epi16(in[6], in[9]);  // step[9]
-  u2 = _mm256_sub_epi16(in[5], in[10]);
-  u3 = _mm256_sub_epi16(in[4], in[11]);
-  u4 = _mm256_sub_epi16(in[3], in[12]);
-  u5 = _mm256_sub_epi16(in[2], in[13]);
-  s6 = _mm256_sub_epi16(in[1], in[14]);  // step[14]
-  s7 = _mm256_sub_epi16(in[0], in[15]);  // step[15]
-
-  in[0] = t0;
-  in[8] = t1;
-  in[4] = t2;
-  in[12] = t3;
-  in[2] = t4;
-  in[14] = t5;
-  in[10] = t6;
-  in[6] = t7;
-
-  x0 = _mm256_unpacklo_epi16(u5, u2);
-  x1 = _mm256_unpackhi_epi16(u5, u2);
-
-  s2 = butter_fly(&x0, &x1, &cospi_p16_p16);  // step[13]
-  s5 = butter_fly(&x0, &x1, &cospi_p16_m16);  // step[10]
-
-  x0 = _mm256_unpacklo_epi16(u4, u3);
-  x1 = _mm256_unpackhi_epi16(u4, u3);
-
-  s3 = butter_fly(&x0, &x1, &cospi_p16_p16);  // step[12]
-  s4 = butter_fly(&x0, &x1, &cospi_p16_m16);  // step[11]
-
-  u0 = _mm256_add_epi16(s0, s4);  // output[8]
-  u1 = _mm256_add_epi16(s1, s5);
-  u2 = _mm256_sub_epi16(s1, s5);
-  u3 = _mm256_sub_epi16(s0, s4);
-  u4 = _mm256_sub_epi16(s7, s3);
-  u5 = _mm256_sub_epi16(s6, s2);
-  u6 = _mm256_add_epi16(s6, s2);
-  u7 = _mm256_add_epi16(s7, s3);
-
-  // stage 4
-  s0 = u0;
-  s3 = u3;
-  s4 = u4;
-  s7 = u7;
-
-  x0 = _mm256_unpacklo_epi16(u1, u6);
-  x1 = _mm256_unpackhi_epi16(u1, u6);
-
-  s1 = butter_fly(&x0, &x1, &cospi_m08_p24);
-  s6 = butter_fly(&x0, &x1, &cospi_p24_p08);
-
-  x0 = _mm256_unpacklo_epi16(u2, u5);
-  x1 = _mm256_unpackhi_epi16(u2, u5);
-
-  s2 = butter_fly(&x0, &x1, &cospi_m24_m08);
-  s5 = butter_fly(&x0, &x1, &cospi_m08_p24);
-
-  // stage 5
-  u0 = _mm256_add_epi16(s0, s1);
-  u1 = _mm256_sub_epi16(s0, s1);
-  u2 = _mm256_sub_epi16(s3, s2);
-  u3 = _mm256_add_epi16(s3, s2);
-  u4 = _mm256_add_epi16(s4, s5);
-  u5 = _mm256_sub_epi16(s4, s5);
-  u6 = _mm256_sub_epi16(s7, s6);
-  u7 = _mm256_add_epi16(s7, s6);
-
-  // stage 6
-  x0 = _mm256_unpacklo_epi16(u0, u7);
-  x1 = _mm256_unpackhi_epi16(u0, u7);
-  in[1] = butter_fly(&x0, &x1, &cospi_p30_p02);
-  in[15] = butter_fly(&x0, &x1, &cospi_m02_p30);
-
-  x0 = _mm256_unpacklo_epi16(u1, u6);
-  x1 = _mm256_unpackhi_epi16(u1, u6);
-  in[9] = butter_fly(&x0, &x1, &cospi_p14_p18);
-  in[7] = butter_fly(&x0, &x1, &cospi_m18_p14);
-
-  x0 = _mm256_unpacklo_epi16(u2, u5);
-  x1 = _mm256_unpackhi_epi16(u2, u5);
-  in[5] = butter_fly(&x0, &x1, &cospi_p22_p10);
-  in[11] = butter_fly(&x0, &x1, &cospi_m10_p22);
-
-  x0 = _mm256_unpacklo_epi16(u3, u4);
-  x1 = _mm256_unpackhi_epi16(u3, u4);
-  in[13] = butter_fly(&x0, &x1, &cospi_p06_p26);
-  in[3] = butter_fly(&x0, &x1, &cospi_m26_p06);
-}
-
-void fadst16_avx2(__m256i *in) {
-  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m256i y0, y1;
-
-  // stage 1, s takes low 256 bits; x takes high 256 bits
-  y0 = _mm256_unpacklo_epi16(in[15], in[0]);
-  y1 = _mm256_unpackhi_epi16(in[15], in[0]);
-  s0 = _mm256_madd_epi16(y0, cospi_p01_p31);
-  x0 = _mm256_madd_epi16(y1, cospi_p01_p31);
-  s1 = _mm256_madd_epi16(y0, cospi_p31_m01);
-  x1 = _mm256_madd_epi16(y1, cospi_p31_m01);
-
-  y0 = _mm256_unpacklo_epi16(in[13], in[2]);
-  y1 = _mm256_unpackhi_epi16(in[13], in[2]);
-  s2 = _mm256_madd_epi16(y0, cospi_p05_p27);
-  x2 = _mm256_madd_epi16(y1, cospi_p05_p27);
-  s3 = _mm256_madd_epi16(y0, cospi_p27_m05);
-  x3 = _mm256_madd_epi16(y1, cospi_p27_m05);
-
-  y0 = _mm256_unpacklo_epi16(in[11], in[4]);
-  y1 = _mm256_unpackhi_epi16(in[11], in[4]);
-  s4 = _mm256_madd_epi16(y0, cospi_p09_p23);
-  x4 = _mm256_madd_epi16(y1, cospi_p09_p23);
-  s5 = _mm256_madd_epi16(y0, cospi_p23_m09);
-  x5 = _mm256_madd_epi16(y1, cospi_p23_m09);
-
-  y0 = _mm256_unpacklo_epi16(in[9], in[6]);
-  y1 = _mm256_unpackhi_epi16(in[9], in[6]);
-  s6 = _mm256_madd_epi16(y0, cospi_p13_p19);
-  x6 = _mm256_madd_epi16(y1, cospi_p13_p19);
-  s7 = _mm256_madd_epi16(y0, cospi_p19_m13);
-  x7 = _mm256_madd_epi16(y1, cospi_p19_m13);
-
-  y0 = _mm256_unpacklo_epi16(in[7], in[8]);
-  y1 = _mm256_unpackhi_epi16(in[7], in[8]);
-  s8 = _mm256_madd_epi16(y0, cospi_p17_p15);
-  x8 = _mm256_madd_epi16(y1, cospi_p17_p15);
-  s9 = _mm256_madd_epi16(y0, cospi_p15_m17);
-  x9 = _mm256_madd_epi16(y1, cospi_p15_m17);
-
-  y0 = _mm256_unpacklo_epi16(in[5], in[10]);
-  y1 = _mm256_unpackhi_epi16(in[5], in[10]);
-  s10 = _mm256_madd_epi16(y0, cospi_p21_p11);
-  x10 = _mm256_madd_epi16(y1, cospi_p21_p11);
-  s11 = _mm256_madd_epi16(y0, cospi_p11_m21);
-  x11 = _mm256_madd_epi16(y1, cospi_p11_m21);
-
-  y0 = _mm256_unpacklo_epi16(in[3], in[12]);
-  y1 = _mm256_unpackhi_epi16(in[3], in[12]);
-  s12 = _mm256_madd_epi16(y0, cospi_p25_p07);
-  x12 = _mm256_madd_epi16(y1, cospi_p25_p07);
-  s13 = _mm256_madd_epi16(y0, cospi_p07_m25);
-  x13 = _mm256_madd_epi16(y1, cospi_p07_m25);
-
-  y0 = _mm256_unpacklo_epi16(in[1], in[14]);
-  y1 = _mm256_unpackhi_epi16(in[1], in[14]);
-  s14 = _mm256_madd_epi16(y0, cospi_p29_p03);
-  x14 = _mm256_madd_epi16(y1, cospi_p29_p03);
-  s15 = _mm256_madd_epi16(y0, cospi_p03_m29);
-  x15 = _mm256_madd_epi16(y1, cospi_p03_m29);
-
-  // u takes low 256 bits; v takes high 256 bits
-  u0 = _mm256_add_epi32(s0, s8);
-  u1 = _mm256_add_epi32(s1, s9);
-  u2 = _mm256_add_epi32(s2, s10);
-  u3 = _mm256_add_epi32(s3, s11);
-  u4 = _mm256_add_epi32(s4, s12);
-  u5 = _mm256_add_epi32(s5, s13);
-  u6 = _mm256_add_epi32(s6, s14);
-  u7 = _mm256_add_epi32(s7, s15);
-
-  u8 = _mm256_sub_epi32(s0, s8);
-  u9 = _mm256_sub_epi32(s1, s9);
-  u10 = _mm256_sub_epi32(s2, s10);
-  u11 = _mm256_sub_epi32(s3, s11);
-  u12 = _mm256_sub_epi32(s4, s12);
-  u13 = _mm256_sub_epi32(s5, s13);
-  u14 = _mm256_sub_epi32(s6, s14);
-  u15 = _mm256_sub_epi32(s7, s15);
-
-  v0 = _mm256_add_epi32(x0, x8);
-  v1 = _mm256_add_epi32(x1, x9);
-  v2 = _mm256_add_epi32(x2, x10);
-  v3 = _mm256_add_epi32(x3, x11);
-  v4 = _mm256_add_epi32(x4, x12);
-  v5 = _mm256_add_epi32(x5, x13);
-  v6 = _mm256_add_epi32(x6, x14);
-  v7 = _mm256_add_epi32(x7, x15);
-
-  v8 = _mm256_sub_epi32(x0, x8);
-  v9 = _mm256_sub_epi32(x1, x9);
-  v10 = _mm256_sub_epi32(x2, x10);
-  v11 = _mm256_sub_epi32(x3, x11);
-  v12 = _mm256_sub_epi32(x4, x12);
-  v13 = _mm256_sub_epi32(x5, x13);
-  v14 = _mm256_sub_epi32(x6, x14);
-  v15 = _mm256_sub_epi32(x7, x15);
-
-  // low 256 bits rounding
-  u8 = _mm256_add_epi32(u8, dct_rounding);
-  u9 = _mm256_add_epi32(u9, dct_rounding);
-  u10 = _mm256_add_epi32(u10, dct_rounding);
-  u11 = _mm256_add_epi32(u11, dct_rounding);
-  u12 = _mm256_add_epi32(u12, dct_rounding);
-  u13 = _mm256_add_epi32(u13, dct_rounding);
-  u14 = _mm256_add_epi32(u14, dct_rounding);
-  u15 = _mm256_add_epi32(u15, dct_rounding);
-
-  u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
-  u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
-  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
-  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
-  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
-  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  // high 256 bits rounding
-  v8 = _mm256_add_epi32(v8, dct_rounding);
-  v9 = _mm256_add_epi32(v9, dct_rounding);
-  v10 = _mm256_add_epi32(v10, dct_rounding);
-  v11 = _mm256_add_epi32(v11, dct_rounding);
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
-  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
-  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
-  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  // Saturation pack 32-bit to 16-bit
-  x8 = _mm256_packs_epi32(u8, v8);
-  x9 = _mm256_packs_epi32(u9, v9);
-  x10 = _mm256_packs_epi32(u10, v10);
-  x11 = _mm256_packs_epi32(u11, v11);
-  x12 = _mm256_packs_epi32(u12, v12);
-  x13 = _mm256_packs_epi32(u13, v13);
-  x14 = _mm256_packs_epi32(u14, v14);
-  x15 = _mm256_packs_epi32(u15, v15);
-
-  // stage 2
-  y0 = _mm256_unpacklo_epi16(x8, x9);
-  y1 = _mm256_unpackhi_epi16(x8, x9);
-  s8 = _mm256_madd_epi16(y0, cospi_p04_p28);
-  x8 = _mm256_madd_epi16(y1, cospi_p04_p28);
-  s9 = _mm256_madd_epi16(y0, cospi_p28_m04);
-  x9 = _mm256_madd_epi16(y1, cospi_p28_m04);
-
-  y0 = _mm256_unpacklo_epi16(x10, x11);
-  y1 = _mm256_unpackhi_epi16(x10, x11);
-  s10 = _mm256_madd_epi16(y0, cospi_p20_p12);
-  x10 = _mm256_madd_epi16(y1, cospi_p20_p12);
-  s11 = _mm256_madd_epi16(y0, cospi_p12_m20);
-  x11 = _mm256_madd_epi16(y1, cospi_p12_m20);
-
-  y0 = _mm256_unpacklo_epi16(x12, x13);
-  y1 = _mm256_unpackhi_epi16(x12, x13);
-  s12 = _mm256_madd_epi16(y0, cospi_m28_p04);
-  x12 = _mm256_madd_epi16(y1, cospi_m28_p04);
-  s13 = _mm256_madd_epi16(y0, cospi_p04_p28);
-  x13 = _mm256_madd_epi16(y1, cospi_p04_p28);
-
-  y0 = _mm256_unpacklo_epi16(x14, x15);
-  y1 = _mm256_unpackhi_epi16(x14, x15);
-  s14 = _mm256_madd_epi16(y0, cospi_m12_p20);
-  x14 = _mm256_madd_epi16(y1, cospi_m12_p20);
-  s15 = _mm256_madd_epi16(y0, cospi_p20_p12);
-  x15 = _mm256_madd_epi16(y1, cospi_p20_p12);
-
-  x0 = _mm256_add_epi32(u0, u4);
-  s0 = _mm256_add_epi32(v0, v4);
-  x1 = _mm256_add_epi32(u1, u5);
-  s1 = _mm256_add_epi32(v1, v5);
-  x2 = _mm256_add_epi32(u2, u6);
-  s2 = _mm256_add_epi32(v2, v6);
-  x3 = _mm256_add_epi32(u3, u7);
-  s3 = _mm256_add_epi32(v3, v7);
-
-  v8 = _mm256_sub_epi32(u0, u4);
-  v9 = _mm256_sub_epi32(v0, v4);
-  v10 = _mm256_sub_epi32(u1, u5);
-  v11 = _mm256_sub_epi32(v1, v5);
-  v12 = _mm256_sub_epi32(u2, u6);
-  v13 = _mm256_sub_epi32(v2, v6);
-  v14 = _mm256_sub_epi32(u3, u7);
-  v15 = _mm256_sub_epi32(v3, v7);
-
-  v8 = _mm256_add_epi32(v8, dct_rounding);
-  v9 = _mm256_add_epi32(v9, dct_rounding);
-  v10 = _mm256_add_epi32(v10, dct_rounding);
-  v11 = _mm256_add_epi32(v11, dct_rounding);
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
-  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
-  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
-  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  x4 = _mm256_packs_epi32(v8, v9);
-  x5 = _mm256_packs_epi32(v10, v11);
-  x6 = _mm256_packs_epi32(v12, v13);
-  x7 = _mm256_packs_epi32(v14, v15);
-
-  u8 = _mm256_add_epi32(s8, s12);
-  u9 = _mm256_add_epi32(s9, s13);
-  u10 = _mm256_add_epi32(s10, s14);
-  u11 = _mm256_add_epi32(s11, s15);
-  u12 = _mm256_sub_epi32(s8, s12);
-  u13 = _mm256_sub_epi32(s9, s13);
-  u14 = _mm256_sub_epi32(s10, s14);
-  u15 = _mm256_sub_epi32(s11, s15);
-
-  v8 = _mm256_add_epi32(x8, x12);
-  v9 = _mm256_add_epi32(x9, x13);
-  v10 = _mm256_add_epi32(x10, x14);
-  v11 = _mm256_add_epi32(x11, x15);
-  v12 = _mm256_sub_epi32(x8, x12);
-  v13 = _mm256_sub_epi32(x9, x13);
-  v14 = _mm256_sub_epi32(x10, x14);
-  v15 = _mm256_sub_epi32(x11, x15);
-
-  u12 = _mm256_add_epi32(u12, dct_rounding);
-  u13 = _mm256_add_epi32(u13, dct_rounding);
-  u14 = _mm256_add_epi32(u14, dct_rounding);
-  u15 = _mm256_add_epi32(u15, dct_rounding);
-
-  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
-  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  x12 = _mm256_packs_epi32(u12, v12);
-  x13 = _mm256_packs_epi32(u13, v13);
-  x14 = _mm256_packs_epi32(u14, v14);
-  x15 = _mm256_packs_epi32(u15, v15);
-
-  // stage 3
-  y0 = _mm256_unpacklo_epi16(x4, x5);
-  y1 = _mm256_unpackhi_epi16(x4, x5);
-  s4 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x4 = _mm256_madd_epi16(y1, cospi_p08_p24);
-  s5 = _mm256_madd_epi16(y0, cospi_p24_m08);
-  x5 = _mm256_madd_epi16(y1, cospi_p24_m08);
-
-  y0 = _mm256_unpacklo_epi16(x6, x7);
-  y1 = _mm256_unpackhi_epi16(x6, x7);
-  s6 = _mm256_madd_epi16(y0, cospi_m24_p08);
-  x6 = _mm256_madd_epi16(y1, cospi_m24_p08);
-  s7 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x7 = _mm256_madd_epi16(y1, cospi_p08_p24);
-
-  y0 = _mm256_unpacklo_epi16(x12, x13);
-  y1 = _mm256_unpackhi_epi16(x12, x13);
-  s12 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x12 = _mm256_madd_epi16(y1, cospi_p08_p24);
-  s13 = _mm256_madd_epi16(y0, cospi_p24_m08);
-  x13 = _mm256_madd_epi16(y1, cospi_p24_m08);
-
-  y0 = _mm256_unpacklo_epi16(x14, x15);
-  y1 = _mm256_unpackhi_epi16(x14, x15);
-  s14 = _mm256_madd_epi16(y0, cospi_m24_p08);
-  x14 = _mm256_madd_epi16(y1, cospi_m24_p08);
-  s15 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x15 = _mm256_madd_epi16(y1, cospi_p08_p24);
-
-  u0 = _mm256_add_epi32(x0, x2);
-  v0 = _mm256_add_epi32(s0, s2);
-  u1 = _mm256_add_epi32(x1, x3);
-  v1 = _mm256_add_epi32(s1, s3);
-  u2 = _mm256_sub_epi32(x0, x2);
-  v2 = _mm256_sub_epi32(s0, s2);
-  u3 = _mm256_sub_epi32(x1, x3);
-  v3 = _mm256_sub_epi32(s1, s3);
-
-  u0 = _mm256_add_epi32(u0, dct_rounding);
-  v0 = _mm256_add_epi32(v0, dct_rounding);
-  u1 = _mm256_add_epi32(u1, dct_rounding);
-  v1 = _mm256_add_epi32(v1, dct_rounding);
-  u2 = _mm256_add_epi32(u2, dct_rounding);
-  v2 = _mm256_add_epi32(v2, dct_rounding);
-  u3 = _mm256_add_epi32(u3, dct_rounding);
-  v3 = _mm256_add_epi32(v3, dct_rounding);
-
-  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
-  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
-  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-
-  in[0] = _mm256_packs_epi32(u0, v0);
-  x1 = _mm256_packs_epi32(u1, v1);
-  x2 = _mm256_packs_epi32(u2, v2);
-  x3 = _mm256_packs_epi32(u3, v3);
-
-  // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
-  u4 = _mm256_add_epi32(s4, s6);
-  u5 = _mm256_add_epi32(s5, s7);
-  u6 = _mm256_sub_epi32(s4, s6);
-  u7 = _mm256_sub_epi32(s5, s7);
-
-  v4 = _mm256_add_epi32(x4, x6);
-  v5 = _mm256_add_epi32(x5, x7);
-  v6 = _mm256_sub_epi32(x4, x6);
-  v7 = _mm256_sub_epi32(x5, x7);
-
-  u4 = _mm256_add_epi32(u4, dct_rounding);
-  u5 = _mm256_add_epi32(u5, dct_rounding);
-  u6 = _mm256_add_epi32(u6, dct_rounding);
-  u7 = _mm256_add_epi32(u7, dct_rounding);
-
-  u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
-
-  v4 = _mm256_add_epi32(v4, dct_rounding);
-  v5 = _mm256_add_epi32(v5, dct_rounding);
-  v6 = _mm256_add_epi32(v6, dct_rounding);
-  v7 = _mm256_add_epi32(v7, dct_rounding);
-
-  v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
-
-  x4 = _mm256_packs_epi32(u4, v4);
-  in[12] = _mm256_packs_epi32(u5, v5);
-  x6 = _mm256_packs_epi32(u6, v6);
-  x7 = _mm256_packs_epi32(u7, v7);
-
-  u0 = _mm256_add_epi32(u8, u10);
-  v0 = _mm256_add_epi32(v8, v10);
-  u1 = _mm256_add_epi32(u9, u11);
-  v1 = _mm256_add_epi32(v9, v11);
-  u2 = _mm256_sub_epi32(u8, u10);
-  v2 = _mm256_sub_epi32(v8, v10);
-  u3 = _mm256_sub_epi32(u9, u11);
-  v3 = _mm256_sub_epi32(v9, v11);
-
-  u0 = _mm256_add_epi32(u0, dct_rounding);
-  v0 = _mm256_add_epi32(v0, dct_rounding);
-  u1 = _mm256_add_epi32(u1, dct_rounding);
-  v1 = _mm256_add_epi32(v1, dct_rounding);
-  u2 = _mm256_add_epi32(u2, dct_rounding);
-  v2 = _mm256_add_epi32(v2, dct_rounding);
-  u3 = _mm256_add_epi32(u3, dct_rounding);
-  v3 = _mm256_add_epi32(v3, dct_rounding);
-
-  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
-  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
-  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-
-  x8 = _mm256_packs_epi32(u0, v0);
-  in[14] = _mm256_packs_epi32(u1, v1);
-  x10 = _mm256_packs_epi32(u2, v2);
-  x11 = _mm256_packs_epi32(u3, v3);
-
-  // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
-  u12 = _mm256_add_epi32(s12, s14);
-  u13 = _mm256_add_epi32(s13, s15);
-  u14 = _mm256_sub_epi32(s12, s14);
-  u15 = _mm256_sub_epi32(s13, s15);
-
-  v12 = _mm256_add_epi32(x12, x14);
-  v13 = _mm256_add_epi32(x13, x15);
-  v14 = _mm256_sub_epi32(x12, x14);
-  v15 = _mm256_sub_epi32(x13, x15);
-
-  u12 = _mm256_add_epi32(u12, dct_rounding);
-  u13 = _mm256_add_epi32(u13, dct_rounding);
-  u14 = _mm256_add_epi32(u14, dct_rounding);
-  u15 = _mm256_add_epi32(u15, dct_rounding);
-
-  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
-  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  x12 = _mm256_packs_epi32(u12, v12);
-  x13 = _mm256_packs_epi32(u13, v13);
-  x14 = _mm256_packs_epi32(u14, v14);
-  x15 = _mm256_packs_epi32(u15, v15);
-  in[2] = x12;
-
-  // stage 4
-  y0 = _mm256_unpacklo_epi16(x2, x3);
-  y1 = _mm256_unpackhi_epi16(x2, x3);
-  s2 = _mm256_madd_epi16(y0, cospi_m16_m16);
-  x2 = _mm256_madd_epi16(y1, cospi_m16_m16);
-  s3 = _mm256_madd_epi16(y0, cospi_p16_m16);
-  x3 = _mm256_madd_epi16(y1, cospi_p16_m16);
-
-  y0 = _mm256_unpacklo_epi16(x6, x7);
-  y1 = _mm256_unpackhi_epi16(x6, x7);
-  s6 = _mm256_madd_epi16(y0, cospi_p16_p16);
-  x6 = _mm256_madd_epi16(y1, cospi_p16_p16);
-  s7 = _mm256_madd_epi16(y0, cospi_m16_p16);
-  x7 = _mm256_madd_epi16(y1, cospi_m16_p16);
-
-  y0 = _mm256_unpacklo_epi16(x10, x11);
-  y1 = _mm256_unpackhi_epi16(x10, x11);
-  s10 = _mm256_madd_epi16(y0, cospi_p16_p16);
-  x10 = _mm256_madd_epi16(y1, cospi_p16_p16);
-  s11 = _mm256_madd_epi16(y0, cospi_m16_p16);
-  x11 = _mm256_madd_epi16(y1, cospi_m16_p16);
-
-  y0 = _mm256_unpacklo_epi16(x14, x15);
-  y1 = _mm256_unpackhi_epi16(x14, x15);
-  s14 = _mm256_madd_epi16(y0, cospi_m16_m16);
-  x14 = _mm256_madd_epi16(y1, cospi_m16_m16);
-  s15 = _mm256_madd_epi16(y0, cospi_p16_m16);
-  x15 = _mm256_madd_epi16(y1, cospi_p16_m16);
-
-  // Rounding
-  u2 = _mm256_add_epi32(s2, dct_rounding);
-  u3 = _mm256_add_epi32(s3, dct_rounding);
-  u6 = _mm256_add_epi32(s6, dct_rounding);
-  u7 = _mm256_add_epi32(s7, dct_rounding);
-
-  u10 = _mm256_add_epi32(s10, dct_rounding);
-  u11 = _mm256_add_epi32(s11, dct_rounding);
-  u14 = _mm256_add_epi32(s14, dct_rounding);
-  u15 = _mm256_add_epi32(s15, dct_rounding);
-
-  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
-  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
-
-  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
-  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  v2 = _mm256_add_epi32(x2, dct_rounding);
-  v3 = _mm256_add_epi32(x3, dct_rounding);
-  v6 = _mm256_add_epi32(x6, dct_rounding);
-  v7 = _mm256_add_epi32(x7, dct_rounding);
-
-  v10 = _mm256_add_epi32(x10, dct_rounding);
-  v11 = _mm256_add_epi32(x11, dct_rounding);
-  v14 = _mm256_add_epi32(x14, dct_rounding);
-  v15 = _mm256_add_epi32(x15, dct_rounding);
-
-  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
-
-  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
-  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  in[7] = _mm256_packs_epi32(u2, v2);
-  in[8] = _mm256_packs_epi32(u3, v3);
-
-  in[4] = _mm256_packs_epi32(u6, v6);
-  in[11] = _mm256_packs_epi32(u7, v7);
-
-  in[6] = _mm256_packs_epi32(u10, v10);
-  in[9] = _mm256_packs_epi32(u11, v11);
-
-  in[5] = _mm256_packs_epi32(u14, v14);
-  in[10] = _mm256_packs_epi32(u15, v15);
-
-  in[1] = _mm256_sub_epi16(zero, x8);
-  in[3] = _mm256_sub_epi16(zero, x4);
-  in[13] = _mm256_sub_epi16(zero, x13);
-  in[15] = _mm256_sub_epi16(zero, x1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
-  txfm_scaling16_avx2((int16_t)Sqrt2, in);
-}
-#endif
-
-void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m256i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x16(input, stride, 1, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(input, stride, 0, 1, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(input, stride, 1, 1, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(input, stride, 0, 1, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(input, stride, 1, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case IDTX:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case V_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case H_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case V_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case H_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x16(input, stride, 1, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x16(input, stride, 0, 1, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-  mm256_transpose_16x16(in, in);
-  write_buffer_16x16(in, output);
-  _mm256_zeroupper();
-}
-
-static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
-  int i = 0;
-  __m256i temp;
-  while (i < size) {
-    temp = a0[i];
-    a0[i] = a1[i];
-    a1[i] = temp;
-    i++;
-  }
-}
-
-static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
-  mm256_transpose_16x16(in0, in0);
-  mm256_transpose_16x16(&in0[16], &in0[16]);
-  mm256_transpose_16x16(in1, in1);
-  mm256_transpose_16x16(&in1[16], &in1[16]);
-  mm256_vectors_swap(&in0[16], in1, 16);
-}
-
-static void prepare_16x16_even(const __m256i *in, __m256i *even) {
-  even[0] = _mm256_add_epi16(in[0], in[31]);
-  even[1] = _mm256_add_epi16(in[1], in[30]);
-  even[2] = _mm256_add_epi16(in[2], in[29]);
-  even[3] = _mm256_add_epi16(in[3], in[28]);
-  even[4] = _mm256_add_epi16(in[4], in[27]);
-  even[5] = _mm256_add_epi16(in[5], in[26]);
-  even[6] = _mm256_add_epi16(in[6], in[25]);
-  even[7] = _mm256_add_epi16(in[7], in[24]);
-  even[8] = _mm256_add_epi16(in[8], in[23]);
-  even[9] = _mm256_add_epi16(in[9], in[22]);
-  even[10] = _mm256_add_epi16(in[10], in[21]);
-  even[11] = _mm256_add_epi16(in[11], in[20]);
-  even[12] = _mm256_add_epi16(in[12], in[19]);
-  even[13] = _mm256_add_epi16(in[13], in[18]);
-  even[14] = _mm256_add_epi16(in[14], in[17]);
-  even[15] = _mm256_add_epi16(in[15], in[16]);
-}
-
-static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
-  odd[0] = _mm256_sub_epi16(in[15], in[16]);
-  odd[1] = _mm256_sub_epi16(in[14], in[17]);
-  odd[2] = _mm256_sub_epi16(in[13], in[18]);
-  odd[3] = _mm256_sub_epi16(in[12], in[19]);
-  odd[4] = _mm256_sub_epi16(in[11], in[20]);
-  odd[5] = _mm256_sub_epi16(in[10], in[21]);
-  odd[6] = _mm256_sub_epi16(in[9], in[22]);
-  odd[7] = _mm256_sub_epi16(in[8], in[23]);
-  odd[8] = _mm256_sub_epi16(in[7], in[24]);
-  odd[9] = _mm256_sub_epi16(in[6], in[25]);
-  odd[10] = _mm256_sub_epi16(in[5], in[26]);
-  odd[11] = _mm256_sub_epi16(in[4], in[27]);
-  odd[12] = _mm256_sub_epi16(in[3], in[28]);
-  odd[13] = _mm256_sub_epi16(in[2], in[29]);
-  odd[14] = _mm256_sub_epi16(in[1], in[30]);
-  odd[15] = _mm256_sub_epi16(in[0], in[31]);
-}
-
-static void collect_16col(const __m256i *even, const __m256i *odd,
-                          __m256i *out) {
-  // fdct16_avx2() already maps the output
-  out[0] = even[0];
-  out[2] = even[1];
-  out[4] = even[2];
-  out[6] = even[3];
-  out[8] = even[4];
-  out[10] = even[5];
-  out[12] = even[6];
-  out[14] = even[7];
-  out[16] = even[8];
-  out[18] = even[9];
-  out[20] = even[10];
-  out[22] = even[11];
-  out[24] = even[12];
-  out[26] = even[13];
-  out[28] = even[14];
-  out[30] = even[15];
-
-  out[1] = odd[0];
-  out[17] = odd[1];
-  out[9] = odd[2];
-  out[25] = odd[3];
-  out[5] = odd[4];
-  out[21] = odd[5];
-  out[13] = odd[6];
-  out[29] = odd[7];
-  out[3] = odd[8];
-  out[19] = odd[9];
-  out[11] = odd[10];
-  out[27] = odd[11];
-  out[7] = odd[12];
-  out[23] = odd[13];
-  out[15] = odd[14];
-  out[31] = odd[15];
-}
-
-static void collect_coeffs(const __m256i *first_16col_even,
-                           const __m256i *first_16col_odd,
-                           const __m256i *second_16col_even,
-                           const __m256i *second_16col_odd, __m256i *in0,
-                           __m256i *in1) {
-  collect_16col(first_16col_even, first_16col_odd, in0);
-  collect_16col(second_16col_even, second_16col_odd, in1);
-}
-
-static void fdct16_odd_avx2(__m256i *in) {
-  // sequence: cospi_L_H = pairs(L, H) and L first
-  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
-  const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
-  const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
-  const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
-  const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
-  const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
-  const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
-  const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
-  const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
-
-  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
-  __m256i u0, u1;
-
-  // stage 1 is in prepare_16x16_odd()
-
-  // stage 2
-  y0 = in[0];
-  y1 = in[1];
-  y2 = in[2];
-  y3 = in[3];
-
-  u0 = _mm256_unpacklo_epi16(in[4], in[11]);
-  u1 = _mm256_unpackhi_epi16(in[4], in[11]);
-  y4 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y11 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  u0 = _mm256_unpacklo_epi16(in[5], in[10]);
-  u1 = _mm256_unpackhi_epi16(in[5], in[10]);
-  y5 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y10 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  u0 = _mm256_unpacklo_epi16(in[6], in[9]);
-  u1 = _mm256_unpackhi_epi16(in[6], in[9]);
-  y6 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y9 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  u0 = _mm256_unpacklo_epi16(in[7], in[8]);
-  u1 = _mm256_unpackhi_epi16(in[7], in[8]);
-  y7 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y8 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  y12 = in[12];
-  y13 = in[13];
-  y14 = in[14];
-  y15 = in[15];
-
-  // stage 3
-  x0 = _mm256_add_epi16(y0, y7);
-  x1 = _mm256_add_epi16(y1, y6);
-  x2 = _mm256_add_epi16(y2, y5);
-  x3 = _mm256_add_epi16(y3, y4);
-  x4 = _mm256_sub_epi16(y3, y4);
-  x5 = _mm256_sub_epi16(y2, y5);
-  x6 = _mm256_sub_epi16(y1, y6);
-  x7 = _mm256_sub_epi16(y0, y7);
-  x8 = _mm256_sub_epi16(y15, y8);
-  x9 = _mm256_sub_epi16(y14, y9);
-  x10 = _mm256_sub_epi16(y13, y10);
-  x11 = _mm256_sub_epi16(y12, y11);
-  x12 = _mm256_add_epi16(y12, y11);
-  x13 = _mm256_add_epi16(y13, y10);
-  x14 = _mm256_add_epi16(y14, y9);
-  x15 = _mm256_add_epi16(y15, y8);
-
-  // stage 4
-  y0 = x0;
-  y1 = x1;
-  y6 = x6;
-  y7 = x7;
-  y8 = x8;
-  y9 = x9;
-  y14 = x14;
-  y15 = x15;
-
-  u0 = _mm256_unpacklo_epi16(x2, x13);
-  u1 = _mm256_unpackhi_epi16(x2, x13);
-  y2 = butter_fly(&u0, &u1, &cospi_m08_p24);
-  y13 = butter_fly(&u0, &u1, &cospi_p24_p08);
-
-  u0 = _mm256_unpacklo_epi16(x3, x12);
-  u1 = _mm256_unpackhi_epi16(x3, x12);
-  y3 = butter_fly(&u0, &u1, &cospi_m08_p24);
-  y12 = butter_fly(&u0, &u1, &cospi_p24_p08);
-
-  u0 = _mm256_unpacklo_epi16(x4, x11);
-  u1 = _mm256_unpackhi_epi16(x4, x11);
-  y4 = butter_fly(&u0, &u1, &cospi_m24_m08);
-  y11 = butter_fly(&u0, &u1, &cospi_m08_p24);
-
-  u0 = _mm256_unpacklo_epi16(x5, x10);
-  u1 = _mm256_unpackhi_epi16(x5, x10);
-  y5 = butter_fly(&u0, &u1, &cospi_m24_m08);
-  y10 = butter_fly(&u0, &u1, &cospi_m08_p24);
-
-  // stage 5
-  x0 = _mm256_add_epi16(y0, y3);
-  x1 = _mm256_add_epi16(y1, y2);
-  x2 = _mm256_sub_epi16(y1, y2);
-  x3 = _mm256_sub_epi16(y0, y3);
-  x4 = _mm256_sub_epi16(y7, y4);
-  x5 = _mm256_sub_epi16(y6, y5);
-  x6 = _mm256_add_epi16(y6, y5);
-  x7 = _mm256_add_epi16(y7, y4);
-
-  x8 = _mm256_add_epi16(y8, y11);
-  x9 = _mm256_add_epi16(y9, y10);
-  x10 = _mm256_sub_epi16(y9, y10);
-  x11 = _mm256_sub_epi16(y8, y11);
-  x12 = _mm256_sub_epi16(y15, y12);
-  x13 = _mm256_sub_epi16(y14, y13);
-  x14 = _mm256_add_epi16(y14, y13);
-  x15 = _mm256_add_epi16(y15, y12);
-
-  // stage 6
-  y0 = x0;
-  y3 = x3;
-  y4 = x4;
-  y7 = x7;
-  y8 = x8;
-  y11 = x11;
-  y12 = x12;
-  y15 = x15;
-
-  u0 = _mm256_unpacklo_epi16(x1, x14);
-  u1 = _mm256_unpackhi_epi16(x1, x14);
-  y1 = butter_fly(&u0, &u1, &cospi_m04_p28);
-  y14 = butter_fly(&u0, &u1, &cospi_p28_p04);
-
-  u0 = _mm256_unpacklo_epi16(x2, x13);
-  u1 = _mm256_unpackhi_epi16(x2, x13);
-  y2 = butter_fly(&u0, &u1, &cospi_m28_m04);
-  y13 = butter_fly(&u0, &u1, &cospi_m04_p28);
-
-  u0 = _mm256_unpacklo_epi16(x5, x10);
-  u1 = _mm256_unpackhi_epi16(x5, x10);
-  y5 = butter_fly(&u0, &u1, &cospi_m20_p12);
-  y10 = butter_fly(&u0, &u1, &cospi_p12_p20);
-
-  u0 = _mm256_unpacklo_epi16(x6, x9);
-  u1 = _mm256_unpackhi_epi16(x6, x9);
-  y6 = butter_fly(&u0, &u1, &cospi_m12_m20);
-  y9 = butter_fly(&u0, &u1, &cospi_m20_p12);
-
-  // stage 7
-  x0 = _mm256_add_epi16(y0, y1);
-  x1 = _mm256_sub_epi16(y0, y1);
-  x2 = _mm256_sub_epi16(y3, y2);
-  x3 = _mm256_add_epi16(y3, y2);
-  x4 = _mm256_add_epi16(y4, y5);
-  x5 = _mm256_sub_epi16(y4, y5);
-  x6 = _mm256_sub_epi16(y7, y6);
-  x7 = _mm256_add_epi16(y7, y6);
-
-  x8 = _mm256_add_epi16(y8, y9);
-  x9 = _mm256_sub_epi16(y8, y9);
-  x10 = _mm256_sub_epi16(y11, y10);
-  x11 = _mm256_add_epi16(y11, y10);
-  x12 = _mm256_add_epi16(y12, y13);
-  x13 = _mm256_sub_epi16(y12, y13);
-  x14 = _mm256_sub_epi16(y15, y14);
-  x15 = _mm256_add_epi16(y15, y14);
-
-  // stage 8
-  u0 = _mm256_unpacklo_epi16(x0, x15);
-  u1 = _mm256_unpackhi_epi16(x0, x15);
-  in[0] = butter_fly(&u0, &u1, &cospi_p31_p01);
-  in[15] = butter_fly(&u0, &u1, &cospi_m01_p31);
-
-  u0 = _mm256_unpacklo_epi16(x1, x14);
-  u1 = _mm256_unpackhi_epi16(x1, x14);
-  in[1] = butter_fly(&u0, &u1, &cospi_p15_p17);
-  in[14] = butter_fly(&u0, &u1, &cospi_m17_p15);
-
-  u0 = _mm256_unpacklo_epi16(x2, x13);
-  u1 = _mm256_unpackhi_epi16(x2, x13);
-  in[2] = butter_fly(&u0, &u1, &cospi_p23_p09);
-  in[13] = butter_fly(&u0, &u1, &cospi_m09_p23);
-
-  u0 = _mm256_unpacklo_epi16(x3, x12);
-  u1 = _mm256_unpackhi_epi16(x3, x12);
-  in[3] = butter_fly(&u0, &u1, &cospi_p07_p25);
-  in[12] = butter_fly(&u0, &u1, &cospi_m25_p07);
-
-  u0 = _mm256_unpacklo_epi16(x4, x11);
-  u1 = _mm256_unpackhi_epi16(x4, x11);
-  in[4] = butter_fly(&u0, &u1, &cospi_p27_p05);
-  in[11] = butter_fly(&u0, &u1, &cospi_m05_p27);
-
-  u0 = _mm256_unpacklo_epi16(x5, x10);
-  u1 = _mm256_unpackhi_epi16(x5, x10);
-  in[5] = butter_fly(&u0, &u1, &cospi_p11_p21);
-  in[10] = butter_fly(&u0, &u1, &cospi_m21_p11);
-
-  u0 = _mm256_unpacklo_epi16(x6, x9);
-  u1 = _mm256_unpackhi_epi16(x6, x9);
-  in[6] = butter_fly(&u0, &u1, &cospi_p19_p13);
-  in[9] = butter_fly(&u0, &u1, &cospi_m13_p19);
-
-  u0 = _mm256_unpacklo_epi16(x7, x8);
-  u1 = _mm256_unpackhi_epi16(x7, x8);
-  in[7] = butter_fly(&u0, &u1, &cospi_p03_p29);
-  in[8] = butter_fly(&u0, &u1, &cospi_m29_p03);
-}
-
-static void fdct32_avx2(__m256i *in0, __m256i *in1) {
-  __m256i even0[16], even1[16], odd0[16], odd1[16];
-  prepare_16x16_even(in0, even0);
-  fdct16_avx2(even0);
-
-  prepare_16x16_odd(in0, odd0);
-  fdct16_odd_avx2(odd0);
-
-  prepare_16x16_even(in1, even1);
-  fdct16_avx2(even1);
-
-  prepare_16x16_odd(in1, odd1);
-  fdct16_odd_avx2(odd1);
-
-  collect_coeffs(even0, odd0, even1, odd1, in0, in1);
-
-  mm256_transpose_32x32(in0, in1);
-}
-
-static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
-                                      tran_low_t *output) {
-  int i = 0;
-  const int stride = 32;
-  tran_low_t *coeff = output;
-  while (i < 32) {
-    storeu_output_avx2(&in0[i], coeff);
-    storeu_output_avx2(&in1[i], coeff + 16);
-    coeff += stride;
-    i += 1;
-  }
-}
-
-#if CONFIG_EXT_TX
-static void fhalfright32_16col_avx2(__m256i *in) {
-  int i = 0;
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2);
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i x0, x1;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 2);
-    x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
-    x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
-    x0 = _mm256_madd_epi16(x0, sqrt2);
-    x1 = _mm256_madd_epi16(x1, sqrt2);
-    x0 = _mm256_add_epi32(x0, dct_rounding);
-    x1 = _mm256_add_epi32(x1, dct_rounding);
-    x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
-    x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
-    in[i + 16] = _mm256_packs_epi32(x0, x1);
-    i += 1;
-  }
-  fdct16_avx2(&in[16]);
-}
-
-static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
-  fhalfright32_16col_avx2(in0);
-  fhalfright32_16col_avx2(in1);
-  mm256_vectors_swap(in0, &in0[16], 16);
-  mm256_vectors_swap(in1, &in1[16], 16);
-  mm256_transpose_32x32(in0, in1);
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void load_buffer_32x32(const int16_t *input, int stride,
-                                     int flipud, int fliplr, __m256i *in0,
-                                     __m256i *in1) {
-  // Load 4 16x16 blocks
-  const int16_t *topL = input;
-  const int16_t *topR = input + 16;
-  const int16_t *botL = input + 16 * stride;
-  const int16_t *botR = input + 16 * stride + 16;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    // Swap left columns
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-    // Swap right columns
-    tmp = topR;
-    topR = botR;
-    botR = tmp;
-  }
-
-  if (fliplr) {
-    // Swap top rows
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-    // Swap bottom rows
-    tmp = botL;
-    botL = botR;
-    botR = tmp;
-  }
-
-  // load first 16 columns
-  load_buffer_16x16(topL, stride, flipud, fliplr, in0);
-  load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
-
-  // load second 16 columns
-  load_buffer_16x16(topR, stride, flipud, fliplr, in1);
-  load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
-}
-
-static INLINE void right_shift_32x32_16col(int bit, __m256i *in) {
-  int i = 0;
-  const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1);
-  __m256i sign;
-  while (i < 32) {
-    sign = _mm256_srai_epi16(in[i], 15);
-    in[i] = _mm256_add_epi16(in[i], rounding);
-    in[i] = _mm256_add_epi16(in[i], sign);
-    in[i] = _mm256_srai_epi16(in[i], bit);
-    i += 1;
-  }
-}
-
-// Positive rounding
-static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) {
-  const int bit = 4;
-  right_shift_32x32_16col(bit, in0);
-  right_shift_32x32_16col(bit, in1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
-  int i = 0;
-  while (i < 32) {
-    in0[i] = _mm256_slli_epi16(in0[i], 2);
-    in1[i] = _mm256_slli_epi16(in1[i], 2);
-    i += 1;
-  }
-  mm256_transpose_32x32(in0, in1);
-}
-#endif
-
-void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m256i in0[32];  // left 32 columns
-  __m256i in1[32];  // right 32 columns
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-    case DCT_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case ADST_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case FLIPADST_DCT:
-      load_buffer_32x32(input, stride, 1, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_32x32(input, stride, 0, 1, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_32x32(input, stride, 1, 1, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_32x32(input, stride, 0, 1, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_32x32(input, stride, 1, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case IDTX:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case V_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case H_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-    case V_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case H_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case V_FLIPADST:
-      load_buffer_32x32(input, stride, 1, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case H_FLIPADST:
-      load_buffer_32x32(input, stride, 0, 1, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-  write_buffer_32x32(in0, in1, output);
-  _mm256_zeroupper();
-}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
index 7186b6b92..30983d1c1 100644
--- a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -14,6 +14,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ; void av1_temporal_filter_apply_sse2 | arg
 ;  (unsigned char  *frame1,           |  0
 ;   unsigned int    stride,           |  1
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
index bf233ca4d..4d2e99f25 100644
--- a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
-  const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
 
   __m128i v_acc0_q = _mm_setzero_si128();
 
-- 
cgit v1.2.3